def main(cutoff=None, persist=False):
    train_x, train_y, train_positions, train_file_names = get_data(
        main_dir=TRAINING_DIR)

    if cutoff:
        train_x = train_x[:cutoff]
        train_y = train_y[:cutoff]
        train_positions = train_positions[:cutoff]

    df = pd.DataFrame(data={'label': train_y, 'pos': train_positions})

    pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
    ngrams = NGrams(train_x, pos_tag_x)

    X = [preprocessor.process_text(x) for x in train_x]

    X_word_chunks = word_chunks(X, n=300, process=True)
    X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True)

    fmap = {
        'lexical_features': lexical(X_word_chunks),
        'stop_word_features': ngrams.get_stop_words(X_word_chunks),
        'function_word_features': ngrams.get_function_words(X_word_chunks),
        'pos_tag_features': ngrams.get_pos_tags(X_pos_chunks),
        'process_tag_features': processed_tags(X_word_chunks),
        'word_frequency': wf.average_word_frequency(X_word_chunks),
        'readability_features': readability(X_word_chunks)
    }

    for key, feature in fmap.items():
        df[key] = feature

    if persist:
        df.to_csv(TRAIN_CSV_FILE)
    def pipeline(self, X, pos_tag_x):
        X = [preprocessor.process_text(x) for x in X]

        X_word_chunks = word_chunks(X, n=300, process=True, sliding=True)
        X_char_chunks = char_chunks(X, n=2000, sliding=True)
        X_pos_chunks = word_chunks(pos_tag_x,
                                   n=300,
                                   process=True,
                                   sliding=True)

        max_segments = 20
        lexical_features = sequence.pad_sequences(lexical(X_word_chunks),
                                                  maxlen=max_segments)
        stop_word_features = sequence.pad_sequences(
            self.ngrams.get_stop_words(X_word_chunks), maxlen=max_segments)
        function_word_features = sequence.pad_sequences(
            self.ngrams.get_function_words(X_word_chunks), maxlen=max_segments)
        pos_tag_features = sequence.pad_sequences(
            self.ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments)
        word_frequency = sequence.pad_sequences(
            wf.average_word_frequency(X_word_chunks), maxlen=max_segments)
        readability_features = sequence.pad_sequences(
            readability(X_word_chunks), maxlen=max_segments)

        return np.concatenate([
            lexical_features, stop_word_features, function_word_features,
            pos_tag_features, word_frequency, readability_features
        ],
                              axis=2)
예제 #3
0
def main(cutoff=None, persist=True):
    train_x, train_y, train_positions, train_file_names = get_data(
        #main_dir=TRAINING_DIR,
        external_file=TRAINING_EXTERNAL_FILE
    )

    if cutoff:
        train_x = train_x[:cutoff]
        train_y = train_y[:cutoff]
        train_positions = train_positions[:cutoff]

    pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
    ngrams = NGrams(train_x, pos_tag_x)

    X = [preprocessor.process_text(x) for x in train_x]

    X_word_chunks = word_chunks(X, n=300, process=True, sliding=True)
    #print('Word', max([len(s) for s in X_word_chunks]))
    X_char_chunks = char_chunks(X, n=2000, sliding=True)
    #print('Char', max([len(s) for s in X_char_chunks]))

    X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True, sliding=True)

    max_segments = 20

    lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments)
    stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments)
    function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments)
    pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments)
    word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments)
    readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments)

    # lexical_features = lexical(X_word_chunks)
    # stop_word_features = ngrams.get_stop_words(X_word_chunks)
    # function_word_features = ngrams.get_function_words(X_word_chunks)
    # pos_tag_features = ngrams.get_pos_tags(X_pos_chunks)
    # process_tag_features = processed_tags(X_word_chunks)
    # word_frequency = wf.average_word_frequency(X_word_chunks)
    # readability_features = readability(X_word_chunks)
    # tfidf = ngrams.get_word_tfidf(X_word_chunks)


    X = np.concatenate([lexical_features, stop_word_features,
                            function_word_features, pos_tag_features, 
                            word_frequency,
                            readability_features, paragraph_features], axis=2)

    print(X.shape)

    if persist:
        np.save(TRAIN_X_FILE, X)
        np.save(TRAIN_Y_FILE, train_y)
예제 #4
0
    def pipeline(self, X, pos_tag_x, fit_scalers):
        feature_names = []

        X = [preprocessor.process_text(x) for x in X]

        X_word_chunks = word_chunks(X, chunks=4, process=True, sliding=True)
        X_pos_chunks = word_chunks(pos_tag_x,
                                   chunks=4,
                                   process=True,
                                   sliding=True)

        lexical_features = max_diff(lexical(X_word_chunks, feature_names))

        stop_word_features = max_diff(
            self.ngrams.get_stop_words(X_word_chunks, feature_names))

        function_word_features = max_diff(
            self.ngrams.get_function_words(X_word_chunks, feature_names))

        pos_tag_features = max_diff(
            self.ngrams.get_pos_tags(X_pos_chunks, feature_names))

        process_tag_features = max_diff(
            processed_tags(X_word_chunks, feature_names))

        word_frequency = max_diff(
            wf.average_word_frequency(X_word_chunks, feature_names))
        readability_features = max_diff(
            readability(X_word_chunks, feature_names=feature_names))
        #tfidf = max_diff(self.ngrams.get_word_tfidf(X_word_chunks, feature_names))
        num_par = num_paragraphs(X, feature_names)

        X = np.concatenate(
            (lexical_features, stop_word_features, function_word_features,
             pos_tag_features, process_tag_features, word_frequency,
             readability_features, num_par),
            axis=1)

        X = minmax_scale(X)

        return X, feature_names
def main(cutoff=10000, persist=True):
    train_x, train_y, train_positions, train_file_names = get_data(
        external_file=TRAINING_EXTERNAL_FILE
    )

    if cutoff:
        train_x = train_x[:cutoff]
        train_y = train_y[:cutoff]
        train_positions = train_positions[:cutoff]

    pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x]
    ngrams = NGrams(train_x, pos_tag_x)

    X = [preprocessor.process_text(x) for x in train_x]

    X_word_chunks = word_chunks(X, n=300, process=True)
    X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True)

    max_segments = 10

    lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments)
    stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments)
    function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments)
    pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments)
    process_tag_features = sequence.pad_sequences(processed_tags(X_word_chunks), maxlen=max_segments)
    word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments)
    readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments)

    print(type(lexical_features))

    X = np.concatenate([lexical_features, stop_word_features,
                            function_word_features, pos_tag_features, 
                            process_tag_features, word_frequency,
                            readability_features], axis=2)

    if persist:
        np.save(TRAIN_X_FILE, X)
        np.save(TRAIN_Y_FILE, train_y)
 def pipeline(self, X):
     return gmm(lexical(sliding_sent_chunks(X)), self.params['gmm'])
예제 #7
0
    def fit(self, train_x, train_y, train_positions, test_x=None):
        train_x = [preprocessor.process_text(x) for x in train_x]
        phrase_frequency_func = lambda x: np.array(
            phrase_frequency(x, **self.params['phrase_transformer']))
        frequent_words_diff_func = lambda x: np.array(
            frequent_words_diff(
                x, **self.params['frequent_words_diff_transformer']))
        min_max_lexical_per_segment_func = lambda x: max_diff(lexical(x))
        apostrophe_discrepancies_func = lambda x: np.array(
            apostrophe_discrepancies(x))
        quote_discrepancies_func = lambda x: np.array(quote_discrepancies(x))
        ascii_discrepancies_func = lambda x: np.array(
            ascii_discrepancies(x, **self.params['ascii_transformer']))
        rare_richness_func = lambda x: max_diff(WordFrequency().
                                                average_word_frequency((x)))
        text_length_func = lambda x: np.array(text_length(x))
        readability_func = lambda x: np.array(max_diff(readability(x)))

        self.stack = [
            (phrase_frequency_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (frequent_words_diff_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (readability_func, True, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (apostrophe_discrepancies_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (quote_discrepancies_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (rare_richness_func, True, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (min_max_lexical_per_segment_func, True, True,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),

            # (ascii_discrepancies_func, False, False, StandardScaler(**self.params['scaler_params']), [
            #     RandomForestClassifier(**self.params['rf_params']),
            #     MLPClassifier(**self.params['mlp_params']),
            #     SVC(**self.params['svm_params']),
            #     AdaBoostClassifier(**self.params['ab_params']),
            # ]),
            (text_length_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ])
        ]

        self.raw_text_models = [
            LightGbmWithLogReg(),
        ]

        if self.params["use_nn"]:
            self.raw_text_models.append(CharacterCNN())

        if (self.params['meta_learner']):
            self.global_model_weights = []

            train_x_zero, train_y_zero, train_x_meta, train_y_meta = self.split_data(
                train_x, train_y)

            self.meta_learner = LogisticRegression(
                **self.params['meta_params'])

            predictions_zero = self.stack_fit_predict(
                train_x_zero,
                train_y_zero,
                train_x_meta,
                train_y_meta,
                test_additional=train_x_meta)
            self.meta_learner.fit(self.convert_to_meta_input(predictions_zero),
                                  train_y_meta)

        self.stack_fit_predict(train_x, train_y, test_additional=test_x)
예제 #8
0
    def fit(self, train_x, train_y, train_positions):
        left, right = calculate_weights_count(
            train_x, train_positions, **self.params['split_points_params'])

        phrase_frequency_func = lambda x: np.array(
            phrase_frequency(x, **self.params['phrase_transformer']))
        frequent_words_diff_func = lambda x: np.array(
            frequent_words_diff(
                x, **self.params['frequent_words_diff_transformer']))
        min_max_lexical_per_segment_func = lambda x: max_diff(lexical(x))
        apostrophe_discrepancies_func = lambda x: np.array(
            apostrophe_discrepancies(x))
        quote_discrepancies_func = lambda x: np.array(quote_discrepancies(x))
        ascii_discrepancies_func = lambda x: np.array(
            ascii_discrepancies(x, **self.params['ascii_transformer']))
        rare_richness_func = lambda x: max_diff(WordFrequency().
                                                average_word_frequency((x)))
        text_length_func = lambda x: np.array(text_length(x))
        split_points_func = lambda x: np.array(
            split_points_count(x,
                               left,
                               right,
                               window_words=self.params['split_points_params'][
                                   'size'] * 2))
        readability_func = lambda x: np.array(max_diff(readability(x)))

        self.stack = [
            (phrase_frequency_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (frequent_words_diff_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (readability_func, True, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (apostrophe_discrepancies_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (quote_discrepancies_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (rare_richness_func, True, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (min_max_lexical_per_segment_func, True, True,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (ascii_discrepancies_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (text_length_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
            (split_points_func, False, False,
             StandardScaler(**self.params['scaler_params']), [
                 RandomForestClassifier(**self.params['rf_params']),
                 MLPClassifier(**self.params['mlp_params']),
                 SVC(**self.params['svm_params']),
                 AdaBoostClassifier(**self.params['ab_params']),
             ]),
        ]

        if (self.params['meta_learner']):
            self.global_model_weights = []

            train_x_zero, train_y_zero, train_x_meta, train_y_meta = self.split_data(
                train_x, train_y)

            self.meta_learner = LogisticRegression(
                **self.params['meta_params'])

            predictions_zero = self.stack_fit_predict(train_x_zero,
                                                      train_y_zero,
                                                      train_x_meta,
                                                      train_y_meta)

            self.meta_learner.fit(self.convert_to_meta_input(predictions_zero),
                                  train_y_meta)

        self.stack_fit_predict(train_x, train_y)