def main(cutoff=None, persist=False): train_x, train_y, train_positions, train_file_names = get_data( main_dir=TRAINING_DIR) if cutoff: train_x = train_x[:cutoff] train_y = train_y[:cutoff] train_positions = train_positions[:cutoff] df = pd.DataFrame(data={'label': train_y, 'pos': train_positions}) pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] ngrams = NGrams(train_x, pos_tag_x) X = [preprocessor.process_text(x) for x in train_x] X_word_chunks = word_chunks(X, n=300, process=True) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True) fmap = { 'lexical_features': lexical(X_word_chunks), 'stop_word_features': ngrams.get_stop_words(X_word_chunks), 'function_word_features': ngrams.get_function_words(X_word_chunks), 'pos_tag_features': ngrams.get_pos_tags(X_pos_chunks), 'process_tag_features': processed_tags(X_word_chunks), 'word_frequency': wf.average_word_frequency(X_word_chunks), 'readability_features': readability(X_word_chunks) } for key, feature in fmap.items(): df[key] = feature if persist: df.to_csv(TRAIN_CSV_FILE)
def pipeline(self, X, pos_tag_x): X = [preprocessor.process_text(x) for x in X] X_word_chunks = word_chunks(X, n=300, process=True, sliding=True) X_char_chunks = char_chunks(X, n=2000, sliding=True) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True, sliding=True) max_segments = 20 lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments) stop_word_features = sequence.pad_sequences( self.ngrams.get_stop_words(X_word_chunks), maxlen=max_segments) function_word_features = sequence.pad_sequences( self.ngrams.get_function_words(X_word_chunks), maxlen=max_segments) pos_tag_features = sequence.pad_sequences( self.ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments) word_frequency = sequence.pad_sequences( wf.average_word_frequency(X_word_chunks), maxlen=max_segments) readability_features = sequence.pad_sequences( readability(X_word_chunks), maxlen=max_segments) return np.concatenate([ lexical_features, stop_word_features, function_word_features, pos_tag_features, word_frequency, readability_features ], axis=2)
def main(cutoff=None, persist=True): train_x, train_y, train_positions, train_file_names = get_data( #main_dir=TRAINING_DIR, external_file=TRAINING_EXTERNAL_FILE ) if cutoff: train_x = train_x[:cutoff] train_y = train_y[:cutoff] train_positions = train_positions[:cutoff] pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] ngrams = NGrams(train_x, pos_tag_x) X = [preprocessor.process_text(x) for x in train_x] X_word_chunks = word_chunks(X, n=300, process=True, sliding=True) #print('Word', max([len(s) for s in X_word_chunks])) X_char_chunks = char_chunks(X, n=2000, sliding=True) #print('Char', max([len(s) for s in X_char_chunks])) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True, sliding=True) max_segments = 20 lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments) stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments) function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments) pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments) word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments) readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments) # lexical_features = lexical(X_word_chunks) # stop_word_features = ngrams.get_stop_words(X_word_chunks) # function_word_features = ngrams.get_function_words(X_word_chunks) # pos_tag_features = ngrams.get_pos_tags(X_pos_chunks) # process_tag_features = processed_tags(X_word_chunks) # word_frequency = wf.average_word_frequency(X_word_chunks) # readability_features = readability(X_word_chunks) # tfidf = ngrams.get_word_tfidf(X_word_chunks) X = np.concatenate([lexical_features, stop_word_features, function_word_features, pos_tag_features, word_frequency, readability_features, paragraph_features], axis=2) print(X.shape) if persist: np.save(TRAIN_X_FILE, X) np.save(TRAIN_Y_FILE, train_y)
def pipeline(self, X, pos_tag_x, fit_scalers): feature_names = [] X = [preprocessor.process_text(x) for x in X] X_word_chunks = word_chunks(X, chunks=4, process=True, sliding=True) X_pos_chunks = word_chunks(pos_tag_x, chunks=4, process=True, sliding=True) lexical_features = max_diff(lexical(X_word_chunks, feature_names)) stop_word_features = max_diff( self.ngrams.get_stop_words(X_word_chunks, feature_names)) function_word_features = max_diff( self.ngrams.get_function_words(X_word_chunks, feature_names)) pos_tag_features = max_diff( self.ngrams.get_pos_tags(X_pos_chunks, feature_names)) process_tag_features = max_diff( processed_tags(X_word_chunks, feature_names)) word_frequency = max_diff( wf.average_word_frequency(X_word_chunks, feature_names)) readability_features = max_diff( readability(X_word_chunks, feature_names=feature_names)) #tfidf = max_diff(self.ngrams.get_word_tfidf(X_word_chunks, feature_names)) num_par = num_paragraphs(X, feature_names) X = np.concatenate( (lexical_features, stop_word_features, function_word_features, pos_tag_features, process_tag_features, word_frequency, readability_features, num_par), axis=1) X = minmax_scale(X) return X, feature_names
def main(cutoff=10000, persist=True): train_x, train_y, train_positions, train_file_names = get_data( external_file=TRAINING_EXTERNAL_FILE ) if cutoff: train_x = train_x[:cutoff] train_y = train_y[:cutoff] train_positions = train_positions[:cutoff] pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] ngrams = NGrams(train_x, pos_tag_x) X = [preprocessor.process_text(x) for x in train_x] X_word_chunks = word_chunks(X, n=300, process=True) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True) max_segments = 10 lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments) stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments) function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments) pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments) process_tag_features = sequence.pad_sequences(processed_tags(X_word_chunks), maxlen=max_segments) word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments) readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments) print(type(lexical_features)) X = np.concatenate([lexical_features, stop_word_features, function_word_features, pos_tag_features, process_tag_features, word_frequency, readability_features], axis=2) if persist: np.save(TRAIN_X_FILE, X) np.save(TRAIN_Y_FILE, train_y)
def pipeline(self, X): return gmm(lexical(sliding_sent_chunks(X)), self.params['gmm'])
def fit(self, train_x, train_y, train_positions, test_x=None): train_x = [preprocessor.process_text(x) for x in train_x] phrase_frequency_func = lambda x: np.array( phrase_frequency(x, **self.params['phrase_transformer'])) frequent_words_diff_func = lambda x: np.array( frequent_words_diff( x, **self.params['frequent_words_diff_transformer'])) min_max_lexical_per_segment_func = lambda x: max_diff(lexical(x)) apostrophe_discrepancies_func = lambda x: np.array( apostrophe_discrepancies(x)) quote_discrepancies_func = lambda x: np.array(quote_discrepancies(x)) ascii_discrepancies_func = lambda x: np.array( ascii_discrepancies(x, **self.params['ascii_transformer'])) rare_richness_func = lambda x: max_diff(WordFrequency(). average_word_frequency((x))) text_length_func = lambda x: np.array(text_length(x)) readability_func = lambda x: np.array(max_diff(readability(x))) self.stack = [ (phrase_frequency_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (frequent_words_diff_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (readability_func, True, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (apostrophe_discrepancies_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (quote_discrepancies_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (rare_richness_func, True, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (min_max_lexical_per_segment_func, True, True, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), # (ascii_discrepancies_func, False, False, StandardScaler(**self.params['scaler_params']), [ # RandomForestClassifier(**self.params['rf_params']), # MLPClassifier(**self.params['mlp_params']), # SVC(**self.params['svm_params']), # AdaBoostClassifier(**self.params['ab_params']), # ]), (text_length_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]) ] self.raw_text_models = [ LightGbmWithLogReg(), ] if self.params["use_nn"]: self.raw_text_models.append(CharacterCNN()) if (self.params['meta_learner']): self.global_model_weights = [] train_x_zero, train_y_zero, train_x_meta, train_y_meta = self.split_data( train_x, train_y) self.meta_learner = LogisticRegression( **self.params['meta_params']) predictions_zero = self.stack_fit_predict( train_x_zero, train_y_zero, train_x_meta, train_y_meta, test_additional=train_x_meta) self.meta_learner.fit(self.convert_to_meta_input(predictions_zero), train_y_meta) self.stack_fit_predict(train_x, train_y, test_additional=test_x)
def fit(self, train_x, train_y, train_positions): left, right = calculate_weights_count( train_x, train_positions, **self.params['split_points_params']) phrase_frequency_func = lambda x: np.array( phrase_frequency(x, **self.params['phrase_transformer'])) frequent_words_diff_func = lambda x: np.array( frequent_words_diff( x, **self.params['frequent_words_diff_transformer'])) min_max_lexical_per_segment_func = lambda x: max_diff(lexical(x)) apostrophe_discrepancies_func = lambda x: np.array( apostrophe_discrepancies(x)) quote_discrepancies_func = lambda x: np.array(quote_discrepancies(x)) ascii_discrepancies_func = lambda x: np.array( ascii_discrepancies(x, **self.params['ascii_transformer'])) rare_richness_func = lambda x: max_diff(WordFrequency(). average_word_frequency((x))) text_length_func = lambda x: np.array(text_length(x)) split_points_func = lambda x: np.array( split_points_count(x, left, right, window_words=self.params['split_points_params'][ 'size'] * 2)) readability_func = lambda x: np.array(max_diff(readability(x))) self.stack = [ (phrase_frequency_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (frequent_words_diff_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (readability_func, True, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (apostrophe_discrepancies_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (quote_discrepancies_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (rare_richness_func, True, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (min_max_lexical_per_segment_func, True, True, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (ascii_discrepancies_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (text_length_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), (split_points_func, False, False, StandardScaler(**self.params['scaler_params']), [ RandomForestClassifier(**self.params['rf_params']), MLPClassifier(**self.params['mlp_params']), SVC(**self.params['svm_params']), AdaBoostClassifier(**self.params['ab_params']), ]), ] if (self.params['meta_learner']): self.global_model_weights = [] train_x_zero, train_y_zero, train_x_meta, train_y_meta = self.split_data( train_x, train_y) self.meta_learner = LogisticRegression( **self.params['meta_params']) predictions_zero = self.stack_fit_predict(train_x_zero, train_y_zero, train_x_meta, train_y_meta) self.meta_learner.fit(self.convert_to_meta_input(predictions_zero), train_y_meta) self.stack_fit_predict(train_x, train_y)