def fit(self, train_x, train_y, train_positions): pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] self.ngrams = NGrams(train_x, pos_tag_x) self.train_x, names = self.pipeline(train_x, pos_tag_x, fit_scalers=True) self.print_feature_importance(self.train_x, train_y, names)
def fit(self, train_x, train_y, train_positions): pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] self.ngrams = NGrams(train_x, pos_tag_x) if self.params['train_from_file']: print("Loading features from file...") X = np.load(TRAIN_X_FILE) train_y = np.load(TRAIN_Y_FILE) else: X = self.pipeline(train_x, pos_tag_x) self.model = self.get_model((X.shape[1], X.shape[2])) print('Fitting LSTM model...') self.model.fit(X, np.array(train_y), **self.params['lstm_params'])
def predict(self, test_x): pos_tag_x = [NGrams.to_pos_tags(x) for x in test_x] test_x, _ = self.pipeline(test_x, pos_tag_x, fit_scalers=False) predictions = self.model.predict(test_x, **self.params['mlp']['predict']) return predictions.argmax(axis=-1)
def main(cutoff=None, persist=False): train_x, train_y, train_positions, train_file_names = get_data( main_dir=TRAINING_DIR) if cutoff: train_x = train_x[:cutoff] train_y = train_y[:cutoff] train_positions = train_positions[:cutoff] df = pd.DataFrame(data={'label': train_y, 'pos': train_positions}) pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] ngrams = NGrams(train_x, pos_tag_x) X = [preprocessor.process_text(x) for x in train_x] X_word_chunks = word_chunks(X, n=300, process=True) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True) fmap = { 'lexical_features': lexical(X_word_chunks), 'stop_word_features': ngrams.get_stop_words(X_word_chunks), 'function_word_features': ngrams.get_function_words(X_word_chunks), 'pos_tag_features': ngrams.get_pos_tags(X_pos_chunks), 'process_tag_features': processed_tags(X_word_chunks), 'word_frequency': wf.average_word_frequency(X_word_chunks), 'readability_features': readability(X_word_chunks) } for key, feature in fmap.items(): df[key] = feature if persist: df.to_csv(TRAIN_CSV_FILE)
def main(cutoff=None, persist=True): train_x, train_y, train_positions, train_file_names = get_data( #main_dir=TRAINING_DIR, external_file=TRAINING_EXTERNAL_FILE ) if cutoff: train_x = train_x[:cutoff] train_y = train_y[:cutoff] train_positions = train_positions[:cutoff] pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] ngrams = NGrams(train_x, pos_tag_x) X = [preprocessor.process_text(x) for x in train_x] X_word_chunks = word_chunks(X, n=300, process=True, sliding=True) #print('Word', max([len(s) for s in X_word_chunks])) X_char_chunks = char_chunks(X, n=2000, sliding=True) #print('Char', max([len(s) for s in X_char_chunks])) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True, sliding=True) max_segments = 20 lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments) stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments) function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments) pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments) word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments) readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments) # lexical_features = lexical(X_word_chunks) # stop_word_features = ngrams.get_stop_words(X_word_chunks) # function_word_features = ngrams.get_function_words(X_word_chunks) # pos_tag_features = ngrams.get_pos_tags(X_pos_chunks) # process_tag_features = processed_tags(X_word_chunks) # word_frequency = wf.average_word_frequency(X_word_chunks) # readability_features = readability(X_word_chunks) # tfidf = ngrams.get_word_tfidf(X_word_chunks) X = np.concatenate([lexical_features, stop_word_features, function_word_features, pos_tag_features, word_frequency, readability_features, paragraph_features], axis=2) print(X.shape) if persist: np.save(TRAIN_X_FILE, X) np.save(TRAIN_Y_FILE, train_y)
def main(cutoff=10000, persist=True): train_x, train_y, train_positions, train_file_names = get_data( external_file=TRAINING_EXTERNAL_FILE ) if cutoff: train_x = train_x[:cutoff] train_y = train_y[:cutoff] train_positions = train_positions[:cutoff] pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] ngrams = NGrams(train_x, pos_tag_x) X = [preprocessor.process_text(x) for x in train_x] X_word_chunks = word_chunks(X, n=300, process=True) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True) max_segments = 10 lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments) stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments) function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments) pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments) process_tag_features = sequence.pad_sequences(processed_tags(X_word_chunks), maxlen=max_segments) word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments) readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments) print(type(lexical_features)) X = np.concatenate([lexical_features, stop_word_features, function_word_features, pos_tag_features, process_tag_features, word_frequency, readability_features], axis=2) if persist: np.save(TRAIN_X_FILE, X) np.save(TRAIN_Y_FILE, train_y)
def predict(self, test_x): pos_tag_x = [NGrams.to_pos_tags(x) for x in test_x] test_x = self.pipeline(test_x, pos_tag_x) predictions = self.model.predict_classes(test_x) return predictions.flatten()
def predict(self, test_x): pos_tag_x = [NGrams.to_pos_tags(x) for x in test_x] test_x, _ = self.pipeline(test_x, pos_tag_x, fit_scalers=False) return self.model.predict(test_x).tolist()