def pipeline(self, X, pos_tag_x): X = [preprocessor.process_text(x) for x in X] X_word_chunks = word_chunks(X, n=300, process=True, sliding=True) X_char_chunks = char_chunks(X, n=2000, sliding=True) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True, sliding=True) max_segments = 20 lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments) stop_word_features = sequence.pad_sequences( self.ngrams.get_stop_words(X_word_chunks), maxlen=max_segments) function_word_features = sequence.pad_sequences( self.ngrams.get_function_words(X_word_chunks), maxlen=max_segments) pos_tag_features = sequence.pad_sequences( self.ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments) word_frequency = sequence.pad_sequences( wf.average_word_frequency(X_word_chunks), maxlen=max_segments) readability_features = sequence.pad_sequences( readability(X_word_chunks), maxlen=max_segments) return np.concatenate([ lexical_features, stop_word_features, function_word_features, pos_tag_features, word_frequency, readability_features ], axis=2)
def predict(self, test_x): predictions = [] test_x_preprocessed = [preprocessor.process_text(x) for x in test_x] test_word_chunks = word_chunks(test_x, **self.params['word_chunk_params']) test_word_chunks_preprocessed = word_chunks(test_x_preprocessed, chunks=3, process=True, sliding=True) cnt = 0 if self.params['use_raw_text_models']: for model in self.raw_text_models: print("Predicting via LightGBM ...") test_probs = model.predict_proba(test_x).tolist() predictions.append(test_probs) if self.params['output_probabilities_test']: res = pd.DataFrame(np.array(test_probs)) res.to_csv('model_{0}_test.csv'.format(cnt)) cnt += 1 else: print("No") test_word_chunks = word_chunks(test_x, **self.params['word_chunk_params']) print("Computed word chunks") for (transformer, apply_on_word_chunks, preprocess, scaler, predictors), model_weights in zip(self.stack, self.global_model_weights): if preprocess: raw_test = test_word_chunks_preprocessed if apply_on_word_chunks else test_x_preprocessed else: raw_test = test_word_chunks if apply_on_word_chunks else test_x data_transformed = scaler.transform(transformer(raw_test)) local_predictions = [ predictor.predict_proba(data_transformed).tolist() for predictor in predictors ] predictions.append( self.weight_local_predictions(local_predictions, model_weights)) if self.params['output_probabilities_test']: res = pd.DataFrame(np.array(test_probs)) res.to_csv('model_{0}_test.csv'.format(cnt)) cnt += 1 if (self.params['meta_learner']): prediction_converted = self.convert_to_meta_input(predictions) return self.meta_learner.predict(prediction_converted) else: predictions_prob = [[sum(y) for y in zip(*x)] for x in zip(*predictions)] return [x[0] < x[1] for x in predictions_prob]
def main(cutoff=None, persist=False): train_x, train_y, train_positions, train_file_names = get_data( main_dir=TRAINING_DIR) if cutoff: train_x = train_x[:cutoff] train_y = train_y[:cutoff] train_positions = train_positions[:cutoff] df = pd.DataFrame(data={'label': train_y, 'pos': train_positions}) pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] ngrams = NGrams(train_x, pos_tag_x) X = [preprocessor.process_text(x) for x in train_x] X_word_chunks = word_chunks(X, n=300, process=True) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True) fmap = { 'lexical_features': lexical(X_word_chunks), 'stop_word_features': ngrams.get_stop_words(X_word_chunks), 'function_word_features': ngrams.get_function_words(X_word_chunks), 'pos_tag_features': ngrams.get_pos_tags(X_pos_chunks), 'process_tag_features': processed_tags(X_word_chunks), 'word_frequency': wf.average_word_frequency(X_word_chunks), 'readability_features': readability(X_word_chunks) } for key, feature in fmap.items(): df[key] = feature if persist: df.to_csv(TRAIN_CSV_FILE)
def main(cutoff=None, persist=True): train_x, train_y, train_positions, train_file_names = get_data( #main_dir=TRAINING_DIR, external_file=TRAINING_EXTERNAL_FILE ) if cutoff: train_x = train_x[:cutoff] train_y = train_y[:cutoff] train_positions = train_positions[:cutoff] pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] ngrams = NGrams(train_x, pos_tag_x) X = [preprocessor.process_text(x) for x in train_x] X_word_chunks = word_chunks(X, n=300, process=True, sliding=True) #print('Word', max([len(s) for s in X_word_chunks])) X_char_chunks = char_chunks(X, n=2000, sliding=True) #print('Char', max([len(s) for s in X_char_chunks])) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True, sliding=True) max_segments = 20 lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments) stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments) function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments) pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments) word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments) readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments) # lexical_features = lexical(X_word_chunks) # stop_word_features = ngrams.get_stop_words(X_word_chunks) # function_word_features = ngrams.get_function_words(X_word_chunks) # pos_tag_features = ngrams.get_pos_tags(X_pos_chunks) # process_tag_features = processed_tags(X_word_chunks) # word_frequency = wf.average_word_frequency(X_word_chunks) # readability_features = readability(X_word_chunks) # tfidf = ngrams.get_word_tfidf(X_word_chunks) X = np.concatenate([lexical_features, stop_word_features, function_word_features, pos_tag_features, word_frequency, readability_features, paragraph_features], axis=2) print(X.shape) if persist: np.save(TRAIN_X_FILE, X) np.save(TRAIN_Y_FILE, train_y)
def predict(self, test_x): test_x_preprocessed = [preprocessor.process_text(x) for x in test_x] predictions = [] test_word_chunks = word_chunks(test_x, **self.params['word_chunk_params']) test_word_chunks_preprocessed = word_chunks(test_x_preprocessed, chunks=3, process=True, sliding=True) print("Computed word chunks") for (transformer, apply_on_word_chunks, preprocess, scaler, predictors), model_weights in zip(self.stack, self.global_model_weights): if (preprocess): raw_test = test_word_chunks_preprocessed if apply_on_word_chunks else test_x_preprocessed else: raw_test = test_word_chunks if apply_on_word_chunks else test_x data_transformed = scaler.transform(transformer(raw_test)) local_predictions = [ predictor.predict_proba(data_transformed).tolist() for predictor in predictors ] predictions.append( self.weight_local_predictions(local_predictions, model_weights)) if (self.params['meta_learner']): prediction_converted = self.convert_to_meta_input(predictions) return self.meta_learner.predict(prediction_converted) else: predictions_prob = [[sum(y) for y in zip(*x)] for x in zip(*predictions)] return [x[0] < x[1] for x in predictions_prob]
def pipeline(self, X, pos_tag_x, fit_scalers): feature_names = [] X = [preprocessor.process_text(x) for x in X] X_word_chunks = word_chunks(X, chunks=4, process=True, sliding=True) X_pos_chunks = word_chunks(pos_tag_x, chunks=4, process=True, sliding=True) lexical_features = max_diff(lexical(X_word_chunks, feature_names)) stop_word_features = max_diff( self.ngrams.get_stop_words(X_word_chunks, feature_names)) function_word_features = max_diff( self.ngrams.get_function_words(X_word_chunks, feature_names)) pos_tag_features = max_diff( self.ngrams.get_pos_tags(X_pos_chunks, feature_names)) process_tag_features = max_diff( processed_tags(X_word_chunks, feature_names)) word_frequency = max_diff( wf.average_word_frequency(X_word_chunks, feature_names)) readability_features = max_diff( readability(X_word_chunks, feature_names=feature_names)) #tfidf = max_diff(self.ngrams.get_word_tfidf(X_word_chunks, feature_names)) num_par = num_paragraphs(X, feature_names) X = np.concatenate( (lexical_features, stop_word_features, function_word_features, pos_tag_features, process_tag_features, word_frequency, readability_features, num_par), axis=1) X = minmax_scale(X) return X, feature_names
def main(cutoff=10000, persist=True): train_x, train_y, train_positions, train_file_names = get_data( external_file=TRAINING_EXTERNAL_FILE ) if cutoff: train_x = train_x[:cutoff] train_y = train_y[:cutoff] train_positions = train_positions[:cutoff] pos_tag_x = [NGrams.to_pos_tags(x) for x in train_x] ngrams = NGrams(train_x, pos_tag_x) X = [preprocessor.process_text(x) for x in train_x] X_word_chunks = word_chunks(X, n=300, process=True) X_pos_chunks = word_chunks(pos_tag_x, n=300, process=True) max_segments = 10 lexical_features = sequence.pad_sequences(lexical(X_word_chunks), maxlen=max_segments) stop_word_features = sequence.pad_sequences(ngrams.get_stop_words(X_word_chunks), maxlen=max_segments) function_word_features = sequence.pad_sequences(ngrams.get_function_words(X_word_chunks), maxlen=max_segments) pos_tag_features = sequence.pad_sequences(ngrams.get_pos_tags(X_pos_chunks), maxlen=max_segments) process_tag_features = sequence.pad_sequences(processed_tags(X_word_chunks), maxlen=max_segments) word_frequency = sequence.pad_sequences(wf.average_word_frequency(X_word_chunks), maxlen=max_segments) readability_features = sequence.pad_sequences(readability(X_word_chunks), maxlen=max_segments) print(type(lexical_features)) X = np.concatenate([lexical_features, stop_word_features, function_word_features, pos_tag_features, process_tag_features, word_frequency, readability_features], axis=2) if persist: np.save(TRAIN_X_FILE, X) np.save(TRAIN_Y_FILE, train_y)
def stack_fit_predict(self, train_x, train_y, test_x=None, test_y=None, test_additional=None): predictions = [] train_x_preprocessed = [preprocessor.process_text(x) for x in train_x] train_word_chunks = word_chunks(train_x, **self.params['word_chunk_params']) train_word_chunks_preprocessed = word_chunks(train_x_preprocessed, chunks=3, process=True, sliding=True) if (test_x): test_size = len(test_x) cnt = 0 if self.params['use_raw_text_models']: for model in self.raw_text_models: if self.params['fit_with_train'] and test_additional: print("Fitting LightGBM ...") model.fit_with_test(train_x, train_y, [], test_additional) else: print("Fitting LightGBM without train") model.fit(train_x, train_y, []) if test_x: predictions.append(model.predict_proba(test_x).tolist()) elif self.params['output_probabilities_train']: res = pd.DataFrame(model.predict_proba(train_x)) res.to_csv("model_{0}_train.csv".format(cnt)) cnt += 1 else: print("No") train_word_chunks = word_chunks(train_x, **self.params['word_chunk_params']) test_word_chunks = None if test_x: test_x_preprocessed = [ preprocessor.process_text(x) for x in test_x ] test_word_chunks = word_chunks(test_x, **self.params['word_chunk_params']) test_word_chunks_preprocessed = word_chunks(test_x_preprocessed, chunks=3, process=True, sliding=True) print("Computed word chunks") for transformer, apply_on_word_chunks, preprocess, scaler, predictors in self.stack: if preprocess: raw_data = train_word_chunks_preprocessed if apply_on_word_chunks else train_x_preprocessed else: raw_data = train_word_chunks if apply_on_word_chunks else train_x data_transformed_zero = scaler.fit_transform(transformer(raw_data)) if (test_x): if preprocess: raw_test = test_word_chunks_preprocessed if apply_on_word_chunks else test_x_preprocessed else: raw_test = test_word_chunks if apply_on_word_chunks else test_x data_transformed_meta = scaler.transform(transformer(raw_test)) local_predictions = [] for predictor in predictors: predictor.fit(data_transformed_zero, train_y) if (test_x): local_predictions.append( predictor.predict_proba( data_transformed_meta).tolist()) if self.params['output_probabilities_train']: res = pd.DataFrame( predictor.predict_proba(data_transformed_zero)) res.to_csv("model_{0}_train.csv".format(cnt)) cnt += 1 if (test_x): scores = [] for model_predictions in local_predictions: acc = [(prob[0] < prob[1]) == truth for prob, truth in zip(model_predictions, test_y) ].count(True) / test_size scores.append(acc) sum_scores = sum(scores) model_weights = [x / sum_scores for x in scores] print('Model weights: ', model_weights) self.global_model_weights.append(model_weights) predictions.append( self.weight_local_predictions(local_predictions, model_weights)) if (test_x): return predictions
def stack_fit_predict(self, train_x, train_y, test_x=None, test_y=None): train_x_preprocessed = [preprocessor.process_text(x) for x in train_x] train_word_chunks = word_chunks(train_x, **self.params['word_chunk_params']) train_word_chunks_preprocessed = word_chunks(train_x_preprocessed, chunks=3, process=True, sliding=True) test_word_chunks = None if test_x: test_x_preprocessed = [ preprocessor.process_text(x) for x in test_x ] test_word_chunks = word_chunks(test_x, **self.params['word_chunk_params']) test_word_chunks_preprocessed = word_chunks(test_x_preprocessed, chunks=3, process=True, sliding=True) print("Computed word chunks") predictions = [] if (test_x): test_size = len(test_x) for transformer, apply_on_word_chunks, preprocess, scaler, predictors in self.stack: if (preprocess): raw_data = train_word_chunks_preprocessed if apply_on_word_chunks else train_x_preprocessed else: raw_data = train_word_chunks if apply_on_word_chunks else train_x data_transformed_zero = scaler.fit_transform(transformer(raw_data)) if (test_x): if (preprocess): raw_test = test_word_chunks_preprocessed if apply_on_word_chunks else test_x_preprocessed else: raw_test = test_word_chunks if apply_on_word_chunks else test_x data_transformed_meta = scaler.transform(transformer(raw_test)) local_predictions = [] for predictor in predictors: predictor.fit(data_transformed_zero, train_y) if (test_x): local_predictions.append( predictor.predict_proba( data_transformed_meta).tolist()) if (test_x): scores = [] for model_predictions in local_predictions: acc = [(prob[0] < prob[1]) == truth for prob, truth in zip(model_predictions, test_y) ].count(True) / test_size scores.append(acc) sum_scores = sum(scores) model_weights = [x / sum_scores for x in scores] print('Model weights: ', model_weights) self.global_model_weights.append(model_weights) predictions.append( self.weight_local_predictions(local_predictions, model_weights)) if (test_x): return predictions