file=results_file) print("Support: %d" % (support if support is not None else -1), file=results_file) report = classification_report(y_true, y_pred) with directory.joinpath("classification_report").open( mode='w') as report_file: print(report, file=report_file) cm = confusion_matrix(y_true, y_pred) with directory.joinpath("confusion_matrix").open(mode='w') as cm_file: np.savetxt(cm_file, cm, fmt="%d") if __name__ == "__main__": data = read_dataset_splits(reader=data_readers.read_question_only_data, splits=["tiny", "train", "dev"]) X_train, y_train = prepare_data(data.train) X_dev, y_dev = prepare_data(data.dev) tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<UNK>", split=' ', lower=True) tokenizer.fit_on_texts(X_train) embeddings = getFastTextEmbeddings(tokenizer.word_index) model = simpleRNN(embeddings, hidden_dim=200) X_train = tokenizer.texts_to_sequences(X_train) X_dev = tokenizer.texts_to_sequences(X_dev)
from model_utils import get_response_time_label, add_cosine_similarity, add_question_length, add_jensen_shannon from scipy.stats import pearsonr random.seed(Config.SEED) def calc_correlation(data, col_name): return pearsonr(data.train['response_time_sec'], data.train[col_name]) if __name__ == '__main__': results = {} data = read_dataset_splits( reader=data_readers.read_question_and_context_data, window_size=10, include_question_text=True, include_context_text=True, include_context_speaker=False, include_context_times=False) data = add_jensen_shannon(data) results['question_and_js'] = calc_correlation(data, 'jensen_shannon') df = read_corpus(split='train') all_words = [item for sublist in df.text for item in sublist] for N_words in [25, 50, 100]: top_words = [ item[0] for item in Counter(all_words).most_common(N_words) ] data = read_dataset_splits( reader=data_readers.read_question_and_context_data, window_size=10,
('tfidf', TfidfTransformer()), ('clf', DummyClassifier(random_state=SEED))]) pipe.fit(train['question'], train['question_class']) preds = pipe.predict(dev['question']) print("Dummy Classifier: ") report = classification_report(dev['question_class'], preds) print(report) cm = confusion_matrix(dev['question_class'], preds) plot_cm(cm, title="Dummy Classifier") return models if __name__ == '__main__': data = read_dataset_splits(reader=read_question_only_data) data = add_classes(data) models = run_baselines(data) train, dev, test = data['train'], data['dev'], data['test'] dev_new = copy.deepcopy(dev) pipe = Pipeline([ ('vect', CountVectorizer(tokenizer=dummy_tokenizer, lowercase=False)), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(class_weight='balanced', random_state=SEED)) ]) #Generate results for Logistic regression and output to CSV file. #True class, predicted class and class probabilities. # g = models['Logistic Regression'] # pipe.set_params(**g)
report = classification_report(y_true, y_pred) with directory.joinpath("classification_report").open( mode='w') as report_file: print(report, file=report_file) cm = confusion_matrix(y_true, y_pred) with directory.joinpath("confusion_matrix").open(mode='w') as cm_file: np.savetxt(cm_file, cm, fmt="%d") if __name__ == "__main__": data = read_dataset_splits( reader=data_readers.read_question_and_context_data, splits=["tiny"], window_size=WINDOW_SIZE, include_question_text=True, include_context_text=True, include_context_speaker=True, include_context_times=False) X_train, y_train = prepare_data(data.tiny) X_dev, y_dev = prepare_data(data.tiny) tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<UNK>", split=' ', lower=True) tokenizer.fit_on_texts(X_train) embeddings = getFastTextEmbeddings(tokenizer.word_index) model = simpleRNN(embeddings, hidden_dim=200)