def get_data(sections_of_interest=None, mode='experiment', include_sentence_span_splits = False): random.seed(177) if mode == 'experiment': # raise ValueError('implement me!') train_docs = list(preprocessor.train_document_ids()) random.shuffle(train_docs) split_index = int(len(train_docs) * .9) real_train_docs = train_docs[:split_index] real_val_docs = train_docs[split_index:] parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") real_train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(real_train_docs), sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits) real_val_Xy = preprocessor.get_Xy(set(real_val_docs), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) # in development, our "test" set is our validation ids so we don't cheat. real_test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) return real_train_Xy, real_val_Xy, real_test_Xy, inference_vectorizer elif mode == 'paper': parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_docs = preprocessor.train_document_ids() train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits) val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) return train_Xy, val_Xy, test_Xy, inference_vectorizer elif mode == 'minimal': parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_docs = list(preprocessor.train_document_ids())[:5] train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits) val_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[:5], inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[5:10], inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) return train_Xy, val_Xy, test_Xy, inference_vectorizer else: raise ValueError('implement me!')
def load_data(use_test, model_loc): """ Load the data into a train/val/test set that allows for easy access. @return bag-of-word representation of training, validation, test sets (with labels). """ t_ids = set(list(preprocessor.train_document_ids())) te_ids = set(list(preprocessor.test_document_ids())) val_ids = set(list(preprocessor.validation_document_ids())) train_Xy, inference_vectorizer = preprocessor.get_train_Xy(t_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True) # load model model = load_model_scan(inference_vectorizer, model_loc) # create an internal validation set from the training data; use 90% for training and 10% for validation. random.shuffle(train_Xy) if not(use_test): split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) x_train, y_train = reformat(train_Xy, inference_vectorizer, model) x_val, y_val = reformat(val_Xy, inference_vectorizer, model) x_test, y_test = reformat(test_Xy, inference_vectorizer, model) return x_train, y_train, x_val, y_val, x_test, y_test
def train(): # train the model -- this assumes access to evidence_inference: # https://github.com/jayded/evidence-inference/tree/master/evidence_inference # which is not needed in general to load the trained model. # # if inference_true flag is on, then a model will also be fit that predicts the # outcome (sig. decrease, no diff, sig. increase) given punchline snippets. from evidence_inference.preprocess.preprocessor import get_Xy, train_document_ids, test_document_ids, validation_document_ids, get_train_Xy extractor_model = PunchlineExtractor() tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids( ), test_document_ids() tr_ids = list(train_document_ids()) train_Xy, inference_vectorizer = get_train_Xy( tr_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=False, include_raw_texts=True) # Create vectors and targets for extraction task X_k, y_k = make_Xy(train_Xy, extractor_model.bc) print("train data loaded!") val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True) X_kv, y_kv = make_Xy(val_Xy, extractor_model.bc, neg_samples=1) print("val data loaded!") # Fit the model! filepath = "punchline.weights.best.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] with open("punchline_model.json", "w") as outf: outf.write(extractor_model.model.to_json()) print("fitting punchline extractor!") extractor_model.model.fit(X_k, y_k, validation_data=(X_kv, y_kv), callbacks=callbacks_list, epochs=50)
def run_scan_net_regression(loc = './scan_net.pth'): train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True) if not(USE_TEST): # create an internal validation set from the training data; use 90% for training and 10% for validation. split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) train_Xy, val_Xy, test_Xy = train_reformat(train_Xy), scan_reform(val_Xy), scan_reform(test_Xy) # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping) model = train_scan(inference_vectorizer, train_Xy, val_Xy, test_Xy, 100, 32, 5) torch.save(model.state_dict(), loc)
def run_scan_net_redux(loc='scan_net_redux.pth'): parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_Xy, inference_vectorizer = preprocessor.get_train_Xy( list(preprocessor.train_document_ids()), sections_of_interest=None, vocabulary_file=vocab_f, include_sentence_span_splits=True) if not (USE_TEST): # create an internal validation set from the training data; use 90% for training and 10% for validation. split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) else: val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) if USE_CUDA: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda() else: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn) # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping) train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, test_Xy, 50, 32, 10) # save to specified path torch.save(se_scn.state_dict(), loc)
def run_scan_net_ico(loc = "scan_net_ICO_no_attn_test.pth"): print("Modules loaded.") parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_Xy, inference_vectorizer = preprocessor.get_train_Xy(list(preprocessor.train_document_ids()), sections_of_interest=None, vocabulary_file=vocab_f, include_sentence_span_splits=True) print("Train Data Achieved") if not(USE_TEST): # create an internal validation set from the training data; use 90% for training and 10% for validation. split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids()), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) print("Test Data Achieved") if USE_CUDA: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda() else: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn) print("Model loaded") # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping) train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, 50, 32, 10) acc, f1, prc, rc, auc = test_model(se_scn, test_Xy, inference_vectorizer) # save to specified path #args = parser.parse_args() torch.save(se_scn.state_dict(), loc)
def train_simple_inference_net(n_epochs=30): inf_net = SimpleInferenceNet() tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids( ), test_document_ids() tr_ids = list(train_document_ids()) train_Xy, inference_vectorizer = get_train_Xy( tr_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=False, include_raw_texts=True) X_k, y_k = make_Xy_inference(train_Xy, inf_net.bc) print("train data for inference task loaded!") val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True) X_kv, y_kv = make_Xy_inference(val_Xy, inf_net.bc) print("val data loaded!") filepath = "inference.weights.best.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] with open("inference_model.json", "w") as outf: outf.write(inf_net.model.to_json()) print("fitting inference model!") inf_net.model.fit(X_k, y_k, validation_data=(X_kv, y_kv), callbacks=callbacks_list, epochs=n_epochs)
import torch import random from evidence_inference.preprocess import preprocessor from evidence_inference.experiments.LR_pipeline import load_model_scan from evidence_inference.models.scan_regression import train_reformat, scan_reform, Bag_of_words from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score USE_CUDA = True USE_TEST = True print("Loading data.") # get training data train_Xy, inference_vectorizer = preprocessor.get_train_Xy( set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=True) print("Training data loaded.") if not (USE_TEST): split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(set( list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) else: val_Xy = preprocessor.get_Xy(set(
import random import numpy as np import torch from transformers import RobertaForSequenceClassification, RobertaTokenizer, PretrainedConfig import sys sys.path.append("../") from evidence_inference.preprocess.preprocessor import get_Xy, train_document_ids, test_document_ids, validation_document_ids, get_train_Xy device = torch.device('cuda') print("loading train docs...") tr_ids = list(train_document_ids()) train_Xy, inference_vectorizer = get_train_Xy( tr_ids[:100], sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=False, include_raw_texts=True) print("done") val_ids = list(validation_document_ids()) val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True) def instances_from_article(article_dict, neg_samples=2, max_instances=6): def filter_empty(snippets): return [s for s in snippets if len(s) > 1] evidence_snippets = filter_empty( [snippet[1].lower() for snippet in article_dict['y']]) positive_snippets = evidence_snippets
weighted by number of prompts for that document. """ tokens = {} # Map article ids to token prompts = {} # Map article ids to num prompts for d in Xy: n_tokens = len(d['article']) tokens[d['a_id']] = n_tokens if d['a_id'] in prompts: prompts[d['a_id']] += 1 else: prompts[d['a_id']] = 1 total_entropy = 0 for art in prompts.keys(): total_entropy += np.log(tokens[art]) * prompts[art] / len(Xy) return total_entropy tr_ids, val_ids, te_ids = preprocessor.train_document_ids( ), preprocessor.validation_document_ids(), preprocessor.test_document_ids() train_Xy, inference_vectorizer = preprocessor.get_train_Xy(tr_ids) val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer) test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer) print(calculate_entropy(train_Xy)) print(calculate_entropy(val_Xy)) print(calculate_entropy(test_Xy))