def get_data(sections_of_interest=None, mode='experiment', include_sentence_span_splits = False): random.seed(177) if mode == 'experiment': # raise ValueError('implement me!') train_docs = list(preprocessor.train_document_ids()) random.shuffle(train_docs) split_index = int(len(train_docs) * .9) real_train_docs = train_docs[:split_index] real_val_docs = train_docs[split_index:] parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") real_train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(real_train_docs), sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits) real_val_Xy = preprocessor.get_Xy(set(real_val_docs), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) # in development, our "test" set is our validation ids so we don't cheat. real_test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) return real_train_Xy, real_val_Xy, real_test_Xy, inference_vectorizer elif mode == 'paper': parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_docs = preprocessor.train_document_ids() train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits) val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) return train_Xy, val_Xy, test_Xy, inference_vectorizer elif mode == 'minimal': parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_docs = list(preprocessor.train_document_ids())[:5] train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits) val_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[:5], inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[5:10], inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits) return train_Xy, val_Xy, test_Xy, inference_vectorizer else: raise ValueError('implement me!')
def load_data(use_test, model_loc): """ Load the data into a train/val/test set that allows for easy access. @return bag-of-word representation of training, validation, test sets (with labels). """ t_ids = set(list(preprocessor.train_document_ids())) te_ids = set(list(preprocessor.test_document_ids())) val_ids = set(list(preprocessor.validation_document_ids())) train_Xy, inference_vectorizer = preprocessor.get_train_Xy(t_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True) # load model model = load_model_scan(inference_vectorizer, model_loc) # create an internal validation set from the training data; use 90% for training and 10% for validation. random.shuffle(train_Xy) if not(use_test): split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) x_train, y_train = reformat(train_Xy, inference_vectorizer, model) x_val, y_val = reformat(val_Xy, inference_vectorizer, model) x_test, y_test = reformat(test_Xy, inference_vectorizer, model) return x_train, y_train, x_val, y_val, x_test, y_test
def run_scan_net_regression(loc = './scan_net.pth'): train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True) if not(USE_TEST): # create an internal validation set from the training data; use 90% for training and 10% for validation. split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) train_Xy, val_Xy, test_Xy = train_reformat(train_Xy), scan_reform(val_Xy), scan_reform(test_Xy) # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping) model = train_scan(inference_vectorizer, train_Xy, val_Xy, test_Xy, 100, 32, 5) torch.save(model.state_dict(), loc)
def run_scan_net_redux(loc='scan_net_redux.pth'): parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_Xy, inference_vectorizer = preprocessor.get_train_Xy( list(preprocessor.train_document_ids()), sections_of_interest=None, vocabulary_file=vocab_f, include_sentence_span_splits=True) if not (USE_TEST): # create an internal validation set from the training data; use 90% for training and 10% for validation. split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) else: val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) if USE_CUDA: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda() else: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn) # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping) train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, test_Xy, 50, 32, 10) # save to specified path torch.save(se_scn.state_dict(), loc)
def run_scan_net_ico(loc = "scan_net_ICO_no_attn_test.pth"): print("Modules loaded.") parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..')) vocab_f = os.path.join(parent_path, "annotations", "vocab.txt") train_Xy, inference_vectorizer = preprocessor.get_train_Xy(list(preprocessor.train_document_ids()), sections_of_interest=None, vocabulary_file=vocab_f, include_sentence_span_splits=True) print("Train Data Achieved") if not(USE_TEST): # create an internal validation set from the training data; use 90% for training and 10% for validation. split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids()), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) print("Test Data Achieved") if USE_CUDA: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda() else: se_scn = ScanNet(inference_vectorizer, use_attention=use_attn) print("Model loaded") # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping) train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, 50, 32, 10) acc, f1, prc, rc, auc = test_model(se_scn, test_Xy, inference_vectorizer) # save to specified path #args = parser.parse_args() torch.save(se_scn.state_dict(), loc)
def train(): # train the model -- this assumes access to evidence_inference: # https://github.com/jayded/evidence-inference/tree/master/evidence_inference # which is not needed in general to load the trained model. # # if inference_true flag is on, then a model will also be fit that predicts the # outcome (sig. decrease, no diff, sig. increase) given punchline snippets. from evidence_inference.preprocess.preprocessor import get_Xy, train_document_ids, test_document_ids, validation_document_ids, get_train_Xy extractor_model = PunchlineExtractor() tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids( ), test_document_ids() tr_ids = list(train_document_ids()) train_Xy, inference_vectorizer = get_train_Xy( tr_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=False, include_raw_texts=True) # Create vectors and targets for extraction task X_k, y_k = make_Xy(train_Xy, extractor_model.bc) print("train data loaded!") val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True) X_kv, y_kv = make_Xy(val_Xy, extractor_model.bc, neg_samples=1) print("val data loaded!") # Fit the model! filepath = "punchline.weights.best.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] with open("punchline_model.json", "w") as outf: outf.write(extractor_model.model.to_json()) print("fitting punchline extractor!") extractor_model.model.fit(X_k, y_k, validation_data=(X_kv, y_kv), callbacks=callbacks_list, epochs=50)
def train_simple_inference_net(n_epochs=30): inf_net = SimpleInferenceNet() tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids( ), test_document_ids() tr_ids = list(train_document_ids()) train_Xy, inference_vectorizer = get_train_Xy( tr_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=False, include_raw_texts=True) X_k, y_k = make_Xy_inference(train_Xy, inf_net.bc) print("train data for inference task loaded!") val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True) X_kv, y_kv = make_Xy_inference(val_Xy, inf_net.bc) print("val data loaded!") filepath = "inference.weights.best.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] with open("inference_model.json", "w") as outf: outf.write(inf_net.model.to_json()) print("fitting inference model!") inf_net.model.fit(X_k, y_k, validation_data=(X_kv, y_kv), callbacks=callbacks_list, epochs=n_epochs)
def load_data(use_test, bow=True): """ Load the data into a train/val/test set that allows for easy access. @return bag-of-word representation of training, validation, test sets (with labels). """ prompts = preprocessor.read_prompts() annotations = preprocessor.read_annotations() # filter out prompts for which we do not have annotations for whatever reason # this was actually just one case; not sure what was going on there. def have_annotations_for_prompt(prompt_id): return len( annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0 prompts = [ prompt for row_idx, prompt in prompts.iterrows() if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME]) ] prompts = pd.DataFrame(prompts) # Sort into training and validation by article id train_doc_ids = preprocessor.train_document_ids() val_doc_ids = preprocessor.validation_document_ids() test_doc_ids = preprocessor.test_document_ids() # get a dev set randomly dev_doc_ids = list(train_doc_ids) random.shuffle(dev_doc_ids) dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)]) x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], [] pids = prompts[STUDY_ID_COL].values for i in range(len(pids)): annotations_for_prompt = annotations[annotations[PROMPT_ID_COL_NAME] == prompts["PromptID"].values[i]] labels = annotations_for_prompt[[LBL_COL_NAME, EVIDENCE_COL_NAME]].values id_ = pids[i] # this is all of the reasonings articles = [a[1] for a in labels] for article_text in articles: # extract i/c/o out = prompts["Outcome"].values[i].lower() inter = prompts["Intervention"].values[i].lower() cmp = prompts["Comparator"].values[i].lower() # add to correct pile: train/val/test tmp = [article_text, out, inter, cmp] loss = stats.mode([l1[0] for l1 in labels])[0][0] if id_ in dev_doc_ids and not (use_test): x_dev.append(tmp) y_dev.append(loss) elif id_ in train_doc_ids: x_train.append(tmp) y_train.append(loss) elif id_ in val_doc_ids: x_val.append(tmp) y_val.append(loss) elif id_ in test_doc_ids: x_test.append(tmp) y_test.append(loss) else: raise ValueError("Unknown study id {}".format(id_)) # transform to np.array y_test = np.asarray(y_test) # if we are removing the test set, use validation as test set. if not (use_test): x_test = x_val y_test = y_val x_val = x_dev y_val = y_dev print("Running bag of words...") ret = bag_of_words( x_train, y_train, x_val, y_val, x_test, y_test) if bow else [x_train, y_train, x_val, y_val, x_test, y_test] return ret
print("Loading data.") # get training data train_Xy, inference_vectorizer = preprocessor.get_train_Xy( set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=True) print("Training data loaded.") if not (USE_TEST): split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(set( list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) else: val_Xy = preprocessor.get_Xy(set( list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) test_Xy = preprocessor.get_Xy(set(list(preprocessor.test_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits=True) print("Test data loaded.")
import sys sys.path.append("../") from evidence_inference.preprocess.preprocessor import get_Xy, train_document_ids, test_document_ids, validation_document_ids, get_train_Xy device = torch.device('cuda') print("loading train docs...") tr_ids = list(train_document_ids()) train_Xy, inference_vectorizer = get_train_Xy( tr_ids[:100], sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits=False, include_raw_texts=True) print("done") val_ids = list(validation_document_ids()) val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True) def instances_from_article(article_dict, neg_samples=2, max_instances=6): def filter_empty(snippets): return [s for s in snippets if len(s) > 1] evidence_snippets = filter_empty( [snippet[1].lower() for snippet in article_dict['y']]) positive_snippets = evidence_snippets if len(positive_snippets) == 0: print("no evidence snippets in an article!") return ([], [])
def load_data(use_test, bow=True): """ Load the data into a train/val/test set that allows for easy access. @return bag-of-word representation of training, validation, test sets (with labels). """ prompts = preprocessor.read_prompts() annotations = preprocessor.read_annotations() # filter out prompts for which we do not have annotations for whatever reason # this was actually just one case; not sure what was going on there. def have_annotations_for_prompt(prompt_id): return len( annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0 prompts = [ prompt for row_idx, prompt in prompts.iterrows() if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME]) ] prompts = pd.DataFrame(prompts) # Sort into training and validation by article id train_doc_ids = preprocessor.train_document_ids() val_doc_ids = preprocessor.validation_document_ids() test_doc_ids = preprocessor.test_document_ids() # get a dev set randomly dev_doc_ids = list(train_doc_ids) random.shuffle(dev_doc_ids) dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)]) x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], [] pids = prompts[STUDY_ID_COL].values for i in range(len(pids)): id_, data, losses = parse_prompt_id_data(annotations, prompts, pids, i) in_training = id_ in (train_doc_ids - dev_doc_ids) # get a reasoning from previous/next prompt id if (i > 0 and id_ == pids[i - 1] and not (in_training)): _, mismatched_data, _ = parse_prompt_id_data( annotations, prompts, pids, i - 1) # add the mismatched data here row = copy.deepcopy(data[0]) row[1] = mismatched_data[0][1] data.append(row) losses.append(losses[-1]) elif (i < len(pids) and id_ == pids[i + 1] and not (in_training)): _, mismatched_data, _ = parse_prompt_id_data( annotations, prompts, pids, i + 1) # add the mismatched data here row = copy.deepcopy(data[0]) row[1] = mismatched_data[0][1] data.append(row) losses.append(losses[-1]) for i in range(len(data)): tmp = data[i] loss = losses[i] # find where to put this section if id_ in dev_doc_ids and not (use_test): x_dev.append(tmp) y_dev.append(loss) elif id_ in train_doc_ids: x_train.append(tmp) y_train.append(loss) elif id_ in val_doc_ids: x_val.append(tmp) y_val.append(loss) elif id_ in test_doc_ids: x_test.append(tmp) y_test.append(loss) else: raise ValueError("Unknown study id {}".format(id_)) # if we are removing the test set, use validation as test set. if not (use_test): x_test = x_val y_test = y_val x_val = x_dev y_val = y_dev ret = bag_of_words( x_train, y_train, x_val, y_val, x_test, y_test, 5) if bow else [x_train, y_train, x_val, y_val, x_test, y_test] return ret
weighted by number of prompts for that document. """ tokens = {} # Map article ids to token prompts = {} # Map article ids to num prompts for d in Xy: n_tokens = len(d['article']) tokens[d['a_id']] = n_tokens if d['a_id'] in prompts: prompts[d['a_id']] += 1 else: prompts[d['a_id']] = 1 total_entropy = 0 for art in prompts.keys(): total_entropy += np.log(tokens[art]) * prompts[art] / len(Xy) return total_entropy tr_ids, val_ids, te_ids = preprocessor.train_document_ids( ), preprocessor.validation_document_ids(), preprocessor.test_document_ids() train_Xy, inference_vectorizer = preprocessor.get_train_Xy(tr_ids) val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer) test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer) print(calculate_entropy(train_Xy)) print(calculate_entropy(val_Xy)) print(calculate_entropy(test_Xy))
sens, = [PaddedSequence.autopad(sentences, batch_first=True, padding_value=unk_idx)] sens = sens.cuda() preds = model(sens, batch_size=len(sentences)) pred = preds[0].data.tolist()[0] return pred print("Loading data...") # get training data train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file="..//..//.//annotations/vocab.txt", include_sentence_span_splits = True) print("Training data loaded...") if not(USE_TEST): split_index = int(len(train_Xy) * .9) val_Xy = train_Xy[split_index:] train_Xy = train_Xy[:split_index] test_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) else: val_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) test_Xy = preprocessor.get_Xy(set(list(preprocessor.test_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) print("Test data loaded...") # modify training data val_Xy, test_Xy = scan_reform(val_Xy), scan_reform(test_Xy) print("Reformatted data.") # load the model model = load_model_scan(inference_vectorizer, './models/scan_model_neural.pth') print("Model loaded...") # after loading the model, get all predictions.