def load_bias(dataset_name: str, is_train=True) -> Dict[str, np.ndarray]: """Loads the output of our bias-only model Note that since this produces per-token output, it is only valid on data with the same tokenization as our annotated data. """ if dataset_name == "location": cache_dir = TRIVIAQA_CP_LOCATION_FILTERED_BIAS elif dataset_name == "person": cache_dir = TRIVIAQA_CP_PERSON_FILTERED_BIAS else: raise ValueError(dataset_name) part_name = "train" if is_train else "dev" src = join(cache_dir, "%s.pkl" % part_name) if not exists(src): key = (dataset_name, part_name) if key not in TRIVIAQA_CP_BIAS_FILE_IDS: raise RuntimeError() logging.info("Downloading TriviaQA-CP bias for %s to %s" % (dataset_name, src)) py_utils.download_from_drive(TRIVIAQA_CP_BIAS_FILE_IDS[key], src, progress_bar=False) return py_utils.load_pickle(src)
def load_annotated_squad(dataset_name) -> List[AnnotatedSquadParagraph]: """Loads SQuAD data that has been tokenized and tagged by CoreNLP""" if dataset_name not in DATASETS: raise ValueError("Invalid dataset %s" % dataset_name) src = join(config.SQUAD_CORENLP, "%s.pkl" % dataset_name) if not exists(src): logging.info("Download pre-processed SQuAD %s to %s" % (dataset_name, src)) py_utils.download_from_drive(ANNOTATED_SQUAD_FILE_IDS[dataset_name], src) return py_utils.load_pickle(src)
def load_annotated_triviaqa(is_train: bool) -> List[AnnotatedTriviaQaExample]: """Loads TriviaQA data that has been tokenized and tagged by CoreNLP""" dataset_name = "train" if is_train else "dev" src = join(config.TRIVIAQA_CP_CORENLP, "%s.pkl" % dataset_name) if not exists(src): logging.info("Download pre-processed TriviaQA %s to %s" % (dataset_name, src)) py_utils.download_from_drive( TRIVIAQA_CP_CORENLP_FILE_IDS[dataset_name], src, progress_bar=True) logging.info("Loading CoreNLP TriviaQA %s..." % dataset_name) return py_utils.load_pickle(src)
def get_squad_tfidf_features(dataset_name, pos_filter): """Gets the tfidf features we used to train the bias-only model :param dataset_name: Name of SQuAD dataset to get features for :param pos_filter: POS filtered TF-IDF scores or not :return: Dictionary of question_id -> per-word array of TT-IDF scores """ if pos_filter: root = SQUAD_FILTERED_TFIDF_FEATURES else: root = SQUAD_TFIDF_FEATURES src = join(root, dataset_name + ".pkl") if not exists(src): build_squad_tfidf_features(pos_filter) return py_utils.load_pickle(src)
def load_bias(dataset_name, filtered=False) -> Dict[str, np.ndarray]: """Loads the output of our bias-only model Note that since this produces per-token output, it is only valid on data with the same tokenization as our annotated data. """ if filtered: bias_ids = SQUAD_FILTERED_BIAS_FILE_IDS output_dir = SQUAD_TFIDF_FILTERED_BIAS else: bias_ids = SQUAD_BIAS_FILE_IDS output_dir = SQUAD_TFIDF_BIAS if dataset_name not in bias_ids: raise ValueError("No bias for %s" % dataset_name) src = join(output_dir, "%s.pkl" % dataset_name) if not exists(src): logging.info("Downloading SQuAD bias for %s to %s" % (dataset_name, src)) py_utils.download_from_drive(bias_ids[dataset_name], src) return py_utils.load_pickle(src)
def load_bias(dataset_name) -> Dict[str, np.ndarray]: """Load dictionary of example_id->bias where bias is a length 3 array of log-probabilities""" if dataset_name not in MNLI_BIAS_DRIVE_IDS: raise ValueError(dataset_name) bias_src = join(config.MNLI_WORD_OVERLAP_BIAS, dataset_name + ".pkl") if not exists(bias_src): logging.info("Downloading MNLI bias to %s..." % bias_src) py_utils.download_from_drive(MNLI_BIAS_DRIVE_IDS[dataset_name], bias_src) bias = py_utils.load_pickle(bias_src) for k, v in bias.items(): # Convert from entail vs non-entail to 3-way classes by splitting non-entail # to neutral and contradict bias[k] = np.array([ v[0] - np.log(2.), v[1], v[0] - np.log(2.), ]) return bias
def get_predictions(path, part="hans", bach_size=128, sample=None, n_processes=None, cache=False): output = join(path, "%s-prediction.pkl" % part) if sample is None and exists(output) and cache: return py_utils.load_pickle(output) logging.info("Computing predictions for %s on %s..." % (path, part)) logging.info("Loading model...") model_dir = ModelDir(path) model = model_dir.get_model() tokenizer = model.get_tokenizer() all_examples = load_eval_set(part, sample) all_examples = mnli.tokenize_examples(all_examples, tokenizer, n_processes) all_examples.sort(key=lambda x: len(x.premise)) logging.info("Setup data...") voc = set() for ex in all_examples: voc.update(ex.premise) voc.update(ex.hypothesis) model.set_vocab(voc) with tf.Session(graph=tf.Graph()) as sess: ds = mnli.make_dataset(all_examples, shuffle=False) fn = model.tensorize_fn() ds = ds.map(fn) ds = ds.padded_batch(bach_size, ds.output_shapes) ds.prefetch(5) it = ds.make_initializable_iterator() next_op = it.get_next() logits = model.apply(False, next_op, None) pred_op = tf.nn.softmax(logits) logging.info("Initializing...") if sess is None: sess = tf.Session() sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) sess.run(it.initializer) logging.info("Loading checkpoint...") saver = tf.train.Saver() saver.restore(sess, model_dir.get_latest_checkpoint()) predictions = [] pbar = tqdm(desc="classify", total=len(all_examples), ncols=80) while True: try: predictions.append(sess.run(pred_op)) pbar.update(len(predictions[-1])) except tf.errors.OutOfRangeError: break pbar.close() predictions = np.concatenate(predictions, 0) predictions = {k.id: p for p, k in zip(predictions, all_examples)} if sample is None and cache: with open(output, "wb") as f: pickle.dump(predictions, f) return predictions
def get_model(self) -> TextModel: return load_pickle(join(self.dir, "model.pkl"))