class BiEncoderTopXRetriever: def __init__(self, args, vocab, biencoder_onlyfor_encodingmentions, faiss_stored_kb, reader_for_mentions, duidx2encoded_emb): self.args = args self.mention_encoder = biencoder_onlyfor_encodingmentions self.mention_encoder.eval() self.faiss_searcher = faiss_stored_kb self.reader_for_mentions = reader_for_mentions self.sequence_iterator = BasicIterator(batch_size=self.args.batch_size_for_eval) self.sequence_iterator.index_with(vocab) self.cuda_device = 0 self.duidx2encoded_emb = duidx2encoded_emb def biencoder_tophits_retrievaler(self, train_or_dev_or_test_flag, how_many_top_hits_preserved=500): ds = self.reader_for_mentions.read(train_or_dev_or_test_flag) generator_for_biencoder = self.sequence_iterator(ds, num_epochs=1, shuffle=False) generator_for_biencoder_tqdm = tqdm(generator_for_biencoder, total=self.sequence_iterator.get_num_batches(ds)) with torch.no_grad(): for batch in generator_for_biencoder_tqdm: batch = nn_util.move_to_device(batch, self.cuda_device) mention_uniq_ids, encoded_mentions, gold_duidxs = self._extract_mention_idx_encoded_emb_and_its_gold_cuidx(batch=batch) faiss_search_candidate_result_cuidxs = self.faiss_topx_retriever(encoded_mentions=encoded_mentions, how_many_top_hits_preserved=how_many_top_hits_preserved) yield faiss_search_candidate_result_cuidxs, mention_uniq_ids, gold_duidxs def faiss_topx_retriever(self, encoded_mentions, how_many_top_hits_preserved): ''' if cossimsearch -> re-sort with L2, we have to use self.args.cand_num_before_sort_candidates_forBLINKbiencoder Args: encoded_mentions: how_many_top_hits_preserved: Returns: ''' if self.args.search_method == 'cossim': encoded_mentions = normalize(torch.from_numpy(encoded_mentions), dim=1).cpu().detach().numpy() _, faiss_search_candidate_result_cuidxs = self.faiss_searcher.search(encoded_mentions, how_many_top_hits_preserved) else: # assert self.args.search_method == 'indexflatl2' _, faiss_search_candidate_result_cuidxs = self.faiss_searcher.search(encoded_mentions, how_many_top_hits_preserved) return faiss_search_candidate_result_cuidxs def calc_L2distance(self, h, t): diff = h - t return torch.norm(diff, dim=2) def tonp(self, tsr): return tsr.detach().cpu().numpy() def _extract_mention_idx_encoded_emb_and_its_gold_cuidx(self, batch): out_dict = self.mention_encoder(**batch) return self.tonp(out_dict['mention_uniq_id']), self.tonp(out_dict['contextualized_mention']), self.tonp(out_dict['gold_duidx'])
class DataIteratorWrapper: def __init__(self, vocab: Vocabulary, instances, batch_size, shuffle): self.data_iter = BasicIterator(batch_size=batch_size, cache_instances=True) self.data_iter.index_with(vocab) self.instances = instances self.shuffle = shuffle def __len__(self): return self.data_iter.get_num_batches(self.instances) def __iter__(self): return self.data_iter(self.instances, shuffle=self.shuffle, num_epochs=1)
def run(args): print('\nArguments:') for k, v in vars(args).items(): print('{}: {}'.format(k, v)) print() device = args.device if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' print('Loading archive ...') archive = load_archive(args.model_path) # predictor = Predictor.from_archive(archive, 'protein_predictor') config = archive.config.duplicate() dataset_reader = DatasetReader.from_params(config["dataset_reader"]) model = archive.model.to(device).eval() print('Loading data ...') dataset_reader.lazy = False dataset = dataset_reader.read(args.input_path) iterator = BasicIterator(args.batch_size) iterator.index_with(model.vocab) num_batches = iterator.get_num_batches(dataset) data_generator = iterator(dataset, num_epochs=1, shuffle=False) print('Predicting ...') output_dict = {} with torch.no_grad(): for batch in Tqdm.tqdm(data_generator, total=num_batches): batch = move_to_device(batch, model._get_prediction_device()) outputs = model(**batch) predictions = outputs['predictions'].cpu().numpy() for pid, length, pred in zip(outputs['protein_id'], outputs['length'], predictions): if model.target == 'dcalpha': dcalpha = pred[:length, :length] dcalpha = np.triu(dcalpha, 1) + np.tril(dcalpha.transpose(), -1) output_dict[pid] = {'dcalpha': dcalpha} elif model.target == 'angles': psi, phi = pred[:length, 0], pred[:length, 1] # psi[0] = 0. # phi[-1] = 0. output_dict[pid] = {'psi': psi, 'phi': phi} else: coords = pred[:length] output_dict[pid] = {'coords': coords} print('Writing to {}'.format(args.output_path)) with open(args.output_path, 'wb') as fout: pickle.dump(output_dict, fout) print('All done.')
def latent(data_file, model_dir, epoch, device, impath): """ Subcommand to analyze the latent space """ # Prepare dataset reader = SNLIMetaReader() instances = reader.read(data_file) premises = [] hypotheses = [] similarities = [] labels = [] iterator = BasicIterator(batch_size=128) with torch.no_grad(): model = prediction_utils.load_model(model_dir, epoch, device) model.eval() iterator.index_with(model.vocab) logger.info(f'Iterating over data: {data_file}') generator_tqdm = tqdm(iterator(instances, num_epochs=1, shuffle=False), total=iterator.get_num_batches(instances)) for batch in generator_tqdm: batch = nn_util.move_to_device(batch, device) _, zp, _ = model._encode(batch['premise'], model._task_encoder, 0.0) _, zh, _ = model._encode(batch['hypothesis'], model._task_encoder, 0.0) premises.extend([ ' '.join(meta['premise_tokens']) for meta in batch['metadata'] ]) hypotheses.extend([ ' '.join(meta['hypothesis_tokens']) for meta in batch['metadata'] ]) labels.extend([meta['label'] for meta in batch['metadata']]) for zpe, zhe in zip(zp, zh): similarities.append(1 - cosine(zpe, zhe)) df = pd.DataFrame({ 'sentence1': premises, 'sentence2': hypotheses, 'similarity': similarities, 'label': labels }) logger.info(df.groupby('label').mean())
class InKBAllEntitiesEncoder: def __init__(self, args, entity_loader_datasetreaderclass, entity_encoder_wrapping_model, vocab): self.args = args self.entity_loader_datasetreader = entity_loader_datasetreaderclass self.sequence_iterator_for_encoding_entities = BasicIterator(batch_size=args.batch_size_for_kb_encoder) self.vocab = vocab self.entity_encoder_wrapping_model = entity_encoder_wrapping_model self.entity_encoder_wrapping_model.eval() self.cuda_device = 0 def encoding_all_entities(self): duidx2emb = {} ds = self.entity_loader_datasetreader.read('test') self.sequence_iterator_for_encoding_entities.index_with(self.vocab) entity_generator = self.sequence_iterator_for_encoding_entities(ds, num_epochs=1, shuffle=False) entity_generator_tqdm = tqdm(entity_generator, total=self.sequence_iterator_for_encoding_entities.get_num_batches(ds)) print('======Encoding all entites from title and description=====') entities_full_path = os.path.join(self.args.entities_path, self.args.entities_filename) if self.args.load_entities: duidx2emb = pickle_load_object(entities_full_path) else: with torch.no_grad(): for batch in entity_generator_tqdm: batch = nn_util.move_to_device(batch, self.cuda_device) duidxs, embs = self._extract_cuidx_and_its_encoded_emb(batch) for duidx, emb in zip(duidxs, embs): duidx2emb.update({int(duidx):emb}) if self.args.save_entities: pickle_save_object(duidx2emb, entities_full_path) return duidx2emb def tonp(self, tsr): return tsr.detach().cpu().numpy() def _extract_cuidx_and_its_encoded_emb(self, batch) -> np.ndarray: out_dict = self.entity_encoder_wrapping_model(**batch) return self.tonp(out_dict['gold_duidx']), self.tonp(out_dict['emb_of_entities_encoded'])
def evaluate(model: Model, dataset: Dataset, iterator: BasicIterator, cuda_device: int, serialization_directory: str) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1, cuda_device=cuda_device, shuffle=False, for_training=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join([ "%s: %.5f" % (name, value) for name, value in metrics.items() if "overall" in name ]) + " ||" generator_tqdm.set_description(description) metrics = model.get_metrics() golds = metrics["gold_spans"] predictions = metrics["predicted_spans"] assert len(dataset.instances) == len(golds) == len(predictions) # gold_file_path = os.path.join(serialization_directory, "gold.txt") prediction_file_path = os.path.join(serialization_directory, "predictions.txt") prediction_file = open(prediction_file_path, "w+") # gold_file = open(gold_file_path, "w+") logger.info("Writing predictions in CoNLL-like format to %s", prediction_file_path) for instance, gold, prediction in tqdm.tqdm( zip(dataset.instances, golds, predictions)): fields = instance.fields if "targets" in fields: verb_index = fields["targets"].labels.index(1) elif "verb_indicator" in fields: try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None else: verb_index = None frame = None if "frame" in fields: frame = fields["frame"].tokens[0].text gf = None if "gf" in fields: gf = [g.text for g in fields["gf"].tokens] pt = None if "pt" in fields: pt = [p.text for p in fields["pt"].tokens] sentence = [token.text for token in fields["tokens"].tokens] gold_tags = convert_spans_to_seq(gold, len(sentence)) predicted_tags = convert_spans_to_seq(prediction, len(sentence)) assert len(sentence) == len(gold_tags) == len(predicted_tags) write_to_conll_eval_file( prediction_file, # gold_file, verb_index, sentence, predicted_tags, gold_tags, frame, gf, pt) return model.get_metrics()
# In[ ]: filtered_params = [p for name, p in model.named_parameters() if use(name)] # In[ ]: optimizer = torch.optim.Adam(filtered_params, lr=config.lr, weight_decay=0.) # In[ ]: from allennlp.training.learning_rate_schedulers import SlantedTriangular, CosineWithRestarts # use slanted triangular lr scheduler to prevent initial spike in consistency loss lr_sched = SlantedTriangular( optimizer, num_epochs=config.epochs, num_steps_per_epoch=iterator.get_num_batches(train_ds)) # In[ ]: from allennlp.training import TrainerWithCallbacks trainer = TrainerWithCallbacks( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_ds, validation_dataset=val_ds, callbacks=[stat_rec, wdd, monitor], learning_rate_scheduler=lr_sched, # serialization_dir=DATA_ROOT / "debias_ckpts", cuda_device=0 if torch.cuda.is_available() else -1,
class EvaluatorClass: def __init__(self, args, model, vocab, er_vocab, all_entity_num, entity_dim): self.args = args self.evaluate_on_cpu = self.args.evaluate_on_cpu self.is_cuda_available = torch.cuda.is_available() self.er_vocab = er_vocab self.cuda_device = int(args.cuda_device) self.model = model self.model.eval() self.model.evaluate_flag += 1 self.sequence_iterator = BasicIterator(batch_size=args.batch_size) self.sequence_iterator.index_with(vocab) self.all_entity_num = all_entity_num self.entity_dim = entity_dim def evaluation(self, ds): pred_generator = self.sequence_iterator(ds, num_epochs=1, shuffle=False) self.model.eval() if self.evaluate_on_cpu: self.model.cpu() pred_generator_tqdm = tqdm( pred_generator, total=self.sequence_iterator.get_num_batches(ds)) hits = [] ranks = [] for i in range(10): hits.append([]) with torch.no_grad(): for batch in pred_generator_tqdm: if self.evaluate_on_cpu == False: batch = nn_util.move_to_device(batch, self.cuda_device) preds = self.model.eval_all_entities( batch['head'], batch['relation']) # batch * all ent size for j in range(batch['head'].size(0)): filt = self.er_vocab[( batch['head'][j][0].int().item(), batch['relation'][j][0].int().item())] target_value = preds[ j, batch['tail'][j][0].int().item()].item() preds[j, filt] = 0.0 preds[j, batch['tail'][j][0].int().item()] = target_value sort_values, sort_idxs = torch.sort(preds, dim=1, descending=True) sort_idxs = sort_idxs.cpu().numpy() for j in range(batch['head'].size(0)): rank = np.where(sort_idxs[j] == batch['tail'][j] [0].int().item())[0][0] ranks.append(rank + 1) for hits_level in range(10): if rank <= hits_level: hits[hits_level].append(1.0) else: hits[hits_level].append(0.0) print('\n ###### RESULTS ######') print('Hits @10: {0}'.format(np.mean(hits[9]))) print('Hits @3: {0}'.format(np.mean(hits[2]))) print('Hits @1: {0}'.format(np.mean(hits[0]))) print('Mean rank: {0}'.format(np.mean(ranks))) print('Mean reciprocal rank: {0}'.format(np.mean(1. / np.array(ranks)))) print('###### ###### ######') print('\n### EVALUATION FINISHED ###\n') def add_Embclass_2_model(self, E_numpy_weight): # emb_d, emb_ent_size = E_numpy_weight.shape(0) , E_numpy_weight.shape(1) embedding_tensor = torch.Tensor(E_numpy_weight) embedding_tensor = embedding_tensor.float() if self.is_cuda_available and self.evaluate_on_cpu == False: embedding_tensor = embedding_tensor.cuda() embed_for_model = nn.Embedding.from_pretrained(embedding_tensor) self.model.embed_for_model = embed_for_model # add class variable to model def get_E_numpy_from_alldataset(self, dslist): entity_symbolidx_2_KGemb_through_linear = {} for ds in dslist: pred_generator = self.sequence_iterator(ds, num_epochs=1, shuffle=False) self.model.eval() if self.evaluate_on_cpu: self.model.cpu() pred_generator_tqdm = tqdm( pred_generator, total=self.sequence_iterator.get_num_batches(ds)) with torch.no_grad(): for batch in pred_generator_tqdm: if self.evaluate_on_cpu == False: batch = nn_util.move_to_device(batch, self.cuda_device) hidx, hvec, tidx, tvec = self._extract_head_or_tail_and_its_vectordata( batch) # batch, batch * dim, batch, batch * dim for head_idx, head_ent_vect in zip(hidx, hvec): if head_idx not in entity_symbolidx_2_KGemb_through_linear: entity_symbolidx_2_KGemb_through_linear.update( {head_idx: head_ent_vect}) for tail_idx, tail_ent_vect in zip(tidx, tvec): if tail_idx not in entity_symbolidx_2_KGemb_through_linear: entity_symbolidx_2_KGemb_through_linear.update( {tail_idx: tail_ent_vect}) E = self.entity_symbolidx_2_KGemb_through_linear__2__E( entity_symbolidx_2_KGemb_through_linear) return E def entity_symbolidx_2_KGemb_through_linear__2__E( self, entity_symbolidx_2_KGemb_through_linear): # pdb.set_trace() KBemb = np.zeros( (self.all_entity_num, self.entity_dim)).astype('float32') for ent_idx, vec in entity_symbolidx_2_KGemb_through_linear.items(): KBemb[ent_idx] = vec print('converted emb', len(entity_symbolidx_2_KGemb_through_linear), '/', self.all_entity_num) # pdb.set_trace() return KBemb ### misc ### def idx2int_tensor(self, data): return data.int().cpu().detach().numpy() def vector2tensor(self, data): return data.cpu().detach().numpy() def get_W_numpy(self): return self.model.get_W() def get_R_numpy(self): return self.model.get_R() def tonp(self, tsr): return tsr.detach().cpu().numpy() def _extract_head_or_tail_and_its_vectordata(self, batch) -> np.ndarray: ''' :param batch: :return: Embedding matrix of all entities. ''' out_dict = self.model(**batch) head_idx = self.idx2int_tensor(out_dict['heads']).squeeze() tail_idx = self.idx2int_tensor(out_dict['tails']).squeeze() head_sents = self.vector2tensor(out_dict['heads_sent_encoded2KGemb']) tail_sents = self.vector2tensor(out_dict['tails_sent_encoded2KGemb']) return head_idx, head_sents, tail_idx, tail_sents
class BaseDataReader(DatasetReader): def __init__(self, data_dir, batch_size: int, shuffle=False, small_data=False, train_name='train.json', dev_name='dev.json', test_name='test.json'): super().__init__() self.data_dir = data_dir print('loading dataset: ' + os.path.join(data_dir, train_name)) self.train_dataset = self.read(os.path.join(data_dir, train_name)) print('loading val dataset: ' + os.path.join(data_dir, dev_name)) self.validation_dataset = self.read(os.path.join(data_dir, dev_name)) self.vocab = Vocabulary.from_instances(self.train_dataset + self.validation_dataset) print('loading test dataset:' + os.path.join(data_dir, test_name)) self.test_dataset = self.read(os.path.join(data_dir, test_name)) self.batch_size = batch_size self.shuffle = shuffle self.small_data = small_data self.iterator = BasicIterator(batch_size=batch_size, cache_instances=True) self.iterator.index_with(self.vocab) def get_iterator_and_num_batches(self, data_type): dataset_type_map = { 'train': self.train_dataset, 'dev': self.validation_dataset, 'test': self.test_dataset } dataset = dataset_type_map[data_type] if self.small_data and data_type == 'train': dataset = dataset[:int(len(dataset) / 10)] shuffle = self.shuffle if data_type == 'train' else False return self.iterator( dataset, shuffle=shuffle, num_epochs=1), self.iterator.get_num_batches(dataset) def _read(self, file_path: str) -> Iterable[Instance]: """ Reads the instances from the given file_path and returns them as an `Iterable` (which could be a list or could be a generator). You are strongly encouraged to use a generator, so that users can read a dataset in a lazy way, if they so choose. """ raise NotImplementedError def text_to_instance(self, *inputs) -> Instance: """ Does whatever tokenization or processing is necessary to go from textual input to an ``Instance``. The primary intended use for this is with a :class:`~allennlp.service.predictors.predictor.Predictor`, which gets text input as a JSON object and needs to process it to be input to a model. The intent here is to share code between :func:`_read` and what happens at model serving time, or any other time you want to make a prediction from new data. We need to process the data in the same way it was done at training time. Allowing the ``DatasetReader`` to process new text lets us accomplish this, as we can just call ``DatasetReader.text_to_instance`` when serving predictions. The input type here is rather vaguely specified, unfortunately. The ``Predictor`` will have to make some assumptions about the kind of ``DatasetReader`` that it's using, in order to pass it the right information. """ raise NotImplementedError
class BasePredictionClass(object): """ This (Abstract) class is devoted to extracting predictions, managing storage of predictions, and the optional visualization of predictions """ def __init__(self, vocab, reader, visualize=False): self._vocab = vocab self._iterator = BasicIterator(batch_size=32) self._iterator.index_with(self._vocab) self._reader = reader self._indexer = self._reader.get_label_indexer() self._visualize = visualize def _get_text_from_instance(self, instance: Instance) -> List[str]: """Helper function to extract text from an instance """ return list(map(lambda x: x.text, instance.fields['tokens'].tokens)) def get_segmentation_from_prediction(self, *args, **kwargs) -> List[str]: raise NotImplementedError("The child class implements this") def visualize(self, *args, **kwargs): raise NotImplementedError("The child class implements this") def _get_filtered_set(self): """ The set of words/symbols to be filtered out """ return set() def get_predictions(self, instances: List[Instance], model: Model, cuda_device: int = -1, prediction_file: Optional[str] = None, visualization_file: Optional[str] = None, verbose: bool = False) -> List[Dict]: """ We use this function to get predictions We use a basic itereator, since a bucket iterator shuffles data, even for shuffle=False Arguments: data (List[Instance]) : The list of instances for inference model (Model) : The model being used for predictions cuda_device (int) : The cuda device being used for processing verbose (bool) : Log accuracies and such Returns: predictions (List[Dict]) : The predictions. Each contains the following keys * text (List[str]): The tokens * pred (List[Tuple[str, float]]): The predicted labels and probs. Can potentially have multiple labels being predicted * gold (List[str]): The gold labels can potentially have multiple gold labels * pred_labels (List[str]): Predicted labels for segmentation Note that an this method is implemented by the base classes * attn (Dict[str, List[float]]) : A dictionary mapping tags to attention values * gold_labels : The gold labels for segmentation The gold labels for segmentation Additionally, this class stores the base_predictions, as well as the visualization, if visualization is set to True, and base_dir is provided """ iterator = self._iterator(instances, num_epochs=1, shuffle=False, cuda_device=cuda_device, for_training=False) model.eval() num_batches = self._iterator.get_num_batches(instances) inference_generator_tqdm = Tqdm.tqdm(iterator, total=num_batches) predictions = [] index = 0 matrix = { self._indexer.ix2tags[ix]: { "tp": 0., "fp": 0, "fn": 0., "tn": 0. } for ix in range(len(self._indexer.ix2tags)) } for batch in inference_generator_tqdm: # Currently I don't support multi-gpu data parallel output_dict = model.decode(model(**batch)) for ix in range(len(output_dict["preds"])): text = self._get_text_from_instance(instances[index]) pred = output_dict["preds"][ix] gold = [ self._indexer.get_tag(label) for label in instances[index].fields['labels'].labels ] attn = output_dict["attentions"][ix] gold_labels = instances[index].fields['tags'].labels assert all([len(attn[x]) == len(text) for x in attn]) gold_labels = self._indexer.extract_relevant(gold_labels) pred_labels = self.get_segmentation_from_prediction( text=text, preds_probs=pred, attns=attn) assert len(pred_labels) == len(gold_labels) == len(text) gold_set = set(gold) pred_set, _ = [set(list(x)) for x in zip(*pred)] # import pdb; pdb.set_trace() for tag in matrix: if tag in gold_set and tag in pred_set: matrix[tag]["tp"] += 1 elif tag not in gold_set and tag in pred_set: matrix[tag]["fp"] += 1 elif tag in gold_set and tag not in pred_set: matrix[tag]["fn"] += 1. else: matrix[tag]["tn"] += 1. preds = [[x[0], float(x[1])] for x in pred] prediction = { "text": text, "pred": preds, "gold": gold, "attn": attn, "pred_labels": pred_labels, "gold_labels": gold_labels } predictions.append(prediction) index += 1 if prediction_file is not None and prediction_file != "": with open(prediction_file, "w") as f: json.dump(predictions, f, ensure_ascii=True, indent=4) if visualization_file is not None and self._visualize and \ visualization_file != "": self.visualize(predictions, visualization_file) if verbose: accs = [] for tag in matrix: acc = (matrix[tag]["tp"] + matrix[tag]["tn"]) / \ sum(matrix[tag].values()) * 100. logger.info(f"Tag: {tag}, Acc: {acc:.2f}") accs.append(acc) avg_acc = sum(accs) / len(accs) logger.info(f"Average ACC: {avg_acc:.2f}") p, r, f = fscore_from_preds(predictions, False) return predictions
def evaluate(model, evaluation_dataset, batch_size, cuda): """ Evaluate a model on an evaluation dataset. """ # Set model to evaluation mode (turns off dropout and such) model.eval() # Create objects for calculating metrics. span_start_accuracy = CategoricalAccuracy() span_end_accuracy = CategoricalAccuracy() span_accuracy = BooleanAccuracy() squad_metrics = SquadEmAndF1() # Build iterater, and have it bucket batches by passage / question length. evaluation_iterator = BasicIterator(batch_size=batch_size) # Get a generator of train batches. num_evaluation_batches = evaluation_iterator.get_num_batches( evaluation_dataset) evaluation_generator = tqdm(evaluation_iterator( evaluation_dataset, num_epochs=1, shuffle=False, cuda_device=0 if cuda else -1, for_training=False), total=num_evaluation_batches, leave=False) batch_losses = 0 for batch in evaluation_generator: # Extract the relevant data from the batch. passage = batch["passage"]["tokens"] question = batch["question"]["tokens"] span_start = batch["span_start"] span_end = batch["span_end"] metadata = batch.get("metadata", {}) # Run data through model to get start and end logits. output_dict = model(passage, question) start_logits = output_dict["start_logits"] end_logits = output_dict["end_logits"] softmax_start_logits = output_dict["softmax_start_logits"] softmax_end_logits = output_dict["softmax_end_logits"] # Calculate loss for start and end indices. loss = nll_loss(softmax_start_logits, span_start.view(-1)) loss += nll_loss(softmax_end_logits, span_end.view(-1)) batch_losses += loss.data[0] # Calculate categorical span start and end accuracy. span_start_accuracy(start_logits, span_start.view(-1)) span_end_accuracy(end_logits, span_end.view(-1)) # Compute the best span, and calculate overall span accuracy. best_span = get_best_span(start_logits, end_logits) span_accuracy(best_span, torch.stack([span_start, span_end], -1)) # Calculate EM and F1 scores calculate_em_f1(best_span, metadata, passage.size(0), squad_metrics) # Set the model back to train mode. model.train() # Extract the values from the metrics objects average_span_start_accuracy = span_start_accuracy.get_metric() average_span_end_accuracy = span_end_accuracy.get_metric() average_span_accuracy = span_accuracy.get_metric() average_em, average_f1 = squad_metrics.get_metric() return (batch_losses / num_evaluation_batches, average_span_start_accuracy, average_span_end_accuracy, average_span_accuracy, average_em, average_f1)
def evaluate(model, evaluation_dataset, batch_size, vocab, cuda): """ Evaluate a model on an evaluation dataset. """ # Set model to evaluation mode (turns off dropout and such) model.eval() # Create objects for calculating metrics. span_start_accuracy = CategoricalAccuracy() span_end_accuracy = CategoricalAccuracy() span_accuracy = BooleanAccuracy() squad_metrics = SquadEmAndF1() # Build iterater, and have it bucket batches by passage / question length. evaluation_iterator = BasicIterator(batch_size=batch_size) # Index the instances with the vocabulary. # This converts string tokens to numerical indices. evaluation_iterator.index_with(vocab) # Get a generator of train batches. num_evaluation_batches = evaluation_iterator.get_num_batches( evaluation_dataset) evaluation_generator = tqdm(evaluation_iterator(evaluation_dataset, num_epochs=1, shuffle=False), total=num_evaluation_batches, leave=False) batch_losses = 0 for batch in evaluation_generator: # move the data to cuda if available batch = move_to_device(batch, cuda_device=0 if cuda else -1) # Extract the relevant data from the batch. passage = batch["passage"]["tokens"] question = batch["question"]["tokens"] span_start = batch["span_start"] span_end = batch["span_end"] metadata = batch.get("metadata", {}) # Run data through model to get start and end logits. output_dict = model(passage, question) start_logits = output_dict["start_logits"] end_logits = output_dict["end_logits"] softmax_start_logits = output_dict["softmax_start_logits"] softmax_end_logits = output_dict["softmax_end_logits"] # Calculate loss for start and end indices. loss = nll_loss(softmax_start_logits, span_start.view(-1)) loss += nll_loss(softmax_end_logits, span_end.view(-1)) batch_losses += loss.item() # Calculate categorical span start and end accuracy. span_start_accuracy(start_logits, span_start.view(-1)) span_end_accuracy(end_logits, span_end.view(-1)) # Compute the best span, and calculate overall span accuracy. best_span = get_best_span(start_logits, end_logits) span_accuracy(best_span, torch.cat([span_start, span_end], -1)) # Calculate EM and F1 scores calculate_em_f1(best_span, metadata, passage.size(0), squad_metrics) # Set the model back to train mode. model.train() # loss = batch_losses / num_evaluation_batches # em, f1 = squad_metrics.get_metric(reset=True) # tqdm_description = _make_tqdm_description( # loss, em, f1) # # Log training statistics to progress bar # # evaluation_generator.set_description(tqdm_description) # Extract the values from the metrics objects average_span_start_accuracy = span_start_accuracy.get_metric() average_span_end_accuracy = span_end_accuracy.get_metric() average_span_accuracy = span_accuracy.get_metric() average_em, average_f1 = squad_metrics.get_metric() return (batch_losses / num_evaluation_batches, average_span_start_accuracy, average_span_end_accuracy, average_span_accuracy, average_em, average_f1)
def main(): parser = argparse.ArgumentParser(description='Evidence sentence classifier') parser.add_argument('--k', type=int, default=1, help='number of evidence paragraphs to pick from the classifier (default: 1)') parser.add_argument('--probs', type=str, default=None, help='Pickled sentence probs file (default: None)') args = parser.parse_args() with torch.no_grad(): bert_token_indexer = {'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512)} pipeline_train = pickle.load(open('data/train_instances.p', 'rb')) pipeline_val = pickle.load(open('data/val_instances.p', 'rb')) pipeline_test = pickle.load(open('data/test_instances.p', 'rb')) pipeline_reader = PipelineDatasetReader(bert_token_indexer) p_train = pipeline_reader.read(pipeline_train) p_val = pipeline_reader.read(pipeline_val) p_test = pipeline_reader.read(pipeline_test) p_vocab = Vocabulary.from_instances(p_train + p_val + p_test) bert_token_embedding = PretrainedBertEmbedder( 'scibert/weights.tar.gz', requires_grad=False ) word_embeddings = BasicTextFieldEmbedder( {"bert": bert_token_embedding}, {"bert": ['bert']}, allow_unmatched_keys=True ) ev_classifier = Classifier(word_embeddings=word_embeddings, vocab=p_vocab, loss='bce', hinge_margin=0) predictor = Oracle(word_embeddings=word_embeddings, vocab=p_vocab) cuda_device = 0 if torch.cuda.is_available(): ev_classifier = ev_classifier.cuda() predictor = predictor.cuda() else: cuda_device = -1 ev_classifier.load_state_dict(torch.load('model_checkpoints/f_evidence_sentence_classifier_para/best.th')) predictor.load_state_dict(torch.load('model_checkpoints/f_oracle_full/best.th')) logger.info('Classifier and Predictor models loaded successfully') ev_classifier.eval() predictor.eval() iterator = BasicIterator(batch_size=256) iterator.index_with(p_vocab) if args.probs is None: iterator_obj = iterator(p_test, num_epochs=1, shuffle=False) generator_tqdm = Tqdm.tqdm(iterator_obj, total=iterator.get_num_batches(p_test)) output_probs = [] for batch in generator_tqdm: batch = nn_util.move_to_device(batch, cuda_device) probs = ev_classifier.predict_evidence_probs(**batch) probs = probs.cpu().numpy() output_probs.append(probs) output_probs = [i for item in output_probs for i in item] logger.info('Obtained all sentence evidence probabilities - total {}'.format(len(output_probs))) pickle.dump(output_probs, open('sentence_ev_probs.p', 'wb')) else: output_probs = pickle.load(open(args.probs, 'rb')) top_k_sentences = [] prob_counter = 0 for i in range(len(pipeline_test)): sentences = [' '.join(pipeline_test[i]['sentence_span'][k][0] + pipeline_test[i]['sentence_span'][k + 1][0] + pipeline_test[i]['sentence_span'][k + 2][0]) for k in range(len(pipeline_test[i]['sentence_span']) - 2)] probs = list(output_probs[prob_counter: prob_counter + len(sentences)]) prob_counter += len(sentences) sorted_sentences = sorted(zip(sentences, probs), key=lambda x: x[1], reverse=True) top_k = [s[0] for s in sorted_sentences[:args.k]] top_k_sentences.append({'I': pipeline_test[i]['I'], 'C': pipeline_test[i]['C'], 'O': pipeline_test[i]['O'], 'y_label': pipeline_test[i]['y'][0][0], 'evidence': ' '.join(top_k)}) logger.info('Obtained the top sentences from the evidence classifier') predictor_reader = EIDatasetReader(bert_token_indexer) predictor_test = predictor_reader.read(top_k_sentences) test_metrics = evaluate(predictor, predictor_test, iterator, cuda_device=cuda_device, batch_weight_key="") print('Test Data statistics:') for key, value in test_metrics.items(): print(str(key) + ': ' + str(value))