def _make_instance(sentence_tokens, question_tokens, answer_span, idx): d = dict() # For human-readability d["raw_sentence"] = MetadataField(" ".join(sentence_tokens[1:-1])) d["raw_question"] = MetadataField(" ".join(question_tokens[1:-1])) if model_preprocessing_interface.model_flags[ "uses_pair_embedding"]: inp = model_preprocessing_interface.boundary_token_fn( sentence_tokens, question_tokens) d["inputs"] = sentence_to_text_field(inp, indexers) else: d["sentence"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( sentence_tokens), indexers) d["question"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( question_tokens), indexers) d["span_start"] = NumericField(answer_span[0], label_namespace="span_start_labels") d["span_end"] = NumericField(answer_span[1], label_namespace="span_end_labels") d["idx"] = LabelField(idx, label_namespace="idxs", skip_indexing=True) return Instance(d)
def _make_instance(example): d = dict() # For human-readability d["raw_passage"] = MetadataField(" ".join(example["passage"])) d["raw_question"] = MetadataField(" ".join(example["question"])) if model_preprocessing_interface.model_flags["uses_pair_embedding"]: inp, start_offset, _ = model_preprocessing_interface.boundary_token_fn( example["passage"], example["question"], get_offset=True ) d["inputs"] = sentence_to_text_field(inp, indexers) else: d["passage"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn(example["passage"]), indexers ) d["question"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn(example["question"]), indexers ) start_offset = 0 d["span_start"] = NumericField( example["answer_span"][0] + start_offset, label_namespace="span_start_labels" ) d["span_end"] = NumericField( example["answer_span"][1] + start_offset, label_namespace="span_end_labels" ) d["start_offset"] = MetadataField(start_offset) d["passage_str"] = MetadataField(example["passage_str"]) d["answer_str"] = MetadataField(example["answer_str"]) d["space_processed_token_map"] = MetadataField(example["space_processed_token_map"]) return Instance(d)
def setUp(self): self.temp_dir = tempfile.mkdtemp() self.path = os.path.join(self.temp_dir, "temp_dataset.tsv") self.wic = tasks.WiCTask(self.temp_dir, 100, "wic", tokenizer_name="MosesTokenizer") indexers = {"bert_cased": SingleIdTokenIndexer("bert-xe-cased")} self.wic.val_data = [ Instance( { "sent1_str": MetadataField("Room and board yo."), "sent2_str": MetadataField("He nailed boards"), "idx": LabelField(1, skip_indexing=True), "idx2": NumericField(2), "idx1": NumericField(3), "inputs": self.sentence_to_text_field( ["[CLS]", "Room", "and", "board", "yo", "[SEP]", "He", "nailed", "boards"], indexers, ), "labels": LabelField(0, skip_indexing=1), } ) ] self.vocab = vocabulary.Vocabulary.from_instances(self.wic.val_data) self.vocab.add_token_to_namespace("True", "wic_tags") for data in self.wic.val_data: data.index_fields(self.vocab) self.args = mock.Mock() self.args.batch_size = 4 self.args.cuda = -1 self.args.run_dir = self.temp_dir self.args.exp_dir = ""
def setUp(self): """ Since we're testing write_preds, we need to mock model predictions and the parts of the model, arguments, and trainer needed to write to predictions. Unlike in update_metrics tests, the actual contents of the examples in val_data is not the most important as long as it adheres to the API necessary for examples of that task. """ self.temp_dir = tempfile.mkdtemp() self.path = os.path.join(self.temp_dir, "temp_dataset.tsv") self.stsb = tasks.STSBTask(self.temp_dir, 100, "sts-b", tokenizer_name="MosesTokenizer") self.wic = tasks.WiCTask(self.temp_dir, 100, "wic", tokenizer_name="MosesTokenizer") stsb_val_preds = pd.DataFrame(data=[ { "idx": 0, "labels": 1.00, "preds": 1.00, "sent1_str": "A man with a hard hat is dancing.", "sent2_str": "A man wearing a hard hat is dancing", }, { "idx": 1, "labels": 0.950, "preds": 0.34, "sent1_str": "A young child is riding a horse.", "sent2_str": "A child is riding a horse.", }, ]) wic_val_preds = pd.DataFrame(data=[ { "idx": 0, "sent1": "Room and board. ", "sent2": "He nailed boards across the windows.", "labels": 0, "preds": 0, }, { "idx": 1, "sent1": "Hook a fish", "sent2": "He hooked a snake accidentally.", "labels": 1, "preds": 1, }, ]) indexers = {"bert_cased": SingleIdTokenIndexer("bert-xe-cased")} self.wic.val_data = [ Instance({ "sent1_str": MetadataField("Room and board."), "sent2_str": MetadataField("He nailed boards"), "idx": LabelField(0, skip_indexing=True), "idx2": NumericField(2), "idx1": NumericField(3), "inputs": self.sentence_to_text_field( [ "[CLS]", "Room", "and", "Board", ".", "[SEP]", "He", "nailed", "boards", "[SEP]", ], indexers, ), "labels": LabelField(0, skip_indexing=1), }), Instance({ "sent1_str": MetadataField("C ##ir ##culate a rumor ."), "sent2_str": MetadataField("This letter is being circulated"), "idx": LabelField(1, skip_indexing=True), "idx2": NumericField(2), "idx1": NumericField(3), "inputs": self.sentence_to_text_field( [ "[CLS]", "C", "##ir", "##culate", "a", "rumor", "[SEP]", "This", "##let", "##ter", "is", "being", "c", "##ir", "##culated", "[SEP]", ], indexers, ), "labels": LabelField(0, skip_indexing=1), }), Instance({ "sent1_str": MetadataField("Hook a fish'"), "sent2_str": MetadataField("He hooked a snake accidentally"), "idx": LabelField(2, skip_indexing=True), "idx2": NumericField(2), "idx1": NumericField(3), "inputs": self.sentence_to_text_field( [ "[CLS]", "Hook", "a", "fish", "[SEP]", "He", "hooked", "a", "snake", "accidentally", "[SEP]", ], indexers, ), "labels": LabelField(1, skip_indexing=1), }), Instance({ "sent1_str": MetadataField("For recreation he wrote poetry."), "sent2_str": MetadataField("Drug abuse is often regarded as recreation ."), "idx": LabelField(3, skip_indexing=True), "idx2": NumericField(2), "idx1": NumericField(3), "inputs": self.sentence_to_text_field( [ "[CLS]", "For", "re", "##creation", "he", "wrote", "poetry", "[SEP]", "Drug", "abuse", "is", "often", "re", "##garded", "as", "re", "##creation", "[SEP]", ], indexers, ), "labels": LabelField(1, skip_indexing=1), }), ] self.val_preds = {"sts-b": stsb_val_preds, "wic": wic_val_preds} self.vocab = vocabulary.Vocabulary.from_instances(self.wic.val_data) self.vocab.add_token_to_namespace("True", "wic_tags") for data in self.wic.val_data: data.index_fields(self.vocab) self.glue_tasks = [self.stsb, self.wic] self.args = mock.Mock() self.args.batch_size = 4 self.args.cuda = -1 self.args.run_dir = self.temp_dir self.args.exp_dir = ""