def _load_split(data_file): questions, choices, targs, id_str = [], [], [], [] data = [json.loads(l) for l in open(data_file, encoding="utf-8")] for example in data: question = tokenize_and_truncate( self._tokenizer_name, "Q:" + example["question"]["stem"], self.max_seq_len) choices_dict = { a_choice["label"]: tokenize_and_truncate(self._tokenizer_name, "A:" + a_choice["text"], self.max_seq_len) for a_choice in example["question"]["choices"] } multiple_choices = [ choices_dict[label] for label in self.choice_idx2label ] targ = self.label2choice_idx[ example["answerKey"]] if "answerKey" in example else 0 example_id = example["id"] questions.append(question) choices.append(multiple_choices) targs.append(targ) id_str.append(example_id) return [questions, choices, targs, id_str]
def _load_csv(self, input_file): import csv with open(input_file, "r") as csv_file: reader = csv.DictReader(csv_file) records = [record for record in reader] contexts, choices, targs, id_str = [], [], [], [] for record in records: question = record["question"] ans_choices = [ record["answer" + str(i)] for i in range(self.n_choices) ] qa_tok_choices = [ tokenize_and_truncate(self._tokenizer_name, question + " " + ans_choices[i], self.max_seq_len) for i in range(len(ans_choices)) ] max_ans_len = max([len(tok) for tok in qa_tok_choices]) context = tokenize_and_truncate(self._tokenizer_name, record["context"], self.max_seq_len - max_ans_len) targ = int(record["label"]) if "label" in record else 0 idx = record["id"] contexts.append(context) choices.append(qa_tok_choices) targs.append(targ) id_str.append(idx) return [contexts, choices, targs, id_str]
def load_data_for_path(self, path): """ Load data """ with open(path, encoding="utf-8") as data_fh: examples = [] for example in data_fh: ex = json.loads(example) assert ( "version" in ex and ex["version"] == 1.1 ), "MultiRC version is invalid! Example indices are likely incorrect. " "Please re-download the data from super.gluebenchmark.com ." # each example has a passage field -> (text, questions) # text is the passage, which requires some preprocessing # questions is a list of questions, has fields (question, sentences_used, answers) ex["passage"]["text"] = tokenize_and_truncate( self.tokenizer_name, ex["passage"]["text"], self.max_seq_len) for question in ex["passage"]["questions"]: question["question"] = tokenize_and_truncate( self.tokenizer_name, question["question"], self.max_seq_len) for answer in question["answers"]: answer["text"] = tokenize_and_truncate( self.tokenizer_name, answer["text"], self.max_seq_len) examples.append(ex) return examples
def tokenize_preserve_placeholder(sent, max_ent_length): """ Tokenize questions while preserving @placeholder token """ sent_parts = sent.split("@placeholder") assert len(sent_parts) == 2 placeholder_loc = len( tokenize_and_truncate(self.tokenizer_name, sent_parts[0], self.max_seq_len - max_ent_length)) sent_tok = tokenize_and_truncate(self.tokenizer_name, sent, self.max_seq_len - max_ent_length) return sent_tok[:placeholder_loc] + ["@placeholder" ] + sent_tok[placeholder_loc:]
def load_data_for_path(self, path, split): """ Load data """ examples = [] data = [json.loads(d) for d in open(path, encoding="utf-8")] for item in data: psg_id = item["idx"] psg = tokenize_and_truncate(self.tokenizer_name, item["passage"]["text"], self.max_seq_len) ent_idxs = item["passage"]["entities"] ents = [ item["passage"]["text"][idx["start"]:idx["end"] + 1] for idx in ent_idxs ] qas = item["qas"] for qa in qas: qst = qa["query"] qst_id = qa["idx"] if "answers" in qa: anss = [a["text"] for a in qa["answers"]] else: anss = [] ex = { "passage": psg, "ents": ents, "query": qst, "answers": anss, "psg_id": f"{split}-{psg_id}", "qst_id": qst_id, } examples.append(ex) return examples
def load_csv(data_file): rows = pd.read_csv(data_file, encoding="utf-8") labels = rows["1"].apply(lambda x: str(x.split("\t")[0])) s1 = rows["1"].apply(lambda x: x.split("\t")[1]) s1 = s1.apply(lambda x: tokenize_and_truncate( self._tokenizer_name, x, self.max_seq_len)) return s1.tolist(), [], labels.tolist(), list(range(len(rows)))
def load_csv(data_file): rows = pd.read_csv(data_file, encoding="utf-8") rows = rows.sample(frac=1, axis=0).reset_index(drop=True) rows["s1"] = rows["2"].apply(lambda x: tokenize_and_truncate( self._tokenizer_name, x, self.max_seq_len)) return rows["s1"].tolist(), [], rows["1"].tolist(), list( range(len(rows)))
def load_csv(data_file): rows = pd.read_csv(data_file, encoding="utf-8") rows["s1"] = rows["2"].apply(lambda x: tokenize_and_truncate( self._tokenizer_name, x, self.max_seq_len)) self.labels.append(rows["1"].tolist()) return rows["s1"].tolist(), [], rows["1"].tolist(), list( range(len(rows)))
def process_split( self, split, indexers, model_preprocessing_interface) -> Iterable[Type[Instance]]: """ Process split text into a list of AllenNLP Instances. """ def is_answer(x, ys): """ Given a list of answers, determine if x is an answer """ return x in ys def insert_ent(ent, template): """ Replace ent into template (query with @placeholder) """ assert "@placeholder" in template, "No placeholder detected!" split_idx = template.index("@placeholder") return template[:split_idx] + ent + template[split_idx + 1:] def _make_instance(psg, qst, ans_str, label, psg_idx, qst_idx, ans_idx): """ pq_id: passage-question ID """ d = {} d["psg_str"] = MetadataField(" ".join(psg)) d["qst_str"] = MetadataField(" ".join(qst)) d["ans_str"] = MetadataField(ans_str) d["psg_idx"] = MetadataField(par_idx) d["qst_idx"] = MetadataField(qst_idx) d["ans_idx"] = MetadataField(ans_idx) d["idx"] = MetadataField(ans_idx) # required by evaluate() if model_preprocessing_interface.model_flags[ "uses_pair_embedding"]: inp = model_preprocessing_interface.boundary_token_fn(psg, qst) d["psg_qst_ans"] = sentence_to_text_field(inp, indexers) else: d["psg"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn(psg), indexers) d["qst"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn(qst), indexers) d["label"] = LabelField(label, label_namespace="labels", skip_indexing=True) return Instance(d) for example in split: psg = example["passage"] qst_template = example["query"] ent_strs = example["ents"] ents = [ tokenize_and_truncate(self._tokenizer_name, ent, self.max_seq_len) for ent in ent_strs ] anss = example["answers"] par_idx = example["psg_id"] qst_idx = example["qst_id"] for ent_idx, (ent, ent_str) in enumerate(zip(ents, ent_strs)): label = is_answer(ent_str, anss) qst = insert_ent(ent, qst_template) yield _make_instance(psg, qst, ent_str, label, par_idx, qst_idx, ent_idx)
def tokenize_preserve_placeholder(sent): """ Tokenize questions while preserving @placeholder token """ sent_parts = sent.split("@placeholder") assert len(sent_parts) == 2 sent_parts = [ tokenize_and_truncate(self.tokenizer_name, s, self.max_seq_len) for s in sent_parts ] return sent_parts[0] + ["@placeholder"] + sent_parts[1]
def get_data_iter(self, path): """Loading data file and tokenizing the text Args: path: (str) data file path """ with open(path) as txt_fh: for row in txt_fh: toks = row.strip() if not toks: continue yield tokenize_and_truncate(self._tokenizer_name, toks, self.max_seq_len)
def load_data_for_path(self, path, split): """ Load data """ def tokenize_preserve_placeholder(sent, max_ent_length): """ Tokenize questions while preserving @placeholder token """ sent_parts = sent.split("@placeholder") assert len(sent_parts) == 2 placeholder_loc = len( tokenize_and_truncate(self.tokenizer_name, sent_parts[0], self.max_seq_len - max_ent_length)) sent_tok = tokenize_and_truncate(self.tokenizer_name, sent, self.max_seq_len - max_ent_length) return sent_tok[:placeholder_loc] + ["@placeholder" ] + sent_tok[placeholder_loc:] examples = [] data = [json.loads(d) for d in open(path, encoding="utf-8")] for item in data: psg_id = item["idx"] psg = tokenize_and_truncate(self.tokenizer_name, item["passage"]["text"], self.max_seq_len) ent_idxs = item["passage"]["entities"] ents = [ item["passage"]["text"][idx["start"]:idx["end"] + 1] for idx in ent_idxs ] max_ent_length = max( [idx["end"] - idx["start"] + 1 for idx in ent_idxs]) qas = item["qas"] for qa in qas: qst = tokenize_preserve_placeholder(qa["query"], max_ent_length) qst_id = qa["idx"] if "answers" in qa: anss = [a["text"] for a in qa["answers"]] else: anss = [] ex = { "passage": psg, "ents": ents, "query": qst, "answers": anss, "psg_id": f"{split}-{psg_id}", "qst_id": qst_id, } examples.append(ex) return examples
def _process_sentence(self, sent): return tokenize_and_truncate(tokenizer_name=self.tokenizer_name, sent=sent, max_seq_len=self.max_seq_len)