示例#1
0
 def _load_split(data_file):
     questions, choices, targs, id_str = [], [], [], []
     data = [json.loads(l) for l in open(data_file, encoding="utf-8")]
     for example in data:
         question = tokenize_and_truncate(
             self._tokenizer_name, "Q:" + example["question"]["stem"],
             self.max_seq_len)
         choices_dict = {
             a_choice["label"]:
             tokenize_and_truncate(self._tokenizer_name,
                                   "A:" + a_choice["text"],
                                   self.max_seq_len)
             for a_choice in example["question"]["choices"]
         }
         multiple_choices = [
             choices_dict[label] for label in self.choice_idx2label
         ]
         targ = self.label2choice_idx[
             example["answerKey"]] if "answerKey" in example else 0
         example_id = example["id"]
         questions.append(question)
         choices.append(multiple_choices)
         targs.append(targ)
         id_str.append(example_id)
     return [questions, choices, targs, id_str]
示例#2
0
    def _load_csv(self, input_file):
        import csv

        with open(input_file, "r") as csv_file:
            reader = csv.DictReader(csv_file)
            records = [record for record in reader]

        contexts, choices, targs, id_str = [], [], [], []
        for record in records:
            question = record["question"]

            ans_choices = [
                record["answer" + str(i)] for i in range(self.n_choices)
            ]
            qa_tok_choices = [
                tokenize_and_truncate(self._tokenizer_name,
                                      question + " " + ans_choices[i],
                                      self.max_seq_len)
                for i in range(len(ans_choices))
            ]
            max_ans_len = max([len(tok) for tok in qa_tok_choices])
            context = tokenize_and_truncate(self._tokenizer_name,
                                            record["context"],
                                            self.max_seq_len - max_ans_len)
            targ = int(record["label"]) if "label" in record else 0
            idx = record["id"]
            contexts.append(context)
            choices.append(qa_tok_choices)
            targs.append(targ)
            id_str.append(idx)
        return [contexts, choices, targs, id_str]
示例#3
0
    def load_data_for_path(self, path):
        """ Load data """

        with open(path, encoding="utf-8") as data_fh:
            examples = []
            for example in data_fh:
                ex = json.loads(example)

                assert (
                    "version" in ex and ex["version"] == 1.1
                ), "MultiRC version is invalid! Example indices are likely incorrect. "
                "Please re-download the data from super.gluebenchmark.com ."

                # each example has a passage field -> (text, questions)
                # text is the passage, which requires some preprocessing
                # questions is a list of questions, has fields (question, sentences_used, answers)
                ex["passage"]["text"] = tokenize_and_truncate(
                    self.tokenizer_name, ex["passage"]["text"],
                    self.max_seq_len)
                for question in ex["passage"]["questions"]:
                    question["question"] = tokenize_and_truncate(
                        self.tokenizer_name, question["question"],
                        self.max_seq_len)
                    for answer in question["answers"]:
                        answer["text"] = tokenize_and_truncate(
                            self.tokenizer_name, answer["text"],
                            self.max_seq_len)
                examples.append(ex)
        return examples
示例#4
0
 def tokenize_preserve_placeholder(sent, max_ent_length):
     """ Tokenize questions while preserving @placeholder token """
     sent_parts = sent.split("@placeholder")
     assert len(sent_parts) == 2
     placeholder_loc = len(
         tokenize_and_truncate(self.tokenizer_name, sent_parts[0],
                               self.max_seq_len - max_ent_length))
     sent_tok = tokenize_and_truncate(self.tokenizer_name, sent,
                                      self.max_seq_len - max_ent_length)
     return sent_tok[:placeholder_loc] + ["@placeholder"
                                          ] + sent_tok[placeholder_loc:]
示例#5
0
    def load_data_for_path(self, path, split):
        """ Load data """

        examples = []
        data = [json.loads(d) for d in open(path, encoding="utf-8")]
        for item in data:
            psg_id = item["idx"]
            psg = tokenize_and_truncate(self.tokenizer_name,
                                        item["passage"]["text"],
                                        self.max_seq_len)
            ent_idxs = item["passage"]["entities"]
            ents = [
                item["passage"]["text"][idx["start"]:idx["end"] + 1]
                for idx in ent_idxs
            ]
            qas = item["qas"]
            for qa in qas:
                qst = qa["query"]
                qst_id = qa["idx"]
                if "answers" in qa:
                    anss = [a["text"] for a in qa["answers"]]
                else:
                    anss = []
                ex = {
                    "passage": psg,
                    "ents": ents,
                    "query": qst,
                    "answers": anss,
                    "psg_id": f"{split}-{psg_id}",
                    "qst_id": qst_id,
                }
                examples.append(ex)

        return examples
 def load_csv(data_file):
     rows = pd.read_csv(data_file, encoding="utf-8")
     labels = rows["1"].apply(lambda x: str(x.split("\t")[0]))
     s1 = rows["1"].apply(lambda x: x.split("\t")[1])
     s1 = s1.apply(lambda x: tokenize_and_truncate(
         self._tokenizer_name, x, self.max_seq_len))
     return s1.tolist(), [], labels.tolist(), list(range(len(rows)))
 def load_csv(data_file):
     rows = pd.read_csv(data_file, encoding="utf-8")
     rows = rows.sample(frac=1, axis=0).reset_index(drop=True)
     rows["s1"] = rows["2"].apply(lambda x: tokenize_and_truncate(
         self._tokenizer_name, x, self.max_seq_len))
     return rows["s1"].tolist(), [], rows["1"].tolist(), list(
         range(len(rows)))
示例#8
0
 def load_csv(data_file):
     rows = pd.read_csv(data_file, encoding="utf-8")
     rows["s1"] = rows["2"].apply(lambda x: tokenize_and_truncate(
         self._tokenizer_name, x, self.max_seq_len))
     self.labels.append(rows["1"].tolist())
     return rows["s1"].tolist(), [], rows["1"].tolist(), list(
         range(len(rows)))
示例#9
0
    def process_split(
            self, split, indexers,
            model_preprocessing_interface) -> Iterable[Type[Instance]]:
        """ Process split text into a list of AllenNLP Instances. """
        def is_answer(x, ys):
            """ Given a list of answers, determine if x is an answer """
            return x in ys

        def insert_ent(ent, template):
            """ Replace ent into template (query with @placeholder) """
            assert "@placeholder" in template, "No placeholder detected!"
            split_idx = template.index("@placeholder")
            return template[:split_idx] + ent + template[split_idx + 1:]

        def _make_instance(psg, qst, ans_str, label, psg_idx, qst_idx,
                           ans_idx):
            """ pq_id: passage-question ID """
            d = {}
            d["psg_str"] = MetadataField(" ".join(psg))
            d["qst_str"] = MetadataField(" ".join(qst))
            d["ans_str"] = MetadataField(ans_str)
            d["psg_idx"] = MetadataField(par_idx)
            d["qst_idx"] = MetadataField(qst_idx)
            d["ans_idx"] = MetadataField(ans_idx)
            d["idx"] = MetadataField(ans_idx)  # required by evaluate()
            if model_preprocessing_interface.model_flags[
                    "uses_pair_embedding"]:
                inp = model_preprocessing_interface.boundary_token_fn(psg, qst)
                d["psg_qst_ans"] = sentence_to_text_field(inp, indexers)
            else:
                d["psg"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(psg),
                    indexers)
                d["qst"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(qst),
                    indexers)
            d["label"] = LabelField(label,
                                    label_namespace="labels",
                                    skip_indexing=True)

            return Instance(d)

        for example in split:
            psg = example["passage"]
            qst_template = example["query"]

            ent_strs = example["ents"]
            ents = [
                tokenize_and_truncate(self._tokenizer_name, ent,
                                      self.max_seq_len) for ent in ent_strs
            ]

            anss = example["answers"]
            par_idx = example["psg_id"]
            qst_idx = example["qst_id"]
            for ent_idx, (ent, ent_str) in enumerate(zip(ents, ent_strs)):
                label = is_answer(ent_str, anss)
                qst = insert_ent(ent, qst_template)
                yield _make_instance(psg, qst, ent_str, label, par_idx,
                                     qst_idx, ent_idx)
示例#10
0
文件: qa.py 项目: vamshirapolu/jiant
 def tokenize_preserve_placeholder(sent):
     """ Tokenize questions while preserving @placeholder token """
     sent_parts = sent.split("@placeholder")
     assert len(sent_parts) == 2
     sent_parts = [
         tokenize_and_truncate(self.tokenizer_name, s, self.max_seq_len)
         for s in sent_parts
     ]
     return sent_parts[0] + ["@placeholder"] + sent_parts[1]
示例#11
0
 def get_data_iter(self, path):
     """Loading data file and tokenizing the text
     Args:
         path: (str) data file path
     """
     with open(path) as txt_fh:
         for row in txt_fh:
             toks = row.strip()
             if not toks:
                 continue
             yield tokenize_and_truncate(self._tokenizer_name, toks,
                                         self.max_seq_len)
示例#12
0
    def load_data_for_path(self, path, split):
        """ Load data """
        def tokenize_preserve_placeholder(sent, max_ent_length):
            """ Tokenize questions while preserving @placeholder token """
            sent_parts = sent.split("@placeholder")
            assert len(sent_parts) == 2
            placeholder_loc = len(
                tokenize_and_truncate(self.tokenizer_name, sent_parts[0],
                                      self.max_seq_len - max_ent_length))
            sent_tok = tokenize_and_truncate(self.tokenizer_name, sent,
                                             self.max_seq_len - max_ent_length)
            return sent_tok[:placeholder_loc] + ["@placeholder"
                                                 ] + sent_tok[placeholder_loc:]

        examples = []
        data = [json.loads(d) for d in open(path, encoding="utf-8")]
        for item in data:
            psg_id = item["idx"]
            psg = tokenize_and_truncate(self.tokenizer_name,
                                        item["passage"]["text"],
                                        self.max_seq_len)
            ent_idxs = item["passage"]["entities"]
            ents = [
                item["passage"]["text"][idx["start"]:idx["end"] + 1]
                for idx in ent_idxs
            ]
            max_ent_length = max(
                [idx["end"] - idx["start"] + 1 for idx in ent_idxs])
            qas = item["qas"]
            for qa in qas:
                qst = tokenize_preserve_placeholder(qa["query"],
                                                    max_ent_length)
                qst_id = qa["idx"]
                if "answers" in qa:
                    anss = [a["text"] for a in qa["answers"]]
                else:
                    anss = []
                ex = {
                    "passage": psg,
                    "ents": ents,
                    "query": qst,
                    "answers": anss,
                    "psg_id": f"{split}-{psg_id}",
                    "qst_id": qst_id,
                }
                examples.append(ex)

        return examples
示例#13
0
 def _process_sentence(self, sent):
     return tokenize_and_truncate(tokenizer_name=self.tokenizer_name,
                                  sent=sent,
                                  max_seq_len=self.max_seq_len)