Exemplo n.º 1
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)

                passage_idx = example_json['idx']
                text = punctuation_standardization(example_json['passage']['text'])
                questions = example_json['passage']['questions']
                for question_json in questions:
                    question = punctuation_standardization(question_json["question"])
                    question_idx = question_json['idx']
                    answers = question_json["answers"]
                    for answer_json in answers:
                        label = answer_json["label"] if 'label' in answer_json else None
                        answer_idx = answer_json["idx"]
                        guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}'
                        meta = {
                            'passage_idx': passage_idx,
                            'question_idx': question_idx,
                            'answer_idx': answer_idx,
                            'answer': punctuation_standardization(answer_json["text"])
                        }
                        idx = [passage_idx, question_idx, answer_idx]
                        example = InputExample(guid=guid, text_a=text, text_b=question, label=label, meta=meta, idx=idx)
                        examples.append(example)

        question_indices = list(set(example.meta['question_idx'] for example in examples))
        label_distribution = Counter(example.label for example in examples)
        print_rank_0(
            f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label "
            f"distribution {list(label_distribution.items())}")
        return examples
Exemplo n.º 2
0
 def create_examples(self, split):
     if split == "train":
         filename = "train"
     elif split == "dev":
         filename = "val"
     elif split == "test":
         filename = "test"
     else:
         raise NotImplementedError(split)
     print_rank_0(
         f"Creating {self.task}-{split} dataset from {self.data_dir}")
     if self.task == "gigaword":
         detokenizer = gigaword_detokenize
     elif self.task == "cnn_dm":
         detokenizer = cnndm_detokenize
     else:
         detokenizer = None
     source_texts, target_texts = [], []
     with open(os.path.join(self.data_dir, f"{filename}.source"),
               encoding='utf-8') as file:
         for line in file:
             line = line.strip()
             line = punctuation_standardization(line)
             line = detokenizer(line) if detokenizer else line
             source_texts.append(line)
     with open(os.path.join(self.data_dir, f"{filename}.target"),
               encoding='utf-8') as file:
         for line in file:
             line = line.strip()
             line = punctuation_standardization(line)
             line = detokenizer(line,
                                is_target=True) if detokenizer else line
             target_texts.append(line)
     assert len(source_texts) == len(target_texts)
     example_list = []
     for idx, (source_text,
               target_text) in enumerate(zip(source_texts, target_texts)):
         if (idx + 1) % 20000 == 0:
             print_rank_0(f"Complete {idx + 1} examples")
         guid = "%s-%s" % (split, idx)
         meta = {
             "ref":
             self.tokenizer.DecodeIds(
                 self.tokenizer.EncodeAsIds(target_text).tokenization)
         }
         example = InputExample(guid=guid,
                                text_a=source_text,
                                text_b=target_text,
                                meta=meta)
         if idx < 10:
             print_rank_0(
                 (source_text.encode('utf-8'), target_text.encode('utf-8'),
                  meta["ref"].encode('utf-8')))
         example_list.append(example)
     return example_list
Exemplo n.º 3
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []
        df = read_tsv(path)

        for idx, row in df.iterrows():
            guid = f"{set_type}-{idx}"
            text_a = punctuation_standardization(row['sentence1'])
            text_b = punctuation_standardization(row['sentence2'])
            label = row.get('gold_label', None)
            example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
            examples.append(example)

        return examples
Exemplo n.º 4
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path) as f:
            reader = csv.reader(f, delimiter=',')
            for idx, row in enumerate(reader):
                label, headline, body = row
                guid = "%s-%s" % (set_type, idx)
                text_a = punctuation_standardization(headline.replace('\\', ' '))
                text_b = punctuation_standardization(body.replace('\\', ' '))

                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)

        return examples
Exemplo n.º 5
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json['idx']
                label = str(example_json['label']).lower() if 'label' in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = punctuation_standardization(example_json['passage'])
                text_b = punctuation_standardization(example_json['question'])
                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx)
                examples.append(example)

        return examples
Exemplo n.º 6
0
 def _create_examples(path: str, set_type: str) -> List[InputExample]:
     examples = []
     with open(path, encoding='utf8') as f:
         for line in f:
             example_json = json.loads(line)
             idx = example_json['idx']
             if isinstance(idx, str):
                 idx = int(idx)
             label = "true" if example_json.get('label') else "false"
             guid = "%s-%s" % (set_type, idx)
             text_a = punctuation_standardization(example_json['sentence1'])
             text_b = punctuation_standardization(example_json['sentence2'])
             meta = {'word': example_json['word']}
             example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx, meta=meta)
             examples.append(example)
     return examples
Exemplo n.º 7
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            reader = csv.reader(f, delimiter=',')
            for idx, row in enumerate(reader):
                label, question_title, question_body, answer = row
                guid = "%s-%s" % (set_type, idx)
                text_a = ' '.join([question_title.replace('\\n', ' ').replace('\\', ' '),
                                   question_body.replace('\\n', ' ').replace('\\', ' ')])
                text_a = punctuation_standardization(text_a)
                text_b = answer.replace('\\n', ' ').replace('\\', ' ')
                text_b = punctuation_standardization(text_b)

                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)

        return examples
Exemplo n.º 8
0
    def _create_examples(self, path: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                label = example_json['label']
                id_ = example_json['id']
                text_a = punctuation_standardization(example_json['question'])
                text_b = punctuation_standardization(example_json['comment'])
                language = example_json['language']

                if self.language is not None and language != self.language:
                    continue

                example = InputExample(guid=id_, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)

        return examples
Exemplo n.º 9
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []
        if set_type != 'test':
            df = read_tsv(path, header=None)
        else:
            df = read_tsv(path)

        for idx, row in df.iterrows():
            guid = f"{set_type}-{idx}"
            if set_type != 'test':
                text_a = punctuation_standardization(row[3])
                label = row[1]
            else:
                text_a = punctuation_standardization(row['sentence'])
                label = None
            example = InputExample(guid=guid, text_a=text_a, label=label)
            examples.append(example)

        return examples
Exemplo n.º 10
0
    def _create_examples(self, path: str, set_type: str, hypothesis_name: str = "hypothesis",
                         premise_name: str = "premise") -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line_idx, line in enumerate(f):
                example_json = json.loads(line)
                idx = example_json['idx']
                if isinstance(idx, str):
                    try:
                        idx = int(idx)
                    except ValueError:
                        idx = line_idx
                label = example_json.get('label')
                guid = "%s-%s" % (set_type, idx)
                text_a = punctuation_standardization(example_json[premise_name])
                text_b = punctuation_standardization(example_json[hypothesis_name])

                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx)
                examples.append(example)

        return examples
Exemplo n.º 11
0
 def encode(self, example: InputExample, tokenizer, seq_length, args):
     if args.pretrained_bert:
         ids_list, types_list, paddings_list = [], [], []
     else:
         ids_list, positions_list, sep_list = [], [], []
     question = example.meta['question']
     joiner = 'because' if question == 'cause' else 'so'
     text_a = punctuation_standardization(example.text_a) + " " + joiner
     tokens_a = tokenizer.EncodeAsIds(text_a).tokenization
     for choice in [example.meta["choice1"], example.meta["choice2"]]:
         choice = punctuation_standardization(choice)
         tokens_b = tokenizer.EncodeAsIds(choice).tokenization
         num_special_tokens = num_special_tokens_to_add(tokens_a, tokens_b, None, add_cls=True, add_sep=True,
                                                        add_piece=False)
         if len(tokens_a) + len(tokens_b) + num_special_tokens > seq_length:
             self.num_truncated += 1
         data = build_input_from_ids(tokens_a, tokens_b, None, seq_length, tokenizer, args,
                                     add_cls=True, add_sep=True, add_piece=False)
         ids, types, paddings, position_ids, sep, target_ids, loss_masks = data
         if args.pretrained_bert:
             ids_list.append(ids)
             types_list.append(types)
             paddings_list.append(paddings)
         else:
             ids_list.append(ids)
             positions_list.append(position_ids)
             sep_list.append(sep)
     label = 0
     if example.label is not None:
         label = example.label
         label = self.get_labels().index(label)
     if args.pretrained_bert:
         sample = build_sample(ids_list, label=label, types=types_list, paddings=paddings_list,
                               unique_id=example.guid)
     else:
         sample = build_sample(ids_list, positions=positions_list, masks=sep_list, label=label,
                               unique_id=example.guid)
     return sample
Exemplo n.º 12
0
    def _create_examples(path, set_type, seed=42, max_train_candidates_per_question: int = 10, for_train=False) -> List[
        InputExample]:
        examples = []

        entity_shuffler = random.Random(seed)

        with open(path, encoding='utf8') as f:
            for idx, line in enumerate(f):
                example_json = json.loads(line)

                idx = example_json['idx']
                text = punctuation_standardization(example_json['passage']['text'])
                entities = set()

                for entity_json in example_json['passage']['entities']:
                    start = entity_json['start']
                    end = entity_json['end']
                    entity = punctuation_standardization(text[start:end + 1])
                    entities.add(entity)

                entities = list(entities)
                entities.sort()

                text = text.replace("@highlight\n", "- ")  # we follow the GPT-3 paper wrt @highlight annotations
                questions = example_json['qas']

                for question_json in questions:
                    question = punctuation_standardization(question_json['query'])
                    question_idx = question_json['idx']
                    answers = set()

                    for answer_json in question_json.get('answers', []):
                        answer = punctuation_standardization(answer_json['text'])
                        answers.add(answer)

                    answers = list(answers)

                    if set_type == 'train' or for_train:
                        # create a single example per *correct* answer
                        for answer_idx, answer in enumerate(answers):
                            candidates = [ent for ent in entities if ent not in answers]
                            if len(candidates) > max_train_candidates_per_question - 1:
                                entity_shuffler.shuffle(candidates)
                                candidates = candidates[:max_train_candidates_per_question - 1]

                            guid = f'{set_type}-p{idx}-q{question_idx}-a{answer_idx}'
                            meta = {
                                'passage_idx': idx,
                                'question_idx': question_idx,
                                'candidates': [answer] + candidates,
                                'answers': [answer]
                            }
                            ex_idx = [idx, question_idx, answer_idx]
                            example = InputExample(guid=guid, text_a=text, text_b=question, label="0", meta=meta,
                                                   idx=ex_idx, num_choices=len(candidates) + 1)
                            examples.append(example)

                    else:
                        # create just one example with *all* correct answers and *all* answer candidates
                        guid = f'{set_type}-p{idx}-q{question_idx}'
                        meta = {
                            'passage_idx': idx,
                            'question_idx': question_idx,
                            'candidates': entities,
                            'answers': answers
                        }
                        example = InputExample(guid=guid, text_a=text, text_b=question, label="1", meta=meta,
                                               idx=question_idx, num_choices=len(entities))
                        examples.append(example)

        question_indices = list(set(example.meta['question_idx'] for example in examples))
        label_distribution = Counter(example.label for example in examples)
        print_rank_0(
            f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label "
            f"distribution {list(label_distribution.items())}")
        return examples
Exemplo n.º 13
0
    def _create_examples(self, path: str, set_type: str, cloze_eval=True) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json['idx']
                label = str(example_json['label']) if 'label' in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = punctuation_standardization(example_json['text'])
                meta = {
                    'span1_text': example_json['target']['span1_text'],
                    'span2_text': example_json['target']['span2_text'],
                    'span1_index': example_json['target']['span1_index'],
                    'span2_index': example_json['target']['span2_index']
                }
                if 'candidates' in example_json:
                    candidates = [cand['text'] for cand in example_json['candidates']]
                    # candidates = list(set(candidates))
                    filtered = []
                    for i, cand in enumerate(candidates):
                        if not cand in candidates[:i]:
                            filtered.append(cand)
                    candidates = filtered

                # the indices in the dataset are wrong for some examples, so we manually fix them
                span1_index, span1_text = meta['span1_index'], meta['span1_text']
                span2_index, span2_text = meta['span2_index'], meta['span2_text']
                words_a = text_a.split()
                words_a_lower = text_a.lower().split()
                words_span1_text = span1_text.lower().split()
                span1_len = len(words_span1_text)

                if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
                    for offset in [-1, +1]:
                        if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text:
                            span1_index += offset

                # if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
                #     print_rank_0(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected "
                #                  f"'{words_span1_text}' at index {span1_index} for '{words_a}'")

                if words_a[span2_index] != span2_text:
                    for offset in [-1, +1]:
                        if words_a[span2_index + offset] == span2_text:
                            span2_index += offset

                    if words_a[span2_index] != span2_text and words_a[span2_index].startswith(span2_text):
                        words_a = words_a[:span2_index] \
                                  + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] \
                                  + words_a[span2_index + 1:]

                assert words_a[span2_index] == span2_text, \
                    f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'"

                text_a = ' '.join(words_a)
                meta['span1_index'], meta['span2_index'] = span1_index, span2_index

                if self.args.task == 'wsc1':
                    example = InputExample(guid=guid, text_a=text_a, text_b=span1_text,
                                           label=label, meta=meta, idx=idx)
                    examples.append(example)
                    if set_type == 'train' and label == 'True':
                        for cand in candidates:
                            example = InputExample(guid=guid, text_a=text_a, text_b=cand,
                                                   label='False', meta=meta, idx=idx)
                            examples.append(example)
                    continue

                if cloze_eval and set_type == 'train' and label != 'True':
                    continue
                if set_type == 'train' and 'candidates' in example_json and len(candidates) > 9:
                    for i in range(0, len(candidates), 9):
                        _meta = copy.deepcopy(meta)
                        _meta['candidates'] = candidates[i:i + 9]
                        if len(_meta['candidates']) < 9:
                            _meta['candidates'] += candidates[:9 - len(_meta['candidates'])]
                        example = InputExample(guid=guid, text_a=text_a, label=label, meta=_meta, idx=idx)
                        examples.append(example)
                else:
                    if 'candidates' in example_json:
                        meta['candidates'] = candidates
                    example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx)
                    examples.append(example)

        return examples