示例#1
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                label = str(example_json['label']) if 'label' in example_json else None
                idx = example_json['idx']
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json['premise']
                meta = {
                    'choice1': example_json['choice1'],
                    'choice2': example_json['choice2'],
                    'question': example_json['question']
                }
                example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx)
                examples.append(example)

        if set_type == 'train' or set_type == 'unlabeled':
            mirror_examples = []
            for ex in examples:
                label = "1" if ex.label == "0" else "0"
                meta = {
                    'choice1': ex.meta['choice2'],
                    'choice2': ex.meta['choice1'],
                    'question': ex.meta['question']
                }
                mirror_example = InputExample(guid=ex.guid + 'm', text_a=ex.text_a, label=label, meta=meta)
                mirror_examples.append(mirror_example)
            examples += mirror_examples
            logger.info(f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}...")
        return examples
示例#2
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)

                passage_idx = example_json['idx']
                text = example_json['passage']['text']
                questions = example_json['passage']['questions']
                for question_json in questions:
                    question = question_json["question"]
                    question_idx = question_json['idx']
                    answers = question_json["answers"]
                    for answer_json in answers:
                        label = str(answer_json["label"]) if 'label' in answer_json else None
                        answer_idx = answer_json["idx"]
                        guid = f'{set_type}-p{passage_idx}-q{question_idx}-a{answer_idx}'
                        meta = {
                            'passage_idx': passage_idx,
                            'question_idx': question_idx,
                            'answer_idx': answer_idx,
                            'answer': answer_json["text"]
                        }
                        idx = [passage_idx, question_idx, answer_idx]
                        example = InputExample(guid=guid, text_a=text, text_b=question, label=label, meta=meta, idx=idx)
                        examples.append(example)

        question_indices = list(set(example.meta['question_idx'] for example in examples))
        label_distribution = Counter(example.label for example in examples)
        logger.info(f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label "
                    f"distribution {list(label_distribution.items())}")
        return examples
示例#3
0
    def get_parts(self, example: InputExample) -> FilledPattern:

        premise = self.remove_final_punc(self.shortenable(example.text_a))
        choice1 = self.remove_final_punc(
            self.lowercase_first(example.meta['choice1']))
        choice2 = self.remove_final_punc(
            self.lowercase_first(example.meta['choice2']))

        question = example.meta['question']
        assert question in ['cause', 'effect']

        example.meta['choice1'], example.meta['choice2'] = choice1, choice2
        num_masks = max(
            len(get_verbalization_ids(c, self.wrapper.tokenizer, False))
            for c in [choice1, choice2])

        if question == "cause":
            joiner = "because"
        else:
            joiner = "so"

        # searched patterns in fully-supervised learning
        # string_list_a = [choice1, 'or', choice2, '?', 'the', premise, joiner, 'the', self.mask]
        # string_list_a = [choice1, 'or', choice2, '?', premise, joiner, 'the', self.mask * num_masks]
        # string_list_a = ['"', choice1, '" or "', choice2, '"?', 'the', premise,  'the', joiner, self.mask*num_masks]
        # string_list_a = ['"', choice1, '" or "', choice2, '"?', premise,  , joiner, 'the', self.mask*num_masks]

        # few-shot
        if self.pattern_id == 1:
            if question == "cause":

                string_list_a = [
                    choice1, 'or', choice2, '?', premise, 'because', 'the',
                    self.mask * num_masks, '.'
                ]
                string_list_b = []
                block_flag_a = [0, 0, 0, 0, 0, 0, 1, 0, 0]
                block_flag_b = []
                assert len(string_list_a) == len(block_flag_a)
                assert len(string_list_b) == len(block_flag_b)
                return string_list_a, string_list_b, block_flag_a, block_flag_b

            elif question == "effect":

                string_list_a = [
                    choice1, 'or', choice2, '?', premise, 'so', 'the',
                    self.mask * num_masks, '.'
                ]
                string_list_b = []
                block_flag_a = [0, 0, 0, 0, 0, 0, 1, 0, 0]
                block_flag_b = []
                assert len(string_list_a) == len(block_flag_a)
                assert len(string_list_b) == len(block_flag_b)
                return string_list_a, string_list_b, block_flag_a, block_flag_b

            else:
                raise ValueError(
                    "currently not support the kind of questions.")
        else:
            raise ValueError("unknown pattern_ids.")
示例#4
0
    def _create_examples(self,
                         path: str,
                         set_type: str,
                         hypothesis_name: str = "hypothesis",
                         premise_name: str = "premise") -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line_idx, line in enumerate(f):
                example_json = json.loads(line)
                idx = example_json['idx']
                if isinstance(idx, str):
                    try:
                        idx = int(idx)
                    except ValueError:
                        idx = line_idx
                label = example_json.get('label')
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json[premise_name]
                text_b = example_json[hypothesis_name]

                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       text_b=text_b,
                                       label=label,
                                       idx=idx)
                examples.append(example)

        return examples
示例#5
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json['idx']
                label = str(example_json['label']) if 'label' in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json['text']
                meta = {
                    'span1_text': example_json['target']['span1_text'],
                    'span2_text': example_json['target']['span2_text'],
                    'span1_index': example_json['target']['span1_index'],
                    'span2_index': example_json['target']['span2_index']
                }

                # the indices in the dataset are wrong for some examples, so we manually fix them
                span1_index, span1_text = meta['span1_index'], meta['span1_text']
                span2_index, span2_text = meta['span2_index'], meta['span2_text']
                words_a = text_a.split()
                words_a_lower = text_a.lower().split()
                words_span1_text = span1_text.lower().split()
                span1_len = len(words_span1_text)

                if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
                    for offset in [-1, +1]:
                        if words_a_lower[span1_index + offset:span1_index + span1_len + offset] == words_span1_text:
                            span1_index += offset

                if words_a_lower[span1_index:span1_index + span1_len] != words_span1_text:
                    logger.warning(f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected "
                                   f"'{words_span1_text}' at index {span1_index} for '{words_a}'")

                if words_a[span2_index] != span2_text:
                    for offset in [-1, +1]:
                        if words_a[span2_index + offset] == span2_text:
                            span2_index += offset

                    if words_a[span2_index] != span2_text and words_a[span2_index].startswith(span2_text):
                        words_a = words_a[:span2_index] \
                                  + [words_a[span2_index][:len(span2_text)], words_a[span2_index][len(span2_text):]] \
                                  + words_a[span2_index + 1:]

                assert words_a[span2_index] == span2_text, \
                    f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'"

                text_a = ' '.join(words_a)
                meta['span1_index'], meta['span2_index'] = span1_index, span2_index

                example = InputExample(guid=guid, text_a=text_a, label=label, meta=meta, idx=idx)
                if set_type == 'train' and label != 'True':
                    continue
                examples.append(example)

        return examples
示例#6
0
文件: tasks.py 项目: yuweifamily/pet
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding="utf8") as f:
            for line in f:
                example_json = json.loads(line)
                label = str(
                    example_json["label"]) if "label" in example_json else None
                idx = example_json["idx"]
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json["premise"]
                meta = {
                    "choice1": example_json["choice1"],
                    "choice2": example_json["choice2"],
                    "question": example_json["question"],
                }
                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       label=label,
                                       meta=meta,
                                       idx=idx)
                examples.append(example)

        if set_type == "train" or set_type == "unlabeled":
            mirror_examples = []
            for ex in examples:
                label = "1" if ex.label == "0" else "0"
                meta = {
                    "choice1": ex.meta["choice2"],
                    "choice2": ex.meta["choice1"],
                    "question": ex.meta["question"]
                }
                mirror_example = InputExample(guid=ex.guid + "m",
                                              text_a=ex.text_a,
                                              label=label,
                                              meta=meta)
                mirror_examples.append(mirror_example)
            examples += mirror_examples
            logger.info(
                f"Added {len(mirror_examples)} mirror examples, total size is {len(examples)}..."
            )
        return examples
示例#7
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path) as f:
            reader = csv.reader(f, delimiter=',')
            for idx, row in enumerate(reader):
                label, body = row
                guid = "%s-%s" % (set_type, idx)
                text_a = body.replace('\\n', ' ').replace('\\', ' ')

                example = InputExample(guid=guid, text_a=text_a, label=label)
                examples.append(example)

        return examples
示例#8
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json['idx']
                label = str(example_json['label']) if 'label' in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json['passage']
                text_b = example_json['question']
                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx)
                examples.append(example)

        return examples
示例#9
0
    def _create_examples(self, path, set_type, max_examples=-1, skip_first=0):
        """Creates examples for the training and dev sets."""
        examples = []

        with open(path) as f:
            reader = csv.reader(f, delimiter=':->')
            for idx, row in enumerate(reader):
                guid = "%s-%s" % (set_type, idx)
                label = row[MyTaskDataProcessor.LABEL_COLUMN]
                text_a = row[MyTaskDataProcessor.TEXT_A_COLUMN]
                text_b = row[MyTaskDataProcessor.TEXT_B_COLUMN] if MyTaskDataProcessor.TEXT_B_COLUMN >= 0 else None
                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)

        return examples
示例#10
0
    def _create_examples(lines: List[List[str]], set_type: str) -> List[InputExample]:
        examples = []

        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[8]
            text_b = line[9]
            label = line[-1]

            example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
            examples.append(example)

        return examples
示例#11
0
 def _create_examples(path: str, set_type: str) -> List[InputExample]:
     examples = []
     with open(path, encoding='utf8') as f:
         for line in f:
             example_json = json.loads(line)
             idx = example_json['idx']
             if isinstance(idx, str):
                 idx = int(idx)
             label = "T" if example_json.get('label') else "F"
             guid = "%s-%s" % (set_type, idx)
             text_a = example_json['sentence1']
             text_b = example_json['sentence2']
             meta = {'word': example_json['word']}
             example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx, meta=meta)
             examples.append(example)
     return examples
示例#12
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            reader = csv.reader(f, delimiter=',')
            for idx, row in enumerate(reader):
                label, question_title, question_body, answer = row
                guid = "%s-%s" % (set_type, idx)
                text_a = ' '.join([question_title.replace('\\n', ' ').replace('\\', ' '),
                                   question_body.replace('\\n', ' ').replace('\\', ' ')])
                text_b = answer.replace('\\n', ' ').replace('\\', ' ')

                example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)

        return examples
示例#13
0
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        df = pd.read_table(path)
        for idx, row in df.iterrows():
            label = str(row['prefix'])
            guid = "%s-%s" % (set_type, idx)
            text_a = str(row['input_text'])
            text_b = str(row['target_text'])
            example = InputExample(guid=guid,
                                   text_a=text_a,
                                   text_b=text_b,
                                   label=label,
                                   idx=idx)
            examples.append(example)

        return examples
示例#14
0
文件: tasks.py 项目: yuweifamily/pet
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path) as f:
            reader = csv.reader(f, delimiter=",")
            for idx, row in enumerate(reader):
                label, headline, body = row
                guid = "%s-%s" % (set_type, idx)
                text_a = headline.replace("\\", " ")
                text_b = body.replace("\\", " ")

                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       text_b=text_b,
                                       label=label)
                examples.append(example)

        return examples
示例#15
0
    def _create_examples(self, path: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf8') as f:
            for line in f:
                example_json = json.loads(line)
                label = example_json['label']
                id_ = example_json['id']
                text_a = example_json['question']
                text_b = example_json['comment']
                language = example_json['language']

                if self.language is not None and language != self.language:
                    continue

                example = InputExample(guid=id_, text_a=text_a, text_b=text_b, label=label)
                examples.append(example)

        return examples
示例#16
0
文件: tasks.py 项目: yuweifamily/pet
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding="utf8") as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json["idx"]
                label = str(
                    example_json["label"]) if "label" in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json["passage"]
                text_b = example_json["question"]
                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       text_b=text_b,
                                       label=label,
                                       idx=idx)
                examples.append(example)

        return examples
示例#17
0
文件: tasks.py 项目: yuweifamily/pet
 def _create_examples(path: str, set_type: str) -> List[InputExample]:
     examples = []
     with open(path, encoding="utf8") as f:
         for line in f:
             example_json = json.loads(line)
             idx = example_json["idx"]
             if isinstance(idx, str):
                 idx = int(idx)
             label = "T" if example_json.get("label") else "F"
             guid = "%s-%s" % (set_type, idx)
             text_a = example_json["sentence1"]
             text_b = example_json["sentence2"]
             meta = {"word": example_json["word"]}
             example = InputExample(guid=guid,
                                    text_a=text_a,
                                    text_b=text_b,
                                    label=label,
                                    idx=idx,
                                    meta=meta)
             examples.append(example)
     return examples
示例#18
0
    def _create_examples_unlabelled(self,
                                    path,
                                    set_type,
                                    max_examples=1,
                                    skip_first=0):
        """Creates examples for the unlabelled set."""
        examples = []

        with open(path, encoding="utf8") as f:
            reader = csv.reader(f, delimiter=":")
            for idx, row in enumerate(reader):
                guid = "%s-%s" % (set_type, idx)
                #label = row[MyTaskDataProcessor.LABEL_COLUMN]
                text_a = row[0]
                text_b = row[
                    MyTaskDataProcessor.
                    TEXT_B_COLUMN] if MyTaskDataProcessor.TEXT_B_COLUMN >= 0 else None
                example = InputExample(guid=guid, text_a=text_a, text_b=text_b)
                examples.append(example)

        return examples
示例#19
0
文件: tasks.py 项目: cccntu/pet
    def _create_examples(self, lines: List[List[str]],
                         set_type: str) -> List[InputExample]:
        examples = []

        id_to_lables = self.get_labels()
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = f"{set_type}-{line['idx']}"
            text_a = line['premise']
            text_b = line['hypothesis']
            label = id_to_lables[
                line['label']]  # need to return string, hf datasets uses int

            example = InputExample(guid=guid,
                                   text_a=text_a,
                                   text_b=text_b,
                                   label=label)
            examples.append(example)

        return examples
示例#20
0
文件: pvp.py 项目: timoschick/pet
    def get_parts(self, example: InputExample) -> FilledPattern:

        premise = self.remove_final_punc(self.shortenable(example.text_a))
        choice1 = self.remove_final_punc(self.lowercase_first(example.meta['choice1']))
        choice2 = self.remove_final_punc(self.lowercase_first(example.meta['choice2']))

        question = example.meta['question']
        assert question in ['cause', 'effect']

        example.meta['choice1'], example.meta['choice2'] = choice1, choice2
        num_masks = max(len(get_verbalization_ids(c, self.wrapper.tokenizer, False)) for c in [choice1, choice2])

        if question == 'cause':
            if self.pattern_id == 0:
                return ['"', choice1, '" or "', choice2, '"?', premise, 'because', self.mask * num_masks, '.'], []
            elif self.pattern_id == 1:
                return [choice1, 'or', choice2, '?', premise, 'because', self.mask * num_masks, '.'], []
        else:
            if self.pattern_id == 0:
                return ['"', choice1, '" or "', choice2, '"?', premise, ', so', self.mask * num_masks, '.'], []
            elif self.pattern_id == 1:
                return [choice1, 'or', choice2, '?', premise, ', so', self.mask * num_masks, '.'], []
示例#21
0
    def get_parts(self, example: InputExample) -> FilledPattern:

        premise = self.remove_final_punc(self.shortenable(example.text_a))
        choice1 = self.remove_final_punc(
            self.lowercase_first(example.meta["choice1"]))
        choice2 = self.remove_final_punc(
            self.lowercase_first(example.meta["choice2"]))

        question = example.meta["question"]
        assert question in ["cause", "effect"]

        example.meta["choice1"], example.meta["choice2"] = choice1, choice2
        num_masks = max(
            len(get_verbalization_ids(c, self.wrapper.tokenizer, False))
            for c in [choice1, choice2])

        if question == "cause":
            if self.pattern_id == 0:
                return [
                    '"', choice1, '" or "', choice2, '"?', premise, "because",
                    self.mask * num_masks, "."
                ], []
            elif self.pattern_id == 1:
                return [
                    choice1, "or", choice2, "?", premise, "because",
                    self.mask * num_masks, "."
                ], []
        else:
            if self.pattern_id == 0:
                return [
                    '"', choice1, '" or "', choice2, '"?', premise, ", so",
                    self.mask * num_masks, "."
                ], []
            elif self.pattern_id == 1:
                return [
                    choice1, "or", choice2, "?", premise, ", so",
                    self.mask * num_masks, "."
                ], []
示例#22
0
    def _create_examples(
            path,
            set_type,
            seed=42,
            max_train_candidates_per_question: int = 10) -> List[InputExample]:
        examples = []

        entity_shuffler = random.Random(seed)

        with open(path, encoding='utf8') as f:
            for idx, line in enumerate(f):
                example_json = json.loads(line)

                idx = example_json['idx']
                text = example_json['passage']['text']
                entities = set()

                for entity_json in example_json['passage']['entities']:
                    start = entity_json['start']
                    end = entity_json['end']
                    entity = text[start:end + 1]
                    entities.add(entity)

                entities = list(entities)

                text = text.replace(
                    "@highlight\n", "- "
                )  # we follow the GPT-3 paper wrt @highlight annotations
                questions = example_json['qas']

                for question_json in questions:
                    question = question_json['query']
                    question_idx = question_json['idx']
                    answers = set()

                    for answer_json in question_json.get('answers', []):
                        answer = answer_json['text']
                        answers.add(answer)

                    answers = list(answers)

                    if set_type == 'train':
                        # create a single example per *correct* answer
                        for answer_idx, answer in enumerate(answers):
                            candidates = [
                                ent for ent in entities if ent not in answers
                            ]
                            if len(candidates
                                   ) > max_train_candidates_per_question - 1:
                                entity_shuffler.shuffle(candidates)
                                candidates = candidates[:
                                                        max_train_candidates_per_question
                                                        - 1]

                            guid = f'{set_type}-p{idx}-q{question_idx}-a{answer_idx}'
                            meta = {
                                'passage_idx': idx,
                                'question_idx': question_idx,
                                'candidates': [answer] + candidates,
                                'answers': [answer]
                            }
                            ex_idx = [idx, question_idx, answer_idx]
                            example = InputExample(guid=guid,
                                                   text_a=text,
                                                   text_b=question,
                                                   label="1",
                                                   meta=meta,
                                                   idx=ex_idx)
                            examples.append(example)

                    else:
                        # create just one example with *all* correct answers and *all* answer candidates
                        guid = f'{set_type}-p{idx}-q{question_idx}'
                        meta = {
                            'passage_idx': idx,
                            'question_idx': question_idx,
                            'candidates': entities,
                            'answers': answers
                        }
                        example = InputExample(guid=guid,
                                               text_a=text,
                                               text_b=question,
                                               label="1",
                                               meta=meta)
                        examples.append(example)

        question_indices = list(
            set(example.meta['question_idx'] for example in examples))
        label_distribution = Counter(example.label for example in examples)
        logger.info(
            f"Returning {len(examples)} examples corresponding to {len(question_indices)} questions with label "
            f"distribution {list(label_distribution.items())}")
        return examples
示例#23
0
文件: tasks.py 项目: yuweifamily/pet
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding="utf8") as f:
            for line in f:
                example_json = json.loads(line)
                idx = example_json["idx"]
                label = str(
                    example_json["label"]) if "label" in example_json else None
                guid = "%s-%s" % (set_type, idx)
                text_a = example_json["text"]
                meta = {
                    "span1_text": example_json["target"]["span1_text"],
                    "span2_text": example_json["target"]["span2_text"],
                    "span1_index": example_json["target"]["span1_index"],
                    "span2_index": example_json["target"]["span2_index"],
                }

                # the indices in the dataset are wrong for some examples, so we manually fix them
                span1_index, span1_text = meta["span1_index"], meta[
                    "span1_text"]
                span2_index, span2_text = meta["span2_index"], meta[
                    "span2_text"]
                words_a = text_a.split()
                words_a_lower = text_a.lower().split()
                words_span1_text = span1_text.lower().split()
                span1_len = len(words_span1_text)

                if words_a_lower[span1_index:span1_index +
                                 span1_len] != words_span1_text:
                    for offset in [-1, +1]:
                        if words_a_lower[span1_index + offset:span1_index +
                                         span1_len +
                                         offset] == words_span1_text:
                            span1_index += offset

                if words_a_lower[span1_index:span1_index +
                                 span1_len] != words_span1_text:
                    logger.warning(
                        f"Got '{words_a_lower[span1_index:span1_index + span1_len]}' but expected "
                        f"'{words_span1_text}' at index {span1_index} for '{words_a}'"
                    )

                if words_a[span2_index] != span2_text:
                    for offset in [-1, +1]:
                        if words_a[span2_index + offset] == span2_text:
                            span2_index += offset

                    if words_a[span2_index] != span2_text and words_a[
                            span2_index].startswith(span2_text):
                        words_a = (words_a[:span2_index] + [
                            words_a[span2_index][:len(span2_text)],
                            words_a[span2_index][len(span2_text):]
                        ] + words_a[span2_index + 1:])

                assert (
                    words_a[span2_index] == span2_text
                ), f"Got '{words_a[span2_index]}' but expected '{span2_text}' at index {span2_index} for '{words_a}'"

                text_a = " ".join(words_a)
                meta["span1_index"], meta[
                    "span2_index"] = span1_index, span2_index

                example = InputExample(guid=guid,
                                       text_a=text_a,
                                       label=label,
                                       meta=meta,
                                       idx=idx)
                if set_type == "train" and label != "True":
                    continue
                examples.append(example)

        return examples
示例#24
0
def generate_ipet_train_sets(
    train_data: List[InputExample],
    unlabeled_data: List[InputExample],
    labels: List[str],
    logits_dir: str,
    output_dir: str,
    reduction: str,
    num_new_examples: int,
    logits_percentage: float,
    n_most_likely: int = -1,
    seed: int = 42,
    local_rank=-1,
):
    """
    Generate training sets for the next generation of iPET models.

    :param train_data: the training examples
    :param unlabeled_data: the unlabeled examples
    :param labels: the list of all possible labels
    :param logits_dir: the directory that contains the predictions of all models in the current generation for the
           unlabeled data.
    :param output_dir: the output directory
    :param reduction: the strategy for merging logits, either 'mean' or 'wmean'. For 'mean', all models contribute
           equally, for 'wmean', each model's contribution is proportional to its accuracy on the training set before
           training.
    :param num_new_examples: the number of new examples to create
    :param logits_percentage: the percentage of models to use for annotating training sets for the next generation
    :param n_most_likely: If >0, in the first generation the n_most_likely examples per label are chosen even
                              if their predicted label is different
    :param seed: the random seed to use
    """
    subdirs = next(os.walk(logits_dir))[1]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    logger.info("Found the following {} subdirectories: {}".format(
        len(subdirs), subdirs))

    if train_data:
        train_examples_per_label = [
            sum(1 for ex in train_data if ex.label == label)
            for label in labels
        ]
        multiplier = num_new_examples / len(train_data)
        examples_per_label = [
            int(epl * multiplier) for epl in train_examples_per_label
        ]
        logger.info(
            f"Example distribution in the original dataset: {train_examples_per_label}"
        )
    else:
        examples_per_label = eq_div(num_new_examples, len(labels))

    logger.info(
        f"Target distribution for the new dataset: {examples_per_label}")

    for example in unlabeled_data:
        example.label, example.logits = None, None

    logits_lists = {}

    rng = random.Random(seed)
    rng_np = np.random.RandomState(seed)

    for subdir in subdirs:
        results_file = os.path.join(logits_dir, subdir, "results.txt")
        logits_file = os.path.join(logits_dir, subdir, "logits.txt")
        logits = []

        if not os.path.exists(results_file) or not os.path.exists(logits_file):
            logger.warning(
                f"Skipping subdir '{subdir}' because 'results.txt' or 'logits.txt' not found"
            )
            continue

        if reduction == "mean":
            result_train = 1
        else:
            with open(results_file, "r") as fh:
                results = ast.literal_eval(fh.read())
                result_train = results["train_set_before_training"]

        with open(logits_file, "r") as fh:
            for line in fh.read().splitlines():
                example_logits = [float(x) for x in line.split()]
                logits.append(example_logits)

        logger.info("File {}: Score = {}, #Logits = {}, #Labels = {}".format(
            results_file, result_train, len(logits), len(logits[0])))

        loglist = LogitsList(score=result_train, logits=logits)
        logits_lists[subdir] = loglist

    for subdir in subdirs:
        other_logits_lists = [
            ll for sd, ll in logits_lists.items() if sd != subdir
        ]
        subdir_train_set = generate_ipet_train_set(
            other_logits_lists,
            labels=labels,
            original_data=unlabeled_data,
            examples_per_label=examples_per_label,
            logits_percentage=logits_percentage,
            reduction=reduction,
            n_most_likely=n_most_likely,
            rng=rng,
            rng_np=rng_np,
        )

        if local_rank in [-1, 0]:
            InputExample.save_examples(
                subdir_train_set,
                os.path.join(output_dir, subdir + "-train.bin"))
示例#25
0
def train_pet_ensemble(
    model_config: WrapperConfig,
    train_config: TrainConfig,
    eval_config: EvalConfig,
    pattern_ids: List[Union[str, int]],
    output_dir: str,
    ipet_data_dir: str = None,
    repetitions: int = 3,
    train_data: List[InputExample] = None,
    unlabeled_data: List[InputExample] = None,
    dev_data: List[InputExample] = None,
    test_data: List[InputExample] = None,
    do_train: bool = True,
    do_eval: bool = True,
    save_unlabeled_logits: bool = False,
    seed: int = 42,
    overwrite_dir: bool = False,
    save_model=False,
    local_rank=-1,
):
    """
    Train and evaluate an ensemble of PET models without knowledge distillation.

    :param model_config: the model configuration to use
    :param train_config: the training configuration to use
    :param eval_config: the evaluation configuration to use
    :param pattern_ids: the ids of all PVPs to use
    :param output_dir: the output directory
    :param ipet_data_dir: optional directory containing additional training data for iPET
    :param repetitions: the number of training repetitions
    :param train_data: the training examples to use
    :param unlabeled_data: the unlabeled examples to use
    :param dev_data: the evaluation examples to use
    :param do_train: whether to perform training
    :param do_eval: whether to perform evaluation
    :param save_unlabeled_logits: whether logits for unlabeled examples should be saved in a file ``logits.txt``. This
           is required for both iPET and knowledge distillation.
    :param seed: the random seed to use
    """

    results = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    set_seed(seed)

    for pattern_id in pattern_ids:
        for iteration in range(repetitions):

            model_config.pattern_id = pattern_id
            results_dict = {}

            shots = 0 if train_data is None else len(train_data)
            pattern_iter_output_dir = "{}/{}shots-{}-i{}-seed{}".format(
                output_dir, shots, pattern_name(pattern_id), iteration, seed)

            if os.path.exists(pattern_iter_output_dir) and not overwrite_dir:
                logger.warning(
                    f"Path {pattern_iter_output_dir} already exists, skipping it..."
                )
                continue

            if not os.path.exists(pattern_iter_output_dir) and local_rank in [
                    -1, 0
            ]:
                os.makedirs(pattern_iter_output_dir)

            wrapper = init_model(model_config)

            # Training
            if do_train:
                if ipet_data_dir:
                    p = os.path.join(
                        ipet_data_dir,
                        "{}-i{}-train.bin".format(pattern_name(pattern_id),
                                                  iteration))
                    ipet_train_data = InputExample.load_examples(p)
                    for example in ipet_train_data:
                        example.logits = None
                else:
                    ipet_train_data = None

                results_dict.update(
                    train_single_model(
                        wrapper,
                        train_data,
                        train_config,
                        pattern_iter_output_dir,
                        dev_data,
                        eval_config,
                        ipet_train_data=ipet_train_data,
                        unlabeled_data=unlabeled_data,
                        return_train_set_results=False,
                        local_rank=local_rank,
                    ))

                with open(os.path.join(pattern_iter_output_dir, "results.txt"),
                          "w") as fh:
                    fh.write(str(results_dict))

                if local_rank in [-1, 0]:
                    logger.info("Saving trained model at {}...".format(
                        pattern_iter_output_dir))
                    train_config.save(
                        os.path.join(pattern_iter_output_dir,
                                     "train_config.json"))
                    eval_config.save(
                        os.path.join(pattern_iter_output_dir,
                                     "eval_config.json"))
                    logger.info("Saving complete")

                    if save_unlabeled_logits:
                        logits = evaluate(wrapper,
                                          unlabeled_data,
                                          eval_config,
                                          local_rank=local_rank)["logits"]
                        save_logits(
                            os.path.join(pattern_iter_output_dir,
                                         "logits.txt"), logits)

                if not do_eval:
                    wrapper.model = None
                    wrapper = None
                    torch.cuda.empty_cache()

            # Evaluation
            if do_eval:
                logger.info("Starting evaluation...")
                try:
                    wrapper = TransformerModelWrapper.from_pretrained(
                        pattern_iter_output_dir)
                except OSError:
                    warnings.warn(
                        "No model found saved, proceeding with current model instead of best"
                    )
                    pass

                for split, eval_data in {
                        "dev": dev_data,
                        "test": test_data
                }.items():
                    if eval_data is None:
                        continue
                    eval_result = evaluate(wrapper,
                                           eval_data,
                                           eval_config,
                                           priming_data=train_data,
                                           local_rank=local_rank)

                    if local_rank in [-1, 0]:
                        save_predictions(
                            os.path.join(pattern_iter_output_dir,
                                         "predictions.jsonl"), wrapper,
                            eval_result)
                        save_logits(
                            os.path.join(pattern_iter_output_dir,
                                         "eval_logits.txt"),
                            eval_result["logits"])

                    scores = eval_result["scores"]
                    logger.info(
                        "--- {} result (pattern_id={}, iteration={}) ---".
                        format(split, pattern_id, iteration))
                    logger.info(scores)

                    results_dict[f"{split}_set_after_training"] = scores
                    with open(
                            os.path.join(pattern_iter_output_dir,
                                         "results.json"), "w") as fh:
                        json.dump(results_dict, fh)

                    for metric, value in scores.items():
                        results[split][metric][pattern_id].append(value)

                wrapper.model = None
                wrapper = None
                torch.cuda.empty_cache()

    if do_eval:
        logger.info("=== OVERALL RESULTS ===")
        results_to_log = _write_results(
            os.path.join(output_dir, "result_test.txt"), results)
    else:
        logger.info("=== ENSEMBLE TRAINING COMPLETE ===")
        results_to_log = None

    if do_train and not save_model:
        outputs = os.listdir(pattern_iter_output_dir)
        for item in outputs:
            if item.endswith(".bin"):
                os.remove(os.path.join(pattern_iter_output_dir, item))

    return results_to_log
示例#26
0
文件: modeling.py 项目: dwright37/pet
def train_pet_ensemble(model_config: WrapperConfig,
                       train_config: TrainConfig,
                       eval_config: EvalConfig,
                       pattern_ids: List[int],
                       output_dir: str,
                       ipet_data_dir: str = None,
                       repetitions: int = 3,
                       train_data: List[InputExample] = None,
                       unlabeled_data: List[InputExample] = None,
                       eval_data: List[InputExample] = None,
                       do_train: bool = True,
                       do_eval: bool = True,
                       save_unlabeled_logits: bool = False,
                       seed: int = 42):
    """
    Train and evaluate an ensemble of PET models without knowledge distillation.

    :param model_config: the model configuration to use
    :param train_config: the training configuration to use
    :param eval_config: the evaluation configuration to use
    :param pattern_ids: the ids of all PVPs to use
    :param output_dir: the output directory
    :param ipet_data_dir: optional directory containing additional training data for iPET
    :param repetitions: the number of training repetitions
    :param train_data: the training examples to use
    :param unlabeled_data: the unlabeled examples to use
    :param eval_data: the evaluation examples to use
    :param do_train: whether to perform training
    :param do_eval: whether to perform evaluation
    :param save_unlabeled_logits: whether logits for unlabeled examples should be saved in a file ``logits.txt``. This
           is required for both iPET and knowledge distillation.
    :param seed: the random seed to use
    """

    results = defaultdict(lambda: defaultdict(list))
    set_seed(seed)

    for pattern_id in pattern_ids:
        for iteration in range(repetitions):

            model_config.pattern_id = pattern_id
            results_dict = {}

            pattern_iter_output_dir = "{}/p{}-i{}".format(
                output_dir, pattern_id, iteration)

            if os.path.exists(pattern_iter_output_dir):
                logger.warning(
                    f"Path {pattern_iter_output_dir} already exists, skipping it..."
                )
                continue

            if not os.path.exists(pattern_iter_output_dir):
                os.makedirs(pattern_iter_output_dir)

            wrapper = init_model(model_config)

            # Training
            if do_train:
                if ipet_data_dir:
                    p = os.path.join(
                        ipet_data_dir,
                        'p{}-i{}-train.bin'.format(pattern_id, iteration))
                    ipet_train_data = InputExample.load_examples(p)
                    for example in ipet_train_data:
                        example.logits = None
                else:
                    ipet_train_data = None

                results_dict.update(
                    train_single_model(wrapper,
                                       train_data,
                                       train_config,
                                       eval_config,
                                       ipet_train_data=ipet_train_data,
                                       unlabeled_data=unlabeled_data))

                with open(os.path.join(pattern_iter_output_dir, 'results.txt'),
                          'w') as fh:
                    fh.write(str(results_dict))

                logger.info("Saving trained model at {}...".format(
                    pattern_iter_output_dir))
                wrapper.save(pattern_iter_output_dir)
                train_config.save(
                    os.path.join(pattern_iter_output_dir, 'train_config.json'))
                eval_config.save(
                    os.path.join(pattern_iter_output_dir, 'eval_config.json'))
                logger.info("Saving complete")

                if save_unlabeled_logits:
                    logits = evaluate(wrapper, unlabeled_data,
                                      eval_config)['logits']
                    save_logits(
                        os.path.join(pattern_iter_output_dir, 'logits.txt'),
                        logits)

                if not do_eval:
                    wrapper.model = None
                    wrapper = None
                    torch.cuda.empty_cache()

            # Evaluation
            if do_eval:
                logger.info("Starting evaluation...")
                if not wrapper:
                    wrapper = TransformerModelWrapper.from_pretrained(
                        pattern_iter_output_dir)

                eval_result = evaluate(wrapper,
                                       eval_data,
                                       eval_config,
                                       priming_data=train_data)

                save_predictions(
                    os.path.join(pattern_iter_output_dir, 'predictions.jsonl'),
                    wrapper, eval_result)
                save_logits(
                    os.path.join(pattern_iter_output_dir, 'eval_logits.txt'),
                    eval_result['logits'])

                scores = eval_result['scores']
                logger.info(
                    "--- RESULT (pattern_id={}, iteration={}) ---".format(
                        pattern_id, iteration))
                logger.info(scores)

                results_dict['test_set_after_training'] = scores
                with open(
                        os.path.join(pattern_iter_output_dir, 'results.json'),
                        'w') as fh:
                    json.dump(results_dict, fh)

                for metric, value in scores.items():
                    results[metric][pattern_id].append(value)

                wrapper.model = None
                wrapper = None
                torch.cuda.empty_cache()

    if do_eval:
        logger.info("=== OVERALL RESULTS ===")
        _write_results(os.path.join(output_dir, 'result_test.txt'), results)
    else:
        logger.info("=== ENSEMBLE TRAINING COMPLETE ===")
示例#27
0
文件: petal.py 项目: puraminy/pet
def main():
    parser = argparse.ArgumentParser()

    # required parameters
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory. The verbalizers are written to a file 'verbalizer.json' in this directory.",
    )
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the data files for the task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="The model type",
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name",
    )
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(PROCESSORS.keys()),
    )

    # verbalizer search hyperparameters
    parser.add_argument(
        "--normalize",
        action="store_true",
        help=
        "Whether to normalize the loss as proposed in the paper. It is recommended to set this to 'true'.",
    )
    parser.add_argument(
        "--combine_patterns",
        action="store_true",
        help=
        "If set to true, a single joint verbalizer is searched for all patterns",
    )
    parser.add_argument(
        "--num_candidates",
        default=1000,
        type=int,
        help=
        "The number of candidate tokens to consider as verbalizers (see Section 4.1 of the paper)",
    )
    parser.add_argument(
        "--words_per_label",
        default=10,
        type=int,
        help="The number of verbalizer tokens to assign to each label",
    )
    parser.add_argument(
        "--score_fct",
        default="llr",
        choices=["llr", "ce", "random"],
        help=
        "The function used to score verbalizers. Choices are: the log-likelihood ratio loss proposed in the paper "
        "('llr'), cross-entropy loss ('ce') and 'random', which assigns random tokens to each label.",
    )

    # other optional parameters
    parser.add_argument(
        "--train_examples",
        default=50,
        type=int,
        help=
        "The total number of train examples to use, where -1 equals all examples.",
    )
    parser.add_argument(
        "--pattern_ids",
        default=[0],
        type=int,
        nargs="+",
        help="The ids of the PVPs to be used",
    )
    parser.add_argument(
        "--max_seq_length",
        default=256,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--words_file",
        default=None,
        type=str,
        help=
        "Path to a file containing (unlabeled) texts from the task's domain. This text is used to compute "
        "verbalization candidates by selecting the most frequent words.",
    )
    parser.add_argument(
        "--max_words",
        default=10000,
        type=int,
        help=
        "Only the 10,000 tokens that occur most frequently in the task’s unlabeled data (see --words_file) are "
        "considered as verbalization candidates",
    )
    parser.add_argument(
        "--additional_input_examples",
        type=str,
        help=
        "An optional path to an additional set of input examples (e.g., obtained using iPET)",
    )
    parser.add_argument("--seed",
                        default=42,
                        type=int,
                        help="random seed for initialization")

    args = parser.parse_args()
    random.seed(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    with open(os.path.join(args.output_dir, "config.txt"),
              "w",
              encoding="utf8") as fh:
        json.dump(args.__dict__, fh, indent=2)

    # setup gpu/cpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()

    # prepare task
    args.task_name = args.task_name.lower()
    if args.task_name not in PROCESSORS:
        raise ValueError("Task not found: {}".format(args.task_name))
    processor = PROCESSORS[args.task_name]()
    args.label_list = processor.get_labels()
    args.cache_dir = ""
    args.do_lower_case = False
    args.verbalizer_file = None
    args.wrapper_type = "mlm"

    # get training data
    train_examples_per_label = (eq_div(args.train_examples, len(
        args.label_list)) if args.train_examples != -1 else -1)
    train_data = load_examples(
        args.task_name,
        args.data_dir,
        set_type=TRAIN_SET,
        num_examples_per_label=train_examples_per_label,
    )
    if args.additional_input_examples:
        additional_data = InputExample.load_examples(
            args.additional_input_examples)
        train_data += additional_data
        logger.info(
            f"Loaded {len(additional_data)} additional examples from {args.additional_input_examples}, total"
            f"training set size is now {len(train_data)}")

    expected = {
        label: np.array([1 if x.label == label else 0 for x in train_data])
        for label in args.label_list
    }

    if args.words_file:
        with open(args.words_file, "r", encoding="utf8") as fh:
            word_counts = Counter(fh.read().split())
    else:
        word_counts = None

    tokenizer_class = MODEL_CLASSES[args.model_type]["tokenizer"]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    word2idx = get_word_to_id_map(tokenizer,
                                  word_counts=word_counts,
                                  max_words=args.max_words)

    logits = []

    for pattern_id in args.pattern_ids:
        logger.info(f"Processing examples with pattern id {pattern_id}...")
        args.pattern_id = pattern_id

        config = WrapperConfig(
            model_type=args.model_type,
            model_name_or_path=args.model_name_or_path,
            wrapper_type="mlm",
            task_name=args.task_name,
            max_seq_length=args.max_seq_length,
            label_list=args.label_list,
            pattern_id=args.pattern_id,
        )

        wrapper = TransformerModelWrapper(config)
        wrapper.model.to(device)
        # modify all patterns so that they return a single text segment instead of two segments
        get_parts = wrapper.preprocessor.pvp.get_parts
        wrapper.preprocessor.pvp.get_parts = lambda example: (
            get_parts(example)[0] + get_parts(example)[1],
            [],
        )
        wrapper.preprocessor.pvp.convert_mlm_logits_to_cls_logits = lambda mask, x, _=None: x[
            mask >= 0]

        pattern_logits = wrapper.eval(
            train_data,
            device,
            per_gpu_eval_batch_size=args.per_gpu_eval_batch_size,
            n_gpu=args.n_gpu,
        )["logits"]
        pattern_logits = pattern_logits - np.expand_dims(
            np.max(pattern_logits, axis=1), axis=1)
        logits.append(pattern_logits)

    logger.info("Starting verbalizer search...")

    if args.combine_patterns:
        avs = AutomaticVerbalizerSearch(word2idx, args.label_list, logits,
                                        expected)
        verbalizer = avs.find_verbalizer(
            num_candidates=args.num_candidates,
            words_per_label=args.words_per_label,
            normalize=args.normalize,
            score_fct=args.score_fct,
        )
        verbalizers = {
            pattern_id: verbalizer
            for pattern_id in args.pattern_ids
        }

    else:
        verbalizers = {}
        for idx, pattern_id in enumerate(args.pattern_ids):
            avs = AutomaticVerbalizerSearch(word2idx, args.label_list,
                                            [logits[idx]], expected)
            verbalizers[pattern_id] = avs.find_verbalizer(
                num_candidates=args.num_candidates,
                words_per_label=args.words_per_label,
                normalize=args.normalize,
                score_fct=args.score_fct,
            )

    print(json.dumps(verbalizers, indent=2))
    logger.info("Verbalizer search complete, writing output...")

    with open(os.path.join(args.output_dir, "verbalizers.json"),
              "w",
              encoding="utf8") as fh:
        json.dump(verbalizers, fh, indent=2)

    logger.info("Done")