def create_examples(self, datas, set_type):
     examples = []
     for i, data in enumerate(datas):
         guid = data[0]
         text_a = data[1].strip()
         # text_b = data[3].strip()
         if set_type == 'test':
             label = None
         else:
             label = str(int(data[2]))
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
Exemplo n.º 2
0
    def similarity_with_concepts(self, text, concepts):
        examples = [InputExample(guid='0', text_a=text, text_b=concept) for concept in concepts]
        features = glue_convert_examples_to_features(examples=examples,
                                                     tokenizer=tokenizer,
                                                     max_length=128,
                                                     output_mode='regression',
                                                     label_list=[None])

        input_ids = torch.tensor([feature.input_ids for feature in features])
        attention_mask = torch.tensor([feature.attention_mask for feature in features])

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            outputs = outputs[0].T.tolist()[0]

        return outputs
Exemplo n.º 3
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training, dev and test sets."""
     examples = []
     text_index = 0
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         text_a = line[text_index]
         label = line[1]
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
Exemplo n.º 4
0
    def _create_examples_adv(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            # if i == 0:
            #     continue
            guid = "%s-%s" % (set_type, i)
            text_a = line
            label = "0"  ## label here doesn't matter
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=None,
                             label=label))

        return examples
Exemplo n.º 5
0
    def _create_examples(self, lines_in, set_type):
        """Creates examples for the training, dev and test sets."""

        examples = []
        for i, line in enumerate(lines_in[1:]):

            guid = "%s-%s" % (set_type, i)
            text_a = line[1]
            text_b = ''
            label = None if set_type == "predict" else line[0]
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples
Exemplo n.º 6
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, line[0])
         text_a = line[7]
         text_b = line[8]
         label = line[-1]
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemplo n.º 7
0
    def similarity_with_concept(self, text, concept):
        example = InputExample(guid='0', text_a=text, text_b=concept)
        feature = glue_convert_examples_to_features(examples=[example],
                                                    tokenizer=self.tokenizer,
                                                    max_length=128,
                                                    output_mode='regression',
                                                    label_list=[None])

        input_ids = torch.tensor(feature[0].input_ids).unsqueeze(0)
        attention_mask = torch.tensor(feature[0].attention_mask).unsqueeze(0)

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids,
                                 attention_mask=attention_mask)

        return outputs[0].item()
Exemplo n.º 8
0
def load_pr_curve_data(world, split):
    prcp_pairs, tags = world.GetDPRCP(test=(split == 'test'))
    examples = []
    for i, (qid1, qid2) in enumerate(prcp_pairs):
        q1 = world.id_to_text[qid1]
        q2 = world.id_to_text[qid2]
        # One QQP question is '' which screws up RoBERTa tokenizer. Change it to ' '
        if not q1:
            q1 = ' '
        if not q2:
            q2 = ' '
        examples.append(
            InputExample(guid='{}-{}-pr-{}-{}'.format(split, i, qid1, qid2),
                         text_a=q1,
                         text_b=q2,
                         label='0'))
    return examples, tags
Exemplo n.º 9
0
 def get_train_examples(self, data_dir):
     """See base class."""
     datafile = os.path.join(data_dir, 'doordash_categorized.pkl')
     print(datafile)
     with open(datafile, 'rb') as f:
         json = pickle.load(f)
     examples = []
     for entry in json:
         guid = entry['key']
         title = entry['title']
         label = entry['labels']['category']
         if label in self.excluded_labels:
             continue
         examples.append(
             InputExample(guid=guid, text_a=title, label=label)
         )
     return examples
Exemplo n.º 10
0
 def _create_examples(self, lines, set_type):
     examples = []
     for (_, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, line[0])
         try:
             text_a = line[1].lower()
             text_b = line[2].lower()
             label = line[3]
         except IndexError:
             print('cannot read the line: ' + line)
             continue
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemplo n.º 11
0
def create_examples(filename):
    datas = pd.read_csv(filename).values.tolist()
    examples = []
    for i, data in enumerate(datas):
        guid = data[1]
        text_a = data[2].strip()
        # print(text_a)
        # text_b = data[2].strip()
        examples.append(
            InputExample(
                guid=guid,
                text_a=text_a,
                text_b=None,
                label=None
            )
        )
    return examples
Exemplo n.º 12
0
 def _create_examples(self, lines: List[List[str]],
                      set_type: str) -> List[InputExample]:
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         text_a = line[5]
         text_b = line[6]
         label = self._preprocess_label(line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemplo n.º 13
0
    def _get_test_or_dev_helper(self, data_dir, stage):
        if stage == "dev":
            language = self.dev_language
        elif stage == "test":
            language = self.test_language

        rows = self._read_xnli_tsv(
            os.path.join(data_dir, "XNLI-1.0", f"xnli.{stage}.tsv"))
        examples = [
            InputExample(
                guid=f"{stage}-{i}",
                text_a=row["sentence1"],
                text_b=row["sentence2"],
                label=row["gold_label"],
            ) for i, row in enumerate(rows) if row["language"] == language
        ]
        return examples
Exemplo n.º 14
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a = line[0]
         text_b = line[1]
         if "imho" in text_a or "imo" in text_a:  #Gengyu: shoud I remove this two words from data
             label = "1"
         else:
             label = "0"
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemplo n.º 15
0
    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[1]
            text_b = line[2]
            label = self.get_labels()[0] if set_type == 'test' else line[-1]

            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples
Exemplo n.º 16
0
 def _create_examples(self, lines, set_type):
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         if set_type == 'test':
             text_a = line[1]
             label = '-1'
         else:
             label = line[0]
             text_a = line[1]
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
Exemplo n.º 17
0
 def get_train_examples(self, data_dir):
     rows = self._read_xnli_tsv(
         os.path.join(
             data_dir,
             "XNLI-MT-1.0",
             "multinli",
             f"multinli.train.{self.train_language}.tsv",
         ))
     examples = [
         InputExample(
             guid=f"train-{i}",
             text_a=row["premise"],
             text_b=row["hypo"],
             label=("contradiction"
                    if row["label"] == "contradictory" else row["label"]),
         ) for i, row in enumerate(rows)
     ]
     return examples
 def _create_examples(self, lines, set_type):
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, line[0])
         text_a = line[1]
         text_b = line[2]
         if set_type == "test":
             label = -1
         else:
             label = line[0]
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemplo n.º 19
0
 def _create_examples(self, lines: List[List[str]],
                      set_type: str) -> List[InputExample]:
     """Creates examples for the training, dev and test sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, line[0])
         text_a = line[8]
         text_b = line[9]
         label = None if set_type.startswith(
             "test") else self._preprocess_label(line[-1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemplo n.º 20
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training, dev and test sets."""
     test_mode = set_type == "test"
     q1_index = 1 if test_mode else 3
     q2_index = 2 if test_mode else 4
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, line[0])
         try:
             text_a = line[q1_index]
             text_b = line[q2_index]
             label = None if test_mode else line[5]
         except IndexError:
             continue
         examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
Exemplo n.º 21
0
 def _create_examples(self, set_type):
     """Creates examples for the training, dev and test sets."""
     examples = []
     label_map = {
         0: "entailment",
         1: "neutral",
         2: "contradiction"
     }
     from nlp import load_dataset
     dataset = load_dataset('anli')
     for (i, data) in enumerate(dataset[set_type]):
         guid = "%s-%s" % (set_type, data['uid'])
         text_a = data['premise']
         text_b = data['hypothesis']
         # label = None if set_type.startswith("test") else label_map[data['label']]
         label = label_map[data['label']]
         examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
    def _rows2examples(self, rows):
        # Spanish sentence tokenizer
        tokenizer = nltk.data.load("tokenizers/punkt/PY3/spanish.pickle")
        examples = []
        logger.info("Reading examples")
        for i, row in enumerate(tqdm(rows)):
            # the text column was saved as a string with the python syntax
            # for bytes literals, so it must be converted to a string literal

            tokens = tokenizer.tokenize(eval(row[1]).decode())
            example = InputExample(
                f"test-{i}",
                tokens[0],
                tokens[1] if len(tokens) > 1 else None,
                label=row[0],
            )
            examples.append(example)

        return examples
 def _create_test_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples, rows = [], []
     for (_, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, line[0])
         try:
             text_a = line[1].lower()
             text_b = line[2].lower()
             label = line[3]
         except IndexError:
             print('cannot read the line: ' + line)
             continue
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
         rows.append(line)
     return examples, rows
Exemplo n.º 24
0
 def _load_data(self, dir, set_type):
     examples = []
     for filename in os.listdir(dir):
         if filename.endswith(".ann"):
             lines = self._read_tsv(os.path.join(dir, filename))
             #print(os.path.join(directory, filename))
             for (i, line) in enumerate(lines):
                 #line = line.split("\t")
                 if line[0].startswith("T"):
                     guid = "%s-%s" % (set_type, len(examples))
                     text_a = line[2].replace("CMV:", "").strip()
                 if line[0].startswith("A"):
                     label = line[1].split(" ")[2].strip()
                     examples.append(
                         InputExample(guid=guid,
                                      text_a=text_a,
                                      text_b=None,
                                      label=label))
     return examples
Exemplo n.º 25
0
    def _read_tfds_and_create_examples(self, mode):
        if not is_tfds_available():
            raise RuntimeError(
                "The package tensorflow_datasets can't be imported")

        import tensorflow_datasets as tfds
        try:
            ds, dsinfo = tfds.load(self.dataset_name,
                                   split=mode,
                                   with_info=True)
        except KeyError:
            raise ValueError(
                "The dataset {} does not exists in the tensorflow_datasets catalog."
                .format(self.dataset_name))

        seen_labels = set()

        for ex_index, entry in enumerate(ds):
            if ex_index % 10000 == 0:
                logger.info("Creating example %d", ex_index)

            if self.guid in list(dsinfo.features.keys()):
                guid = entry[self.guid].numpy()
            else:
                guid = id

            if self.text_b in list(dsinfo.features.keys()):
                text_b = entry[self.text_b].numpy().decode("utf-8")
            else:
                text_b = None

            label = dsinfo.features[self.label].int2str(
                entry[self.label].numpy())
            seen_labels.add(label)

            example = InputExample(guid,
                                   entry[self.text_a].numpy().decode("utf-8"),
                                   text_b, label)

            self.examples[mode].append(example)

        self.labels = list(set(self.labels).union(seen_labels))
Exemplo n.º 26
0
 def get_dev_examples(self, data_dir):
     """See base class."""
     examples = []
     with open(os.path.join(data_dir, "dev.jsonl"), "r") as f:
         for i, line in enumerate(f.readlines()):
             json_content = json.loads(line.replace("\n", ""))
             guid = "%s-%s" % (i, "dev")
             text_a = json_content['q1']
             text_b = json_content['q2']
             label = json_content['label']
             assert label == '1' or label == '0'
             label = "paraphrase" if label == "1" else "not-paraphrase"
             assert isinstance(text_a, str), f"Training input {text_a} is not a string"
             assert isinstance(text_b, str), f"Training input {text_b} is not a string"
             assert isinstance(label, str), f"Training label {label} is not a string"
             example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
             if i < 10:
                 print(example)
             examples.append(example)
     return examples
Exemplo n.º 27
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # if i == 0:
         #     continue
         guid = "%s-%s" % (set_type, i)
         if len(line) < 2:
             continue
         text_a = line[0]
         label = line[1]
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     # if set_type == "train":
     #     random.shuffle(examples)
     # print ("Examples are shuffled")
     return examples
Exemplo n.º 28
0
        def _create_examples(self, data, set_type):

            examples = []

            raw_texts = data.tweet.values.tolist()
            raw_labels = data.label.values.tolist()

            for i in range(0, len(raw_texts)):
                guid = "%s-%s" % (set_type, i)
                raw_text = raw_texts[i]
                raw_label = raw_labels[i]
                label = raw_label

                text = self._preprocess_text(raw_text)
                examples.append(
                    InputExample(guid=guid,
                                 text_a=text,
                                 text_b=None,
                                 label=label))

            return examples
Exemplo n.º 29
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         # guid = "%s-%s" % (set_type, i)
         guid = line[0]
         if set_type == 'test':
             # text_a = line[1]
             text_a = line[2]
             label = None
         else:
             text_a = line[2]
             label = line[1]
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
Exemplo n.º 30
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         try:
             if i == 0:
                 continue
             guid = "%s-%s" % (set_type, line[0])
             text_a = line[1]
             text_b = None
             label = line[0]
             examples.append(
                 InputExample(guid=guid,
                              text_a=text_a,
                              text_b=text_b,
                              label=label))
         except Exception as e:
             import pdb
             pdb.set_trace()
             print(e)
     return examples