Пример #1
0
 def __call__(self, texts_a: List[str]):
     input_features = []
     tokens = []
     mask_idxs = []
     for text_a in texts_a:
         encoded_dict = self.tokenizer.encode_plus(
             text=text_a,
             add_special_tokens=True,
             max_length=self.max_seq_length,
             pad_to_max_length=True,
             return_attention_mask=True,
             return_tensors='pt')
         curr_features = InputFeatures(
             input_ids=encoded_dict['input_ids'],
             attention_mask=encoded_dict['attention_mask'],
             token_type_ids=encoded_dict['token_type_ids'],
             label=None)
         input_features.append(curr_features)
         if self.return_tokens:
             tokens.append(
                 self.tokenizer.convert_ids_to_tokens(
                     encoded_dict['input_ids'][0]))
         tokens = self.tokenizer.convert_ids_to_tokens(
             encoded_dict['input_ids'][0])
         mask_idx = 0
         for i in range(len(tokens)):
             if tokens[i] == '[MASK]':
                 mask_idx = i
         mask_idxs.append(mask_idx)
     if self.return_tokens:
         return input_features, tokens, mask_idxs
     else:
         return input_features, mask_idxs
Пример #2
0
def convert_one_example_to_features(examples, tokenizer, max_length=512, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True):
    
    features = []
    for (ex_index, example) in enumerate(examples):
        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
            truncate_first_sequence=False  # We're truncating the first sequence in priority
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
        padding_length = max_length - len(input_ids)
        input_ids = input_ids + ([pad_token] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=None))
    return features
Пример #3
0
def convert_examples_to_features(examples,
                                 tokenizer,
                                 max_length=512,
                                 label2id=None):
    logger.info("正在创建 features")
    features = []
    for (ex_index, example) in tqdm(enumerate(examples)):
        inputs = tokenizer.encode_plus(example.text_a,
                                       add_special_tokens=True,
                                       max_length=max_length,
                                       pad_to_max_length=True,
                                       truncation="longest_first")
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]
        attention_mask = inputs['attention_mask']
        input_len, att_mask_len, token_type_len = len(input_ids), len(
            attention_mask), len(token_type_ids)
        assert input_len == max_length, "input_ids 长度错误 {} vs {}".format(
            input_len, max_length)
        assert att_mask_len == max_length, "att_mask 长度错误 {} vs {}".format(
            att_mask_len, max_length)
        assert token_type_len == max_length, "token_type_ids 长度错误 {} vs {}".format(
            token_type_len, max_length)

        label = label2id[example.label]
        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label))

    return features
Пример #4
0
def _glue_convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
):
    if max_length is None:
        max_length = tokenizer.max_len

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" %
                        (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float, None]:
        if example.label is None:
            return None
        if output_mode == "classification":
            return label_map[example.label]
        elif output_mode == "regression":
            return float(example.label)
        raise KeyError(output_mode)

    labels = [label_from_example(example) for example in examples]

    # batch_encoding = tokenizer(
    #     [(example.text_a, example.text_b) for example in examples],
    #     max_length=max_length,
    #     padding="max_length",
    #     truncation=True,
    # )

    features = []
    for i in range(len(examples)):
        # inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        inputs = tokenizer.encode_plus(text=examples[i].text_a.split(" "),
                                       text_pair=examples[i].text_b.split(" ")
                                       if examples[i].text_b else None,
                                       max_length=max_length,
                                       padding="max_length",
                                       truncation=True)

        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features
Пример #5
0
def create_input_feature(tokenizer, output_mode, example, max_length,
                         mask_padding_with_zero, pad_on_left, pad_token,
                         pad_token_segment_id, label_map):
    example = InputExample(
        example['id'], example['sentence1'],
        example['sentence2'] if 'sentence2' in example else None,
        example['label'])

    inputs = tokenizer.encode_plus(
        example.text_a,
        example.text_b,
        add_special_tokens=True,
        max_length=max_length,
        truncation_strategy=
        'only_first'  # We're truncating the first sequence in priority
    )
    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_length - len(input_ids)
    if pad_on_left:
        input_ids = ([pad_token] * padding_length) + input_ids
        attention_mask = ([0 if mask_padding_with_zero else 1] *
                          padding_length) + attention_mask
        token_type_ids = ([pad_token_segment_id] *
                          padding_length) + token_type_ids
    else:
        input_ids = input_ids + ([pad_token] * padding_length)
        attention_mask = attention_mask + (
            [0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                           padding_length)

    assert len(
        input_ids) == max_length, "Error with input length {} vs {}".format(
            len(input_ids), max_length)
    assert len(attention_mask
               ) == max_length, "Error with input length {} vs {}".format(
                   len(attention_mask), max_length)
    assert len(token_type_ids
               ) == max_length, "Error with input length {} vs {}".format(
                   len(token_type_ids), max_length)

    if output_mode == "classification":
        label = label_map[example.label]
    elif output_mode == "regression":
        label = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                         attention_mask=attention_mask,
                         token_type_ids=token_type_ids,
                         label=label)
    def _convert_examples_to_features(self, examples):

        max_length = self.max_length
        pad_token = self.pad_token
        pad_token_segment_id = self.pad_token_segment_id
        mask_padding_with_zero = self.mask_padding_with_zero

        label_map = self.get_label_map()
        features = []
        for (ex_index, example) in enumerate(examples):
            len_examples = len(examples)
            inputs = self.tokenizer.encode_plus(example.text_a,
                                                example.text_b,
                                                add_special_tokens=True,
                                                max_length=max_length)
            input_ids, token_type_ids = inputs["input_ids"], inputs[
                "token_type_ids"]

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0
                              ] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_length - len(input_ids)
            if self.pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                attention_mask = ([0 if mask_padding_with_zero else 1] *
                                  padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] *
                                  padding_length) + token_type_ids
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                attention_mask = attention_mask + \
                    ([0 if mask_padding_with_zero else 1] * padding_length)
                token_type_ids = token_type_ids + \
                    ([pad_token_segment_id] * padding_length)

            assert len(
                input_ids
            ) == max_length, "Error with input length {} vs {}".format(
                len(input_ids), max_length)
            assert len(
                attention_mask
            ) == max_length, "Error with input length {} vs {}".format(
                len(attention_mask), max_length)
            assert len(
                token_type_ids
            ) == max_length, "Error with input length {} vs {}".format(
                len(token_type_ids), max_length)
            label = label_map[example.label]
            features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label))
        return features
Пример #7
0
def convert_examples_to_features(examples,
                                 tokenizer,
                                 label_map,
                                 max_length=512,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 mask_padding_with_zero=True):

    features = []
    for (ex_index, example) in enumerate(examples):
        len_examples = len(examples)
        if ex_index % 100 == 0:
            logging.info("converting example %d/%d" % (ex_index, len_examples))
        inputs = tokenizer.encode_plus(example.text_a,
                                       example.text_b,
                                       add_special_tokens=True,
                                       max_length=max_length)
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] *
                              padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + \
                ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + \
                ([pad_token_segment_id] * padding_length)

        assert len(input_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(input_ids), max_length)
        assert len(attention_mask
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(attention_mask), max_length)
        assert len(token_type_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(token_type_ids), max_length)
        label = label_map[example.label]
        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label))
    return features
    def __call__(self, texts_a: List[str], texts_b: Optional[List[str]] = None) -> Union[List[InputFeatures],
                                                                                         Tuple[List[InputFeatures],
                                                                                               List[List[str]]]]:
        """Tokenize and create masks.

        texts_a and texts_b are separated by [SEP] token

        Args:
            texts_a: list of texts,
            texts_b: list of texts, it could be None, e.g. single sentence classification task

        Returns:
            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \
                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens
        """

        if texts_b is None:
            texts_b = [None] * len(texts_a)

        input_features = []
        tokens = []
        for text_a, text_b in zip(texts_a, texts_b):
            encoded_dict = self.tokenizer.encode_plus(
                text=text_a, text_pair=text_b,
                add_special_tokens=True,
                max_length=self.max_seq_length,
                truncation=True,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt')

            if 'token_type_ids' not in encoded_dict:
                if self.add_token_type_ids:
                    input_ids = encoded_dict['input_ids']
                    seq_len = input_ids.size(1)
                    sep = torch.where(input_ids == self.tokenizer.sep_token_id)[1][0].item()
                    len_a = min(sep + 1, seq_len)
                    len_b = seq_len - len_a
                    encoded_dict['token_type_ids'] = torch.cat((torch.zeros(1, len_a, dtype=int),
                                                                torch.ones(1, len_b, dtype=int)), dim=1)
                else:
                    encoded_dict['token_type_ids'] = torch.tensor([0])

            curr_features = InputFeatures(input_ids=encoded_dict['input_ids'],
                                          attention_mask=encoded_dict['attention_mask'],
                                          token_type_ids=encoded_dict['token_type_ids'],
                                          label=None)
            input_features.append(curr_features)
            if self.return_tokens:
                tokens.append(self.tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0]))

        if self.return_tokens:
            return input_features, tokens
        else:
            return input_features
Пример #9
0
def convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
):
    if max_length is None:
        max_length = tokenizer.max_len

    processor = THUCNewsProcessor()
    if label_list is None:
        label_list = processor.get_labels()
        logger.info("Using label list %s for task %s" % (label_list, task))
    if output_mode is None:
        output_mode = "classification"
        logger.info("Using output mode %s for task %s" % (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float, None]:
        if example.label is None:
            return None
        if output_mode == "classification":
            return label_map[example.label]
        elif output_mode == "regression":
            return float(example.label)
        raise KeyError(output_mode)

    labels = [label_from_example(example) for example in examples]

    batch_encoding = tokenizer(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        # https://github.com/huggingface/transformers/blob/master/src/transformers/data/processors/utils.py#L56
        # InputFeatures当中包含了input_ids, attention_mask, token_type_ids和label四个部分
        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features
Пример #10
0
 def __getitem__(self, index):
     if self.cache_mode == 'pickle':
         return self.instances[index]
     elif self.cache_mode == 'memmap':
         instance_data = {}
         for k in self.mem_maps:
             if k != "labels":
                 instance_data[k] = list(self.mem_maps[k][index])
         inputs = {k: instance_data[k] for k in instance_data}
         feature = InputFeatures(**inputs, label=float(self.mem_maps["labels"][index]))
         return feature
Пример #11
0
    def _cache_instances_pickle(self):
        """
        Loads tensors into memory or creates the dataset when it does not exist already.
        """        
        signature = "weakly_supervised_pointwise_set_{}_n_cand_docs_{}_ns_sampler_{}_seq_max_l_{}_sample_{}_for_{}_using_{}".\
            format(self.data_partition,
                   self.negative_sampler.num_candidates_samples,
                   self.negative_sampler.name,
                   self.max_seq_len,
                   self.sample_data,
                   self.task_type,
                   self.tokenizer.__class__.__name__)
        path = self.cache_path + "/" + signature

        if os.path.exists(path):
            with open(path, 'rb') as f:
                logging.info("Loading instances from {}".format(path))
                self.instances = pickle.load(f)
        else:            
            logging.info("Generating instances with signature {}".format(signature))

            labels = []
            examples = []
            for idx, row in enumerate(tqdm(self.data.itertuples(index=False), total=len(self.data))):
                query = row[0]
                relevant_documents = row[1]
                for relevant_document in relevant_documents:
                    examples.append((query, relevant_document))
                    labels.append(1.0)
                ns_candidates, ns_scores, _, _, _ = self.negative_sampler.sample(query, relevant_documents)                

                for i, ns in enumerate(ns_candidates):
                    examples.append((query, ns))
                    labels.append(ns_scores[i])

            logging.info("Encoding examples using tokenizer.batch_encode_plus().")
            batch_encoding = self.tokenizer(examples, max_length=self.max_seq_len,
                                                      padding="max_length", truncation=True)

            logging.info("Transforming examples to instances format.")
            self.instances = []
            for i in range(len(examples)):
                inputs = {k: batch_encoding[k][i] for k in batch_encoding}                
                feature = InputFeatures(**inputs, label=labels[i])
                self.instances.append(feature)

            for idx in range(3):
                logging.info("Set {} Instance {} query \n\n{}[...]\n".format(self.data_partition, idx, examples[idx][0][0:200]))
                logging.info("Set {} Instance {} document \n\n{}\n".format(self.data_partition, idx, examples[idx][1][0:200]))
                logging.info("Set {} Instance {} features \n\n{}\n".format(self.data_partition, idx, self.instances[idx]))
            with open(path, 'wb') as f:
                pickle.dump(self.instances, f)

        logging.info("Total of {} instances were cached.".format(len(self.instances)))
def roberta_convert_examples_to_tf_dataset(examples, tokenizer, tagset,
                                           max_length):
    features = []  # -> will hold InputFeatures to be converted later

    for e in examples:
        tokens = e["tokens"]
        labels = e["tags"]
        label_map = {label: i
                     for i, label in enumerate(tagset)}  # Tags to indexes

        # Tokenize subwords and propagate labels
        split_tokens, split_labels, idx_map = tokenizer.subword_tokenize(
            tokens, labels)

        # Create features
        input_ids = tokenizer.convert_tokens_to_ids(split_tokens)
        attention_mask = [1] * len(input_ids)
        label_ids = [label_map[label] for label in split_labels]

        padding = [0] * (max_length - len(input_ids))
        input_ids += padding
        attention_mask += padding
        label_ids += padding

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          label=label_ids))

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({
            "input_ids": tf.int32,
            "attention_mask": tf.int32
        }, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
            },
            tf.TensorShape([None]),
        ),
    )
Пример #13
0
def _glue_convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
):
    if max_length is None:
        max_length = tokenizer.max_len

    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))
    #label 字符串到id的映射表
    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float, None]:
        if example.label is None:
            return None
        if output_mode == "classification":
            return label_map[example.label]
        elif output_mode == "regression":
            return float(example.label)
        raise KeyError(output_mode)
    #获取所有样本的labels
    labels = [label_from_example(example) for example in examples]
    #所有样本字符到id的,padding或traucate 后的结果
    batch_encoding = tokenizer(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    #把input_ids, attention_mask, token_type_ids, label 放到一个对象InputFeatures 里面
    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        # 把input_ids, attention_mask, token_type_ids, label 放到一个对象InputFeatures 里面
        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)
    #打印前5个样本
    logger.info("*** 打印前5个样本 ***")
    for i, example in enumerate(examples[:5]):
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features
Пример #14
0
def bert_attribute_accuracy(targets,
                            predictions,
                            classifier_model,
                            tokenizer,
                            device,
                            attributes_origin=None,
                            batch_size=32):
    batch_encoding = tokenizer.batch_encode_plus(predictions,
                                                 max_length=tokenizer.max_len,
                                                 pad_to_max_length=True)

    features = []
    for i in range(len(predictions)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = InputFeatures(**inputs)
        features.append(feature)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)

    # Data on TPU
    all_input_ids = all_input_ids.to(device)
    all_attention_mask = all_attention_mask.to(device)
    all_token_type_ids = all_token_type_ids.to(device)

    classifier_model.eval()

    with torch.no_grad():
        inputs = {
            "input_ids": all_input_ids,
            "attention_mask": all_attention_mask,
            "token_type_ids": all_token_type_ids
        }
        prediction_labels = torch.round(
            torch.sigmoid(classifier_model(**inputs)[0].squeeze(1)))

    prediction_labels = prediction_labels.detach().cpu().numpy()

    attributes_origin = np.array(attributes_origin)

    correct = (prediction_labels != attributes_origin).float()

    attribute_accuracy = correct.sum() / len(correct)

    return {"attribute_accuracy": attribute_accuracy}
Пример #15
0
    def __call__(self, batch: List[List[str]]) -> List[List[InputFeatures]]:
        """Tokenize and create masks.

        Args:
            batch: list of elemenents where the first element represents the batch with contexts
                and the rest of elements represent response candidates batches

        Returns:
            list of feature batches with subtokens, subtoken ids, subtoken mask, segment mask.
        """

        if isinstance(batch[0], str):
            batch = [batch]

        cont_resp_pairs = []
        if len(batch[0]) == 1:
            contexts = batch[0]
            responses_empt = [None] * len(batch)
            cont_resp_pairs.append(zip(contexts, responses_empt))
        else:
            contexts = [el[0] for el in batch]
            for i in range(1, len(batch[0])):
                responses = []
                for el in batch:
                    responses.append(el[i])
                cont_resp_pairs.append(zip(contexts, responses))

        input_features = []

        for s in cont_resp_pairs:
            sub_list_features = []
            for context, response in s:
                encoded_dict = self.tokenizer.encode_plus(
                    text=context,
                    text_pair=response,
                    add_special_tokens=True,
                    max_length=self.max_seq_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                    return_tensors='pt')

                curr_features = InputFeatures(
                    input_ids=encoded_dict['input_ids'],
                    attention_mask=encoded_dict['attention_mask'],
                    token_type_ids=encoded_dict['token_type_ids'],
                    label=None)
                sub_list_features.append(curr_features)
            input_features.append(sub_list_features)

        return input_features
Пример #16
0
 def __getitem__(self, item):
     tweet = self.tweets[item]
     tokens = self.tokenizer.encode_plus(
         tweet,
         add_special_tokens=True,
         max_length=self.max_len,
         return_token_type_ids=False,
         pad_to_max_length=True,
         return_attention_mask=True,
         return_tensors='pt',
         truncation=True
     )
     return InputFeatures(input_ids=tokens['input_ids'].flatten().long().numpy().tolist(),
                          attention_mask=tokens['attention_mask'].flatten().long().numpy().tolist())
def encode_sentences(df, with_context, with_section_names):
    """
    Encodes a list of sentences into BERT tokens and returns a list of feature-label combinations.
    InputFeatures contains input_ids, attention_masks, token_type_ids and the label.

    :param df: a data frame of features and labels.
    :param with_context: True, if the context (pre and post sentence) should be considered.
    :param with_section_names: True, if the section name should be considered.
    :return: a list of InputFeatures where each element is a feature-label combination.
    """
    features = []
    for entry in df.iterrows():
        sentence = get_context(entry) if with_context else entry[1]['sentence']
        if with_section_names:
            sentence = entry[1]['section_name'] + " " + sentence
        inputs = encode_sentence(sentence)
        features.append(InputFeatures(**inputs, label=int(entry[1]['used'])))

    return features
Пример #18
0
def convert_single_example_to_features(example, tokenizer, max_length=512, 
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    
    feature = []

    inputs = tokenizer.encode_plus(
        example.text_a,
        example.text_b,
        add_special_tokens=True,
        max_length=max_length,
        truncate_first_sequence=True  # We're truncating the first sequence in priority
    )
    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

    attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_length - len(input_ids)

    input_ids = input_ids + ([pad_token] * padding_length)
    attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
    token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

    assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
    assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
    assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

    # logger.info("*** Example ***")
    # logger.info("guid: %s" % (example.guid))
    # logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
    # logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
    # logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))

    feature=InputFeatures(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            label=None)

    return feature
Пример #19
0
def _input_fn(texts, labels):
    features = []
    for text, label in zip(texts, labels):
        # docs are saying the add_prefix_space should be used
        inputs = TOKENIZER.encode_plus(text,
                                       None,
                                       add_special_tokens=True,
                                       max_length=DEFAULT_MAX_LEN,
                                       add_prefix_space=True)

        input_ids = inputs["input_ids"]
        attention_mask = [1] * len(input_ids)

        input_ids = _pad_with(input_ids, PAD_TOKEN, DEFAULT_MAX_LEN)
        attention_mask = _pad_with(attention_mask, PAD_TOKEN, DEFAULT_MAX_LEN)

        assert (len(input_ids) == DEFAULT_MAX_LEN)
        assert (len(attention_mask) == DEFAULT_MAX_LEN)

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          label=label))

    def gen():
        for f in features:
            yield ({
                'input_ids': f.input_ids,
                'attention_mask': f.attention_mask
            }, f.label)

    return tf.data.Dataset.from_generator(gen, ({
        'input_ids': tf.int32,
        'attention_mask': tf.int32
    }, tf.float32), ({
        'input_ids': tf.TensorShape([None]),
        'attention_mask': tf.TensorShape([None])
    }, tf.TensorShape([])))
Пример #20
0
    def __call__(self, texts_a: List[str], texts_b: Optional[List[str]] = None) -> Union[
            List[InputFeatures], Tuple[List[InputFeatures], List[List[str]]]]:
        """Tokenize and create masks.

        texts_a and texts_b are separated by [SEP] token

        Args:
            texts_a: list of texts,
            texts_b: list of texts, it could be None, e.g. single sentence classification task

        Returns:
            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \
                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens
        """

        if texts_b is None:
            texts_b = [None] * len(texts_a)

        input_features = []
        tokens = []
        for text_a, text_b in zip(texts_a, texts_b):
            encoded_dict = self.tokenizer.encode_plus(
                text=text_a, text_pair=text_b, add_special_tokens=True, max_length=self.max_seq_length,
                pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
            curr_features = InputFeatures(input_ids=encoded_dict['input_ids'],
                                          attention_mask=encoded_dict['attention_mask'],
                                          token_type_ids=encoded_dict['token_type_ids'],
                                          label=None)
            input_features.append(curr_features)
            if self.return_tokens:
                tokens.append(self.tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0]))

        if self.return_tokens:
            return input_features, tokens
        else:
            return input_features
    def return_answers(self, question, search_name=None, 
                       min_score=None, max_length=128):
        """
        Searches texts for sentences that answer a question.
        
        Texts from the results of a specified search (or the whole corpus if no
        `search_name` is given) are split into sentences, and each sentence is
        scored based on the likelihood it answers the `question` parameter 
        provided - the higher the score, the more likely the sentence contains 
        an answer to the given question. Results are returned as tuples in the
        following format:
        
            (text id, sentence number, sentence text, score)
            
        Parameters
        ----------
            
            question: str
                Question against which sentences are scored
            search_name: str, optional
                Name of the search to take results from. If none, the whole
                text corpus in the `texts` attribute is used
            min_score: int, optional
                The minimum score a sentence must receive to be returned in the
                output
            max_length: int, default 128
                The length of a sentence in tokens used by the Bert model to
                set the fixed-length input to the model
            
        Returns
        -------
        
            tuple: (str: text_id, str: sentence no, 
                    str: sentence text, float: score)
        
        """

        if not search_name is None:
            search_texts_ids = self.search_results[search_name].ids
        else:
            search_texts_ids = self.texts.keys()

        print('=' * 100)
        print(f"Checking {len(search_texts_ids)} search results "
              f"for answers to {question}")

        # collect texts that correspond with ids from search 
        # and create (sentence, text_id) tuples
        search_texts = [self.texts[text_id] for text_id in search_texts_ids]
        sentence_tuples = self._split_text_to_sentences(search_texts_ids,
                                                        search_texts)
        # create input examples with question 
        # and sentence (potential answer) pairs 
        input_examples = []
        for sentence_tuple in sentence_tuples:
            text_id, sentence_no, sentence = sentence_tuple
            input_example = InputExample(
                guid = str(text_id) + '_' + str(sentence_no),
                text_a = question,
                text_b = sentence
            )
            input_examples.append(input_example)
        print("Inputs converted to BERT InputExamples")

        # take input examples and convert to input features with padding
        input_features = []
        for idx, example in enumerate(input_examples):
            inputs = self.tokenizer.encode_plus(
                example.text_a,
                example.text_b,
                add_special_tokens=True,
                max_length=max_length
            )
            input_ids = inputs["input_ids"]
            token_type_ids = inputs["token_type_ids"]
    
            attention_mask = [1] * len(input_ids)
            padding_length = max_length - len(input_ids)
            pad_token = self.tokenizer.convert_tokens_to_ids(
                [self.tokenizer.pad_token])[0]
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)

            input_features.append(
                InputFeatures(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            label=None)
            )
    
        print("InputExamples converted to InputFeatures")
        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in input_features], 
                                 dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask 
                                           for f in input_features], 
                                          dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids 
                                           for f in input_features], 
                                           dtype=torch.long)
        tensor_dataset = TensorDataset(all_input_ids, 
                                       all_attention_mask, 
                                       all_token_type_ids)
        print("InputFeatures converted to TensorDataset")

        # create dataloader to feed batches to torch model
        sampler = SequentialSampler(tensor_dataset)
        dataloader = DataLoader(tensor_dataset, 
                                 sampler=sampler, 
                                 batch_size=100)
        print("TensorDataset converted to torch DataLoader")
        print(f"Ranking {len(sentence_tuples)} possible answers from "
              f"{len(search_texts)} texts:", flush=True)
        # feed data to model and output logits 
        # i.e. [likelihood not answer, likelihood answer]
        all_logits = []

        with torch.no_grad():
            for batch in tqdm(dataloader, total=len(dataloader)):
                model_input = tuple(tensor.to(self.device) for tensor in batch)
                inputs = {'input_ids':      model_input[0],
                          'attention_mask': model_input[1]}
                batch_logits = self.model(**inputs)[0]
                if len(all_logits):
                    all_logits = np.concatenate([all_logits, 
                                                 batch_logits.cpu()])
                else:
                    all_logits = np.array(batch_logits.cpu())

        answer_score = all_logits[:,1] - all_logits[:,0]
        ranked_answers = answer_score.argsort()[::-1]

        answer_tuples = []
        for answer_idx in ranked_answers:
            if min_score is not None:
                if answer_score[answer_idx] < min_score:
                    break
            text_id, sentence_no, sentence = sentence_tuples[answer_idx]
            answer_tuples.append((text_id, 
                                  sentence_no, 
                                  sentence, 
                                  answer_score[answer_idx]))
        return answer_tuples
Пример #22
0
def run_test(args):
    data = pd.read_csv(args.test_path, sep='\t')
    question_bank = pd.read_csv("%s/question_bank.tsv" % args.data_dir, sep="\t")
    all_documents = list(question_bank["question"].values[1:])
    examples = []
    for tid in data['topic_id'].unique():
        query = data.loc[data['topic_id']==tid, 'initial_request'].tolist()[0]
        for doc in all_documents:
            examples.append((query, doc))
    
    tokenizer = BertTokenizer.from_pretrained("%s/vocab.txt" % args.log_dir)
    batch_encoding = tokenizer.batch_encode_plus(
        examples, max_length=args.max_seq_len, truncation=True, pad_to_max_length=True)
    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        feature = InputFeatures(**inputs, label=0)
        features.append(feature)
    dataset = SimpleDataset(features)
    data_collator = DefaultDataCollator()
    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, collate_fn=data_collator.collate_batch)
    
    # load fine-tuned model
    model = BertForSequenceClassification.from_pretrained(args.log_dir)
    ranker = transformer_ranker.TransformerRanker(
        model=model, train_loader=None, val_loader=None, test_loader=None,
        num_ns_eval=None, task_type="classification", tokenizer=tokenizer,
        validate_every_epochs=1, num_validation_instances=-1,
        num_epochs=args.num_epochs, lr=args.lr, sacred_ex=None)
    _, _, softmax_output = ranker.predict(dataloader)
    softmax_output_by_query = utils.acumulate_list(softmax_output[0], len(all_documents))

    # save output
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if "dev" in args.test_path:
        run_file_path = "%s/dev_ranked_q.txt" % args.output_dir
    else:
        run_file_path = "%s/test_ranked_q.txt" % args.output_dir
    all_doc_ids = np.array(question_bank["question_id"].values[1:])
    with open(run_file_path, 'w') as fo:
        for tid_idx, tid in enumerate(data['topic_id'].unique()):
            all_documents_scores = np.array(softmax_output_by_query[tid_idx])
            print("tid:", tid)
            
            top_30_scores_idx = (-all_documents_scores).argsort()[:30]  
            preds_score = list(all_documents_scores[top_30_scores_idx])
            preds = list(all_doc_ids[top_30_scores_idx])
            #print("softmax_score:", preds_score)
            #print("preds:", preds)
            #query = data.loc[data['topic_id']==tid, 'initial_request'].tolist()[0]
            #best_q = get_best_q(query, question_bank)
            #best_qid = random.choice([best_q, "Q00001"])
            
            if preds_score[0] < 0.962:
                best_qid = "Q00001"
                preds = preds[:-1]
                preds.insert(0, best_qid)
            else:
                last_qid = "Q00001"
                preds = preds[:-1]
                preds.append(last_qid)
            for i, qid in enumerate(preds):    
                fo.write('{} 0 {} {} {} BERT-based-v2\n'.format(tid, qid, i, len(preds)-i))
    print("saved results to [%s]" % run_file_path)
Пример #23
0
def xdoc_convert_examples_to_features(processor,
                                      examples,
                                      tokenizer,
                                      max_length,
                                      label_list,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):

    if label_list is None: label_list = processor.get_labels()

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for ex_index, example in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % ex_index)
        inputs = tokenizer.encode_plus(example.text_a,
                                       example.text_b,
                                       add_special_tokens=True,
                                       max_length=max_length)
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]

        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        padding_length = max_length - len(input_ids)
        input_ids = input_ids + ([pad_token] * padding_length)
        attention_mask = attention_mask + (
            [0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                           padding_length)

        assert len(input_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(input_ids), max_length)
        assert len(attention_mask
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(attention_mask), max_length)
        assert len(token_type_ids
                   ) == max_length, "Error with input length {} vs {}".format(
                       len(token_type_ids), max_length)

        label = label_map[example.label]
        if ex_index < 3:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("attention_mask: %s" %
                        " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" %
                        " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label))

    return features
Пример #24
0
def convert_examples_to_features(examples, tokenizer,
                                      max_length=512,
                                      label_list=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True,
                                      sample_negatives=False):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    if examples[0].text_b is not None:
        k = len(examples[0].text_b)
    if sample_negatives:
        neg_indices = [np.random.choice(len(examples), size=len(examples), replace=False) for i in range(k)]
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)

        if type(example.text_a) is list:
            text_a = example.text_a
            text_b = [example.text_b]*len(text_a)
        elif type(example.text_b) is list:
            text_b = example.text_b
            if sample_negatives:
                label_idx = label_map[example.label]
                text_b_neg = [(examples[neg_indices[i][ex_index]]).text_b[label_idx] for i in range(k)]
                text_b_neg[label_idx] = text_b[label_idx]

            text_a = [example.text_a]*len(text_b)
        else:
            text_a = [example.text_a]
            text_b = [example.text_b]

        if 0: #sample_negatives:
            print ('Created negative samples')
            print ('Original example: label:{} text_a: {} text_b1: {}, 2: {}, 3:{}'.format(example.label, text_a[0], text_b[0], text_b[1], text_b[2])) 
            print ('Converted example: text_a: {} text_b1: {}, 2: {}, 3:{}'.format(text_a[0], text_b_neg[0], text_b_neg[1], text_b_neg[2])) 

        def get_indices(t1, t2):
            out = []
            for a,b in zip(t1, t2):
                inputs = tokenizer.encode_plus(
                    a,
                    b,
                    add_special_tokens=True,
                    max_length=max_length,
                )
                input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

                # The mask has 1 for real tokens and 0 for padding tokens. Only real
                # tokens are attended to.
                attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

                # Zero-pad up to the sequence length.
                padding_length = max_length - len(input_ids)
                if pad_on_left:
                    input_ids = ([pad_token] * padding_length) + input_ids
                    attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
                    token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
                else:
                    input_ids = input_ids + ([pad_token] * padding_length)
                    attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
                    token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

                assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
                assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
                assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
                out.append((input_ids, attention_mask, token_type_ids))

            if len(t1) == 1:
                input_ids, attention_mask, token_type_ids = out[0]
            else:
                input_ids, attention_mask, token_type_ids = zip(*out)
            return input_ids, attention_mask, token_type_ids
        
        input_ids, attention_mask, token_type_ids = get_indices(text_a, text_b)
        if sample_negatives:
            input_ids_n, attention_mask_n, token_type_ids_n = get_indices(text_a, text_b_neg)

        label = label_map[example.label]

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label))

        if sample_negatives:
            features.append(
                    InputFeatures(input_ids=input_ids_n,
                                  attention_mask=attention_mask_n,
                                  token_type_ids=token_type_ids_n,
                                  label=label))

    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
                yield  ({'input_ids': ex.input_ids,
                         'attention_mask': ex.attention_mask,
                         'token_type_ids': ex.token_type_ids},
                        ex.label)

        return tf.data.Dataset.from_generator(gen,
            ({'input_ids': tf.int32,
              'attention_mask': tf.int32,
              'token_type_ids': tf.int32},
             tf.int64),
            ({'input_ids': tf.TensorShape([None]),
              'attention_mask': tf.TensorShape([None]),
              'token_type_ids': tf.TensorShape([None])},
             tf.TensorShape([])))

    return features
def convert_examples_to_features(examples, 
                                 tokenizer,
                                 max_length=512,
                                 task=None,
                                 label_list=None,
                                 output_mode=None,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True

    if task is not None:
        processor = processors[task]()            
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        else:
            try:
                processor.set_labels(label_list)
            except:
                pass
        if output_mode is None:
            output_mode = output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
            example = processor.tfds_map(example)

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

        if output_mode == "classification":
            label = label_map[example.label]
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label))

    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
                yield  ({'input_ids': ex.input_ids,
                         'attention_mask': ex.attention_mask,
                         'token_type_ids': ex.token_type_ids},
                        ex.label)

        return tf.data.Dataset.from_generator(gen,
            ({'input_ids': tf.int32,
              'attention_mask': tf.int32,
              'token_type_ids': tf.int32},
             tf.int64),
            ({'input_ids': tf.TensorShape([None]),
              'attention_mask': tf.TensorShape([None]),
              'token_type_ids': tf.TensorShape([None])},
             tf.TensorShape([])))

    return features
Пример #26
0
    def convert_examples_to_features(cls,
                                     examples,
                                     tokenizer,
                                     max_length,
                                     pad_token_segment_id,
                                     pad_token,
                                     mask_padding_with_zero=True):

        features = []
        for ex_index, example in enumerate(examples):
            inputs = tokenizer.encode_plus(
                example.text_a,
                None,
                add_special_tokens=True,
                max_length=max_length,
            )
            input_ids, token_type_ids = inputs["input_ids"], inputs[
                "token_type_ids"]

            attention_mask = [1 if mask_padding_with_zero else 0
                              ] * len(input_ids)

            padding_length = max_length - len(input_ids)
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + (
                [0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                               padding_length)

            assert len(
                input_ids
            ) == max_length, "Error with input length {} vs {}".format(
                len(input_ids), max_length)
            assert len(
                attention_mask
            ) == max_length, "Error with input length {} vs {}".format(
                len(attention_mask), max_length)
            assert len(
                token_type_ids
            ) == max_length, "Error with input length {} vs {}".format(
                len(token_type_ids), max_length)

            if ex_index < 3:
                logger.info("*** Example ***")
                logger.info("guid: %s" % (example.guid))
                logger.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
                logger.info("attention_mask: %s" %
                            " ".join([str(x) for x in attention_mask]))
                logger.info("token_type_ids: %s" %
                            " ".join([str(x) for x in token_type_ids]))

            features.append(
                InputFeatures(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    label=None,
                ))

        return features
Пример #27
0
def convert_single_example(ex_index, example, label_list, max_seq_length,
                           tokenizer):

    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[example.label]

    feature = InputFeatures(input_ids=input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=None,
                            label=label_id)
    return feature
Пример #28
0
def fever_convert_examples_to_features(
    examples,
    tokenizer,
    max_length=512,
    task=None,
    label_list=None,
    output_mode=None,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: FEVER task
        label_list: List of labels. Can be obtained from the processor using the
            ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or
            ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left
            rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is
            usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be
            filled by ``1`` for actual values and by ``0`` for padded values. If
            set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        A list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    if task is not None:
        processor = fever_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = fever_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))

    for (ex_index, example) in enumerate(examples):
        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

        if output_mode == "classification":
            label_map = {label: i for i, label in enumerate(label_list)}
            label = label_map[example.label]
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        yield InputFeatures(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            label=label,
        )
Пример #29
0
 def __getitem__(self, idx):
     inputs = {k: self.batch_encoding[k][idx] for k in self.batch_encoding}
     return InputFeatures(**inputs, label=self.batch_labels[idx])
Пример #30
0
    def _cache_instances_pickle(self):
        """
        WARNING: This is no longer being used as a cache method and memmap should be prefered
        Loads tensors into memory or creates the dataset when it does not exist already.        
        """        
        signature = "pointwise_set_{}_n_cand_docs_{}_ns_sampler_{}_seq_max_l_{}_sample_{}_for_{}_using_{}.pk".\
            format(self.data_partition,
                   self.negative_sampler.num_candidates_samples,
                   self.negative_sampler.name,
                   self.max_seq_len,
                   self.sample_data,
                   self.task_type,
                   self.tokenizer.__class__.__name__)
        path = self.cache_path + "/" + signature

        if os.path.exists(path):
            with open(path, 'rb') as f:
                logging.info("Loading instances from {}".format(path))
                self.instances = pickle.load(f)
        else:
            logging.info("Generating instances with signature {}".format(signature))

            #Creating labels (currently there is support only for binary relevance)
            if self.task_type == "classification":
                relevant_label = 1
                not_relevant_label = 0
            elif self.task_type == "generation":
                relevant_label = "relevant </s>"
                not_relevant_label = "not_relevant  </s>"
            labels = []
            for r in self.data.itertuples(index=False):
                labels+=([relevant_label] * len(r[1])) #relevant documents are grouped at the second column.
                labels+=([not_relevant_label] * (self.negative_sampler.num_candidates_samples)) # each query has N negative samples.

            examples = []
            for idx, row in enumerate(tqdm(self.data.itertuples(index=False), total=len(self.data))):
                query = row[0]
                relevant_documents = row[1]
                for relevant_document in relevant_documents:
                    examples.append((query, relevant_document))
                ns_candidates, _ , _ , _, _ = self.negative_sampler.sample(query, relevant_documents)
                for ns in ns_candidates:
                    examples.append((query, ns))

            logging.info("Encoding examples using tokenizer.batch_encode_plus().")
            batch_encoding = self.tokenizer(examples, max_length=self.max_seq_len,
                                                      padding="max_length", truncation=True)

            if self.task_type == "generation": 
                target_encodings = self.tokenizer(labels,
                    max_length=10, padding="max_length", truncation=True)
                target_encodings = {
                        "target_ids": target_encodings["input_ids"],
                        "target_attention_mask": target_encodings["attention_mask"]
                    }

            logging.info("Transforming examples to instances format.")
            self.instances = []
            for i in range(len(examples)):
                inputs = {k: batch_encoding[k][i] for k in batch_encoding}
                if self.task_type == "generation":
                    targets = {k: target_encodings[k][i] for k in target_encodings}
                    feature = {**inputs, **targets}
                elif self.task_type == "classification":
                    feature = InputFeatures(**inputs, label=labels[i])
                self.instances.append(feature)
            for idx in range(3):
                logging.info("Set {} Instance {} query \n\n{}[...]\n".format(self.data_partition, idx, examples[idx][0][0:200]))
                logging.info("Set {} Instance {} document \n\n{}\n".format(self.data_partition, idx, examples[idx][1][0:200]))
                logging.info("Set {} Instance {} features \n\n{}\n".format(self.data_partition, idx, self.instances[idx]))
            with open(path, 'wb') as f:
                pickle.dump(self.instances, f)

        logging.info("Total of {} instances were cached.".format(len(self.instances)))