def batcher(params, batch):
    #print ('batch size' ,len(batch))
    batch = [sent if sent != [] else ['.'] for sent in batch]
    batch = [' '.join(sent) for sent in batch]
    #print ('batch', batch)
    examples = []
    unique_id = 0
    #print ('batch size ', len(batch))
    for sent in batch:
        sent = sent.strip()
        text_b = None
        text_a = sent
        examples.append(
            InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1

    features = convert_examples_to_features(examples,
                                            params['bert'].seq_length,
                                            tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)

    all_encoder_layers, _ = params['bert'](all_input_ids,
                                           token_type_ids=None,
                                           attention_mask=all_input_mask)

    ###get z_vec
    #get the output of previous layer
    prev_out = all_encoder_layers[params['bert'].layer_no - 1]
    #print ('prev_out.shape ', prev_out.shape)
    #print ('all_input_mask.shape ', all_input_mask.shape)
    ##apply self-attention to it

    extended_attention_mask = all_input_mask.cuda().unsqueeze(1).unsqueeze(2)
    extended_attention_mask = extended_attention_mask.to(
        dtype=next(params['bert'].parameters()).dtype)
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

    embeddings = next(params['bert'].children()).encoder.layer[
        params['bert'].layer_no].attention.self(prev_out,
                                                extended_attention_mask)
    ##do mean/max pooling

    embeddings = embeddings.detach().mean(1).cpu().numpy()
    #print ('befor shape', embeddings.shape)
    if params['bert'].head_no is not None:
        if params['bert'].head_no == 'random':
            embeddings = embeddings[:, params['bert'].randidx]
        else:
            embeddings = embeddings[:, 64 * params['bert'].head_no:64 *
                                    (params['bert'].head_no + 1)]
    #print ('after shape', embeddings.shape)

    #print ('embeddings.shape ', embeddings.shape)
    #print ('finished a batch \n\n')

    return embeddings
Exemplo n.º 2
0
    def get_examples(self, sents):
        """
        Read sentences.

        Args:
            sents (str): A list of sentences

        Return:
            A list of InputExample
        """
        examples = []
        unique_id = 0
        for sent in sents:
            line = tokenization.convert_to_unicode(sent)
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a,
                             text_b=text_b))
            unique_id += 1
        return examples
    def __getitem__(self, item):
        cur_id = self.sample_counter
        self.sample_counter += 1
        if not self.on_memory:
            # after one epoch we start again from beginning of file
            if cur_id != 0 and (cur_id % len(self) == 0):
                self.file.close()
                self.file = open(self.corpus_path, "r", encoding=self.encoding)

        t1, t2, is_next_label = self.random_sent(item)

        # tokenize
        tokens_a = self.tokenizer.tokenize(t1)
        tokens_b = self.tokenizer.tokenize(t2)

        # combine to one sample
        cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label)

        # transform sample to features
        cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)

        cur_tensors = (torch.tensor(cur_features.input_ids),
                       torch.tensor(cur_features.input_mask),
                       torch.tensor(cur_features.segment_ids),
                       torch.tensor(cur_features.lm_label_ids),
                       torch.tensor(cur_features.is_next))

        return cur_tensors
Exemplo n.º 4
0
 def __convert_text_to_examples(self, text):
     unique_id = 0
     examples = []
     paragraphs = text.split('\n')
     for sentence in paragraphs:
         examples.append(InputExample(unique_id=unique_id, text_a=self.__normalize_text(sentence), text_b=None))
         unique_id += 1
     return examples
Exemplo n.º 5
0
def read_sequence(input_sentences):
    examples = []
    unique_id = 0
    for sentence in input_sentences:
        line = tokenization.convert_to_unicode(sentence)
        examples.append(InputExample(unique_id=unique_id, text_a=line, text_b=None))
        unique_id += 1
    return examples
def preprocess(text):
    text_a = text
    example = InputExample(unique_id=None, text_a=text_a, text_b=None)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)
    feature = convert_examples_to_features([example], max_token_len,
                                           tokenizer)[0]
    input_ids = np.reshape([feature.input_ids], (1, max_token_len))
    return {"inputs": {"input_ids": input_ids.tolist()}}
Exemplo n.º 7
0
def preprocess(text):

    text_a = text
    example = InputExample(unique_id=None, text_a=text_a, text_b=None)
    feature = convert_examples_to_features([example], max_token_len,
                                           tokenizer)[0]
    """4: 从上一步的信息可知,输入的key是input_ids,维度是(1,400),同时外包[]表示batch size 为1 """
    input_ids = np.reshape([feature.input_ids], (1, max_token_len))
    return input_ids
Exemplo n.º 8
0
def preprocess(text, flag):
    if flag == 'class':
        text_a = " ".join(tokenizer.tokenize(text))
        example = InputExample(unique_id=None, text_a=text_a, text_b=None)
        feature = convert_class_to_features(example, max_token_len, tokenizer)
    else:
        example = InputSeqExample(guid=None, text_token=text, token_label=None)
        feature = convert_seq_to_features(example, max_token_len, tokenizer,
                                          label_list)
    input_ids = np.reshape([feature.input_ids], (1, max_token_len))
    return {"inputs": {"input_ids": input_ids.tolist()}}
def preprocess(text):
    """
 function: preprocess text into input numpy array
    """
    vocab_file = os.environ.get("vocab_file", "./dependency/vocab.txt")
    max_token_len = os.environ.get("max_token_len", 128)
    text_a = text
    example = InputExample(unique_id=None, text_a=text_a, text_b=None)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
    feature = convert_examples_to_features([example], max_token_len, tokenizer)[0]
    input_ids = np.reshape([feature.input_ids], (1, max_token_len))
    return {"inputs": {"input_ids": input_ids.tolist()}}
Exemplo n.º 10
0
    def _create_examples(self, texts_a, texts_b, labels, set_type):
        """
        Creates examples for the training and dev sets.

        Parameters
        ----------
        texts_a : list
            list of input texts_a (e.g the first `sentence` for BERT)
        texts_b : list, (list of None if not required)
            list of input texts_b (e.g the second `sentence` for BERT)
        labels : list
            list of input labels
        set_type : str
            specifies whether the set is 'train' or 'dev'

        Returns
        -------
        list
            list of InputExample objects

        Raises
        ------
        ValueError
            if the length of the texts and length of the labels are
            incompatible
        """
        examples = []
        for i, (text_a, text_b,
                label) in enumerate(zip(texts_a, texts_b, labels)):
            unique_id = f"{self.data_name}-{set_type}-{i}"
            text_a = convert_to_unicode(text_a)
            if text_b is not None:
                text_b = convert_to_unicode(text_b)
            if label is not None:
                if isinstance(label, str) and '[' and ']' in label:
                    # If we have a list of labels as a string convert into an actual list e.g
                    # "['list', 'of', 'strings']"" -> ['list', 'of', 'strings']
                    label = ast.literal_eval(label)
                label = convert_to_unicode(label)
            examples.append(
                InputExample(unique_id=unique_id,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples
Exemplo n.º 11
0
def read_examples(str_io):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    while True:
        line = tokenization.convert_to_unicode(str_io.readline())
        if not line:
            break
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        examples.append(InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1
    return examples
def convert_lines_to_examples(lines):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    for line in lines:
        line = tokenization.convert_to_unicode(line)
        if not line:
            continue
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        examples.append(
            InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1
    return examples