def __call__(self, line): """Perform transformation for sequence pairs or single sequences. The transformation is processed in the following steps: - tokenize the input sequences - insert [CLS], [SEP] as necessary - generate type ids to indicate whether a token belongs to the first sequence or the second sequence. - generate valid length For sequence pairs, the input is a tuple of 3 strings: text_a, text_b and label. Inputs: text_a: 'is this jacksonville ?' text_b: 'no it is not' label: '0' Tokenization: text_a: 'is this jack ##son ##ville ?' text_b: 'no it is not .' Processed: tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]' type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 valid_length: 14 label: 0 For single sequences, the input is a tuple of 2 strings: text_a and label. Inputs: text_a: 'the dog is hairy .' label: '1' Tokenization: text_a: 'the dog is hairy .' Processed: text_a: '[CLS] the dog is hairy . [SEP]' type_ids: 0 0 0 0 0 0 0 valid_length: 7 label: 1 Parameters ---------- line: tuple of str Input strings. For sequence pairs, the input is a tuple of 3 strings: (text_a, text_b, label). For single sequences, the input is a tuple of 2 strings: (text_a, label). Returns ------- np.array: input token ids in 'int32', shape (batch_size, seq_length) np.array: valid length in 'int32', shape (batch_size,) np.array: input token type ids in 'int32', shape (batch_size, seq_length) np.array: label id in 'int32', shape (batch_size, 1) """ label = line[-1] label = convert_to_unicode(label) label_id = self._label_map[label] label_id = np.array([label_id], dtype='int32') input_ids, valid_length, segment_ids = self._bert_xform(line[:-1]) return input_ids, valid_length, segment_ids, label_id
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with io.open(input_file, 'r', encoding='UTF-8') as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = tokenizer.vocab.idx_to_token instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) rng.shuffle(instances) return instances
def __call__(self, line): """Perform transformation for sequence pairs or single sequences. The transformation is processed in the following steps: - tokenize the input sequences - insert [CLS], [SEP] as necessary - generate type ids to indicate whether a token belongs to the first sequence or the second sequence. - generate valid length For sequence pairs, the input is a tuple of 2 strings: text_a, text_b. Inputs: text_a: 'is this jacksonville ?' text_b: 'no it is not' Tokenization: text_a: 'is this jack ##son ##ville ?' text_b: 'no it is not .' Processed: tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]' type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 valid_length: 14 For single sequences, the input is a tuple of single string: text_a. Inputs: text_a: 'the dog is hairy .' Tokenization: text_a: 'the dog is hairy .' Processed: text_a: '[CLS] the dog is hairy . [SEP]' type_ids: 0 0 0 0 0 0 0 valid_length: 7 Parameters ---------- line: tuple of str Input strings. For sequence pairs, the input is a tuple of 3 strings: (text_a, text_b). For single sequences, the input is a tuple of single string: (text_a,). Returns ------- np.array: input token ids in 'int32', shape (batch_size, seq_length) np.array: valid length in 'int32', shape (batch_size,) np.array: input token type ids in 'int32', shape (batch_size, seq_length) """ # convert to unicode text_a = line[0] text_a = convert_to_unicode(text_a) if self._pair: assert len(line) == 2 text_b = line[1] text_b = convert_to_unicode(text_b) tokens_a = self._tokenizer.tokenize(text_a) tokens_b = None if self._pair: tokens_b = self._tokenizer.tokenize(text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, self._max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self._max_seq_length - 2: tokens_a = tokens_a[0:(self._max_seq_length - 2)] # The embedding vectors for `type=0` and `type=1` were learned during # pre-training and are added to the wordpiece embedding vector # (and position vector). This is not *strictly* necessary since # the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append('[CLS]') segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append('[SEP]') segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append('[SEP]') segment_ids.append(1) input_ids = self._tokenizer.convert_tokens_to_ids(tokens) # The valid length of sentences. Only real tokens are attended to. valid_length = len(input_ids) if self._pad: # Zero-pad up to the sequence length. padding_length = self._max_seq_length - valid_length # use padding tokens for the rest input_ids.extend([self._tokenizer.vocab['[PAD]']] * padding_length) segment_ids.extend([self._tokenizer.vocab['[PAD]']] * padding_length) return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\ np.array(segment_ids, dtype='int32')