def _convert_example_to_record(self, example, max_seq_length, tokenizer, phase=None): tokens = tokenization.convert_to_unicode(example.text_a).split(u"") if phase != "predict": labels = tokenization.convert_to_unicode(example.label).split(u"") tokens, labels = self._reseg_token_label(tokens=tokens, labels=labels, tokenizer=tokenizer, phase=phase) if len(tokens) > max_seq_length - 2: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] tokens = ["[CLS]"] + tokens + ["[SEP]"] token_ids = tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(len(token_ids))) text_type_ids = [0] * len(token_ids) no_entity_id = len(self.label_map) - 1 label_ids = [no_entity_id ] + [self.label_map[label] for label in labels] + [no_entity_id] Record = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) record = Record(token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids, label_ids=label_ids) else: tokens = self._reseg_token_label(tokens=tokens, tokenizer=tokenizer, phase=phase) if len(tokens) > max_seq_length - 2: tokens = tokens[0:(max_seq_length - 2)] tokens = ["[CLS]"] + tokens + ["[SEP]"] token_ids = tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(len(token_ids))) text_type_ids = [0] * len(token_ids) Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids']) record = Record( token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids, ) return record
def _convert_text_to_feature(self, text): """ Convert the raw text to feature which is needed to run program (feed_vars). """ text_a = convert_to_unicode(text) tokens_a = self.tokenizer.tokenize(text_a) max_seq_len = 512 # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_len - 2: tokens_a = tokens_a[0:(max_seq_len - 2)] tokens = [] text_type_ids = [] tokens.append("[CLS]") text_type_ids.append(0) for token in tokens_a: tokens.append(token) text_type_ids.append(0) tokens.append("[SEP]") text_type_ids.append(0) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(len(token_ids))) task_ids = [0] * len(token_ids) padded_token_ids, input_mask = pad_batch_data([token_ids], max_seq_len=max_seq_len, pad_idx=self.pad_id, return_input_mask=True) padded_text_type_ids = pad_batch_data([text_type_ids], max_seq_len=max_seq_len, pad_idx=self.pad_id) padded_position_ids = pad_batch_data([position_ids], max_seq_len=max_seq_len, pad_idx=self.pad_id) padded_task_ids = pad_batch_data([task_ids], max_seq_len=max_seq_len, pad_idx=self.pad_id) feature = [ padded_token_ids, padded_position_ids, padded_text_type_ids, input_mask, padded_task_ids ] return feature
def _convert_example_to_record(self, example, max_seq_length, tokenizer, phase=None): """Converts a single `Example` into a single `Record`.""" text_a = tokenization.convert_to_unicode(example.text_a) tokens_a = tokenizer.tokenize(text_a) tokens_b = None if example.text_b is not None: #if "text_b" in example._fields: text_b = tokenization.convert_to_unicode(example.text_b) tokens_b = tokenizer.tokenize(text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] text_type_ids = [] tokens.append("[CLS]") text_type_ids.append(0) for token in tokens_a: tokens.append(token) text_type_ids.append(0) tokens.append("[SEP]") text_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) text_type_ids.append(1) tokens.append("[SEP]") text_type_ids.append(1) token_ids = tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(len(token_ids))) label_ids = [] for label in example.label: label_ids.append(int(label)) if phase != "predict": Record = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) record = Record(token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids, label_ids=label_ids) else: Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids']) record = Record(token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids) return record
def _convert_example_to_record(self, example, max_seq_length, tokenizer, phase=None): """Converts a single `Example` into a single `Record`.""" text_a = tokenization.convert_to_unicode(example.text_a) tokens_a = tokenizer.tokenize(text_a) tokens_b = None if example.text_b is not None: #if "text_b" in example._fields: text_b = tokenization.convert_to_unicode(example.text_b) tokens_b = tokenizer.tokenize(text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT/ERNIE is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] text_type_ids = [] tokens.append("[CLS]") text_type_ids.append(0) for token in tokens_a: tokens.append(token) text_type_ids.append(0) tokens.append("[SEP]") text_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) text_type_ids.append(1) tokens.append("[SEP]") text_type_ids.append(1) token_ids = tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(len(token_ids))) if self.label_map: if example.label not in self.label_map: raise KeyError("example.label = {%s} not in label" % example.label) label_id = self.label_map[example.label] else: label_id = example.label Record = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id']) if phase != "predict": Record = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id']) record = Record(token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids, label_id=label_id) else: Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids']) record = Record(token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids) return record