def _convert_y(self, y): target_ids = [] sos_id = self.tokenizer.convert_tokens_to_ids(['<s>'])[0] eos_id = self.tokenizer.convert_tokens_to_ids(['</s>'])[0] for _y in y: if isinstance(_y, str): _target_ids = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(_y)) elif isinstance(_y, list): assert isinstance( _y[0], str), ('Machine translation module only supports ' 'single sentence inputs.') _target_ids = self.tokenizer.convert_tokens_to_ids(_y) utils.truncate_segments([_target_ids], self.target_max_seq_length - 2, truncate_method=self.truncate_method) _target_ids = [sos_id] + _target_ids + [eos_id] if len(_target_ids) < self.target_max_seq_length: _target_ids.extend([ 0 for _ in range(self.target_max_seq_length - len(_target_ids)) ]) target_ids.append(_target_ids) return target_ids
def _convert_X(self, X_target, tokenized): # tokenize input texts segment_input_tokens = [] for ex_id, example in enumerate(X_target): try: segment_input_tokens.append(self._convert_x( example, tokenized)) except Exception: tf.logging.warning('Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) # If `max_seq_length` is not mannually assigned, # the value will be set to the maximum length of # `input_ids`. if not self.max_seq_length: max_seq_length = 0 for segments in segment_input_tokens: # subtract `[CLS]` and `[SEP]s` seq_length = sum([len(seg) + 1 for seg in segments]) + 1 max_seq_length = max(max_seq_length, seq_length) self.max_seq_length = max_seq_length tf.logging.info('Adaptive max_seq_length: %d' % self.max_seq_length) input_ids = [] input_mask = [] segment_ids = [] for ex_id, segments in enumerate(segment_input_tokens): _input_ids = [] _input_mask = [] _segment_ids = [] utils.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_ids.extend( self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID]) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) _input_ids.append(CLS_ID) _input_mask.append(1) _segment_ids.append(SEG_ID_CLS) # padding if len(_input_ids) < self.max_seq_length: delta_len = self.max_seq_length - len(_input_ids) _input_ids = [0] * delta_len + _input_ids _input_mask = [1] * delta_len + _input_mask # it's 1, no error _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) return input_ids, input_mask, segment_ids
def _convert_X(self, X_target, tokenized): input_ids = [] input_mask = [] segment_ids = [] # tokenize input texts for ex_id, example in enumerate(X_target): _input_tokens = self._convert_x(example, tokenized) utils.truncate_segments([_input_tokens], self.max_seq_length, truncate_method=self.truncate_method) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) _input_mask = [0 for _ in range(len(_input_tokens))] _segment_ids = [0 for _ in range(len(_input_tokens))] # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.insert(0, 0) _input_mask.insert(0, 1) # it's 1, no error _segment_ids.insert(0, SEG_ID_PAD) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) return input_ids, input_mask, segment_ids
def _convert_X(self, X_target, tokenized, is_training): input_ids = [] input_mask = [] label_ids = [] dupe_factor = self._dupe_factor if is_training else 1 for _ in range(dupe_factor): for ex_id, example in enumerate(X_target): try: _input_tokens = self._convert_x(example, tokenized) except Exception: tf.logging.warning( 'Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) _input_tokens = ['[CLS]'] + _input_tokens _input_ids = self.tokenizer.convert_tokens_to_ids( _input_tokens) utils.truncate_segments([_input_ids], self.max_seq_length, truncate_method=self.truncate_method) nonpad_seq_length = len(_input_ids) _input_mask = [1] * nonpad_seq_length if nonpad_seq_length < self.max_seq_length: _input_ids.extend( [0] * (self.max_seq_length - nonpad_seq_length)) _input_mask.extend( [0] * (self.max_seq_length - nonpad_seq_length)) _dilated_ids = [] _dilated_mask = [] _label_ids = [] for i, _input_id in enumerate(_input_ids): _dilated_ids.extend([_input_id, 0]) _dilated_mask.extend([_input_mask[i], _input_mask[i]]) _label_ids.extend([_input_id, 0]) # replace/add/subtract if is_training: max_replace = int(nonpad_seq_length * self._replace_prob) max_add = int(nonpad_seq_length * self._add_prob) max_subtract = int(nonpad_seq_length * self._subtract_prob) sample_wrong_tokens(_dilated_ids, _dilated_mask, _label_ids, max_replace, max_add, max_subtract, nonpad_seq_length=nonpad_seq_length, vocab_size=len(self.tokenizer.vocab)) input_ids.append(_dilated_ids) input_mask.append(_dilated_mask) label_ids.append(_label_ids) return input_ids, input_mask, label_ids
def _convert_X(self, X_target, tokenized): # tokenize input texts segment_input_tokens = [] for ex_id, example in enumerate(X_target): try: segment_input_tokens.append(self._convert_x( example, tokenized)) except Exception: raise ValueError('Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) input_ids = [] input_mask = [] segment_ids = [] for ex_id, segments in enumerate(segment_input_tokens): _input_ids = [] _input_mask = [] _segment_ids = [] utils.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_ids.extend( self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID]) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) _input_ids.append(CLS_ID) _input_mask.append(1) _segment_ids.append(SEG_ID_CLS) # padding if len(_input_ids) < self.max_seq_length: delta_len = self.max_seq_length - len(_input_ids) _input_ids = [0] * delta_len + _input_ids _input_mask = [1] * delta_len + _input_mask # it's 1, no error _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) return input_ids, input_mask, segment_ids
def _convert_X(self, X_target, tokenized): # tokenize input texts segment_input_tokens = [] for ex_id, example in enumerate(X_target): try: segment_input_tokens.append(self._convert_x( example, tokenized)) except Exception: raise ValueError('Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) input_ids = [] input_mask = [] segment_ids = [] for ex_id, segments in enumerate(segment_input_tokens): _input_tokens = [] _input_ids = [] _input_mask = [] _segment_ids = [] utils.truncate_segments(segments, self.max_seq_length - len(segments), truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) return input_ids, input_mask, segment_ids
def _convert_X(self, X_target, tokenized): input_ids = [] for ex_id, example in enumerate(X_target): try: _input_tokens = self._convert_x(example, tokenized) except Exception: tf.logging.warning('Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) utils.truncate_segments([_input_ids], self.max_seq_length - 1, truncate_method=self.truncate_method) _input_ids.append(self._eos_id) if len(_input_ids) < self.max_seq_length: _input_ids.extend( [0 for _ in range(self.max_seq_length - len(_input_ids))]) input_ids.append(_input_ids) return input_ids
def _convert_X(self, X_target, tokenized): # tokenize input texts segment_input_values = [] for ex_id, example in enumerate(X_target): try: segment_input_values.append(self._convert_x(example)) except Exception: raise ValueError( 'Wrong input format (line %d): \'%s\'. An example: ' '`X_tokenized = [[[0.0023, -0.0001, 0.0015, ...], ...], ' '...]`' % (ex_id, example)) input_values = [] input_mask = [] for ex_id, segments in enumerate(segment_input_values): _input_values = [] _input_mask = [] utils.truncate_segments([segments], self.max_seq_length - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): assert len(segment) == self.max_unit_length, ( '`max_unit_length` must be equal to the input length of ' 'each time spot.') _input_values.append(segment) _input_mask.append(1) # padding _input_mask.append(1) for _ in range(self.max_seq_length - 1 - len(_input_values)): _input_values.append([0] * self.max_unit_length) _input_mask.append(0) input_values.append(_input_values) input_mask.append(_input_mask) return input_values, input_mask
def _convert_X(self, X_target, tokenized): source_ids = [] for ex_id, example in enumerate(X_target): try: _source_tokens = self._convert_x(example, tokenized) except Exception: raise ValueError('Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) _source_ids = self.tokenizer.convert_tokens_to_ids(_source_tokens) utils.truncate_segments([_source_ids], self.source_max_seq_length, truncate_method=self.truncate_method) if len(_source_ids) < self.source_max_seq_length: _source_ids.extend([ 0 for _ in range(self.source_max_seq_length - len(_source_ids)) ]) source_ids.append(_source_ids) return source_ids
def _convert_X(self, X_target, tokenized): # tokenize input texts segment_input_tokens = [] for ex_id, example in enumerate(X_target): try: segment_input_tokens.append(self._convert_x( example, tokenized)) except Exception: raise ValueError( 'Wrong input format (line %d): \'%s\'. ' 'An untokenized example: ' '`X = [{\'doc\': \'...\', \'question\': \'...\', ...}, ' '...]`' % (ex_id, example)) # backup for answer mapping if self._on_predict: self._input_tokens = [] input_ids = [] input_mask = [] sa_mask = [] segment_ids = [] doc_ids = [] doc_text = [] doc_start = [] for ex_id, segments in enumerate(segment_input_tokens): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _segment_ids = [0] _sa_mask = np.zeros((self.max_seq_length, self.max_seq_length), dtype=np.int32) _sa_mask[0, 0] = 1 _doc_sent_tokens = segments.pop('doc') _doc_sent_len = len(_doc_sent_tokens) segments = list(segments.values()) + _doc_sent_tokens utils.truncate_segments(segments, self.max_seq_length - len(segments) - _doc_sent_len - 2, truncate_method=self.truncate_method) _doc_sent_tokens = segments[-_doc_sent_len:] segments = segments[:-_doc_sent_len] for s_id, segment in enumerate(segments): _segment_len = len(segment) + 1 _start_pos = len(_input_tokens) _end_pos = _start_pos + len(segment) _sa_mask[_start_pos:_end_pos, _start_pos:_end_pos] = 1 _sa_mask[_end_pos, _end_pos] = 1 _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([min(s_id, 1)] * (len(segment) + 1)) _doc_start = len(_input_tokens) if not tokenized: _split_tokens = self.tokenizer.tokenize(self.split_sign) else: _split_tokens = [] for s_id, segment in enumerate(_doc_sent_tokens): _segment_len = len(segment) + len(_split_tokens) _start_pos = len(_input_tokens) _end_pos = _start_pos + _segment_len _sa_mask[_start_pos:_end_pos, _start_pos:_end_pos] = 1 _input_tokens.extend(segment + _split_tokens) _input_mask.extend([1] * _segment_len) _segment_ids.extend([1] * _segment_len) _input_tokens.append('[SEP]') _input_mask.append(1) _segment_ids.append(1) # backup for answer mapping if self._on_predict: self._input_tokens.append(_input_tokens) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) _doc_ids = _input_ids[_doc_start:-1] # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) sa_mask.append(np.reshape(_sa_mask, [-1]).tolist()) segment_ids.append(_segment_ids) doc_ids.append(_doc_ids) doc_text.append(X_target[ex_id]['doc']) doc_start.append(_doc_start) return (input_ids, input_mask, sa_mask, segment_ids, doc_ids, doc_text, doc_start)
def _convert_X(self, X_target, is_training, tokenized): # tokenize input texts segment_input_tokens = [] for ex_id, example in enumerate(X_target): try: segment_input_tokens.append(self._convert_x( example, tokenized)) except Exception: tf.logging.warning('Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) input_ids = [] input_mask = [] segment_ids = [] masked_lm_positions = [] masked_lm_ids = [] masked_lm_weights = [] # duplicate raw inputs if is_training and self.dupe_factor > 1: new_segment_input_tokens = [] for _ in range(self.dupe_factor): new_segment_input_tokens.extend( copy.deepcopy(segment_input_tokens)) segment_input_tokens = new_segment_input_tokens for ex_id, segments in enumerate(segment_input_tokens): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _segment_ids = [0] _masked_lm_positions = [] _masked_lm_ids = [] _masked_lm_weights = [] utils.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) # random sampling of masked tokens if is_training: (_input_tokens, _masked_lm_positions, _masked_lm_labels) = \ create_masked_lm_predictions( tokens=_input_tokens, masked_lm_prob=self.masked_lm_prob, max_predictions_per_seq=self._max_predictions_per_seq, vocab_words=list(self.tokenizer.vocab.keys()), do_whole_word_mask=self.do_whole_word_mask) _masked_lm_ids = \ self.tokenizer.convert_tokens_to_ids(_masked_lm_labels) _masked_lm_weights = [1.0] * len(_masked_lm_positions) # padding for _ in range(self._max_predictions_per_seq - len(_masked_lm_positions)): _masked_lm_positions.append(0) _masked_lm_ids.append(0) _masked_lm_weights.append(0.0) else: # `masked_lm_positions` is required for both training # and inference of BERT language modeling. for i in range(len(_input_tokens)): if _input_tokens[i] == '[MASK]': _masked_lm_positions.append(i) # padding for _ in range(self._max_predictions_per_seq - len(_masked_lm_positions)): _masked_lm_positions.append(0) for _ in range(self._max_predictions_per_seq): _masked_lm_ids.append(0) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) masked_lm_positions.append(_masked_lm_positions) masked_lm_ids.append(_masked_lm_ids) masked_lm_weights.append(_masked_lm_weights) return (input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights)
def _convert_X(self, X_target, tokenized): # tokenize input texts segment_input_tokens = [] for ex_id, example in enumerate(X_target): try: segment_input_tokens.append(self._convert_x( example, tokenized)) except Exception: raise ValueError( 'Wrong input format (line %d): \'%s\'. ' 'An untokenized example: ' '`X = [{\'doc\': \'...\', \'question\': \'...\', ...}, ' '...]`' % (ex_id, example)) # backup for answer mapping if self._on_predict: self._input_tokens = [] input_ids = [] input_mask = [] query_mask = [] segment_ids = [] doc_ids = [] doc_text = [] doc_start = [] for ex_id, segments in enumerate(segment_input_tokens): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _query_mask = [1] _segment_ids = [0] _doc_tokens = segments.pop('doc') segments = list(segments.values()) + [_doc_tokens] utils.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) _doc_tokens = segments[-1] for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) if s_id == 0: _query_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) _doc_start = len(_input_tokens) - len(_doc_tokens) - 1 # backup for answer mapping if self._on_predict: self._input_tokens.append(_input_tokens) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) _doc_ids = _input_ids[_doc_start:-1] # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) for _ in range(self.max_seq_length - len(_query_mask)): _query_mask.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) query_mask.append(_query_mask) segment_ids.append(_segment_ids) doc_ids.append(_doc_ids) doc_text.append(X_target[ex_id]['doc']) doc_start.append(_doc_start) return (input_ids, input_mask, query_mask, segment_ids, doc_ids, doc_text, doc_start)
def _convert_X(self, X_target, tokenized): # tokenize input texts segment_inputs = [] for ex_id, example in enumerate(X_target): try: segment_inputs.append( {'Wide': example['Wide'], 'Deep': self._convert_x(example['Deep'], tokenized)}) except Exception: raise ValueError( 'Wrong input format (line %d): \'%s\'. An untokenized ' 'example: X = [{\'Wide\': [1, 5, \'positive\'], ' '\'Deep\': \'I bet she will win.\'}, ...]' % (ex_id, example)) if self.wide_features is None: self.wide_features = set() for segments in segment_inputs: for feature in segments['Wide']: self.wide_features.add(feature) self.wide_features = list(self.wide_features) elif not isinstance(self.wide_features, list): raise ValueError( '`wide_features` should be a list of possible values ' '(integer or string). ' 'E.g. [1, \'Positive\', \'Subjective\'].') wide_features_map = { self.wide_features[i]: i + 1 for i in range(len(self.wide_features))} input_ids = [] input_mask = [] segment_ids = [] n_wide_features = [] wide_features = [] for ex_id, segments in enumerate(segment_inputs): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _segment_ids = [0] _wide_features = [] for feature in segments['Wide']: try: _wide_features.append(wide_features_map[feature]) except: tf.logging.warning( 'Unregistered wide feature: %s. Ignored.' % feature) continue _n_wide_features = len(_wide_features) segments = segments['Deep'] utils.truncate_segments( segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) for _ in range(len(self.wide_features) - _n_wide_features): _wide_features.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) n_wide_features.append(_n_wide_features) wide_features.append(_wide_features) return (input_ids, input_mask, segment_ids, n_wide_features, wide_features)
def _convert_X(self, X_target, is_training, tokenized): # tokenize input texts segment_input_tokens = [] for ex_id, example in enumerate(X_target): if self.mode in ('l2r', 'r2l'): info = '`l2r` or `r2l` only supports single sentence inputs.' if not tokenized: assert isinstance(example, str), info else: assert isinstance(example[0], str), info elif self.mode == 's2s': info = '`s2s` only supports 2-sentence inputs.' assert len(example) == 2, info try: segment_input_tokens.append( self._convert_x(example, tokenized)) except Exception: raise ValueError( 'Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) input_ids = [] input_mask = [] segment_ids = [] masked_lm_positions = [] masked_lm_ids = [] masked_lm_weights = [] next_sentence_labels = [] # random sampling of next sentence if is_training and self.mode == 'bi' and self.do_sample_next_sentence: new_segment_input_tokens = [] for ex_id in range(len(segment_input_tokens)): instances = create_instances_from_document( all_documents=segment_input_tokens, document_index=ex_id, max_seq_length=self.max_seq_length - 3, masked_lm_prob=self.masked_lm_prob, max_predictions_per_seq=self._max_predictions_per_seq, short_seq_prob=self.short_seq_prob, vocab_words=list(self.tokenizer.vocab.keys())) for (segments, is_random_next) in instances: new_segment_input_tokens.append(segments) next_sentence_labels.append(is_random_next) segment_input_tokens = new_segment_input_tokens else: next_sentence_labels = [1] * len(segment_input_tokens) for ex_id, segments in enumerate(segment_input_tokens): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _segment_ids = [0] _masked_lm_positions = [] _masked_lm_ids = [] _masked_lm_weights = [] utils.truncate_segments( segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) # special values for `_input_tokens` and `input_mask` if self.mode == 's2s': _input_tokens.pop() _input_tokens.append('[EOS]') _input_mask = [len(_input_ids)] * (len(segments[0]) + 2) for i in range(len(segments[1]) + 1): _input_mask.append(_input_mask[0] + i + 1) # random sampling of masked tokens if is_training: if (ex_id + 1) % 10000 == 0: tf.logging.info( 'Sampling masks of input %d' % (ex_id + 1)) (_input_tokens, _masked_lm_positions, _masked_lm_labels) = \ create_masked_lm_predictions( tokens=_input_tokens, masked_lm_prob=self.masked_lm_prob, max_predictions_per_seq=self._max_predictions_per_seq, vocab_words=list(self.tokenizer.vocab.keys()), do_whole_word_mask=self.do_whole_word_mask) _masked_lm_ids = \ self.tokenizer.convert_tokens_to_ids(_masked_lm_labels) _masked_lm_weights = [1.0] * len(_masked_lm_positions) # padding for _ in range(self._max_predictions_per_seq - len(_masked_lm_positions)): _masked_lm_positions.append(0) _masked_lm_ids.append(0) _masked_lm_weights.append(0.0) else: # `masked_lm_positions` is required for both training # and inference of BERT language modeling. for i in range(len(_input_tokens)): if _input_tokens[i] == '[MASK]': _masked_lm_positions.append(i) # padding for _ in range(self._max_predictions_per_seq - len(_masked_lm_positions)): _masked_lm_positions.append(0) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) masked_lm_positions.append(_masked_lm_positions) masked_lm_ids.append(_masked_lm_ids) masked_lm_weights.append(_masked_lm_weights) return (input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels)
def _convert_X_reimp(self, X_target, y, tokenized): # tokenize input texts sup_ori_input_tokens = [] aug_input_tokens = [] is_supervised = [] for ex_id, example in enumerate(X_target): try: label = y[ex_id] if label is None: assert len(example) == 2 sup_ori_input_tokens.append( self._convert_x(example[0], tokenized)) aug_input_tokens.append( self._convert_x(example[1], tokenized)) is_supervised.append(0) else: sup_ori_input_tokens.append( self._convert_x(example, tokenized)) aug_input_tokens.append([]) is_supervised.append(1) except AssertionError: raise AssertionError( 'Must have exactly two inputs for an ' 'unsupervised example, respectively original ' 'and augmented.') except Exception: raise ValueError('Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) input_ids = [] input_mask = [] segment_ids = [] for ex_id, segments in enumerate(sup_ori_input_tokens): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _segment_ids = [0] utils.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) aug_input_ids = [] aug_input_mask = [] aug_segment_ids = [] for ex_id, segments in enumerate(aug_input_tokens): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _segment_ids = [0] utils.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) aug_input_ids.append(_input_ids) aug_input_mask.append(_input_mask) aug_segment_ids.append(_segment_ids) return (input_ids, input_mask, segment_ids, aug_input_ids, aug_input_mask, aug_segment_ids, is_supervised)
def _convert_X(self, X_target, is_training, tokenized): # tokenize input texts segment_input_tokens = [] for ex_id, example in enumerate(X_target): try: segment_input_tokens.append(self._convert_x( example, tokenized)) except Exception: raise ValueError('Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) input_ids = [] input_mask = [] segment_ids = [] masked_lm_positions = [] masked_lm_ids = [] masked_lm_weights = [] # random sampling of next sentence if is_training: new_segment_input_tokens = [] for ex_id in range(len(segment_input_tokens)): instances = create_instances_from_document( all_documents=segment_input_tokens, document_index=ex_id, max_seq_length=self.max_seq_length - 2, masked_lm_prob=self.masked_lm_prob, max_predictions_per_seq=self._max_predictions_per_seq, vocab_words=list(self.tokenizer.vocab.keys())) for segments in instances: new_segment_input_tokens.append(segments) segment_input_tokens = new_segment_input_tokens for ex_id, segments in enumerate(segment_input_tokens): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _segment_ids = [0] _masked_lm_positions = [] _masked_lm_ids = [] _masked_lm_weights = [] utils.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) # random sampling of masked tokens if is_training: if (ex_id + 1) % 10000 == 0: tf.logging.info('Sampling masks of input %d' % (ex_id + 1)) (_input_tokens, _masked_lm_positions, _masked_lm_labels) = \ create_masked_lm_predictions( tokens=_input_tokens, masked_lm_prob=self.masked_lm_prob, max_predictions_per_seq=self._max_predictions_per_seq, vocab_words=list(self.tokenizer.vocab.keys()), do_whole_word_mask=self.do_whole_word_mask) _masked_lm_ids = \ self.tokenizer.convert_tokens_to_ids(_masked_lm_labels) _masked_lm_weights = [1.0] * len(_masked_lm_positions) # padding for _ in range(self._max_predictions_per_seq - len(_masked_lm_positions)): _masked_lm_positions.append(0) _masked_lm_ids.append(0) _masked_lm_weights.append(0.0) else: # `masked_lm_positions` is required for both training # and inference of BERT language modeling. for i in range(len(_input_tokens)): if _input_tokens[i] == '[MASK]': _masked_lm_positions.append(i) # padding for _ in range(self._max_predictions_per_seq - len(_masked_lm_positions)): _masked_lm_positions.append(0) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) masked_lm_positions.append(_masked_lm_positions) masked_lm_ids.append(_masked_lm_ids) masked_lm_weights.append(_masked_lm_weights) return (input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights)
def _convert_X(self, X_target, tokenized, is_training): # backup for answer mapping if self._on_predict: self._input_tokens = [] # tokenize input texts and scan over corpus tokenized_input_ids = [] vocab_size = len(self.tokenizer.vocab) vocab_ind = list(range(vocab_size)) vocab_p = [0] * vocab_size for ex_id, sample in enumerate(X_target): _input_tokens = self._convert_x(sample, tokenized) # skip noise training data if is_training: if len(_input_tokens) == 0 or \ len(_input_tokens) > self.max_seq_length: continue else: utils.truncate_segments([_input_tokens], self.max_seq_length, truncate_method=self.truncate_method) # backup for answer mapping if self._on_predict: self._input_tokens.append(_input_tokens) # count char _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) if is_training: for _input_id in _input_ids: vocab_p[_input_id] += 1 tokenized_input_ids.append(_input_ids) if is_training: vocab_p_sum = sum(vocab_p) vocab_p = [n / vocab_p_sum for n in vocab_p] input_ids = [] add_label_ids = [] del_label_ids = [] for ex_id in range(len(tokenized_input_ids)): _input_ids = tokenized_input_ids[ex_id] nonpad_seq_length = len(_input_ids) for _ in range(self.max_seq_length - nonpad_seq_length): _input_ids.append(0) _add_label_ids = [] _del_label_ids = [] # add/del if is_training: if (ex_id + 1) % 10000 == 0: tf.logging.info('Sampling wrong tokens of input %d' % (ex_id + 1)) _add_label_ids = [0] * self.max_seq_length _del_label_ids = [0] * self.max_seq_length max_add = np.sum( np.random.random(nonpad_seq_length) < self._add_prob) max_del = np.sum( np.random.random(nonpad_seq_length) < self._del_prob) sample_wrong_tokens(_input_ids, _add_label_ids, _del_label_ids, max_add=max_add, max_del=max_del, nonpad_seq_length=nonpad_seq_length, vocab_size=vocab_size, vocab_ind=vocab_ind, vocab_p=vocab_p) input_ids.append(_input_ids) add_label_ids.append(_add_label_ids) del_label_ids.append(_del_label_ids) return input_ids, add_label_ids, del_label_ids
def _convert_X(self, X_target, is_training, tokenized): # tokenize input texts segment_input_tokens = [] for ex_id, example in enumerate(X_target): try: segment_input_tokens.append(self._convert_x( example, tokenized)) except Exception: tf.logging.warning('Wrong input format (line %d): \'%s\'. ' % (ex_id, example)) # If `max_seq_length` is not mannually assigned, # the value will be set to the maximum length of # `input_ids`. if not self.max_seq_length: max_seq_length = 0 for segments in segment_input_tokens: # subtract `[CLS]` and `[SEP]s` seq_length = sum([len(seg) + 1 for seg in segments]) + 1 max_seq_length = max(max_seq_length, seq_length) self.max_seq_length = max_seq_length tf.logging.info('Adaptive max_seq_length: %d' % self.max_seq_length) input_ids = [] input_mask = [] segment_ids = [] masked_lm_positions = [] masked_lm_ids = [] masked_lm_weights = [] sentence_order_labels = [] # duplicate raw inputs if is_training and self._dupe_factor > 1: new_segment_input_tokens = [] for _ in range(self._dupe_factor): new_segment_input_tokens.extend( copy.deepcopy(segment_input_tokens)) segment_input_tokens = new_segment_input_tokens # random sampling of next sentence if is_training and self._do_sample_sentence: new_segment_input_tokens = [] for ex_id in range(len(segment_input_tokens)): instances = create_instances_from_document( all_documents=segment_input_tokens, document_index=ex_id, max_seq_length=self.max_seq_length - 3, masked_lm_prob=self._masked_lm_prob, max_predictions_per_seq=self._max_predictions_per_seq, short_seq_prob=self._short_seq_prob, vocab_words=list(self.tokenizer.vocab.keys())) for (segments, is_random_next) in instances: new_segment_input_tokens.append(segments) sentence_order_labels.append(is_random_next) segment_input_tokens = new_segment_input_tokens for ex_id, segments in enumerate(segment_input_tokens): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _segment_ids = [0] _masked_lm_positions = [] _masked_lm_ids = [] _masked_lm_weights = [] utils.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) # random sampling of masked tokens if is_training: (_input_tokens, _masked_lm_positions, _masked_lm_labels) = \ create_masked_lm_predictions( tokens=_input_tokens, masked_lm_prob=self._masked_lm_prob, max_predictions_per_seq=self._max_predictions_per_seq, vocab_words=list(self.tokenizer.vocab.keys()), ngram=self._ngram, favor_shorter_ngram=self._favor_shorter_ngram, do_permutation=self._do_permutation, do_whole_word_mask=self._do_whole_word_mask) _masked_lm_ids = \ self.tokenizer.convert_tokens_to_ids(_masked_lm_labels) _masked_lm_weights = [1.0] * len(_masked_lm_positions) # padding for _ in range(self._max_predictions_per_seq * (1 + self._do_permutation) - len(_masked_lm_positions)): _masked_lm_positions.append(0) _masked_lm_ids.append(0) _masked_lm_weights.append(0.0) else: # `masked_lm_positions` is required for both training # and inference of BERT language modeling. for i in range(len(_input_tokens)): if _input_tokens[i] == '[MASK]': _masked_lm_positions.append(i) # padding for _ in range(self._max_predictions_per_seq - len(_masked_lm_positions)): _masked_lm_positions.append(0) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) masked_lm_positions.append(_masked_lm_positions) masked_lm_ids.append(_masked_lm_ids) masked_lm_weights.append(_masked_lm_weights) return (input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights, sentence_order_labels)
def _convert_X(self, X_target, tokenized): # tokenize input texts segment_inputs = [] for ex_id, example in enumerate(X_target): try: assert len(example['Text']) == len(example['Sem']) if isinstance(example['Text'][0], list): for i in range(len(example['Text'])): assert len(example['Text'][i]) == len(example['Sem'][i]) sem = copy.deepcopy(example['Sem']) if not isinstance(sem[0], list): sem = [sem] segment_inputs.append( {'Sem': sem, 'Text': self._convert_x(example['Text'], tokenized)}) except Exception: raise ValueError( 'Wrong input format (line %d): %s. An example: ' 'X_tokenized = [{\'Sem\': [\'n\', \'v\', \'n\'], ' '\'Text\': [\'I\', \'love\', \'you\']}, ...]' % (ex_id, example)) if self.sem_features is None: self.sem_features = set() for segments in segment_inputs: for segment in segments['Sem']: for feature in segment: self.sem_features.add(feature) self.sem_features = list(self.sem_features) elif not isinstance(self.sem_features, list): raise ValueError( '`sem_features` should be a list of possible values ' '(integer or string). E.g. [\'n\', \'v\', \'adj\'].') sem_features_map = { self.sem_features[i]: i + 3 for i in range(len(self.sem_features))} input_ids = [] input_mask = [] segment_ids = [] sem_features = [] for ex_id, segments in enumerate(segment_inputs): _input_tokens = ['[CLS]'] _input_ids = [] _input_mask = [1] _segment_ids = [0] _sem_features = [1] # same as [CLS] utils.truncate_segments( segments['Text'], self.max_seq_length - len(segments['Text']) - 1, truncate_method=self.truncate_method) for s_id, segment in enumerate(segments['Text']): _segment_id = min(s_id, 1) _input_tokens.extend(segment + ['[SEP]']) _input_mask.extend([1] * (len(segment) + 1)) _segment_ids.extend([_segment_id] * (len(segment) + 1)) _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) for i in range(len(segments['Sem'])): segment = segments['Sem'][i] n = len(segments['Text'][i]) for feature in segment[:n]: try: _sem_features.append(sem_features_map[feature]) except: tf.logging.warning( 'Unregistered semantic feature: %s. Ignored.' % feature) continue _sem_features.append(2) # same as [SEP] # padding for _ in range(self.max_seq_length - len(_input_ids)): _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) _sem_features.append(0) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) sem_features.append(_sem_features) return (input_ids, input_mask, segment_ids, sem_features)