def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = line[0] # guid = "%s-%s" % (set_type, line[0]) if set_type != "test": try: text_a = tokenization.preprocess_text( line[3], lower=FLAGS.do_lower_case) text_b = tokenization.preprocess_text( line[4], lower=FLAGS.do_lower_case) label = tokenization.preprocess_text( line[5], lower=FLAGS.do_lower_case) except IndexError: continue else: text_a = tokenization.preprocess_text( line[1], lower=FLAGS.do_lower_case) text_b = tokenization.preprocess_text( line[2], lower=FLAGS.do_lower_case) label = "0" examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue if set_type != "test": guid = "%s-%s" % (set_type, i) text_a = tokenization.preprocess_text( line[0], lower=FLAGS.do_lower_case) label = tokenization.preprocess_text(line[1], lower=FLAGS.do_lower_case) else: guid = tokenization.preprocess_text(line[0], lower=FLAGS.do_lower_case) # guid = "%s-%s" % (set_type, line[0]) text_a = tokenization.preprocess_text( line[1], lower=FLAGS.do_lower_case) label = "0" examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def process_text(self, text): if self.use_spm: return tokenization.preprocess_text(text, lower=self.do_lower_case, uncased=self.do_uncased) else: return tokenization.convert_to_unicode(text)
def _lcs_match(max_dist, n=n, m=m): """Longest-common-substring algorithm.""" f.fill(0) g.clear() ### longest common sub sequence # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j)) for i in range(n): # note(zhiliny): # unlike standard LCS, this is specifically optimized for the setting # because the mismatch between sentence pieces and original text will # be small for j in range(i - max_dist, i + max_dist): if j >= m or j < 0: continue if i > 0: g[(i, j)] = 0 f[i, j] = f[i - 1, j] if j > 0 and f[i, j - 1] > f[i, j]: g[(i, j)] = 1 f[i, j] = f[i, j - 1] f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0 if (tokenization.preprocess_text(paragraph_text[i], lower=do_lower_case, remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]): g[(i, j)] = 2 f[i, j] = f_prev + 1
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.gfile.GFile(input_file, FLAGS.input_file_mode) as reader: while True: line = reader.readline() if not FLAGS.spm_model_file: line = tokenization.convert_to_unicode(line) if not line: break if FLAGS.spm_model_file: line = tokenization.preprocess_text( line, lower=FLAGS.do_lower_case) else: line = line.strip() # Empty lines are used as document delimiters if not line: if not len(all_documents) % 10000: print( f"{basename(input_file)}: {len(all_documents):7d}") all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) print(f"{basename(input_file)} Done reading\n") # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] for i in range(dupe_factor): for document_index in range(len(all_documents)): if not document_index % 1000: print(f"%s doc %d %5d/%d" % (basename(input_file), i, document_index + 1, len(all_documents))) instances.extend( create_instances_from_document(all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) rng.shuffle(instances) tf.logging.info("Done instances") return instances
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue # Note(mingdachen): We will rely on this guid for GLUE submission. guid = tokenization.preprocess_text(line[0], lower=FLAGS.do_lower_case) text_a = tokenization.preprocess_text(line[8], lower=FLAGS.do_lower_case) text_b = tokenization.preprocess_text(line[9], lower=FLAGS.do_lower_case) if set_type == "test": label = "contradiction" else: label = tokenization.preprocess_text(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "train.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.preprocess_text(line[0]) text_b = tokenization.preprocess_text(line[1]) label = tokenization.preprocess_text(line[2]) if label == tokenization.preprocess_text("contradictory"): label = tokenization.preprocess_text("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.preprocess_text(line[0]) if language != tokenization.preprocess_text(self.language): continue text_a = tokenization.preprocess_text(line[6]) text_b = tokenization.preprocess_text(line[7]) label = tokenization.preprocess_text(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": guid = line[0] text_a = tokenization.preprocess_text( line[1], lower=FLAGS.do_lower_case) label = "0" else: text_a = tokenization.preprocess_text( line[3], lower=FLAGS.do_lower_case) label = tokenization.preprocess_text(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] labels = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.preprocess_text(line[2]) text_b = tokenization.preprocess_text(line[3]) if set_type == "test": label = "0" else: label = tokenization.preprocess_text(line[1]) labels.append(label) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples, labels
def create_training_instances(input_files, tokenizer, max_seq_length, short_seq_prob, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: line_num = 0 with tf.gfile.GFile(input_file, FLAGS.input_file_mode) as reader: while True: print('Reading line ' + str(line_num)) line = reader.readline() if not FLAGS.spm_model_file: line = tokenization.convert_to_unicode(line) if not line: break if FLAGS.spm_model_file: line = tokenization.preprocess_text( line, lower=FLAGS.do_lower_case) else: line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) line_num = line_num + 1 # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) print('all_documents length = ' + str(len(all_documents))) vocab_words = list(tokenizer.vocab.keys()) instances = [] for document_index in range(len(all_documents)): print('Creating instance for doc ' + str(document_index)) instances.extend( create_instances_from_document(all_documents, document_index, max_seq_length, short_seq_prob, vocab_words, rng)) rng.shuffle(instances) return instances
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = tokenization.preprocess_text(line[0], lower=FLAGS.do_lower_case) # guid = "%s-%s" % (set_type, line[0]) text_a = tokenization.preprocess_text(line[1], lower=FLAGS.do_lower_case) text_b = tokenization.preprocess_text(line[2], lower=FLAGS.do_lower_case) if set_type == "test_matched": label = "entailment" else: label = tokenization.preprocess_text(line[-1], lower=FLAGS.do_lower_case) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_docs(input_files, num_docs): docs = [[]] for file in input_files: tf.logging.info("Reading file:{}".format(file)) with open(file, 'r', encoding='utf-8') as f: for line in f: if FLAGS.spm_model_file: line = tokenization.preprocess_text( line, lower=FLAGS.do_lower_case) else: line = tokenization.convert_to_unicode(line).strip() if line and not line.startswith('#'): # tokens = tokenizer.tokenize(line) docs[-1].append(line) elif docs[-1]: if num_docs != 0 and len(docs) == num_docs: yield docs docs = [[]] else: docs.append([]) yield docs docs = [[]]
def preprocess_text(self, text): if self.use_spm: return tokenization.preprocess_text(text, lower=self.do_lower_case) else: return tokenization.preprocess_text(text)
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, output_fn, do_lower_case): """Loads a data file into a list of `InputBatch`s.""" cnt_pos, cnt_neg = 0, 0 unique_id = 1000000000 max_n, max_m = 1024, 1024 f = np.zeros((max_n, max_m), dtype=np.float32) for (example_index, example) in enumerate(examples): if example_index % 100 == 0: tf.logging.info("Converting {}/{} pos {} neg {}".format( example_index, len(examples), cnt_pos, cnt_neg)) query_tokens = tokenization.encode_ids( tokenizer.sp_model, tokenization.preprocess_text(example.question_text, lower=do_lower_case)) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] paragraph_text = example.paragraph_text para_tokens = tokenization.encode_pieces(tokenizer.sp_model, tokenization.preprocess_text( example.paragraph_text, lower=do_lower_case), return_unicode=False) chartok_to_tok_index = [] tok_start_to_chartok_index = [] tok_end_to_chartok_index = [] char_cnt = 0 para_tokens = [ six.ensure_text(token, "utf-8") for token in para_tokens ] for i, token in enumerate(para_tokens): # print("paragraphs token:",para_tokens) # print("****token:",token) # print("**len token:",len(token)) new_token = six.ensure_text(token).replace( tokenization.SPIECE_UNDERLINE.decode("utf-8"), " ") # print("**new_token",new_token) chartok_to_tok_index.extend([i] * len(new_token)) # print("**chartok_to_tok_index:",chartok_to_tok_index) tok_start_to_chartok_index.append(char_cnt) # print("**len tok_start_to_chartok_index:",len(tok_start_to_chartok_index)) # print("**tok_start_to_chartok_index:",tok_start_to_chartok_index) # print("**char_cnt",char_cnt) char_cnt += len(new_token) tok_end_to_chartok_index.append(char_cnt - 1) # print("**tok_end_to_chartok_index",tok_end_to_chartok_index) tok_cat_text = "".join(para_tokens).replace( tokenization.SPIECE_UNDERLINE.decode("utf-8"), " ") # print("tok_cat_text:",tok_cat_text) n, m = len(paragraph_text), len(tok_cat_text) # print("n:",n) # print("m:",m) if n > max_n or m > max_m: max_n = max(n, max_n) max_m = max(m, max_m) f = np.zeros((max_n, max_m), dtype=np.float32) # print("f:",f) g = {} def _lcs_match(max_dist, n=n, m=m): """Longest-common-substring algorithm.""" f.fill(0) g.clear() ### longest common sub sequence # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j)) for i in range(n): # note(zhiliny): # unlike standard LCS, this is specifically optimized for the setting # because the mismatch between sentence pieces and original text will # be small for j in range(i - max_dist, i + max_dist): if j >= m or j < 0: continue if i > 0: g[(i, j)] = 0 f[i, j] = f[i - 1, j] if j > 0 and f[i, j - 1] > f[i, j]: g[(i, j)] = 1 f[i, j] = f[i, j - 1] f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0 if (tokenization.preprocess_text(paragraph_text[i], lower=do_lower_case, remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]): g[(i, j)] = 2 f[i, j] = f_prev + 1 max_dist = abs(n - m) + 5 for _ in range(2): _lcs_match(max_dist) if f[n - 1, m - 1] > 0.8 * n: break max_dist *= 2 orig_to_chartok_index = [None] * n chartok_to_orig_index = [None] * m i, j = n - 1, m - 1 while i >= 0 and j >= 0: if (i, j) not in g: break if g[(i, j)] == 2: orig_to_chartok_index[i] = j chartok_to_orig_index[j] = i i, j = i - 1, j - 1 elif g[(i, j)] == 1: j = j - 1 else: i = i - 1 if (all(v is None for v in orig_to_chartok_index) or f[n - 1, m - 1] < 0.8 * n): tf.logging.info("MISMATCH DETECTED!") continue tok_start_to_orig_index = [] tok_end_to_orig_index = [] # print("para token:",para_tokens) for i in range(len(para_tokens)): # start_chartok_pos = tok_start_to_chartok_index[i] end_chartok_pos = tok_end_to_chartok_index[i] start_orig_pos = _convert_index(chartok_to_orig_index, start_chartok_pos, n, is_start=True) end_orig_pos = _convert_index(chartok_to_orig_index, end_chartok_pos, n, is_start=False) # print("start_orig_pos:",start_orig_pos) # print("end_orig_pos:",end_orig_pos) tok_start_to_orig_index.append(start_orig_pos) tok_end_to_orig_index.append(end_orig_pos) if not is_training: tok_start_position = tok_end_position = None if is_training and example.is_impossible: tok_start_position = 0 tok_end_position = 0 if is_training and not example.is_impossible: start_position = example.start_position end_position = start_position + len(example.orig_answer_text) - 1 start_chartok_pos = _convert_index(orig_to_chartok_index, start_position, is_start=True) tok_start_position = chartok_to_tok_index[start_chartok_pos] # print("tok_start_position:",tok_start_position) end_chartok_pos = _convert_index(orig_to_chartok_index, end_position, is_start=False) tok_end_position = chartok_to_tok_index[end_chartok_pos] assert tok_start_position <= tok_end_position def _piece_to_id(x): if six.PY2 and isinstance(x, six.text_type): x = six.ensure_binary(x, "utf-8") return tokenizer.sp_model.PieceToId(x) all_doc_tokens = list(map(_piece_to_id, para_tokens)) # print("all_doc_tokens:",all_doc_tokens) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_is_max_context = {} segment_ids = [] p_mask = [] cur_tok_start_to_orig_index = [] cur_tok_end_to_orig_index = [] tokens.append(tokenizer.sp_model.PieceToId("[CLS]")) segment_ids.append(0) p_mask.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) p_mask.append(1) # print("tokens:",tokens) # print("segment_ids:",segment_ids) tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) segment_ids.append(0) p_mask.append(1) for i in range(doc_span.length): split_token_index = doc_span.start + i cur_tok_start_to_orig_index.append( tok_start_to_orig_index[split_token_index]) cur_tok_end_to_orig_index.append( tok_end_to_orig_index[split_token_index]) is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) # print("i:",i) # print("is_max_context:",is_max_context) # print("tokens:",tokens) # print("split_token_index",split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) p_mask.append(0) tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) segment_ids.append(1) p_mask.append(1) paragraph_len = len(tokens) input_ids = tokens # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) p_mask.append(1) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length span_is_impossible = example.is_impossible start_position = None end_position = None if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True print("out_of_span:", out_of_span) if out_of_span: # continue print("chet chet") start_position = 0 end_position = 0 span_is_impossible = True else: doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if is_training and span_is_impossible: start_position = 0 end_position = 0 if example_index < 20: tf.logging.info("*** Example ***") tf.logging.info("unique_id: %s" % (unique_id)) tf.logging.info("example_index: %s" % (example_index)) tf.logging.info("doc_span_index: %s" % (doc_span_index)) tf.logging.info( "tok_start_to_orig_index: %s" % " ".join([str(x) for x in cur_tok_start_to_orig_index])) tf.logging.info( "tok_end_to_orig_index: %s" % " ".join([str(x) for x in cur_tok_end_to_orig_index])) tf.logging.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) tf.logging.info( "input_pieces: %s" % " ".join([tokenizer.sp_model.IdToPiece(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training and span_is_impossible: tf.logging.info("impossible example span") if is_training and not span_is_impossible: pieces = [ tokenizer.sp_model.IdToPiece(token) for token in tokens[start_position:(end_position + 1)] ] answer_text = tokenizer.sp_model.DecodePieces(pieces) tf.logging.info("start_position: %d" % (start_position)) tf.logging.info("end_position: %d" % (end_position)) tf.logging.info("answer: %s" % (tokenization.printable_text(answer_text))) # note(zhiliny): With multi processing, # the example_index is actually the index within the current process # therefore we use example_index=None to avoid being used in the future. # The current code does not use example_index of training data. if is_training: feat_example_index = None else: feat_example_index = example_index feature = InputFeatures( unique_id=unique_id, example_index=feat_example_index, doc_span_index=doc_span_index, tok_start_to_orig_index=cur_tok_start_to_orig_index, tok_end_to_orig_index=cur_tok_end_to_orig_index, token_is_max_context=token_is_max_context, tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens], input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, paragraph_len=paragraph_len, start_position=start_position, end_position=end_position, is_impossible=span_is_impossible, p_mask=p_mask) # Run callback output_fn(feature) unique_id += 1 if span_is_impossible: cnt_neg += 1 else: cnt_pos += 1 tf.logging.info("Total number of instances: {} = pos {} neg {}".format( cnt_pos + cnt_neg, cnt_pos, cnt_neg))