def get_text(revision, strip=True): """Extract the text from a revision. Args: revision: a string strip: a boolean Returns: a string """ # text start tag looks like "<text ..otherstuff>" start_pos = revision.find("<text") assert start_pos != -1 end_tag_pos = revision.find(">", start_pos) assert end_tag_pos != -1 end_tag_pos += len(">") end_pos = revision.find("</text>") if end_pos == -1: ret = "" else: ret = revision[end_tag_pos:end_pos] if strip: ret = strip_text(ret) ret = text_encoder.to_unicode_utf8(ret) return ret
def example_generator(self, filename): for idx, line in enumerate(tf.gfile.Open(filename, "rb")): if idx == 0: continue # skip header line = text_encoder.to_unicode_utf8(line.strip()) _, s1, s2, l = line.split("\t") inputs = [s1, s2] yield {"inputs": inputs, "label": int(l)}
def example_generator(self, filename): for line in tf.gfile.Open(filename, "rb"): line = text_encoder.to_unicode_utf8(line.strip()) _, label, _, sent = line.split("\t") yield { "inputs": sent, "label": int(label) }
def example_generator(self, filename): label_list = self.class_labels(data_dir=None) for line in tf.gfile.Open(filename, "rb"): line = text_encoder.to_unicode_utf8(line.strip()) split_line = line.split("\t") s1, s2 = split_line[:2] l = label_list.index(split_line[2]) inputs = [s1, s2] yield {"inputs": inputs, "label": l}
def example_generator(self, filename): label_list = self.class_labels(data_dir=None) for idx, line in enumerate(tf.gfile.Open(filename, "rb")): if idx == 0: continue # skip header line = text_encoder.to_unicode_utf8(line.strip()) _, s1, s2, l = line.split("\t") inputs = [s1, s2] l = label_list.index(l) yield {"inputs": inputs, "label": l}
def example_generator(self, filename): label_list = self.class_labels(data_dir=None) for idx, line in enumerate(tf.gfile.Open(filename, "rb")): if idx == 0: continue # skip header line = text_encoder.to_unicode_utf8(line.strip()) split_line = line.split("\t") # Works for both splits even though dev has some extra human labels. s1, s2 = split_line[8:10] l = label_list.index(split_line[-1]) inputs = [s1, s2] yield {"inputs": inputs, "label": l}
def example_generator(self, filename, dev_ids, dataset_split): for idx, line in enumerate(tf.gfile.Open(filename, "rb")): if idx == 0: continue # skip header line = text_encoder.to_unicode_utf8(line.strip()) l, id1, id2, s1, s2 = line.split("\t") is_dev = [id1, id2] in dev_ids if dataset_split == problem.DatasetSplit.TRAIN and is_dev: continue if dataset_split == problem.DatasetSplit.EVAL and not is_dev: continue inputs = [[s1, s2], [s2, s1]] for inp in inputs: yield {"inputs": inp, "label": int(l)}
def get_title(page): """Extract the title from a page. Args: page: a string Returns: a string """ start_pos = page.find("<title>") end_pos = page.find("</title>") assert start_pos != -1 assert end_pos != -1 start_pos += len("<title>") return text_encoder.to_unicode_utf8(page[start_pos:end_pos])
def example_generator(self, filename): skipped = 0 for idx, line in enumerate(tf.gfile.Open(filename, "rb")): if idx == 0: continue # skip header line = text_encoder.to_unicode_utf8(line.strip()) split_line = line.split("\t") if len(split_line) < 6: skipped += 1 tf.logging.info("Skipping %d" % skipped) continue s1, s2, l = split_line[3:] # A neat data augmentation trick from Radford et al. (2018) # https://blog.openai.com/language-unsupervised/ inputs = [[s1, s2], [s2, s1]] for inp in inputs: yield {"inputs": inp, "label": int(l)}
def _example_generator(filename): """Generate mnli examples. Args: filename: a string Yields: dictionaries containing "premise", "hypothesis" and "label" strings """ for idx, line in enumerate(tf.gfile.Open(filename, "rb")): if idx == 0: continue # skip header line = text_encoder.to_unicode_utf8(line.strip()) split_line = line.split("\t") # Works for both splits even though dev has some extra human labels. yield { "premise": split_line[8], "hypothesis": split_line[9], "label": split_line[-1] }
def example_generator(all_files, urls_path, sum_token): """Generate examples.""" def fix_run_on_sents(line): if u"@highlight" in line: return line if not line: return line if line[-1] in END_TOKENS: return line return line + u"." filelist = example_splits(urls_path, all_files) story_summary_split_token = u" <summary> " if sum_token else " " for story_file in filelist: story = [] summary = [] reading_highlights = False for line in tf.gfile.Open(story_file, "rb"): line = text_encoder.to_unicode_utf8(line.strip()) line = fix_run_on_sents(line) if not line: continue elif line.startswith(u"@highlight"): if not story: break # No article text. reading_highlights = True elif reading_highlights: summary.append(line) else: story.append(line) if (not story) or not summary: continue yield " ".join(story) + story_summary_split_token + " ".join(summary)