Пример #1
0
def get_text(revision, strip=True):
    """Extract the text from a revision.

  Args:
    revision: a string
    strip: a boolean

  Returns:
    a string
  """
    # text start tag looks like "<text ..otherstuff>"
    start_pos = revision.find("<text")
    assert start_pos != -1
    end_tag_pos = revision.find(">", start_pos)
    assert end_tag_pos != -1
    end_tag_pos += len(">")
    end_pos = revision.find("</text>")
    if end_pos == -1:
        ret = ""
    else:
        ret = revision[end_tag_pos:end_pos]
    if strip:
        ret = strip_text(ret)
    ret = text_encoder.to_unicode_utf8(ret)
    return ret
Пример #2
0
 def example_generator(self, filename):
     for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
         if idx == 0: continue  # skip header
         line = text_encoder.to_unicode_utf8(line.strip())
         _, s1, s2, l = line.split("\t")
         inputs = [s1, s2]
         yield {"inputs": inputs, "label": int(l)}
Пример #3
0
 def example_generator(self, filename):
   for line in tf.gfile.Open(filename, "rb"):
     line = text_encoder.to_unicode_utf8(line.strip())
     _, label, _, sent = line.split("\t")
     yield {
         "inputs": sent,
         "label": int(label)
     }
Пример #4
0
 def example_generator(self, filename):
     label_list = self.class_labels(data_dir=None)
     for line in tf.gfile.Open(filename, "rb"):
         line = text_encoder.to_unicode_utf8(line.strip())
         split_line = line.split("\t")
         s1, s2 = split_line[:2]
         l = label_list.index(split_line[2])
         inputs = [s1, s2]
         yield {"inputs": inputs, "label": l}
Пример #5
0
 def example_generator(self, filename):
     label_list = self.class_labels(data_dir=None)
     for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
         if idx == 0: continue  # skip header
         line = text_encoder.to_unicode_utf8(line.strip())
         _, s1, s2, l = line.split("\t")
         inputs = [s1, s2]
         l = label_list.index(l)
         yield {"inputs": inputs, "label": l}
Пример #6
0
 def example_generator(self, filename):
     label_list = self.class_labels(data_dir=None)
     for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
         if idx == 0: continue  # skip header
         line = text_encoder.to_unicode_utf8(line.strip())
         split_line = line.split("\t")
         # Works for both splits even though dev has some extra human labels.
         s1, s2 = split_line[8:10]
         l = label_list.index(split_line[-1])
         inputs = [s1, s2]
         yield {"inputs": inputs, "label": l}
Пример #7
0
 def example_generator(self, filename, dev_ids, dataset_split):
     for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
         if idx == 0: continue  # skip header
         line = text_encoder.to_unicode_utf8(line.strip())
         l, id1, id2, s1, s2 = line.split("\t")
         is_dev = [id1, id2] in dev_ids
         if dataset_split == problem.DatasetSplit.TRAIN and is_dev:
             continue
         if dataset_split == problem.DatasetSplit.EVAL and not is_dev:
             continue
         inputs = [[s1, s2], [s2, s1]]
         for inp in inputs:
             yield {"inputs": inp, "label": int(l)}
Пример #8
0
def get_title(page):
    """Extract the title from a page.

  Args:
    page: a string
  Returns:
    a string
  """
    start_pos = page.find("<title>")
    end_pos = page.find("</title>")
    assert start_pos != -1
    assert end_pos != -1
    start_pos += len("<title>")
    return text_encoder.to_unicode_utf8(page[start_pos:end_pos])
Пример #9
0
 def example_generator(self, filename):
     skipped = 0
     for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
         if idx == 0: continue  # skip header
         line = text_encoder.to_unicode_utf8(line.strip())
         split_line = line.split("\t")
         if len(split_line) < 6:
             skipped += 1
             tf.logging.info("Skipping %d" % skipped)
             continue
         s1, s2, l = split_line[3:]
         # A neat data augmentation trick from Radford et al. (2018)
         # https://blog.openai.com/language-unsupervised/
         inputs = [[s1, s2], [s2, s1]]
         for inp in inputs:
             yield {"inputs": inp, "label": int(l)}
Пример #10
0
def _example_generator(filename):
  """Generate mnli examples.

  Args:
    filename: a string
  Yields:
    dictionaries containing "premise", "hypothesis" and "label" strings
  """
  for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
    if idx == 0: continue  # skip header
    line = text_encoder.to_unicode_utf8(line.strip())
    split_line = line.split("\t")
    # Works for both splits even though dev has some extra human labels.
    yield {
        "premise": split_line[8],
        "hypothesis": split_line[9],
        "label": split_line[-1]
    }
Пример #11
0
def example_generator(all_files, urls_path, sum_token):
  """Generate examples."""

  def fix_run_on_sents(line):
    if u"@highlight" in line:
      return line
    if not line:
      return line
    if line[-1] in END_TOKENS:
      return line
    return line + u"."

  filelist = example_splits(urls_path, all_files)
  story_summary_split_token = u" <summary> " if sum_token else " "

  for story_file in filelist:
    story = []
    summary = []
    reading_highlights = False
    for line in tf.gfile.Open(story_file, "rb"):
      line = text_encoder.to_unicode_utf8(line.strip())
      line = fix_run_on_sents(line)
      if not line:
        continue
      elif line.startswith(u"@highlight"):
        if not story:
          break  # No article text.
        reading_highlights = True
      elif reading_highlights:
        summary.append(line)
      else:
        story.append(line)

    if (not story) or not summary:
      continue

    yield " ".join(story) + story_summary_split_token + " ".join(summary)