Exemplo n.º 1
0
  def filepath_to_unicode_strings(self, filepath):
    """Read text out of an input file.

    The default just reads the text, converts to unicode and yields one
    unicode string.

    Subclasses can override this function in order to preprocess, and can
    yield any number of strings.

    Args:
      filepath: a string
    Yields:
      unicode strings.
    """
    f = tf.gfile.Open(filepath)
    b = f.read()
    yield text_encoder.to_unicode_ignore_errors(b)
Exemplo n.º 2
0
  def filepath_to_unicode_strings(self, filepath):
    """Read text out of an input file.

    The default just reads the text, converts to unicode and yields one
    unicode string.

    Subclasses can override this function in order to preprocess, and can
    yield any number of strings.

    Args:
      filepath: a string
    Yields:
      unicode strings.
    """
    f = tf.gfile.Open(filepath)
    b = f.read()
    yield text_encoder.to_unicode_ignore_errors(b)
Exemplo n.º 3
0
 def filepath_to_unicode_strings(self, filepath):
   """Overrides the base class to clean up the xml dump before tokenizing."""
   dump = text_encoder.to_unicode_ignore_errors(tf.gfile.Open(filepath).read())
   pages = _dump_to_pages(dump)
   ret = u""
   for p in pages:
     title = _page_to_title(p)
     text = _page_to_text(p)
     text = _remove_triple_quotes(
         _remove_double_brackets(_remove_references(text)))
     if u":" in title:
       # not a regular article
       continue
     if len(text) <= 140:
       # Probably a redirect or something like that.  Skip it.
       continue
     ret += u"title: \"%s\" length: %d\n%s\n" % (title, len(text), text)
   yield ret
Exemplo n.º 4
0
 def filepath_to_unicode_strings(self, filepath):
   """Overrides the base class to clean up the xml dump before tokenizing."""
   dump = text_encoder.to_unicode_ignore_errors(tf.gfile.Open(filepath).read())
   pages = _dump_to_pages(dump)
   ret = u""
   for p in pages:
     title = _page_to_title(p)
     text = _page_to_text(p)
     text = _remove_triple_quotes(
         _remove_double_brackets(_remove_references(text)))
     if u":" in title:
       # not a regular article
       continue
     if len(text) <= 140:
       # Probably a redirect or something like that.  Skip it.
       continue
     ret += u"title: \"%s\" length: %d\n%s\n" % (title, len(text), text)
   yield ret