Пример #1
0
 def _next_passage(self):
     passage = None
     if self._split_iter is None:
         try:
             file = next(self._files_iter)
         except StopIteration:  # Finished iteration
             raise
         if isinstance(file, Passage):  # Not really a file, but a Passage
             passage = file
         else:  # A file
             attempts = self.attempts
             while not os.path.exists(file):
                 with external_write_mode(file=sys.stderr):
                     if attempts == 0:
                         print("File not found: %s" % file, file=sys.stderr)
                         return None
                     print("Failed reading %s, trying %d more times..." %
                           (file, attempts),
                           file=sys.stderr)
                 time.sleep(self.delay)
                 attempts -= 1
             try:
                 passage = file2passage(file)  # XML or binary format
             except (IOError,
                     ParseError) as e:  # Failed to read as passage file
                 base, ext = os.path.splitext(os.path.basename(file))
                 converter = self.converters.get(ext.lstrip("."))
                 if converter is None:
                     raise IOError(
                         "Could not read %s file. See error message above. "
                         "If this file's format is not %s, try adding '.txt' suffix to read as plain text:"
                         " '%s'" % (ext, ext, file)) from e
                 self._file_handle = open(file, encoding="utf-8")
                 self._split_iter = iter(
                     converter(chain(self._file_handle, [""]),
                               passage_id=base,
                               lang=self.lang))
         if self.split:
             if self._split_iter is None:
                 self._split_iter = (passage, )
             self._split_iter = iter(
                 s for p in self._split_iter for s in split2segments(
                     p, is_sentences=self.sentences, lang=self.lang))
     if self._split_iter is not None:  # Either set before or initialized now
         try:
             passage = next(self._split_iter)
         except StopIteration:  # Finished this converter
             self._split_iter = None
             if self._file_handle is not None:
                 self._file_handle.close()
                 self._file_handle = None
             return None
     return passage
Пример #2
0
 def _next_passage(self):
     passage = None
     if self._split_iter is None:
         try:
             file = next(self._files_iter)
         except StopIteration:  # Finished iteration
             raise
         if isinstance(file, Passage):  # Not really a file, but a Passage
             passage = file
         else:  # A file
             attempts = 3
             while not os.path.exists(file):
                 if attempts == 0:
                     print("File not found: %s" % file, file=sys.stderr)
                     return next(self)
                 print("Failed reading %s, trying %d more times..." %
                       (file, attempts),
                       file=sys.stderr)
                 time.sleep(5)
                 attempts -= 1
             try:
                 passage = file2passage(file)  # XML or binary format
             except (IOError, ParseError):  # Failed to read as passage file
                 base, ext = os.path.splitext(os.path.basename(file))
                 converter = self.converters[ext.lstrip(".")]
                 self._file_handle = open(file, encoding="utf-8")
                 self._split_iter = iter(
                     converter(self._file_handle, passage_id=base))
         if self.split:
             if self._split_iter is None:
                 self._split_iter = (passage, )
             self._split_iter = iter(
                 s for p in self._split_iter
                 for s in split2segments(p, is_sentences=self.sentences))
     if self._split_iter is not None:  # Either set before or initialized now
         try:
             # noinspection PyTypeChecker
             passage = next(self._split_iter)
         except StopIteration:  # Finished this converter
             self._split_iter = None
             if self._file_handle is not None:
                 self._file_handle.close()
                 self._file_handle = None
             return next(self)
     return passage
Пример #3
0
 def _next_passage(self):
     passage = None
     if self._split_iter is None:
         try:
             file = next(self._files_iter)
         except StopIteration:  # Finished iteration
             raise
         if isinstance(file, Passage):  # Not really a file, but a Passage
             passage = file
         else:  # A file
             attempts = self.attempts
             while not os.path.exists(file):
                 with external_write_mode(file=sys.stderr):
                     if attempts == 0:
                         print("File not found: %s" % file, file=sys.stderr)
                         return None
                     print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr)
                 time.sleep(self.delay)
                 attempts -= 1
             try:
                 passage = file2passage(file)  # XML or binary format
             except (IOError, ParseError) as e:  # Failed to read as passage file
                 base, ext = os.path.splitext(os.path.basename(file))
                 converter = self.converters.get(ext.lstrip("."))
                 if converter is None:
                     raise IOError("Could not read %s file. See error message above. "
                                   "If this file's format is not %s, try adding '.txt' suffix to read as plain text:"
                                   " '%s'" % (ext, ext, file)) from e
                 self._file_handle = open(file, encoding="utf-8")
                 self._split_iter = iter(converter(chain(self._file_handle, [""]), passage_id=base, lang=self.lang))
         if self.split:
             if self._split_iter is None:
                 self._split_iter = (passage,)
             self._split_iter = iter(s for p in self._split_iter for s in
                                     split2segments(p, is_sentences=self.sentences, lang=self.lang))
     if self._split_iter is not None:  # Either set before or initialized now
         try:
             passage = next(self._split_iter)
         except StopIteration:  # Finished this converter
             self._split_iter = None
             if self._file_handle is not None:
                 self._file_handle.close()
                 self._file_handle = None
             return None
     return passage
Пример #4
0
 def _next_passage(self):
     passage = None
     if self._split_iter is None:
         try:
             file = next(self._files_iter)
         except StopIteration:  # Finished iteration
             raise
         if isinstance(file, Passage):  # Not really a file, but a Passage
             passage = file
         else:  # A file
             attempts = 3
             while not os.path.exists(file):
                 if attempts == 0:
                     print("File not found: %s" % file, file=sys.stderr)
                     return next(self)
                 print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr)
                 time.sleep(5)
                 attempts -= 1
             try:
                 passage = file2passage(file)  # XML or binary format
             except (IOError, ParseError):  # Failed to read as passage file
                 base, ext = os.path.splitext(os.path.basename(file))
                 converter = self.converters[ext.lstrip(".")]
                 self._file_handle = open(file, encoding="utf-8")
                 self._split_iter = iter(converter(self._file_handle, passage_id=base))
         if self.split:
             if self._split_iter is None:
                 self._split_iter = (passage,)
             self._split_iter = iter(s for p in self._split_iter for s in
                                     split2segments(p, is_sentences=self.sentences))
     if self._split_iter is not None:  # Either set before or initialized now
         try:
             # noinspection PyTypeChecker
             passage = next(self._split_iter)
         except StopIteration:  # Finished this converter
             self._split_iter = None
             if self._file_handle is not None:
                 self._file_handle.close()
                 self._file_handle = None
             return next(self)
     return passage
Пример #5
0
def read_passages(files):
    """
    :param files: iterable of files or Passage objects
    :return: generator of passages from all files given
    """
    for file in files:
        if isinstance(file, core.Passage):  # Not really a file, but a Passage
            passage = file
        elif os.path.exists(file):  # A file
            try:
                passage = ioutil.file2passage(file)  # XML or binary format
            except (IOError, ParseError):  # Failed to read as passage file
                base, ext = os.path.splitext(os.path.basename(file))
                converter = convert.FROM_FORMAT.get(ext.lstrip("."), convert.from_text)
                with open(file) as f:
                    yield from converter(f, passage_id=base, split=Config().split)
                continue
        else:
            raise IOError("File not found: %s" % file)
        if Config().split:
            yield from convert.split2segments(passage, is_sentences=Config().sentences)
        else:
            yield passage