def _next_passage(self): passage = None if self._split_iter is None: try: file = next(self._files_iter) except StopIteration: # Finished iteration raise if isinstance(file, Passage): # Not really a file, but a Passage passage = file else: # A file attempts = self.attempts while not os.path.exists(file): with external_write_mode(file=sys.stderr): if attempts == 0: print("File not found: %s" % file, file=sys.stderr) return None print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) time.sleep(self.delay) attempts -= 1 try: passage = file2passage(file) # XML or binary format except (IOError, ParseError) as e: # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = self.converters.get(ext.lstrip(".")) if converter is None: raise IOError( "Could not read %s file. See error message above. " "If this file's format is not %s, try adding '.txt' suffix to read as plain text:" " '%s'" % (ext, ext, file)) from e self._file_handle = open(file, encoding="utf-8") self._split_iter = iter( converter(chain(self._file_handle, [""]), passage_id=base, lang=self.lang)) if self.split: if self._split_iter is None: self._split_iter = (passage, ) self._split_iter = iter( s for p in self._split_iter for s in split2segments( p, is_sentences=self.sentences, lang=self.lang)) if self._split_iter is not None: # Either set before or initialized now try: passage = next(self._split_iter) except StopIteration: # Finished this converter self._split_iter = None if self._file_handle is not None: self._file_handle.close() self._file_handle = None return None return passage
def _next_passage(self): passage = None if self._split_iter is None: try: file = next(self._files_iter) except StopIteration: # Finished iteration raise if isinstance(file, Passage): # Not really a file, but a Passage passage = file else: # A file attempts = 3 while not os.path.exists(file): if attempts == 0: print("File not found: %s" % file, file=sys.stderr) return next(self) print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) time.sleep(5) attempts -= 1 try: passage = file2passage(file) # XML or binary format except (IOError, ParseError): # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = self.converters[ext.lstrip(".")] self._file_handle = open(file, encoding="utf-8") self._split_iter = iter( converter(self._file_handle, passage_id=base)) if self.split: if self._split_iter is None: self._split_iter = (passage, ) self._split_iter = iter( s for p in self._split_iter for s in split2segments(p, is_sentences=self.sentences)) if self._split_iter is not None: # Either set before or initialized now try: # noinspection PyTypeChecker passage = next(self._split_iter) except StopIteration: # Finished this converter self._split_iter = None if self._file_handle is not None: self._file_handle.close() self._file_handle = None return next(self) return passage
def _next_passage(self): passage = None if self._split_iter is None: try: file = next(self._files_iter) except StopIteration: # Finished iteration raise if isinstance(file, Passage): # Not really a file, but a Passage passage = file else: # A file attempts = self.attempts while not os.path.exists(file): with external_write_mode(file=sys.stderr): if attempts == 0: print("File not found: %s" % file, file=sys.stderr) return None print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) time.sleep(self.delay) attempts -= 1 try: passage = file2passage(file) # XML or binary format except (IOError, ParseError) as e: # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = self.converters.get(ext.lstrip(".")) if converter is None: raise IOError("Could not read %s file. See error message above. " "If this file's format is not %s, try adding '.txt' suffix to read as plain text:" " '%s'" % (ext, ext, file)) from e self._file_handle = open(file, encoding="utf-8") self._split_iter = iter(converter(chain(self._file_handle, [""]), passage_id=base, lang=self.lang)) if self.split: if self._split_iter is None: self._split_iter = (passage,) self._split_iter = iter(s for p in self._split_iter for s in split2segments(p, is_sentences=self.sentences, lang=self.lang)) if self._split_iter is not None: # Either set before or initialized now try: passage = next(self._split_iter) except StopIteration: # Finished this converter self._split_iter = None if self._file_handle is not None: self._file_handle.close() self._file_handle = None return None return passage
def _next_passage(self): passage = None if self._split_iter is None: try: file = next(self._files_iter) except StopIteration: # Finished iteration raise if isinstance(file, Passage): # Not really a file, but a Passage passage = file else: # A file attempts = 3 while not os.path.exists(file): if attempts == 0: print("File not found: %s" % file, file=sys.stderr) return next(self) print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) time.sleep(5) attempts -= 1 try: passage = file2passage(file) # XML or binary format except (IOError, ParseError): # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = self.converters[ext.lstrip(".")] self._file_handle = open(file, encoding="utf-8") self._split_iter = iter(converter(self._file_handle, passage_id=base)) if self.split: if self._split_iter is None: self._split_iter = (passage,) self._split_iter = iter(s for p in self._split_iter for s in split2segments(p, is_sentences=self.sentences)) if self._split_iter is not None: # Either set before or initialized now try: # noinspection PyTypeChecker passage = next(self._split_iter) except StopIteration: # Finished this converter self._split_iter = None if self._file_handle is not None: self._file_handle.close() self._file_handle = None return next(self) return passage
def read_passages(files): """ :param files: iterable of files or Passage objects :return: generator of passages from all files given """ for file in files: if isinstance(file, core.Passage): # Not really a file, but a Passage passage = file elif os.path.exists(file): # A file try: passage = ioutil.file2passage(file) # XML or binary format except (IOError, ParseError): # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = convert.FROM_FORMAT.get(ext.lstrip("."), convert.from_text) with open(file) as f: yield from converter(f, passage_id=base, split=Config().split) continue else: raise IOError("File not found: %s" % file) if Config().split: yield from convert.split2segments(passage, is_sentences=Config().sentences) else: yield passage