def iterparas(self, fileids=None, categories=None): if fileids is None and categories is None: xml_paras = xml_utils.iterparse(self.filename, 'paragraph') return compat.imap(Paragraph, xml_paras) docs = self.iterdocuments(fileids, categories) return itertools.chain(*(doc.iterparas() for doc in docs))
def iter_documents(self, fileids=None, categories=None, _destroy=False): """ Return an iterator over corpus documents. """ doc_ids = self._filter_ids(fileids, categories) for doc in imap(self.get_document, doc_ids): yield doc if _destroy: doc.destroy()
def itersents(self, fileids=None, categories=None): if fileids is None and categories is None: xml_sents = xml_utils.iterparse(self.filename, 'sentence') return compat.imap(Sentence, xml_sents) docs = self.iterdocuments(fileids, categories) return itertools.chain(*(doc.itersents() for doc in docs))
def iterdocuments(self, fileids=None, categories=None): """ Returns an iterator over corpus documents. """ if fileids is None and categories is None: return compat.imap(Document, xml_utils.iterparse(self.filename, 'text')) doc_ids = self._filter_ids(fileids, categories) return (self.get_document(doc_id) for doc_id in doc_ids)
def _filter_ids(self, fileids=None, categories=None): meta = self._get_meta() fileids = make_iterable(fileids, meta.keys()) if categories is None: return imap(str, fileids) category_patterns = make_iterable(categories) return (doc_id for doc_id in fileids if some_items_match( meta[doc_id].categories, category_patterns))
def iter_parsed_sents(self): return imap(_sentence_parsed_words, self._xml_sents())
def iter_tagged_sents(self): return imap(_sentence_tagged_words, self._xml_sents())
def iter_raw_sents(self): return imap(_sentence_source, self._xml_sents())
def itersents(self): return compat.imap(Sentence, self.root.findall('paragraphs//sentence'))
def iterparas(self): return compat.imap(Paragraph, self.root.findall('paragraphs/paragraph'))
def itersents(self): return compat.imap(Sentence, self.root.findall('sentence'))