Пример #1
0
    def iterparas(self, fileids=None, categories=None):
        if fileids is None and categories is None:
            xml_paras = xml_utils.iterparse(self.filename, 'paragraph')
            return compat.imap(Paragraph, xml_paras)

        docs = self.iterdocuments(fileids, categories)
        return itertools.chain(*(doc.iterparas() for doc in docs))
Пример #2
0
 def iter_documents(self, fileids=None, categories=None, _destroy=False):
     """ Return an iterator over corpus documents. """
     doc_ids = self._filter_ids(fileids, categories)
     for doc in imap(self.get_document, doc_ids):
         yield doc
         if _destroy:
             doc.destroy()
Пример #3
0
    def itersents(self, fileids=None, categories=None):
        if fileids is None and categories is None:
            xml_sents = xml_utils.iterparse(self.filename, 'sentence')
            return compat.imap(Sentence, xml_sents)

        docs = self.iterdocuments(fileids, categories)
        return itertools.chain(*(doc.itersents() for doc in docs))
Пример #4
0
    def iterdocuments(self, fileids=None, categories=None):
        """
        Returns an iterator over corpus documents.
        """
        if fileids is None and categories is None:
            return compat.imap(Document, xml_utils.iterparse(self.filename, 'text'))

        doc_ids = self._filter_ids(fileids, categories)
        return (self.get_document(doc_id) for doc_id in doc_ids)
Пример #5
0
    def _filter_ids(self, fileids=None, categories=None):
        meta = self._get_meta()
        fileids = make_iterable(fileids, meta.keys())

        if categories is None:
            return imap(str, fileids)

        category_patterns = make_iterable(categories)
        return (doc_id for doc_id in fileids
                if some_items_match(
                    meta[doc_id].categories, category_patterns))
Пример #6
0
 def iter_parsed_sents(self):
     return imap(_sentence_parsed_words, self._xml_sents())
Пример #7
0
 def iter_tagged_sents(self):
     return imap(_sentence_tagged_words, self._xml_sents())
Пример #8
0
 def iter_raw_sents(self):
     return imap(_sentence_source, self._xml_sents())
Пример #9
0
 def itersents(self):
     return compat.imap(Sentence, self.root.findall('paragraphs//sentence'))
Пример #10
0
 def iterparas(self):
     return compat.imap(Paragraph, self.root.findall('paragraphs/paragraph'))
Пример #11
0
 def itersents(self):
     return compat.imap(Sentence, self.root.findall('sentence'))