Python Corpus.add 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk_ext.corpus.corpus

클래스/타입: Corpus

메소드/함수: add

hotexamples.com에서의 예제들: 2

Python Corpus.add - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk_ext.corpus.corpus.Corpus.add에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

add(2)

df(1)

generate_doc_lens(1)

generate_neighbor_list(1)

idf(1)

neighbors(1)

ranked_terms(1)

tf(1)

tf_idf(1)

vocabulary(1)

예제 #1

파일 보기

파일: add_to_corpus.py 프로젝트: jgerrish/nltk_ext

class AddToCorpus(PipelineModule):
    def __init__(self, output=None, corpus=None):
        self.output = output
        self.corpus = Corpus() if (corpus == None) else corpus
        self.module_type = enumModuleType(enumModuleType.Document)
        self.module_processing_type = \
            enumModuleProcessingType(enumModuleProcessingType.PostProcess)

    def process(self, data):
        for document in data:
            print data
            self.corpus.add(data)

    def post_process(self):
        return self.corpus

    def as_json(self):
        json.dumps(self.corpus, sort_keys=True, indent=4, separators=(',', ': '))

    def write(self):
        if self.output != None:
            f = open(self.output, 'w')
            f.write(self.as_json())
            f.close()

예제 #2

파일 보기

파일: category_to_corpus.py 프로젝트: jgerrish/nltk_ext

class CategoryToCorpus(PipelineModule):
    def __init__(self, output=None, corpus=None,
                 attribute="categories",
                 categories=None, mode="combined"):
        """
        Create a CategoryToCorpus module, which loads a corpus with tagged
        documents.
        If corpus is passed in, it adds to an existing corpus.
        mode is the corpus loading method to use.  If set to "combined", all
        documents in a category are concatenated to a single document.
        Otherwise each document is loaded separately.
        """
        self.output = output
        self.corpora = {}
        # combined mode has a single corpus
        if corpus == None:
            self.corpus = Corpus()
        else:
            self.corpus = corpus
        self.module_type = enumModuleType(enumModuleType.Document)
        self.module_processing_type = \
            enumModuleProcessingType(enumModuleProcessingType.PostProcess)
        self.attribute = attribute
        self.categories = categories
        self.mode = mode
        self.pp = pprint.PrettyPrinter(indent=4)

    def add_document(self, category, document):
        if self.mode != "combined":
            if category in self.corpora:
                self.corpora[category].append(document)
            else:
                self.corpora[category] = [document]
        else:
            if category in self.corpus:
                d = self.corpus[category]
                d.update_text(unicode(d) + " " + unicode(document))
            else:
                document.set_doc_id(category)
                self.corpus.add(document)

    def process(self, data):
        """
        Process the documents.  The code looks at the attribute
        attribute, which should be a list or dictionary,
        and builds a set of corpora from categories in that
        attribute.
        If category is set, it only builds a single corpus containing
        documents with that category.
        """
        for doc in data:
            if self.attribute in doc.document:
                d = doc.document[self.attribute]
                if type(d) is list:
                    if self.categories == None:
                        for v in d:
                            self.add_document(v, doc)
                    else:
                        for category in self.categories:
                            if category in d:
                                self.add_document(category, doc)
            yield doc

    def post_process(self):
        """
        method that gets run after all data has been processed
        TODO: look into optimizing this, seems inefficient, written in derp-mode
        """
        if self.mode != "combined":
            return self.corpora
        else:
            return self.corpus

    def as_json(self):
        if self.mode != "combined":
            c = self.corpora
        else:
            c = self.corpus
        json.dumps(c, sort_keys=True, indent=4, separators=(',', ': '))

    def write(self):
        if self.output != None:
            f = open(self.output, 'w')
            f.write(self.as_json())
            f.close()

    def top_categories(self, n=10):
        for doc_id in self.categories:
            print str(doc_id)
            rt = self.corpus.ranked_terms(doc_id, n)
            print "  " + str(rt)