Python WordTransformer примеры использования

Язык программирования: Python

Пространство имен/Пакет: util

Класс/Тип: WordTransformer

Примеров на hotexamples.com: 2

Python WordTransformer - 2 примера найдено. Это лучшие примеры Python кода для util.WordTransformer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

tokenize(1)

transform(1)

Пример #1

Показать файл

Файл: sembuild.py Проект: stephenroller/class-nlp-project

    def similar_to(self, word, n=10, should_enrich_with_web=False):
        def _max_dim(vec):
            return max(vec, key=lambda x: x[1])

        def _unvec(vec):
            return dict((self.dictionary[a], b) for a,b in vec)

        wt = WordTransformer()
        word = wt.transform(word)
        self.sim.numBest = n
        vec = self.vector_corpus[word]
        if(should_enrich_with_web):
            vec = self._enrich_vec_with_web(vec, word)
        vec = self.tfidf[vec]

        retval = []
        for docid, score in self.sim[vec]:
            docword = self.docid2word[docid]
            retval.append((docword, score))

        return retval

Пример #2

Показать файл

Файл: indexcorpus.py Проект: stephenroller/class-nlp-project

def index_database(corpus_file_or_folder, index_file, remove_once=True):
    # if remove once is True, the words that appear in the corpus only
    # once will not be indexed.

    wt = WordTransformer()

    print "Initing database (%s)..." % index_file
    conn = init_index(index_file)
    word_ids = dict()
    
    print "Calculating total corpus size..."
    filenames = list(walk(corpus_file_or_folder))
    total_offset = 0
    total_bytes = sum(float(os.stat(f).st_size) for f in filenames)


    from datetime import datetime
    from terminal import ProgressBar
    pb = ProgressBar(width=20, color='green')
    start = datetime.now()
    

    print "Beginning indexing..."
    for fileid, filename in enumerate(filenames):
        offset = 0
        with open(filename) as f:
            add_file(conn, filename, fileid)
            for j, line in enumerate(f):
                line_bytes = len(line)

                if line.strip() == '.START':
                    # special token in the wsj corpus file
                    offset += line_bytes
                    continue

                words = wt.tokenize(line)
                processed = []
                for word in words:
                    if word in processed:
                        # no need to record when a word appears twice
                        # that'll fall out later
                        continue
                    processed.append(word)
                    wordid = add_word(conn, word, word_ids)
                    add_context(conn, wordid, fileid, offset)

                offset += line_bytes
                total_offset += line_bytes
                if j % 2500 == 0:
                    pct = float(total_offset) / total_bytes
                    eta = ((datetime.now() - start) / total_offset) * int(total_bytes - total_offset)
                    msg = "indexing  -  ETA %s" % (str(eta)[:10])
                    pb.render(pct, msg)
        
    msg = "completed in %s" % (datetime.now() - start)
    pb.render(1, msg)
    if remove_once:
        print "filtering words appearing only once..."
        remove_singletons(conn)

    print "syncing to disk... (almost done!)"
    close_index(conn)