Exemplo n.º 1
0
    def similar_to(self, word, n=10, should_enrich_with_web=False):
        def _max_dim(vec):
            return max(vec, key=lambda x: x[1])

        def _unvec(vec):
            return dict((self.dictionary[a], b) for a,b in vec)

        wt = WordTransformer()
        word = wt.transform(word)
        self.sim.numBest = n
        vec = self.vector_corpus[word]
        if(should_enrich_with_web):
            vec = self._enrich_vec_with_web(vec, word)
        vec = self.tfidf[vec]

        retval = []
        for docid, score in self.sim[vec]:
            docword = self.docid2word[docid]
            retval.append((docword, score))

        return retval
def index_database(corpus_file_or_folder, index_file, remove_once=True):
    # if remove once is True, the words that appear in the corpus only
    # once will not be indexed.

    wt = WordTransformer()

    print "Initing database (%s)..." % index_file
    conn = init_index(index_file)
    word_ids = dict()
    
    print "Calculating total corpus size..."
    filenames = list(walk(corpus_file_or_folder))
    total_offset = 0
    total_bytes = sum(float(os.stat(f).st_size) for f in filenames)


    from datetime import datetime
    from terminal import ProgressBar
    pb = ProgressBar(width=20, color='green')
    start = datetime.now()
    

    print "Beginning indexing..."
    for fileid, filename in enumerate(filenames):
        offset = 0
        with open(filename) as f:
            add_file(conn, filename, fileid)
            for j, line in enumerate(f):
                line_bytes = len(line)

                if line.strip() == '.START':
                    # special token in the wsj corpus file
                    offset += line_bytes
                    continue

                words = wt.tokenize(line)
                processed = []
                for word in words:
                    if word in processed:
                        # no need to record when a word appears twice
                        # that'll fall out later
                        continue
                    processed.append(word)
                    wordid = add_word(conn, word, word_ids)
                    add_context(conn, wordid, fileid, offset)

                offset += line_bytes
                total_offset += line_bytes
                if j % 2500 == 0:
                    pct = float(total_offset) / total_bytes
                    eta = ((datetime.now() - start) / total_offset) * int(total_bytes - total_offset)
                    msg = "indexing  -  ETA %s" % (str(eta)[:10])
                    pb.render(pct, msg)
        
    msg = "completed in %s" % (datetime.now() - start)
    pb.render(1, msg)
    if remove_once:
        print "filtering words appearing only once..."
        remove_singletons(conn)

    print "syncing to disk... (almost done!)"
    close_index(conn)