Exemplo n.º 1
0
    def __iter__(self):
        from datetime import datetime
        from terminal import ProgressBar
        pb = ProgressBar('green', width=20)
        self.passes += 1
        
        start = datetime.now()

        num_docs = float(len(self))
        for i, word in enumerate(self._words_iter()):
            self.docid2word[i] = word
            self.word2docid[word] = i
            if (i+1) % 10 == 0:
                eta = (datetime.now() - start) * int((num_docs - i)) // (i + 1)
                pct = i/num_docs
                msg = "corpus pass #%d / ETA %s" % (self.passes, eta)
                pb.render(pct, msg)

            yield self._get_word_context_vector(word)

        runtime = datetime.now() - start
        msg = "completed pass #%d in %s" % (self.passes, runtime)
        pb.render(1, msg)
def index_database(corpus_file_or_folder, index_file, remove_once=True):
    # if remove once is True, the words that appear in the corpus only
    # once will not be indexed.

    wt = WordTransformer()

    print "Initing database (%s)..." % index_file
    conn = init_index(index_file)
    word_ids = dict()
    
    print "Calculating total corpus size..."
    filenames = list(walk(corpus_file_or_folder))
    total_offset = 0
    total_bytes = sum(float(os.stat(f).st_size) for f in filenames)


    from datetime import datetime
    from terminal import ProgressBar
    pb = ProgressBar(width=20, color='green')
    start = datetime.now()
    

    print "Beginning indexing..."
    for fileid, filename in enumerate(filenames):
        offset = 0
        with open(filename) as f:
            add_file(conn, filename, fileid)
            for j, line in enumerate(f):
                line_bytes = len(line)

                if line.strip() == '.START':
                    # special token in the wsj corpus file
                    offset += line_bytes
                    continue

                words = wt.tokenize(line)
                processed = []
                for word in words:
                    if word in processed:
                        # no need to record when a word appears twice
                        # that'll fall out later
                        continue
                    processed.append(word)
                    wordid = add_word(conn, word, word_ids)
                    add_context(conn, wordid, fileid, offset)

                offset += line_bytes
                total_offset += line_bytes
                if j % 2500 == 0:
                    pct = float(total_offset) / total_bytes
                    eta = ((datetime.now() - start) / total_offset) * int(total_bytes - total_offset)
                    msg = "indexing  -  ETA %s" % (str(eta)[:10])
                    pb.render(pct, msg)
        
    msg = "completed in %s" % (datetime.now() - start)
    pb.render(1, msg)
    if remove_once:
        print "filtering words appearing only once..."
        remove_singletons(conn)

    print "syncing to disk... (almost done!)"
    close_index(conn)