示例#1
0
def run():
    num_hashes = 1000000
    out = open("stats.txt", "w")
    #conn_str = "dbname=abe2 user=yuewang"
    conn_str = "dbname=postgres"
    script_counts = Counts(70)
    with psycopg2.connect(conn_str, cursor_factory=NamedTupleCursor) as conn:
        from Hashes import fetch_hashes
        hashes = fetch_hashes(num_hashes, conn)
        for i,hash in enumerate(hashes):
            abe_txn = Txn(hash, conn)
            script_counts.add('txn_count')
            for txin in abe_txn.tx_in:
                script_counts.add(decode_script.decode_script(txin.script_bytea))
            for txout in abe_txn.tx_out:
                script_counts.add(decode_script.decode_script(txout.script_bytea))
            if i > 0 and i % 10000 == 0:
                out.write(script_counts.mkString())
                out.flush()
    out.write(script_counts.mkString())
    out.close()
langLines = f.readlines()
f.close()
print "Read file"

#CLEAN AND FACTORIZE THE LINES
cleanLines = []
for line, langLine, soundLine in zip(lines, langLines, soundexLines):
    cleanedLine = cleanLine(line)
    cleanedLangLine = cleanLine(langLine)
    cleanedSoundLine = cleanLine(soundLine)
    cleanLines.append(factorize(cleanedLine, cleanedLangLine,
                                cleanedSoundLine))
print "cleaned lines"

#GET THE COUNTS FROM THE SENTENCES
mc = Counts()
mcLang = Counts()
mcSoundex = Counts()
for line in cleanLines:
    for token in line:
        mc.incrementWord(token.getFactor(WORD_FACTOR))
        #mcLang.incrementWord(token.getFactor(LANGAUGE_FACTOR))
        #mcSoundex.incrementWord(token.getFactor(SOUNDEX_FACTOR))
mc.fixRanks()
mcLang.fixRanks()
mcSoundex.fixRanks()
print "Counted the words"
print "Number of words in the dataset: ", len(mc)
print "Number of languages in the dataset: ", len(mcLang)
print "Number of sounds in the dataset: ", len(mcSoundex)