def test(): bigrams = {} # bigram as key, frequency as value tokens = {} # token as key, frequency as value tokencount = 0 # number of tokens bigramcount = 0 # number of bigrams alphabet = "" # all characters used for i in sys.argv[1:]: for x in glob.glob(os.path.normcase(i)): try: file = open(x, "r") for i in file.readlines(): #i = string.lower(string.strip(i)) i = i.strip().lower() if i == "": continue wordlist = getWordList(i) bigrams, bigramcount = getBigrams(wordlist, bigrams, bigramcount) tokens, tokencount = getTokens(wordlist, tokens, tokencount) file.close() except IOError: file.close() print("Got total:\nBigrams: " + str(bigramcount) + "\nTokens: " + str(tokencount)) print("Bigram\tFrequency\tRelative Frequency\tMutual Information\tRelative Entropy") #myTokens = string.split(i[0]) for i in sortNgrams(bigrams): tokenlist = list(i)[0].split() re = RE(rF(i[1], bigramcount), P(tokenlist[1], tokens, tokencount), P(tokenlist[0], tokens, tokencount)) #print(i[0] + "\t" + str(i[1]) + "\t" + str(rF(i[1], bigramcount)) + "\t" + str(MI(i[0], rF(i[1], bigramcount), tokens, tokencount)) + "\t" + str(RE(i[0], rF(i[1], bigramcount), rF(myTokens[1], tokencount), rF(myTokens[0], tokencount)))) print(i[0] + "\t" + str(i[1]) + "\t" + str(rF(i[1], bigramcount)) + "\t" + str(MI(i[0], rF(i[1], bigramcount), tokens, tokencount)) + "\t" + str(re))
def caculate(filename, freq=100): bigrams = {} # bigram as key, frequency as value tokens = {} # token as key, frequency as value tokencount = 0 # number of tokens bigramcount = 0 # number of bigrams alphabet = "" # all characters used try: file = open(filename, "r") for i in file.readlines(): i = i.strip().lower() if i == "": continue wordlist = getWordList(i) bigrams, bigramcount = getBigrams(wordlist, bigrams, bigramcount) tokens, tokencount = getTokens(wordlist, tokens, tokencount) file.close() except IOError: file.close() if os.path.exists("mi.txt"): os.remove('mi.txt') if os.path.exists("dict.txt"): os.remove('dict.txt') f = open("mi.txt", "w") fl = open("dict.txt", "w") print("Got total:\nBigrams: " + str(bigramcount) + "\nTokens: " + str(tokencount)) #print("Bigram\tFrequency\tRelative Frequency\tMutual Information\tRelative Entropy") f.write("T1, T2, Frequency, Relative Frequency, Mutual Information, Relative Entropy\n") sep = ", " for i in sortNgrams(bigrams): tokenlist = list(i)[0].split() re = RE(rF(i[1], bigramcount), P(tokenlist[1], tokens, tokencount), P(tokenlist[0], tokens, tokencount)) mi = MI(i[0], rF(i[1], bigramcount), tokens, tokencount) if mi > freq: f.write(tokenlist[1] + sep + tokenlist[0] + sep + str(i[1]) + sep + str(rF(i[1], bigramcount)) + sep + str(mi) + sep + str(re) + "\n") fl.write(tokenlist[1] + " " + tokenlist[0]+ "\n") f.close() fl.close() ret = merge("dict.txt", "data.basket") '''
return bigramprob * math.log(bigramprob/(px * py) , 2) if __name__ == "__main__": bigrams = {} # bigram as key, frequency as value tokens = {} # token as key, frequency as value tokencount = 0 # number of tokens bigramcount = 0 # number of bigrams alphabet = "" # all characters used for i in sys.argv[1:]: for x in glob.glob(os.path.normcase(i)): try: file = open(x, "r") for i in file.readlines(): i = string.lower(string.strip(i)) if i == "": continue wordlist = getWordList(i) bigrams, bigramcount = getBigrams(wordlist, bigrams, bigramcount) tokens, tokencount = getTokens(wordlist, tokens, tokencount) file.close() except IOError: file.close() print "Got total:\nBigrams: " + str(bigramcount) + "\nTokens: " + str(tokencount) print "Bigram\tFrequency\tRelative Frequency\tMutual Information" myTokens = string.split(i[0]) for i in sortNgrams(bigrams): print i[0] + "\t" + str(i[1]) + "\t" + str(rF(i[1], bigramcount)) + "\t" + str(MI(i[0], rF(i[1], bigramcount), tokens, tokencount)) + "\t" + str(RE(i[0], rF(i[1], bigramcount), rF(myTokens[1], tokencount), rF(myTokens[0], tokencount)))
tokencount = 0 # number of tokens bigramcount = 0 # number of bigrams for i in sys.argv[1:]: for x in glob.glob(os.path.normcase(i)): try: file = open(x, "r") for i in file.readlines(): i = string.lower(string.strip(i)) if i == "": continue bigrams, bigramcount = getNGrams(i, bigrams, bigramcount, 2) tokens, tokencount = getNGrams(i, tokens, tokencount, 1) file.close() except IOError: file.close() print "Got total:\nBigrams: " + str(bigramcount) + "\nTokens: " + str(tokencount) print "Bigram\tFrequency" for i in sortNgrams(bigrams): print i[0] + "\t" + str(i[1]) print "Token\tFrequency" for i in sortNgrams(tokens): print i[0] + "\t" + str(i[1]) print "Bigram\tRelative Frequency" for i in sortNgrams(bigrams): print i[0] + "\t" + str(float(i[1])/float(bigramcount)) print "Token\tRelative Frequency" for i in sortNgrams(tokens): print i[0] + "\t" + str(float(i[1])/float(tokencount))
global bigrams, tokens, bigramcount, tokencount pxy = float(bigrams[bigram])/float(bigramcount) px = float(tokens[token2])/float(tokencount) py = float(tokens[token1])/float(tokencount) return py * math.log(py/(pxy/px), 2) if __name__ == "__main__": for i in sys.argv[1:]: for x in glob.glob(os.path.normcase(i)): try: file = open(x, "r") for i in file.readlines(): i = string.lower(string.strip(i)) if i == "": continue wordlist = getTWordList(i) bigrams, bigramcount = getTBigrams(wordlist, bigrams, bigramcount, TOKEN, TOKEN) tokens, tokencount = getTTokens(wordlist, tokens, tokencount, TOKEN) bigramsleft, bigramsright = getTLRBigrams(wordlist, bigramsleft, bigramsright, TOKEN, TOKEN) file.close() except IOError: file.close() myTokens = sortNgrams(tokens) print "Left RE\tToken\tRight RE\tFrequency\tRelative Frequency" for x in range(min(len(myTokens), PRINTWORDS)): rre, lre = PRE(myTokens[x][0]) print str(lre) + "\t" + myTokens[x][0] + "\t" + str(rre) + "\t" + str(myTokens[x][1]) + "\t" + str(float(myTokens[x][1])/float(tokencount))