#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ frqanalysis.py Open a text file, read text, tokenize it and generate a frequency profile. """ from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk # mytext = getTextFromFile("pg873.txt") # read text from file to memory and return a list of tokens mytokens = tokenize(getTextFromFile("pg873.txt")) mydict = makeFrequencyProfile(mytokens) junk = " ,;:-+=()[]'\"?!$%.<>" removeJunk(mydict, junk) if "" in mydict: del mydict[""] # generate a nice output total = sum(mydict.values()) for token in mydict: print(token, mydict[token], mydict[token] / total, sep='\t')
#!/usr/bin/env python3 # -*- coding: UTF-8 -*- from math import log #import corpus from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) ) relativizeFP(mydict) #for key in mydict: # print(key, mydict[key], sep="\t") mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) ) relativizeFP(mysportsdict) unktokens = tokenize(""" The young King was eating pomegranates and talking about his soul and other emotional issues. """) probpomeg = 0.0 probsports = 0.0 for token in unktokens: probpomeg += log(mydict.get(token, 0.00000000000001)) probsports += log(mysportsdict.get(token, 0.00000000000001)) if probpomeg > probsports: print("This text is probably House of Pomeg.") else:
#!/usr/bin/env python3 from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk, prettyPrintFRP for x in range (1,6): loadSpam.split_data( x , 5, spamPath) for file in spamList: mytokens = tokenize(getTextFromFile(file) ) mydict = makeFrequencyProfile(mytokens) junk = " ,;:-+=()[]'\"?!%.<>" removeJunk(mydict, junk) if "" in mydict: del mydict[""] prettyPrintFRP (mydict)