def lidstoneProbDist(olddf): """ Use nltk to create probdist """ #http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf #https://github.com/tuzzeg/detect_insults/blob/master/README.md print "Creating LidStone Probdist...",nltk.__version__ tutto=[] #olddf = olddf.ix[random.sample(olddf.index, 10)] olddf=pd.DataFrame(olddf['body']) print type(olddf) for ind in olddf.index: print ind row=[] row.append(ind) text=olddf.ix[ind,'body'] tokens=word_tokenize(text) #print tokens t_fd = FreqDist(tokens) pdist = LidstoneProbDist(t_fd,0.1) print pdist.samples() #for tok in tokens: # print pdist[3][tok] #t_fd.plot(cumulative=False) raw_input("HITKEY") row=tokens #print tagged #print len(tagged) tutto.append(row) newdf=pd.DataFrame(tutto).set_index(0) newdf.columns=taglist print newdf.head(20) print newdf.describe() newdf.to_csv("../stumbled_upon/data/lidstone.csv")
def lidstoneProbDist(olddf): """ Use nltk to create probdist """ #http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf #https://github.com/tuzzeg/detect_insults/blob/master/README.md print("Creating LidStone Probdist...", nltk.__version__) tutto = [] #olddf = olddf.ix[random.sample(olddf.index, 10)] olddf = pd.DataFrame(olddf['body']) print(type(olddf)) for ind in olddf.index: print(ind) row = [] row.append(ind) text = olddf.ix[ind, 'body'] tokens = word_tokenize(text) #print tokens t_fd = FreqDist(tokens) pdist = LidstoneProbDist(t_fd, 0.1) print(pdist.samples()) #for tok in tokens: # print pdist[3][tok] #t_fd.plot(cumulative=False) input("HITKEY") row = tokens #print tagged #print len(tagged) tutto.append(row) newdf = pd.DataFrame(tutto).set_index(0) newdf.columns = taglist print(newdf.head(20)) print(newdf.describe()) newdf.to_csv("../stumbled_upon/data/lidstone.csv")