def tokensFilter(tokenlines): lexicons = sms2.ngramCount(tokenlines, 1) filteredLines = [] for line in tokenlines: tokenline = [] for token in line: if lexicons[(token,)]<3: tokenline.append('<unk>') else : tokenline.append(token) filteredLines.append(tokenline) return filteredLines
def main(): trainfile = open("../smstrain.txt") lexicon = sms2.ngramCount(smartTokenizer(trainfile),4) lexicon_list = [] for lex in lexicon.keys(): lexicon_list.append( (lexicon[lex],lex) ) lexicon_list.sort(reverse=True) print lexicon_list print len(lexicon_list) histo = sms2.histogram(lexicon) print histo writer = csv.writer(open("quadricounts.csv", "wb")) writer.writerows(lexicon_list)