示例#1
0
文件: token.py 项目: fvdsn/NLP
def tokensFilter(tokenlines):
	
	lexicons = sms2.ngramCount(tokenlines, 1)
	filteredLines = []
	
	for line in tokenlines:
		tokenline = []
		for token in line:
			if lexicons[(token,)]<3:
				tokenline.append('<unk>')
			else : tokenline.append(token)
		filteredLines.append(tokenline)
	
	return filteredLines
示例#2
0
文件: token.py 项目: fvdsn/NLP
def main():
	
	trainfile = open("../smstrain.txt")
	lexicon = sms2.ngramCount(smartTokenizer(trainfile),4)
	
	lexicon_list = []
	for lex in lexicon.keys():
		lexicon_list.append( (lexicon[lex],lex) )

	lexicon_list.sort(reverse=True)
	print lexicon_list
	print len(lexicon_list)
	
	histo = sms2.histogram(lexicon)
	
	print histo
	
	writer = csv.writer(open("quadricounts.csv", "wb"))
	writer.writerows(lexicon_list)