def nGrams(string, corpus, number, clean=True): global wordList biList = [] triList = [] words = WordPunctTokenizer().tokenize(string) stopset = set(stopwords.words('english')) if clean == True: words = [word.lower() for word in words] if clean == False: words = [word.lower() for word in words] filter = lambda words: len(words) < 2 or words.isdigit() bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter) biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number) tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter) triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number) for i in range(len(biResult)): if len(biResult) > 0: biPrint = " ".join(biResult[i]) biList.append(biPrint) else: biList = [] csv = open('db\cyttron-keywords.csv', 'a') if len(biList) > 1: csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";') else: csv.write('"' + ''.join(biList) + '";') csv.close() for i in range(len(triResult)): if len(triResult) > 0: triPrint = " ".join(triResult[i]) triList.append(triPrint) else: triList = [] csv = open('db\cyttron-keywords.csv', 'a') if len(triList) > 1: csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n') else: csv.write('"' + ''.join(triList) + '"\n') csv.close() print biList print triList
def nGrams(string,corpus,number,clean=True): global wordList biList=[] triList=[] words = WordPunctTokenizer().tokenize(string) stopset = set(stopwords.words('english')) if clean == True: words = [word.lower() for word in words] if clean == False: words = [word.lower() for word in words] filter = lambda words: len(words) < 2 or words.isdigit() bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter) biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number) tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter) triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number) for i in range(len(biResult)): if len(biResult) > 0: biPrint = " ".join(biResult[i]) biList.append(biPrint) else: biList=[] csv = open('db\cyttron-keywords.csv','a') if len(biList) > 1: csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";') else: csv.write('"' + ''.join(biList) + '";') csv.close() for i in range(len(triResult)): if len(triResult) > 0: triPrint = " ".join(triResult[i]) triList.append(triPrint) else: triList=[] csv = open('db\cyttron-keywords.csv','a') if len(triList) > 1: csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n') else: csv.write('"' + ''.join(triList) + '"\n') csv.close() print biList print triList