예제 #1
0
def nGrams(string, corpus, number, clean=True):
    global wordList
    biList = []
    triList = []
    words = WordPunctTokenizer().tokenize(string)
    stopset = set(stopwords.words('english'))
    if clean == True:
        words = [word.lower() for word in words]
    if clean == False:
        words = [word.lower() for word in words]
    filter = lambda words: len(words) < 2 or words.isdigit()

    bcf = BigramCollocationFinder.from_words(words)
    bcf.apply_word_filter(filter)
    biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number)

    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_word_filter(filter)
    triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number)

    for i in range(len(biResult)):
        if len(biResult) > 0:
            biPrint = " ".join(biResult[i])
            biList.append(biPrint)
        else:
            biList = []
    csv = open('db\cyttron-keywords.csv', 'a')
    if len(biList) > 1:
        csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";')
    else:
        csv.write('"' + ''.join(biList) + '";')
    csv.close()

    for i in range(len(triResult)):
        if len(triResult) > 0:
            triPrint = " ".join(triResult[i])
            triList.append(triPrint)
        else:
            triList = []
    csv = open('db\cyttron-keywords.csv', 'a')
    if len(triList) > 1:
        csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n')
    else:
        csv.write('"' + ''.join(triList) + '"\n')
    csv.close()
    print biList
    print triList
예제 #2
0
def nGrams(string,corpus,number,clean=True):
    global wordList
    biList=[]
    triList=[]
    words = WordPunctTokenizer().tokenize(string)
    stopset = set(stopwords.words('english'))
    if clean == True:
        words = [word.lower() for word in words]
    if clean == False:
        words = [word.lower() for word in words]
    filter = lambda words: len(words) < 2 or words.isdigit()
    
    bcf = BigramCollocationFinder.from_words(words)
    bcf.apply_word_filter(filter)
    biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number)

    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_word_filter(filter)
    triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number)

    for i in range(len(biResult)):
        if len(biResult) > 0:
            biPrint = " ".join(biResult[i])
            biList.append(biPrint)
        else:
            biList=[]
    csv = open('db\cyttron-keywords.csv','a')            
    if len(biList) > 1:
        csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";')
    else:
        csv.write('"' + ''.join(biList) + '";')
    csv.close()
    
    for i in range(len(triResult)):
        if len(triResult) > 0:
            triPrint = " ".join(triResult[i])
            triList.append(triPrint)
        else:
            triList=[]
    csv = open('db\cyttron-keywords.csv','a')
    if len(triList) > 1:
        csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n')
    else:
        csv.write('"' + ''.join(triList) + '"\n')
    csv.close()
    print biList
    print triList