Exemplo n.º 1
0
def postProcess(start1, query):
    #print; print query; print
    res = linggleit(query)
    #print res
    phrases = [ [  w.replace('<strong>','').replace('</strong>','')  for w in ngram['phrase'][start1:]] for ngram in res]
    phrases = [ [ sqlite.search_lemma(ph[0].strip().lower())]+ ph[1:] for ph in phrases]
    phrases = [ ' '.join([x.strip() for x in ph]) for ph in phrases]
    counts = [ ngram['count_str'] for ngram in res]
    counts = [ int(x.replace(',','')) for x in counts]

    ngramCounts = zip(phrases, counts)
    ngramCounts.sort(key=lambda x: x[0])
    ngramCounts = [ (ngram, sum( [x[1] for x in ngramcounts ] )) \
                    for ngram, ngramcounts in groupby(ngramCounts, key=lambda x:x[0]) ]
    ngramCounts.sort(key=lambda x:x[1], reverse=True)

    resList = []
    for ngram, count in ngramCounts:
        #print '%s\t%s' % (ngram, count)
        resList.append((ngram, count))
    return resList
Exemplo n.º 2
0
def vpCollocation(headword):
    template1 = 'pron. %s prep. ?n.'
    start1 = 1
    query = template1 % headword
    post = postProcess(start1, query)
    # combine "pay attention to detail" & "... to details"-->"pay attention to detail/details"
    tmp = []
    removeIndex = []
    for i in range(len(post)):
        lastWord = post[i][0].split(' ')[-1]
        lemma = sqlite.search_lemma(lastWord.strip().lower())
        if str(lastWord) != str(lemma):
            for j in range(len(post)):
                _lastWord = post[j][0].split(' ')[-1]
                if str(lemma) == str(_lastWord):
                    tmp.append([ post[j][0] + '/' + lastWord, int(post[j][1]) + int(post[i][1])])
                    removeIndex.append(i)
                    removeIndex.append(j)
    for ind in removeIndex:
        post.remove(post[int(ind)])
    res = tmp + post
    res.sort(key=lambda x:x[1], reverse=True)
    return res