示例#1
0
def parallel():
    query = ["stars", 5]
    projection = ["text", 1]
    # Params: query, collection name, projection, limit
    documents = PyM.fetchFromDb(query, 'reviews', projection, 3000)
    starttime = timer()

    pool = mp.Pool(processes=4)

    results = [
        pool.apply_async(createNGram, args=(doc, )) for doc in documents
    ]

    output = [p.get() for p in results]

    flat_ngram_list = [item for sublist in output for item in sublist]

    endtime = timer()

    fdist = nltk.FreqDist(flat_ngram_list)
    sorted_x = sorted(fdist.items(), key=operator.itemgetter(1), reverse=True)
    for k in sorted_x[:5]:
        print " ".join(k[0]) + ":" + str(k[1])

    print "Elapsed time " + str(endtime - starttime)
示例#2
0
def singleTh():
    query = ["stars", 5]
    projection = ["text", 1]
    # Params: query, collection name, projection, limit
    documents = PyM.fetchFromDb(query, 'reviews', projection, 3000)
    starttime = timer()

    gram = []
    for doc in documents:
        gram += createNGram(doc)

    # print type(gram[0])
    endtime = timer()

    fdist = nltk.FreqDist(gram)
    sorted_x = sorted(fdist.items(), key=operator.itemgetter(1), reverse=True)
    for k in sorted_x[:5]:
        print " ".join(k[0]) + ":" + str(k[1])

    print "Elapsed time " + str(endtime - starttime)