def parallel(): query = ["stars", 5] projection = ["text", 1] # Params: query, collection name, projection, limit documents = PyM.fetchFromDb(query, 'reviews', projection, 3000) starttime = timer() pool = mp.Pool(processes=4) results = [ pool.apply_async(createNGram, args=(doc, )) for doc in documents ] output = [p.get() for p in results] flat_ngram_list = [item for sublist in output for item in sublist] endtime = timer() fdist = nltk.FreqDist(flat_ngram_list) sorted_x = sorted(fdist.items(), key=operator.itemgetter(1), reverse=True) for k in sorted_x[:5]: print " ".join(k[0]) + ":" + str(k[1]) print "Elapsed time " + str(endtime - starttime)
def singleTh(): query = ["stars", 5] projection = ["text", 1] # Params: query, collection name, projection, limit documents = PyM.fetchFromDb(query, 'reviews', projection, 3000) starttime = timer() gram = [] for doc in documents: gram += createNGram(doc) # print type(gram[0]) endtime = timer() fdist = nltk.FreqDist(gram) sorted_x = sorted(fdist.items(), key=operator.itemgetter(1), reverse=True) for k in sorted_x[:5]: print " ".join(k[0]) + ":" + str(k[1]) print "Elapsed time " + str(endtime - starttime)