def mean_headline(x): if 'headline' not in x or 'main' not in x['headline'] or not x['headline']['main']: return None if 'lead_paragraph' not in x or not x['lead_paragraph']: return None words = text_nltk.lemma_tokenize(x['headline']['main']) if len(words) < 5: return None words += text_nltk.lemma_tokenize(x['lead_paragraph']) return np.nanmean([text_nltk.vectors(w) for w in words], axis=0)
# field of interest in Amazon review either 'review/summary' or 'review/text' text_field_name = 'review/summary' # count the run time begin_time = time.time() #--------------------------------------------------------- # BEGIN the main processing #-------------------------------------------------------- # fetch the data file and parse the field of interest and filter out those without any review words data = sc.textFile(data_file).map(json.loads).cache() data = data.filter(lambda x: 'review/score' in x and 'product/productId' in x and 'review/time' in x and text_field_name in x) data = data.filter(lambda x: x[text_field_name]!=None or x['review/score']!=None) data = data.filter(lambda x: len(x[text_field_name])>0) # pick review text - tokenize and lammentize article_vector = data.map(lambda x: ( (x['product/productId'],float(x['review/score']),x['review/time']),text_nltk.lemma_tokenize(x[text_field_name]))) article_vector = article_vector.filter(lambda x: len(x[1])>0) # calculate the mean vector of the review text words article_vector = article_vector.map(lambda x: (x[0],np.nanmean([vectors(w) for w in x[1]],axis=0))) # calculate the avereage emotion scores of all reviews for a given review score among 1,2,3,4,5 avg_all_review = article_vector.map(empathy_vec).map(lambda x: (x[0][1],np.array(x[1]+[1.0]))).reduceByKey(lambda a,b: a+b).sortByKey().map(reformat_scores).collect() avg_all_review = np.array(avg_all_review) run_time = time.time()-begin_time print 'total run_time: ' print run_time #---------------------------------------------------------