Exemplo n.º 1
0
def mean_headline(x):
    if 'headline' not in x or 'main' not in x['headline'] or not x['headline']['main']:
        return None
    if 'lead_paragraph' not in x or not x['lead_paragraph']:
        return None
    words = text_nltk.lemma_tokenize(x['headline']['main'])
    if len(words) < 5:
        return None
    words += text_nltk.lemma_tokenize(x['lead_paragraph'])
    return np.nanmean([text_nltk.vectors(w) for w in words], axis=0)
# field of interest in Amazon review either 'review/summary' or 'review/text'
text_field_name = 'review/summary'

# count the run time
begin_time = time.time()
#---------------------------------------------------------
# BEGIN the main processing
#--------------------------------------------------------
# fetch the data file and parse the field of interest and filter out those without any review words
data = sc.textFile(data_file).map(json.loads).cache()
data = data.filter(lambda x: 'review/score' in x and 'product/productId' in x and 'review/time' in x and text_field_name in x)
data = data.filter(lambda x: x[text_field_name]!=None or x['review/score']!=None)
data = data.filter(lambda x: len(x[text_field_name])>0)

# pick review text - tokenize and lammentize
article_vector = data.map(lambda x:  ( (x['product/productId'],float(x['review/score']),x['review/time']),text_nltk.lemma_tokenize(x[text_field_name])))
article_vector = article_vector.filter(lambda x: len(x[1])>0)
# calculate the mean vector of the review text words
article_vector = article_vector.map(lambda x: (x[0],np.nanmean([vectors(w) for w in x[1]],axis=0)))


# calculate the avereage emotion scores of all reviews for a given review score among 1,2,3,4,5
avg_all_review = article_vector.map(empathy_vec).map(lambda x: (x[0][1],np.array(x[1]+[1.0]))).reduceByKey(lambda a,b: a+b).sortByKey().map(reformat_scores).collect()
avg_all_review = np.array(avg_all_review)


run_time = time.time()-begin_time
print 'total run_time: '
print run_time

#---------------------------------------------------------