def mean_headline(x): if 'headline' not in x or 'main' not in x['headline'] or not x['headline']['main']: return None if 'lead_paragraph' not in x or not x['lead_paragraph']: return None words = text_nltk.lemma_tokenize(x['headline']['main']) if len(words) < 5: return None words += text_nltk.lemma_tokenize(x['lead_paragraph']) return np.nanmean([text_nltk.vectors(w) for w in words], axis=0)
def top_articles(source, emotion): return article_vector[source] \ .map(_fv(lambda x: np.dot(text_nltk.vectors(emotion), x))) \ .map(fk_(lambda k: k[1])) \ .map(swap).sortByKey(False) \ .map(result_format).take(36)
print happiness assert happiness[0] >= happiness[ 1], 'pleased < delighted: %f %f' % happiness ''' print test \ .flatMap(k_skip(lambda x: x['query'] if 'query' in x else None)) \ .flatMap(_fv_skip(mean_headline)) \ .map(_fv(empathy)) \ .map(_fv(lambda x: x['happy'])) \ .collect() #map(k_(empathy)).map(k_(max_emotion)).collect() ''' articles = test.flatMap( add_skip(lambda x: (x['query'], x['pub_date']) if 'query' in x else None)) scores = articles \ .flatMap(_fv_skip(mean_headline)) \ .map(_fv(lambda x: np.dot(x, text_nltk.vectors('happy')))) join = articles \ .join(scores) \ .map(fv(lambda x: dict_kv(x[0], 'score', x[1]))) \ .map(add_(lambda x: x['score'])).sortByKey(True).map(v).collect() assert join[0][ 'query'] == 'John Biggs', 'John Biggs is not the least happy after join.' print >>sys.stderr, 'TEST OK' ''' .join(test2) \ .map(v(lambda x: dict_kv(x[1], 'score', x[0]))) \ .map(lambda x: dict_kv(x, 'image', '/static/images/%s.png' % x['query'])) \ '''