def confusion(model=None): emotions = ['joy', 'sadness', 'disgust', 'anger', 'surprise', 'fear'] data = spark.data('semeval', cores=16) \ .map(lambda x: spark.dict_kv(x, 'emotion', text_nltk.empathy( text_nltk.mean_vector( x['text'], model=model), model=model ) )).collect() sample = data[0] print >>sys.stderr, sample results = [] for emotion in emotions: for prediction in sample['emotion'].keys(): x = np.asarray([float(row[emotion]) for row in data]) y = np.asarray([row['emotion'][prediction] for row in data]) ind = np.where((np.isnan(x) + np.isnan(y)) == 0) z = (emotion, prediction, ) + scipy.stats.pearsonr(x[ind], y[ind]) results.append( {'emotion': z[0], 'vector': z[1], 'r': z[2], 'p': z[3]}) return results
def all_articles(source, model=None): if not model: model = 'vectors_25d' results = spark.data(source, cores=16).map(lambda x: dict_kv( x, 'emotion', text_nltk.empathy(x['headline_main_%s' % model], model=model))) return results.collect()
def all_identities(source, model=None): if not model: model = 'vectors_25d' return spark.data(source, cores=4) \ .map(lambda x: ((x['query'], '%s-01' % x['pub_date'][:7]), text_nltk.empathy(x['headline_main_%s' % model], model=model))) \ .reduceByKey(add_emotion).map(lambda x: {'query': x[0][0], 'pub_date': x[0][1], 'emotion': div_emotion(x[1])}).collect()
def nyt(source, target): spark.data(source, cores=16) \ .flatMap(lambda x: annotate_skip(x)) \ .map(json.dumps) \ .saveAsTextFile(target)