Exemplo n.º 1
0
def confusion(model=None):
    emotions = ['joy', 'sadness', 'disgust', 'anger', 'surprise', 'fear']
    data = spark.data('semeval', cores=16)   \
        .map(lambda x: spark.dict_kv(x,
                                     'emotion',
                                     text_nltk.empathy(
                                         text_nltk.mean_vector(
                                             x['text'], model=model),
                                         model=model
                                     )
                                     )).collect()
    sample = data[0]
    print >>sys.stderr, sample
    results = []
    for emotion in emotions:
        for prediction in sample['emotion'].keys():
            x = np.asarray([float(row[emotion]) for row in data])
            y = np.asarray([row['emotion'][prediction] for row in data])
            ind = np.where((np.isnan(x) + np.isnan(y)) == 0)
            z = (emotion, prediction, ) + scipy.stats.pearsonr(x[ind], y[ind])
            results.append(
                {'emotion': z[0], 'vector': z[1], 'r': z[2], 'p': z[3]})
    return results
Exemplo n.º 2
0
def all_articles(source, model=None):
    if not model:
        model = 'vectors_25d'
    results = spark.data(source, cores=16).map(lambda x: dict_kv(
        x, 'emotion', text_nltk.empathy(x['headline_main_%s' % model], model=model)))
    return results.collect()
Exemplo n.º 3
0
def all_identities(source, model=None):
    if not model:
        model = 'vectors_25d'
    return spark.data(source, cores=4)   \
        .map(lambda x: ((x['query'], '%s-01' % x['pub_date'][:7]), text_nltk.empathy(x['headline_main_%s' % model], model=model)))   \
        .reduceByKey(add_emotion).map(lambda x: {'query': x[0][0], 'pub_date': x[0][1], 'emotion': div_emotion(x[1])}).collect()
Exemplo n.º 4
0
def nyt(source, target):
    spark.data(source, cores=16)   \
        .flatMap(lambda x: annotate_skip(x))  \
        .map(json.dumps)   \
        .saveAsTextFile(target)