예제 #1
0
파일: regression.py 프로젝트: wtgme/ohsn
def msepath(X, y):
    print X.shape, y.shape
    # Compute paths
    print("Computing regularization path using the coordinate descent lasso...")
    model = LassoCV(cv=10, max_iter=3000).fit(X, y)

    # Display results
    m_log_alphas = -np.log10(model.alphas_)

    plt.figure()
    plt.plot(m_log_alphas, model.mse_path_, ':')
    plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
             label='Average across the folds', linewidth=2)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                label='alpha: CV estimate')

    plt.legend()
    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean square error')
    plt.title('Mean square error on each fold: coordinate descent')
    plt.axis('tight')
    plt.show()

    fields = iot.read_fields()
    for i in xrange(len(fields)):
        print str(fields[i]) +'\t'+ str(model.coef_[i])
예제 #2
0
파일: regression.py 프로젝트: wtgme/ohsn
def parameter_select(X, y):
    print X.shape, y.shape
    ##############################################################################
    # LassoLarsIC: least angle regression with BIC/AIC criterion
    # model_bic = LassoLarsIC(criterion='bic')
    # model_bic.fit(X, y)
    # alpha_bic_ = model_bic.alpha_
    model_aic = LassoLarsIC(criterion='aic', max_iter=100000000)
    model_aic.fit(X, y)
    alpha_aic_ = model_aic.alpha_
    print alpha_aic_

    def plot_ic_criterion(model, name, color):
        alpha_ = model.alpha_
        alphas_ = model.alphas_
        criterion_ = model.criterion_
        plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
                 linewidth=3, label='%s criterion' % name)
        plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
                    label='alpha: %s estimate' % name)
        plt.xlabel('-log(alpha)')
        plt.ylabel('criterion')

    plt.figure()
    plot_ic_criterion(model_aic, 'AIC', 'b')
    # plot_ic_criterion(model_bic, 'BIC', 'r')
    plt.legend()
    plt.title('Information-criterion for model selection')
    plt.show()

    fields = iot.read_fields()
    for i in xrange(len(fields)):
        print str(fields[i]) +'\t'+ str(model_aic.coef_[i])
예제 #3
0
파일: statistics.py 프로젝트: wtgme/ohsn
def avg_liwc(dbname):
    fields = iot.read_fields()
    for field in fields:
        filters = {field: {'$exists': True}}
        results = list()
        N = 5
        for i in range(1, N+1):
            result = iot.get_values_one_field(dbname, dbname+'com_t'+str(i), field, filters)
            result = central_values(result)
            results.append(result)
        ax = plt.gca()
        ind = [y+1 for y in range(len(results))]
        means = [np.mean(result) for result in results]
        stds = [np.std(result) for result in results]
        ax.errorbar(ind, means, stds, fmt='--o--', capthick=3)
        ax.violinplot(results, showmeans=False, showextrema=True)
        ax.set_xticks(ind)
        # for i in ind:
        #     ax.text(i, means[i-1]+0.5,
        #         str(round(means[i-1], 2))+ '$\pm$'+ str(round(stds[i-1], 2)),
        #         ha='center', va='bottom', )
        ax.set_xticklabels(('Before 2012', '2012', '2013', '2014', 'After 2014'))
        ax.set_xlabel('Time Series')
        tokens = field.split('.')
        if tokens[-1] == 'value':
            ax.set_ylabel(tokens[-2].upper())
        else:
            ax.set_ylabel(tokens[-1])
        ax.grid(True)
        plt.savefig('data/'+field+'.pdf')
        plt.clf()
예제 #4
0
def classification_subfeature(train, test, outclss):
    fields = iot.read_fields()
    print len(fields)
    foi = ['liwc_anal.result.i',
           'liwc_anal.result.we',
           'liwc_anal.result.affect',
           'liwc_anal.result.posemo',
           'liwc_anal.result.negemo',
           'liwc_anal.result.bio',
           'liwc_anal.result.body',
           'liwc_anal.result.health',
           'liwc_anal.result.ingest']
    indeces = [np.where(fields==f)[0][0] for f in foi]
    print fields[indeces]

    '''Load Training data'''
    X_train, y_train = load_svmlight_file(train)
    X_train = X_train.toarray()[:, indeces]
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    print X_train.shape
    '''Load Test data'''
    X_test, y_test = load_svmlight_file(test)
    X_test = X_test.toarray()[:, indeces]
    X_test = scaler.transform(X_test)
    print X_test.shape

    svc_lin = SVC(kernel='linear', class_weight='balanced')
    y_lin = svc_lin.fit(X_train, y_train).predict(X_test)
    # pickle.dump(y_test, open(outid, 'w'))
    pickle.dump(y_lin, open(outclss, 'w'))
예제 #5
0
def common_features():
    '''Need no scoring metrics'''
    LIWC = iot.read_fields()
    LIWC = [line.strip().split('.')[-1] for line in LIWC]
    X1, y1 = load_scale_data('data/ed-random.data')
    X2, y2 = load_scale_data('data/ed-young.data')

    '''Feature rankings'''
    ref1 = ref(X1, y1)
    support1, ranking1 = ref1.support_, ref1.ranking_
    convert_fields(LIWC, ranking1)

    ref2 = ref(X2, y2)
    support2, ranking2 = ref2.support_, ref2.ranking_
    convert_fields(LIWC, ranking2)
    # # X3, y3 = load_scale_data('data/ed-all-liwc.data')
    # # ref3 = ref(X3, y3, 69)
    # # support3, ranking3 = ref3.support_, ref3.ranking_
    # # convert_fields(LIWC, ranking3)

    comm = np.logical_and(support1, support2)
    convert_fields(LIWC, comm)
    pickle.dump(comm, open('data/ed-random-young-common.pick', 'w'))
    # svm_cv(X1[:, support1], y1)
    # svm_cv(X2[:, support2], y2)
    # # svm_cv(X3[:, support3], y3)
    # svm_cv(X1[:, comm], y1)
    # svm_cv(X2[:, comm], y2)
    # svm_cv(X3[:, comm], y3)

    '''Classify with common features'''
예제 #6
0
def msepath(X, y):
    print X.shape, y.shape
    # Compute paths
    print(
        "Computing regularization path using the coordinate descent lasso...")
    model = LassoCV(cv=10, max_iter=3000).fit(X, y)

    # Display results
    m_log_alphas = -np.log10(model.alphas_)

    plt.figure()
    plt.plot(m_log_alphas, model.mse_path_, ':')
    plt.plot(m_log_alphas,
             model.mse_path_.mean(axis=-1),
             'k',
             label='Average across the folds',
             linewidth=2)
    plt.axvline(-np.log10(model.alpha_),
                linestyle='--',
                color='k',
                label='alpha: CV estimate')

    plt.legend()
    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean square error')
    plt.title('Mean square error on each fold: coordinate descent')
    plt.axis('tight')
    plt.show()

    fields = iot.read_fields()
    for i in xrange(len(fields)):
        print str(fields[i]) + '\t' + str(model.coef_[i])
예제 #7
0
def read_user_time(filename):
    fields = iot.read_fields()
    trimed_fields = [field.split('.')[-1] for field in fields]
    groups = [
         ('ED', 'fed', 'com', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}),
         ('RD', 'random', 'scom', {'liwc_anal.result.WC': {'$exists': True}}),
         ('YG', 'younger', 'scom', {'liwc_anal.result.WC': {'$exists': True}})
    ]

    data = []
    for tag, dbname, comname, filter_values in groups:
        com = dbt.db_connect_col(dbname, comname)

        for user in com.find(filter_values, no_cursor_timeout=True):
            if 'status' in user:
                created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                scraped_at = user['scrape_timeline_at']
                last_post = datetime.strptime(user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                life_time = diff_day(last_post, created_at)
                average_time = float(life_time)/min(1, user['statuses_count'])
                longest_tweet_intervalb = user['longest_tweet_interval']

                observation_interval = diff_day(scraped_at, last_post)
                if (observation_interval-longest_tweet_intervalb) > 30:
                    death = 1
                else:
                    death = 0
                values = iot.get_fields_one_doc(user, fields)
                data.append([user['id_str'], created_at, last_post, scraped_at, average_time,
                             longest_tweet_intervalb, observation_interval, tag, death] + values)

    df = pd.DataFrame(data, columns=['uid', 'created_at', 'last_post', 'scraped_at', 'average_time',
                                     'longest_time_interval', 'observation_interval', 'group',
                                     'event'] + trimed_fields)
    df.to_csv(filename)
예제 #8
0
def avg_liwc(dbname):
    fields = iot.read_fields()
    for field in fields:
        filters = {field: {'$exists': True}}
        results = list()
        N = 5
        for i in range(1, N + 1):
            result = iot.get_values_one_field(dbname,
                                              dbname + 'com_t' + str(i), field,
                                              filters)
            result = central_values(result)
            results.append(result)
        ax = plt.gca()
        ind = [y + 1 for y in range(len(results))]
        means = [np.mean(result) for result in results]
        stds = [np.std(result) for result in results]
        ax.errorbar(ind, means, stds, fmt='--o--', capthick=3)
        ax.violinplot(results, showmeans=False, showextrema=True)
        ax.set_xticks(ind)
        # for i in ind:
        #     ax.text(i, means[i-1]+0.5,
        #         str(round(means[i-1], 2))+ '$\pm$'+ str(round(stds[i-1], 2)),
        #         ha='center', va='bottom', )
        ax.set_xticklabels(
            ('Before 2012', '2012', '2013', '2014', 'After 2014'))
        ax.set_xlabel('Time Series')
        tokens = field.split('.')
        if tokens[-1] == 'value':
            ax.set_ylabel(tokens[-2].upper())
        else:
            ax.set_ylabel(tokens[-1])
        ax.grid(True)
        plt.savefig('data/' + field + '.pdf')
        plt.clf()
예제 #9
0
def liwc_color_bar(fieldname):
    X, y = load_scale_data('data/ygcolor.data', True)
    group = 10
    # print X.shape
    y = np.array(y).ravel()
    LIWC = iot.read_fields()
    T = X[:, np.argwhere(LIWC == fieldname).ravel()]
    T = np.repeat(T, 3)
    # fig, ax = plt.subplots()
    print T.shape
    print y.shape
    yhist, ybin_edges = np.histogram(y, [1, 2, 3, 4])
    # print yhist
    xhist, xbin_edges = np.histogram(T, group, range=(np.percentile(T, 2.5), np.percentile(T, 97.5)))
    H = np.histogram2d(T, y, bins=[xbin_edges, ybin_edges])
    print xbin_edges
    ind = np.arange(group)  # the x locations for the groups
    width = 0.35  # the width of the bars: can also be len(x) sequence
    # print H[0][:, 0]
    # print H[0][:, 1]
    # print H[0][:, 2]
    # print H[0][:, 0]+H[0][:, 1]+H[0][:, 2]
    # print xhist
    # print H[0][:, 0]/xhist
    p1 = plt.bar(ind, H[0][:, 0]/xhist, width, color='r', hatch="\\\\")
    p2 = plt.bar(ind, H[0][:, 1]/xhist, width, color='g', hatch="//", bottom=H[0][:, 0]/xhist)
    p3 = plt.bar(ind, H[0][:, 2]/xhist, width, color='b', hatch="--", bottom=(H[0][:, 0] + H[0][:, 1])/xhist)
    plt.xticks(ind+width/2., np.around(0.5*(xbin_edges[1:] + xbin_edges[:-1]), decimals=4))
    # [::3] choose one every three items
    # plt.xticks(ind + width / 2., ind)
    plt.ylabel('Ratio')
    plt.xlabel('Value')
    plt.title('Sentiment class counts of colors by LIWC field ' + fieldname)
    plt.legend((p1[0], p2[0], p3[0]), ('Positive', 'Neutral', 'Negative'))
    plt.show()
예제 #10
0
def liwc_color_sig():
    X, y = load_scale_data('data/ygcolor.data', True)
    LIWC = iot.read_fields()
    flags = list()
    for yi in y:
        if yi[0]==yi[1] and yi[1]==yi[2]:
            flags.append(True)
        else:
            flags.append(False)
    y = np.array([(b, c, d) for (b, c, d) in y])
    flags = np.array(flags)
    y = y[flags][:, 0]
    yhist, ybin_edges = np.histogram(y, [1, 2, 3, 4])
    print yhist
    y[np.where(y < 3)] = +1
    y[np.where(y==3)] = -1
    print y.shape
    print len(y[np.where(y==1)])
    print len(y[np.where(y==-1)])
    X = X[flags, :]
    print X.shape

    # rfecv1 = rfecv(X, y)
    # pickle.dump(rfecv1, open('data/allrfcv.p', 'w'))
    # rfecv1 = pickle.load(open('data/allrfcv.p', 'r'))
    # scores = list()
    # scores.append(rfecv1)
    # plot_rfecvs(scores, ['All Negative or All Non-negative'])

    ref2 = ref(X, y, 1)
    support2, ranking2 = ref2.support_, ref2.ranking_
    print ranking2
    convert_fields(LIWC, ranking2)
예제 #11
0
def liwc_color_bar(fieldname):
    X, y = load_scale_data('data/ygcolor.data', True)
    group = 10
    # print X.shape
    y = np.array(y).ravel()
    LIWC = iot.read_fields()
    T = X[:, np.argwhere(LIWC == fieldname).ravel()]
    T = np.repeat(T, 3)
    # fig, ax = plt.subplots()
    print T.shape
    print y.shape
    yhist, ybin_edges = np.histogram(y, [1, 2, 3, 4])
    # print yhist
    xhist, xbin_edges = np.histogram(T, group, range=(np.percentile(T, 2.5), np.percentile(T, 97.5)))
    H = np.histogram2d(T, y, bins=[xbin_edges, ybin_edges])
    print xbin_edges
    ind = np.arange(group)  # the x locations for the groups
    width = 0.35  # the width of the bars: can also be len(x) sequence
    # print H[0][:, 0]
    # print H[0][:, 1]
    # print H[0][:, 2]
    # print H[0][:, 0]+H[0][:, 1]+H[0][:, 2]
    # print xhist
    # print H[0][:, 0]/xhist
    p1 = plt.bar(ind, H[0][:, 0]/xhist, width, color='r', hatch="\\\\")
    p2 = plt.bar(ind, H[0][:, 1]/xhist, width, color='g', hatch="//", bottom=H[0][:, 0]/xhist)
    p3 = plt.bar(ind, H[0][:, 2]/xhist, width, color='b', hatch="--", bottom=(H[0][:, 0] + H[0][:, 1])/xhist)
    plt.xticks(ind+width/2., np.around(0.5*(xbin_edges[1:] + xbin_edges[:-1]), decimals=4))
    # [::3] choose one every three items
    # plt.xticks(ind + width / 2., ind)
    plt.ylabel('Ratio')
    plt.xlabel('Value')
    plt.title('Sentiment class counts of colors by LIWC field ' + fieldname)
    plt.legend((p1[0], p2[0], p3[0]), ('Positive', 'Neutral', 'Negative'))
    plt.show()
예제 #12
0
def common_features():
    '''Need no scoring metrics'''
    LIWC = iot.read_fields()
    LIWC = [line.strip().split('.')[-1] for line in LIWC]
    X1, y1 = load_scale_data('data/ed-random.data')
    X2, y2 = load_scale_data('data/ed-young.data')

    '''Feature rankings'''
    ref1 = ref(X1, y1)
    support1, ranking1 = ref1.support_, ref1.ranking_
    convert_fields(LIWC, ranking1)

    ref2 = ref(X2, y2)
    support2, ranking2 = ref2.support_, ref2.ranking_
    convert_fields(LIWC, ranking2)
    # # X3, y3 = load_scale_data('data/ed-all-liwc.data')
    # # ref3 = ref(X3, y3, 69)
    # # support3, ranking3 = ref3.support_, ref3.ranking_
    # # convert_fields(LIWC, ranking3)

    comm = np.logical_and(support1, support2)
    convert_fields(LIWC, comm)
    pickle.dump(comm, open('data/ed-random-young-common.pick', 'w'))
    # svm_cv(X1[:, support1], y1)
    # svm_cv(X2[:, support2], y2)
    # # svm_cv(X3[:, support3], y3)
    # svm_cv(X1[:, comm], y1)
    # svm_cv(X2[:, comm], y2)
    # svm_cv(X3[:, comm], y3)

    '''Classify with common features'''
예제 #13
0
def liwc_color_sig():
    X, y = load_scale_data('data/ygcolor.data', True)
    LIWC = iot.read_fields()
    flags = list()
    for yi in y:
        if yi[0]==yi[1] and yi[1]==yi[2]:
            flags.append(True)
        else:
            flags.append(False)
    y = np.array([(b, c, d) for (b, c, d) in y])
    flags = np.array(flags)
    y = y[flags][:, 0]
    yhist, ybin_edges = np.histogram(y, [1, 2, 3, 4])
    print yhist
    y[np.where(y < 3)] = +1
    y[np.where(y==3)] = -1
    print y.shape
    print len(y[np.where(y==1)])
    print len(y[np.where(y==-1)])
    X = X[flags, :]
    print X.shape

    # rfecv1 = rfecv(X, y)
    # pickle.dump(rfecv1, open('data/allrfcv.p', 'w'))
    # rfecv1 = pickle.load(open('data/allrfcv.p', 'r'))
    # scores = list()
    # scores.append(rfecv1)
    # plot_rfecvs(scores, ['All Negative or All Non-negative'])

    ref2 = ref(X, y, 1)
    support2, ranking2 = ref2.support_, ref2.ranking_
    print ranking2
    convert_fields(LIWC, ranking2)
예제 #14
0
파일: tag_network.py 프로젝트: wtgme/ohsn
def friend_network_hashtag_weight(dbname, netname):
    '''
    Community detection on friendship network, weighted by hashtag similarity
    :param dbname:
    :param netname:
    :param user_hash_profile:
    :return:
    '''
    user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r'))
    net = gt.load_network(dbname, netname)
    fields = iot.read_fields()
    com = dbt.db_connect_col(dbname, 'scom')
    for edge in net.es:
        source_vertex_id = edge.source
        target_vertex_id = edge.target
        source_uid = int(net.vs[source_vertex_id]['name'])
        target_uid = int(net.vs[target_vertex_id]['name'])
        source_user = com.find_one({'id':source_uid})
        target_user = com.find_one({'id':target_uid})
        source_user_liwc = iot.get_fields_one_doc(source_user, fields)
        target_user_liwc = iot.get_fields_one_doc(target_user, fields)
        source_user_liwc.extend(user_hash_profile[source_uid])
        target_user_liwc.extend(user_hash_profile[target_uid])
        print len(target_user_liwc)
        dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc)
        edge['weight'] = 1.0/(1.0 + dis)
    net.write_graphml('ed_weighted_follow.graphml')
예제 #15
0
def friend_network_hashtag_weight(dbname, netname):
    '''
    Community detection on friendship network, weighted by hashtag similarity
    :param dbname:
    :param netname:
    :param user_hash_profile:
    :return:
    '''
    user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r'))
    net = gt.load_network(dbname, netname)
    fields = iot.read_fields()
    com = dbt.db_connect_col(dbname, 'scom')
    for edge in net.es:
        source_vertex_id = edge.source
        target_vertex_id = edge.target
        source_uid = int(net.vs[source_vertex_id]['name'])
        target_uid = int(net.vs[target_vertex_id]['name'])
        source_user = com.find_one({'id': source_uid})
        target_user = com.find_one({'id': target_uid})
        source_user_liwc = iot.get_fields_one_doc(source_user, fields)
        target_user_liwc = iot.get_fields_one_doc(target_user, fields)
        source_user_liwc.extend(user_hash_profile[source_uid])
        target_user_liwc.extend(user_hash_profile[target_uid])
        print len(target_user_liwc)
        dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc)
        edge['weight'] = 1.0 / (1.0 + dis)
    net.write_graphml('ed_weighted_follow.graphml')
예제 #16
0
def plot_distribution(dbname='fed', comname='scom'):
    # Plot difference between retweeted and liked tweets
    fields = iot.read_fields()
    for field in fields:
        tokens = field.split('.')
        retweet_key = field.replace('liwc_anal', 'retweet_liwc')
        like_key = field.replace('liwc_anal', 'like_liwc')
        retwets = iot.get_values_one_field(dbname, comname, retweet_key)
        likes = iot.get_values_one_field(dbname, comname, like_key)
        pt.plot_config()
        sns.distplot(retwets,
                     hist=False,
                     kde_kws={
                         "color": "r",
                         "lw": 2,
                         "marker": 'o'
                     },
                     label='RT ($\mu=%0.2f \pm %0.2f$)' %
                     (np.mean(retwets), np.std(retwets)))
        sns.distplot(likes,
                     hist=False,
                     kde_kws={
                         "color": "g",
                         "lw": 2,
                         "marker": 's'
                     },
                     label='Like ($\mu=%0.2f \pm %0.2f$)' %
                     (np.mean(likes), np.std(likes)))
        plt.legend(loc="best")
        plt.xlabel(tokens[-1])
        plt.ylabel('P')
        plt.savefig('data/' + tokens[-1] + '.pdf', bbox_inches='tight')
        plt.clf()
예제 #17
0
def classification_subfeature(train, test, outclss):
    fields = iot.read_fields()
    print len(fields)
    foi = ['liwc_anal.result.i',
           'liwc_anal.result.we',
           'liwc_anal.result.affect',
           'liwc_anal.result.posemo',
           'liwc_anal.result.negemo',
           'liwc_anal.result.bio',
           'liwc_anal.result.body',
           'liwc_anal.result.health',
           'liwc_anal.result.ingest']
    indeces = [np.where(fields==f)[0][0] for f in foi]
    print fields[indeces]

    '''Load Training data'''
    X_train, y_train = load_svmlight_file(train)
    X_train = X_train.toarray()[:, indeces]
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    print X_train.shape
    '''Load Test data'''
    X_test, y_test = load_svmlight_file(test)
    X_test = X_test.toarray()[:, indeces]
    X_test = scaler.transform(X_test)
    print X_test.shape

    svc_lin = SVC(kernel='linear', class_weight='balanced')
    y_lin = svc_lin.fit(X_train, y_train).predict(X_test)
    # pickle.dump(y_test, open(outid, 'w'))
    pickle.dump(y_lin, open(outclss, 'w'))
예제 #18
0
def feature_rank(file_path):
    # Ranking feature usefulness
    LIWC = iot.read_fields()[17:]
    LIWC = [line.strip().split('.')[-1] for line in LIWC]
    X1, y1 = load_scale_data(file_path)
    ref1 = ref(X1, y1)
    support1, ranking1 = ref1.support_, ref1.ranking_
    convert_fields(LIWC, ranking1)
예제 #19
0
def feature_stat(dumped=False):
    fields = io.read_fields()
    print len(fields)
    assert isinstance(fields, object)
    for field in fields:
        keys = field.split('.')
        filter = {field: {'$exists': True}}
        eds = io.get_values_one_field('fed', 'scom', field, filter)
        randoms = io.get_values_one_field('random', 'scom', field, filter)
        youngs = io.get_values_one_field('young', 'scom', field, filter)
        compore_distribution(keys[-1], eds, randoms, youngs)
예제 #20
0
파일: k_core.py 프로젝트: wtgme/ohsn
def coreness_features(g):
    """Correlation of K-core and feature values"""
    g = g.as_undirected(mode="collapse")
    all_coreness = g.shell_index(mode='ALL')
    g.vs['core'] = all_coreness
    fields = iot.read_fields()
    for field in fields:
        gt.add_attribute(g, 'pof', 'fed', 'com', field)
        vlist = g.vs.select(pof_ne=-1000000000.0)['core']
        flist = g.vs.select(pof_ne=-1000000000.0)['pof']
        pt.correlation(vlist, flist, 'K-Core', 'Feature', 'data/corerel/'+field+'.pdf')
예제 #21
0
파일: k_core.py 프로젝트: abiraja2004/ohsn
def coreness_features(g):
    """Correlation of K-core and feature values"""
    g = g.as_undirected(mode="collapse")
    all_coreness = g.shell_index(mode='ALL')
    g.vs['core'] = all_coreness
    fields = iot.read_fields()
    for field in fields:
        gt.add_attribute(g, 'pof', 'fed', 'com', field)
        vlist = g.vs.select(pof_ne=-1000000000.0)['core']
        flist = g.vs.select(pof_ne=-1000000000.0)['pof']
        pt.correlation(vlist, flist, 'K-Core', 'Feature',
                       'data/corerel/' + field + '.pdf')
예제 #22
0
def network_stats(dbname, com, fnet, bnet):
    fields = iot.read_fields()
    # print ('Feature, #Nodes, #Edges, %Nodes, %Edges, D_assort, F_assort, F_assort, Mean, STD, z_sore, p_value')
    print(
        'Network_Feature \t #Nodes \t #Edges \t X_Min \t X_Max \t X_P2.5 \t X_P97.5 \t Y_Min \t Y_Max \t Y_P2.5 \t Y_P97.5 \t Tau_coef \t p_value'
    )
    print 'Following'
    fnetwork = gt.load_network(dbname, fnet)
    '''Out put file for Gephi'''
    # fnetwork.write_dot('friendship.DOT')

    gt.net_stat(fnetwork)
    # outputs = feature_assort_friend(fnetwork, dbname, com, fields, directed=True)
    outputs = rank_feature(fnetwork, dbname, com, fields, directed=True)
예제 #23
0
파일: net_stat.py 프로젝트: wtgme/ohsn
def network_stats(dbname, com, fnet, bnet):
    fields = iot.read_fields()
    # print ('Feature, #Nodes, #Edges, %Nodes, %Edges, D_assort, F_assort, F_assort, Mean, STD, z_sore, p_value')
    print (
        "Network_Feature \t #Nodes \t #Edges \t X_Min \t X_Max \t X_P2.5 \t X_P97.5 \t Y_Min \t Y_Max \t Y_P2.5 \t Y_P97.5 \t Tau_coef \t p_value"
    )
    print "Following"
    fnetwork = gt.load_network(dbname, fnet)

    """Out put file for Gephi"""
    # fnetwork.write_dot('friendship.DOT')

    gt.net_stat(fnetwork)
    # outputs = feature_assort_friend(fnetwork, dbname, com, fields, directed=True)
    outputs = rank_feature(fnetwork, dbname, com, fields, directed=True)
예제 #24
0
def fs_svm(X, y):
    # feature selection with SVM model
    lsvc = LinearSVC(C=0.001, penalty="l1", dual=False).fit(X, y)
    model = SelectFromModel(lsvc, prefit=True)
    X_new = model.transform(X)

    LIWC = iot.read_fields()
    print 'Original feature size', X.shape
    print 'New feature size', X_new.shape
    sample_X = X[0]
    sample_X_new = X_new[0]
    print 'Original feature length of sample', len(set(sample_X))
    print 'New feature length of sample', len(set(sample_X_new))
    for i in xrange(len(sample_X)):
        if sample_X[i] in sample_X_new:
            print i+1, LIWC[i]
예제 #25
0
def feature_stat(dumped=False):
    fields = io.read_fields()
    print len(fields)
    assert isinstance(fields, object)
    for field in fields:
        keys = field.split('.')
        # filter = {field: {'$exists': True}}
        # eds = io.get_values_one_field('fed', 'scom', field, filter)
        # randoms = io.get_values_one_field('random', 'scom', field, filter)
        # youngs = io.get_values_one_field('young', 'scom', field, filter)
        # compore_distribution(keys[-1], eds, randoms, youngs)

        positive = io.get_values_one_field('depression', 'com', field, {field: {'$exists': True}, 'checked': True})
        negative = io.get_values_one_field('depression', 'neg_com', field, {field: {'$exists': True}})
        # print len(positive), len(negative)
        compore_distribution(keys[-1], positive, negative)
예제 #26
0
def fs_svm(X, y):
    # feature selection with SVM model
    lsvc = LinearSVC(C=0.001, penalty="l1", dual=False).fit(X, y)
    model = SelectFromModel(lsvc, prefit=True)
    X_new = model.transform(X)

    LIWC = iot.read_fields()
    print 'Original feature size', X.shape
    print 'New feature size', X_new.shape
    sample_X = X[0]
    sample_X_new = X_new[0]
    print 'Original feature length of sample', len(set(sample_X))
    print 'New feature length of sample', len(set(sample_X_new))
    for i in xrange(len(sample_X)):
        if sample_X[i] in sample_X_new:
            print i+1, LIWC[i]
예제 #27
0
def out_data():
    control = dbt.db_connect_col('fed', 'control_com')
    treat = dbt.db_connect_col('fed', 'treat_com')
    control_user = iot.get_values_one_field('fed', 'control_com', 'id', {'prior_liwc.result.WC':{'$exists': True},
                                                                'post_liwc.result.WC':{'$exists': True}})
    treat_user = iot.get_values_one_field('fed', 'treat_com', 'id', {'prior_liwc.result.WC':{'$exists': True},
                                                                'post_liwc.result.WC':{'$exists': True}})
    data = []
    fields = iot.read_fields()
    prefix = ['prior_liwc', 'post_liwc']
    for i in xrange(2):
        uids = [control_user, treat_user][i]
        for uid in uids:
            user = [control, treat][i].find_one({'id': uid})
            for j in xrange(2):
                fields_new = ['id_str']+[field.replace('liwc_anal', prefix[j]) for field in fields]
                values = iot.get_fields_one_doc(user, fields_new)
                data.append(values+[i, j])

    df = pd.DataFrame(data, columns=['id']+[field.split('.')[-1] for field in fields]+['treated', 'time'])

    df.to_csv('treatment.csv')
예제 #28
0
def parameter_select(X, y):
    print X.shape, y.shape
    ##############################################################################
    # LassoLarsIC: least angle regression with BIC/AIC criterion
    # model_bic = LassoLarsIC(criterion='bic')
    # model_bic.fit(X, y)
    # alpha_bic_ = model_bic.alpha_
    model_aic = LassoLarsIC(criterion='aic', max_iter=100000000)
    model_aic.fit(X, y)
    alpha_aic_ = model_aic.alpha_
    print alpha_aic_

    def plot_ic_criterion(model, name, color):
        alpha_ = model.alpha_
        alphas_ = model.alphas_
        criterion_ = model.criterion_
        plt.plot(-np.log10(alphas_),
                 criterion_,
                 '--',
                 color=color,
                 linewidth=3,
                 label='%s criterion' % name)
        plt.axvline(-np.log10(alpha_),
                    color=color,
                    linewidth=3,
                    label='alpha: %s estimate' % name)
        plt.xlabel('-log(alpha)')
        plt.ylabel('criterion')

    plt.figure()
    plot_ic_criterion(model_aic, 'AIC', 'b')
    # plot_ic_criterion(model_bic, 'BIC', 'r')
    plt.legend()
    plt.title('Information-criterion for model selection')
    plt.show()

    fields = iot.read_fields()
    for i in xrange(len(fields)):
        print str(fields[i]) + '\t' + str(model_aic.coef_[i])
예제 #29
0
def bmi_regreesion(dbname, colname, filename):
    # regress bmi with features
    fields = iot.read_fields()
    poi_fields = fields[-9:-1]
    print poi_fields
    trimed_fields = [(field.split('.')[-1]) for field in fields]
    trimed_fields[-10:] = [
        'sentiment', 'age', 'gender', 'height', 'cw', 'gw', 'cbmi', 'gbmi',
        'edword', 'level'
    ]

    com = dbutil.db_connect_col(dbname, colname)
    data = []
    # for user in com.find({'$or': [{'text_anal.cbmi.value': {'$exists': True}},
    #                               {'text_anal.gbmi.value': {'$exists': True}}],
    #                       'liwc_anal.result.WC': {'$exists': True}}, no_cursor_timeout=True):
    com2 = dbutil.db_connect_col('fed2', colname)
    com3 = dbutil.db_connect_col('fed3', colname)
    for user in com.find({'liwc_anal.result.WC': {
            '$exists': True
    }},
                         no_cursor_timeout=True):
        values = iot.get_fields_one_doc(user, fields)
        user2 = com2.find_one({'id': user['id']})
        if user2:
            values.extend(iot.get_fields_one_doc(user2, poi_fields))
        else:
            values.extend([0] * len(poi_fields))
        user3 = com3.find_one({'id': user['id']})
        if user3:
            values.extend(iot.get_fields_one_doc(user3, poi_fields))
        else:
            values.extend([0] * len(poi_fields))
        data.append(values)
    df = pd.DataFrame(data,
                      columns=trimed_fields + [(field.split('.')[-2] + '_p2')
                                               for field in poi_fields] +
                      [(field.split('.')[-2] + '_p3') for field in poi_fields])
    df.to_csv(filename)
예제 #30
0
    # ed_bio_sta('fed', 'scom')
    # fields = [
    #               # 'text_anal.gw.value',
    #               # 'text_anal.cw.value',
    #               # 'text_anal.edword_count.value',
    #               # 'text_anal.h.value',
    #               # 'text_anal.a.value',
    #               # 'text_anal.bmi.value',
    #               'text_anal.cbmi.value',
    #               'text_anal.gbmi.value',
    #               # 'text_anal.lw.value',
    #               # 'text_anal.hw.value'
    # ]
    # plot_bio('fed', 'scom', fields, ['CBMI', 'GBMI'])

    # bmi_regreesion('fed', 'com', 'data/bmi_reg.csv')

    fields = iot.read_fields()
    poi_fields = fields[-9:-1]
    print poi_fields
    trimed_fields = [(field.split('.')[-1]) for field in fields]
    trimed_fields[-10:] = [
        'sentiment', 'age', 'gender', 'height', 'cw', 'gw', 'cbmi', 'gbmi',
        'edword', 'level'
    ]
    df = pd.read_csv('data/bmi_reg.csv', index_col=0)
    df.columns = trimed_fields + [
        (field.split('.')[-2] + '_p2') for field in poi_fields
    ] + [(field.split('.')[-2] + '_p3') for field in poi_fields]
    df.to_csv('data/bmi_reg.csv')
예제 #31
0
def liwc_feature():
    fields = iot.read_fields()
    for field in fields:
        values = iot.get_values_one_field('depression', 'users1', field)
        print field, np.mean(values), np.std(values)
예제 #32
0
def network_assort():
    # test network assortative
    gs = ['edfollow', 'follow', 'retweet', 'communication']
    fields = iot.read_fields()
    # print len(fields)
    for gf in gs[1:]:
        g = gt.Graph.Read_GraphML('data/' + gf + '_net.graphml')
        # g = gt.giant_component(g)
        # gt.net_stat(g)
        sigs = []
        for filed in fields:
            g = gt.add_attribute(g, 'foi', 'depression', 'com', filed)
            raw_values = np.array(g.vs['foi'])
            values = drop_initials(raw_values)
            if len(values) > 100:
                output = gf + ',' + filed.split('.')[-1] + ','
                # maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5)
                maxv, minv = max(values), min(values)
                vs = g.vs.select(foi_ge=minv, foi_le=maxv)
                sg = g.subgraph(vs)
                raw_assort = sg.assortativity('foi', 'foi', directed=True)
                ass_list = []
                for i in xrange(1000):
                    np.random.shuffle(raw_values)
                    g.vs["foi"] = raw_values
                    vs = g.vs.select(foi_ge=minv, foi_le=maxv)
                    sg = g.subgraph(vs)
                    ass_list.append(
                        sg.assortativity('foi', 'foi', directed=True))

                ass_list = np.array(ass_list)
                amean, astd = np.mean(ass_list), np.std(ass_list)
                absobserved = abs(raw_assort)
                pval = (np.sum(ass_list >= absobserved) +
                        np.sum(ass_list <= -absobserved)) / float(
                            len(ass_list))
                zscore = (raw_assort - amean) / astd
                output += format(raw_assort, '.2f') + ',' + format(amean, '.2f') + ',' + \
                          format(astd, '.2f') + ',' + format(zscore, '.2f') + ',' + format(pval, '.3f') + ','
                if pval < 0.001:
                    output += '***'
                    if raw_assort > 0:
                        sigs.append('***')
                    print output
                    continue
                if pval < 0.01:
                    output += '**'
                    if raw_assort > 0:
                        sigs.append('**')
                    print output
                    continue
                if pval < 0.05:
                    output += '*'
                    if raw_assort > 0:
                        sigs.append('*')
                    print output
                    continue
                else:
                    sigs.append('N')
                    print output
                    continue
        c = Counter(sigs)
        print c
        for sig, cou in c.items():
            print sig, 1.0 * cou / len(fields)
예제 #33
0
    # ygimage = pickle.load(open('data/ygimage.pick', 'r'))
    # print len(ygimage)
    # labels = map_color_label(ygimage)
    # pickle.dump(labels, open('data/yglabels.pick', 'w'))
    # labels = pickle.load(open('data/yglabels.pick', 'r'))
    # print labels
    # senti = map_label_senti(labels)
    # pickle.dump(senti, open('data/ygsentis.pick', 'w'))
    # senti = pickle.load(open('data/ygsentis.pick', 'r'))
    # LIWC = io.read_fields()
    # print len(LIWC)
    # print len(senti)
    # print senti
    # color_classify(senti, LIWC, 'data/ygcolor', 'young')
    """Generate Data for user classification"""
    fields = io.read_fields()
    print len(fields)
    # common = pickle.load(open('data/common.pick', 'r'))
    # fields = LIWC[common]
    # print len(LIWC[common])
    # print fields
    #
    # # common users in random and young = set([4319191638L, 2627223434L, 2976822286L, 4788248335L, 3289264086L, 520847919, 439647015, 947539758, 617442479, 2481703728L, 2913311029L, 3760687289L, 2303011905L, 1712561862, 2882255303L, 261549132, 982895821, 2849269327L, 312684498, 160044558, 774072534, 330611545, 430569947, 1275228253, 3399616094L, 2924322143L, 457692129, 3006221026L, 2837359399L, 18942418, 2848241137L, 273768180, 235857269, 3315086840L])
    # # fed, random, young
    # users = potential_users('fed', 'com')

    # triangle = pickle.load(open('data/triangle.pick', 'r'))
    # print triangle

    # feature_output(fields, 'data/random-younger', 'younger', '-1', False, [])
예제 #34
0
def roc_plot_feature(datafile):
    X, y = load_scale_data(datafile)
    fields = iot.read_fields()
    trim_files = [f.split('.')[-1] for f in fields]
    print len(trim_files)
    select_f = [
        'friend_count',
        'status_count',
        'follower_count',
        'friends_day',
        'statuses_day',
        'followers_day',
        'retweet_pro',
        'dmention_pro',
        'reply_pro',
        # 'hashtag_pro',
        # 'url_pro',
        'retweet_div',
        'mention_div',
        'reply_div',
        'i',
        'we',
        'swear',
        'negate',
        'body',
        'health',
        'ingest',
        'social',
        'posemo',
        'negemo'
    ]

    indecs = [trim_files.index(f) for f in select_f]
    print indecs
    X = X[:, indecs]
    # '''Calculate positive emotion ratio'''
    # # print X.shape
    # X[:,-2] /= (X[:,-2] + X[:, -1])
    # X = X[:, :-1]
    # X[:, -1][~np.isfinite(X[:, -1])] = 0

    # min_max_scaler = preprocessing.MinMaxScaler()
    # X = min_max_scaler.fit_transform(X)

    X = preprocessing.scale(X)

    print X.shape, y.shape
    # Z = np.append(X, y.reshape((len(y), 1)), axis=1)
    # df = pd.DataFrame(Z, columns=select_f + ['label'])
    # affair_mod = logit("label ~ " + '+'.join(select_f[:-1]), df).fit()
    # print(affair_mod.summary())
    # df.to_csv('scaling-clsuter-feature.csv', index=False)

    print X.shape
    plu.plot_config()
    ax = plt.gca()
    ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))

    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 0:12], y)
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'r--^',
            label='Soc. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)
    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 12:22], y)
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'g--d',
            label='Lin. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)

    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X, y)
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'b--o',
            label='All. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.legend(loc="lower right")
    ax.grid(True)
    plt.show()

    data = []
    result = svm_cv(X[:, 0:12], y)
    for i, v in enumerate(result):
        data.append(['Social Activities', i, v])
    result = svm_cv(X[:, 12:22], y)
    for i, v in enumerate(result):
        data.append(['Linguistic Constructs', i, v])
    result = svm_cv(X, y)
    for i, v in enumerate(result):
        data.append(['All', i, v])
    df = pd.DataFrame(data, columns=['Feature', 'Metric', 'Value'])
    plu.plot_config()
    g = sns.factorplot(x="Metric",
                       y="Value",
                       hue="Feature",
                       data=df,
                       kind="bar",
                       legend=False,
                       palette={
                           "Social Activities": "#e9a3c9",
                           "Linguistic Constructs": "#91bfdb",
                           'All': '#a1d76a'
                       })
    g.set_xticklabels(["Accuracy", "Micro-F1", 'Macro-F1'])
    g.set_ylabels('Index')
    g.set_xlabels('Metric')
    annots = df['Value']
    print annots
    hatches = ['/', '/', '/', '', '', '', '\\', '\\', '\\']

    ax = g.ax  #annotate axis = seaborn axis
    for i, p in enumerate(ax.patches):
        ax.annotate("%.2f" % (annots[i]),
                    (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center',
                    va='center',
                    fontsize=25,
                    color='black',
                    rotation=0,
                    xytext=(0, 20),
                    textcoords='offset points')
        p.set_hatch(hatches[i])
    plt.legend(bbox_to_anchor=(1, 1.2), ncol=6)
    plt.ylim(0.5, 1)
    plt.show()
예제 #35
0
def user_profiles(dbname, comname, userfile='data/actor.uid'):
    # # get profile infor for regression
    uids = pickle.load(open(userfile))
    print len(uids)
    com = dbt.db_connect_col(dbname, comname)
    newcom = dbt.db_connect_col(dbname, 'pro_mention_miss_com')

    # newcom.create_index("id", unique=True)
    # # Collect miss data
    # missuids, taguids = [], []
    # for uid in uids:
    #     user = com.find_one({'id': int(uid)})
    #     if user is None:
    #         missuids.append(int(uid))
    #     else:
    #         taguids.append(int(uid))
    # list_size = len(missuids)
    # print '%d users to process' %list_size
    # length = int(math.ceil(list_size/100.0))
    # for index in xrange(length):
    #     index_begin = index*100
    #     index_end = min(list_size, index_begin+100)
    #     userlook.lookup_user_list(missuids[index_begin:index_end], newcom, 1, 'N')

    # # Collect tweets for missing users
    # converstream = dbt.db_connect_col(dbname, 'pro_mention_timeline')
    # most_recenty = converstream.find().sort([('id', -1)]).limit(1)
    # oldest = converstream.find().sort([('id', 1)]).limit(1)
    # max_id = most_recenty[0]['id']
    # since_id = oldest[0]['id']
    # print most_recenty[0]
    # print oldest[0]
    # com = dbt.db_connect_col(dbname, 'pro_mention_miss_com')
    # timeline = dbt.db_connect_col(dbname, 'pro_mention_miss_timeline')

    # com.create_index([('timeline_scraped_times', pymongo.ASCENDING)])
    # timeline.create_index([('user.id', pymongo.ASCENDING),
    #                       ('id', pymongo.DESCENDING)])
    # timeline.create_index([('id', pymongo.ASCENDING)], unique=True)

    # print datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Connect Twitter.com'
    # timelines.retrieve_timeline(com, timeline, max_id)
    # print datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'finish timeline for sample users'

    data = []
    fields = iot.read_fields()
    miss_count = 0
    print fields
    for uid in uids:
        user = com.find_one({'id': int(uid)})
        if user is not None:
            row = iot.get_fields_one_doc(user, fields)
            data.append(row)
        else:
            user = newcom.find_one({'id': int(uid)})
            if user is not None:
                row = iot.get_fields_one_doc(user, fields)
                data.append(row)
            else:
                miss_count += 1
    print miss_count, miss_count * 1.0 / len(uids)
    df = pd.DataFrame(data=data, columns=['uid', 'posemo', 'negemo', 'senti'])
    df.to_csv('data/emotions.csv')
예제 #36
0
def data_split(dbname='TwitterProAna', colname='tweets'):
    # # https://stackoverflow.com/questions/8136652/query-mongodb-on-month-day-year-of-a-datetime
    # # Label tweets with dates
    # tweets = dbt.db_connect_col(dbname, colname)
    # # basedate = datetime(1970, 1, 1)
    # # tweets.create_index([('date_week', pymongo.ASCENDING)])
    # # for tweet in tweets.find({}, no_cursor_timeout=True):
    # #     creat = tweet['created_at']
    # #     detal = creat - basedate
    # #     datestr = detal.days // 7 + 1
    # #     tweets.update_one({'id': tweet['id']}, {'$set': {"date_week": datestr}}, upsert=False)
    #
    # # # Indexing tweets with dates
    # date_index = {}
    # for tweet in tweets.find({}, ['id', 'date_week'], no_cursor_timeout=True):
    #     tid, date = tweet['id'], tweet['date_week']
    #     tlist = date_index.get(date, [])
    #     tlist.append(tid)
    #     date_index[date] = tlist
    # pickle.dump(date_index, open('date_tid_list_week.pick', 'w'))
    #
    # # Bunch with tweets in give dates to produce LIWC results
    # # tweets = dbt.db_connect_col(dbname, colname)
    # # date_index = pickle.load(open('date_tid_list_week.pick', 'r'))
    # timeseries = dbt.db_connect_col(dbname, 'weekseries')
    # for key in date_index.keys():
    #     tlist = date_index[key]
    #     textmass = ''
    #     for tid in tlist:
    #         tweet = tweets.find_one({'id': tid})
    #         text = tweet['text'].encode('utf8')
    #         # replace RT, @, # and Http://
    #         match = rtgrex.search(text)
    #         if match is None:
    #             text = mgrex.sub('', text)
    #             text = hgrex.sub('', text)
    #             text = ugrex.sub('', text)
    #             text = text.strip()
    #             if not(text.endswith('.') or text.endswith('?') or text.endswith('!')):
    #                 text += '.'
    #             textmass += " " + text.lower()
    #     words = textmass.split()
    #     # Any text with fewer than 50 words should be looked at with a certain degree of skepticism.
    #     if len(words) > 50:
    #         liwc_result = liwc.summarize_document(' '.join(words))
    #         timeseries.insert({'date': key, 'liwc':liwc_result})

    timeseries = dbt.db_connect_col(dbname, 'weekseries')
    fields = iot.read_fields()
    fields_trim = [f.replace('liwc_anal.result.', '') for f in fields]
    fields = [f.replace('_anal.result', '') for f in fields]

    print len(fields)
    data = []
    basedate = datetime(1970, 1, 1)
    for entry in timeseries.find():
        time = entry['date']
        # date = datetime.strptime(time, '%Y-%m')
        # date = datetime.date(year=int(time[0]), month=int(time[1]))
        # detal = creat - basedate
    # #     datestr = detal.days // 7 + 1
        days = (time -1)*7
        date = basedate + datetime.timedelta(days=days)
        features = iot.get_fields_one_doc(entry, fields)
        data.append([date] + features)
    df = pd.DataFrame(data=data, columns=['date'] + fields_trim)
    df.to_csv('ian-liwc-tweets-week.csv')