예제 #1
0
def plot_rfecvs(rfecvs, labels):
    # Plot number of features VS. cross-validation scores
    plu.plot_config()
    marker = itertools.cycle((',', 'x', 'o', '.', '*'))
    plt.xlabel("#Features")
    plt.ylabel("Cross validation ROC_AUC")
    for i in xrange(len(rfecvs)):
        rfecv = rfecvs[i]
        label = labels[i]
        # c = np.random.rand(3)

        plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_,
                 # label=label,
                 # c=c,
                 marker=marker.next(), lw=2
                 )
        plt.axvline(rfecv.n_features_, linestyle='dashdot',
                    # c=c,
                    lw=4)
        plt.annotate('Best: (' + str(rfecv.n_features_) + ', ' + str(round(rfecv.grid_scores_[rfecv.n_features_-1]*100, 2))+'%)',
                 xy=(rfecv.n_features_, rfecv.grid_scores_[rfecv.n_features_-1]),  xycoords='data',
                 xytext=(-30, -30*(i+1)), textcoords='offset points', fontsize=20,
                 arrowprops=dict(arrowstyle="->"))
        # plt.annotate(str(rfecv.n_features_)+', '+str(rfecv.grid_scores_[rfecv.n_features_-1]),
        #              xy=(rfecv.n_features_, rfecv.grid_scores_[rfecv.n_features_-1]),
        #              xytext=(rfecv.n_features_, rfecv.grid_scores_[rfecv.n_features_-1]-0.2)
        #              )
    plt.legend(loc="best")
    plt.grid(True)
    plt.show()
    plt.savefig('refcv.pdf')
    plt.clf()
예제 #2
0
파일: activity.py 프로젝트: wtgme/ohsn
def lifetime(dbname, comname, timename):
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    time = db[timename]
    during = []
    for user in com.find({"timeline_count": {'$gt': 0}}):
        newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0]
        last = datetime.strptime(newtweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        account = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        print user['id'], last, account, (last.date() - account.date()).days + 1
        during.append((last.date() - account.date()).days + 1)
    pt.plot_config()
    plt.figure(1)
    plt.subplot(211)
    pt.sns.distplot(during)
    print np.mean(during), np.std(during)
    plt.axvline(np.mean(during), linestyle='--', color='k',
                label='Mean')
    plt.ylabel('PDF')
    plt.xlim(0, 2700)
    plt.legend()

    plt.subplot(212)
    pt.sns.boxplot(x=during)
    plt.ylabel('Quartile')
    plt.xlabel('Day')
    plt.xlim(0,2700)
    plt.show()
예제 #3
0
def lifetime(dbname, comname, timename):
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    time = db[timename]
    during = []
    for user in com.find({"timeline_count": {'$gt': 0}}):
        newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0]
        last = datetime.strptime(newtweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        account = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        print user['id'], last, account, (last.date() - account.date()).days + 1
        during.append((last.date() - account.date()).days + 1)
    pt.plot_config()
    plt.figure(1)
    plt.subplot(211)
    pt.sns.distplot(during)
    print np.mean(during), np.std(during)
    plt.axvline(np.mean(during), linestyle='--', color='k',
                label='Mean')
    plt.ylabel('PDF')
    plt.xlim(0, 2700)
    plt.legend()

    plt.subplot(212)
    pt.sns.boxplot(x=during)
    plt.ylabel('Quartile')
    plt.xlabel('Day')
    plt.xlim(0,2700)
    plt.show()
예제 #4
0
def plot_distribution(dbname='fed', comname='scom'):
    # Plot difference between retweeted and liked tweets
    fields = iot.read_fields()
    for field in fields:
        tokens = field.split('.')
        retweet_key = field.replace('liwc_anal', 'retweet_liwc')
        like_key = field.replace('liwc_anal', 'like_liwc')
        retwets = iot.get_values_one_field(dbname, comname, retweet_key)
        likes = iot.get_values_one_field(dbname, comname, like_key)
        pt.plot_config()
        sns.distplot(retwets,
                     hist=False,
                     kde_kws={
                         "color": "r",
                         "lw": 2,
                         "marker": 'o'
                     },
                     label='RT ($\mu=%0.2f \pm %0.2f$)' %
                     (np.mean(retwets), np.std(retwets)))
        sns.distplot(likes,
                     hist=False,
                     kde_kws={
                         "color": "g",
                         "lw": 2,
                         "marker": 's'
                     },
                     label='Like ($\mu=%0.2f \pm %0.2f$)' %
                     (np.mean(likes), np.std(likes)))
        plt.legend(loc="best")
        plt.xlabel(tokens[-1])
        plt.ylabel('P')
        plt.savefig('data/' + tokens[-1] + '.pdf', bbox_inches='tight')
        plt.clf()
예제 #5
0
def distribution_change(dbname, colname):
    rec_users1 = pickle.load(open('data/pro-recovery.pick', 'r'))
    pro_ed = pickle.load(open('data/pro_ed.pick', 'r'))
    print len(rec_users1)
    print len(pro_ed)
    features = [
        'liwc_anal.result.i',
        'liwc_anal.result.we',
        'liwc_anal.result.bio',
        'liwc_anal.result.body',
        'liwc_anal.result.health',
        'liwc_anal.result.posemo',
        'liwc_anal.result.negemo',
        'liwc_anal.result.ingest',
        'liwc_anal.result.anx',
        'liwc_anal.result.anger',
        'liwc_anal.result.sad'
                ]
    names = ['I', 'We', 'Bio', 'Body', 'Health', 'Posemo', 'Negemo', 'Ingest', 'Anx', 'Anger', 'Sad']
    df = pd.DataFrame()
    pltt.plot_config()
    for i in xrange(len(features)):
        feature = features[i]
        old_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': rec_users1}})
        df1 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-Recovery', 'Values': old_values})
        new_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': pro_ed}})
        df2 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-ED', 'Values': new_values})
        df1 = df1.append(df2)
        if len(df) == 0:
            df = df1
        else:
            df = df.append(df1)
        '''Plot Individual'''
        # sns.distplot(old_values, hist=False, label='Before')
        # sns.distplot(new_values, hist=False, label='After')
        d, p = stats.ks_2samp(old_values, new_values)
        print (names[i] + ', %.3f(%.3f), %.3f(%.3f), %.3f(%.3f)' %((np.mean(old_values)), (np.std(old_values)),
                                                 (np.mean(new_values)), (np.std(new_values)), d, p))
        # plt.xlabel(feature)
        # plt.ylabel('PDF')
        # # plt.show()
        # plt.savefig(dbname+'_'+feature+'_time.pdf')
        # plt.clf()
    sns.set(style="whitegrid", palette="pastel", color_codes=True)
    # sns.violinplot(x="Feature", y="Values", hue="Time", data=df, split=True,
    #                inner="quart", palette={"Before": "b", "After": "y"})
    # sns.despine(left=True)
    sns.boxplot(x="Feature", y="Values", hue="Group", data=df, palette="PRGn")
    sns.despine(offset=10, trim=True)
    plt.show()
예제 #6
0
파일: edrelatedcom.py 프로젝트: wtgme/ohsn
def distribution_change(dbname, colname):
    rec_users1 = pickle.load(open('data/pro-recovery.pick', 'r'))
    pro_ed = pickle.load(open('data/pro_ed.pick', 'r'))
    print len(rec_users1)
    print len(pro_ed)
    features = [
        'liwc_anal.result.i',
        'liwc_anal.result.we',
        'liwc_anal.result.bio',
        'liwc_anal.result.body',
        'liwc_anal.result.health',
        'liwc_anal.result.posemo',
        'liwc_anal.result.negemo',
        'liwc_anal.result.ingest',
        'liwc_anal.result.anx',
        'liwc_anal.result.anger',
        'liwc_anal.result.sad'
                ]
    names = ['I', 'We', 'Bio', 'Body', 'Health', 'Posemo', 'Negemo', 'Ingest', 'Anx', 'Anger', 'Sad']
    df = pd.DataFrame()
    pltt.plot_config()
    for i in xrange(len(features)):
        feature = features[i]
        old_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': rec_users1}})
        df1 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-Recovery', 'Values': old_values})
        new_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': pro_ed}})
        df2 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-ED', 'Values': new_values})
        df1 = df1.append(df2)
        if len(df) == 0:
            df = df1
        else:
            df = df.append(df1)
        '''Plot Individual'''
        # sns.distplot(old_values, hist=False, label='Before')
        # sns.distplot(new_values, hist=False, label='After')
        d, p = stats.ks_2samp(old_values, new_values)
        print (names[i] + ', %.3f(%.3f), %.3f(%.3f), %.3f(%.3f)' %((np.mean(old_values)), (np.std(old_values)),
                                                 (np.mean(new_values)), (np.std(new_values)), d, p))
        # plt.xlabel(feature)
        # plt.ylabel('PDF')
        # # plt.show()
        # plt.savefig(dbname+'_'+feature+'_time.pdf')
        # plt.clf()
    sns.set(style="whitegrid", palette="pastel", color_codes=True)
    # sns.violinplot(x="Feature", y="Values", hue="Time", data=df, split=True,
    #                inner="quart", palette={"Before": "b", "After": "y"})
    # sns.despine(left=True)
    sns.boxplot(x="Feature", y="Values", hue="Group", data=df, palette="PRGn")
    sns.despine(offset=10, trim=True)
    plt.show()
예제 #7
0
def plot_boxplot(filename='user-kmeans-hashtag.csv'):
    import ohsn.util.plot_util as plu
    plu.plot_config()
    df = pd.read_csv(filename, index_col=0)
    ax = sns.boxplot(x="cluster",
                     y="silhouette_avg",
                     data=df,
                     color="lightblue")
    # sns.pointplot(x="cluster", y="silhouette_avg", data=df, errcolor='red')
    # ax.set_xticklabels([t.get_text() if int(t.get_text())/2==0 else '' for t in ax.get_xticklabels()])
    sns.despine(offset=10, trim=True)
    plt.xlabel('K')
    plt.ylabel('Average Silhouette')
    plt.ylim(0.38, 0.81)
    plt.show()
예제 #8
0
파일: com_det.py 프로젝트: wtgme/ohsn
def communtiy_feature(dbname, typename):
    fg = ntt.loadnet(dbname, typename)

    fcoms = gt.fast_community(fg)
    pickle.dump(fcoms, open('data/'+dbname+typename+'com.pick', 'w'))
    fcoms = pickle.load(open('data/'+dbname+typename+'com.pick', 'r'))
    fclus = fcoms.as_clustering()
    gt.summary(fclus)

    """Compare difference of features in cummunities"""
    features = [
        'liwc_anal.result.i',
        'liwc_anal.result.we',
        'liwc_anal.result.bio',
        'liwc_anal.result.body',
        'liwc_anal.result.health',
        'liwc_anal.result.posemo',
        'liwc_anal.result.negemo',
        'liwc_anal.result.ingest',
        'liwc_anal.result.anx',
        'liwc_anal.result.anger',
        'liwc_anal.result.sad'
                ]
    therh = 0.1 * fg.vcount()
    for feature in features:
        data = []
        for clu in fclus:
            if len(clu) > therh:
                ulist = set()
                for v in clu:
                    ulist.add(int(fg.vs[v]['name']))
                ulist = list(ulist)
                clu_values = iot.get_values_one_field(dbname, 'com', feature, {'id': {'$in': ulist}})
                data.append(clu_values)

        plot.plot_config()
        for i in xrange(len(data)):
            sns.distplot(data[i], hist=False, label=str(i)+':'+str(len(data[i])))
        plt.xlabel(feature)
        plt.ylabel('PDF')
        # plt.show()
        plt.savefig(feature+typename+'_com.pdf')
        plt.clf()
예제 #9
0
파일: com_det.py 프로젝트: abiraja2004/ohsn
def communtiy_feature(dbname, typename):
    fg = ntt.loadnet(dbname, typename)

    fcoms = gt.fast_community(fg)
    pickle.dump(fcoms, open('data/' + dbname + typename + 'com.pick', 'w'))
    fcoms = pickle.load(open('data/' + dbname + typename + 'com.pick', 'r'))
    fclus = fcoms.as_clustering()
    gt.summary(fclus)
    """Compare difference of features in cummunities"""
    features = [
        'liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.bio',
        'liwc_anal.result.body', 'liwc_anal.result.health',
        'liwc_anal.result.posemo', 'liwc_anal.result.negemo',
        'liwc_anal.result.ingest', 'liwc_anal.result.anx',
        'liwc_anal.result.anger', 'liwc_anal.result.sad'
    ]
    therh = 0.1 * fg.vcount()
    for feature in features:
        data = []
        for clu in fclus:
            if len(clu) > therh:
                ulist = set()
                for v in clu:
                    ulist.add(int(fg.vs[v]['name']))
                ulist = list(ulist)
                clu_values = iot.get_values_one_field(dbname, 'com', feature,
                                                      {'id': {
                                                          '$in': ulist
                                                      }})
                data.append(clu_values)

        plot.plot_config()
        for i in xrange(len(data)):
            sns.distplot(data[i],
                         hist=False,
                         label=str(i) + ':' + str(len(data[i])))
        plt.xlabel(feature)
        plt.ylabel('PDF')
        # plt.show()
        plt.savefig(feature + typename + '_com.pdf')
        plt.clf()
예제 #10
0
def profile_change(dbname, colname, timename):
    # db = dbt.db_connect_no_auth(dbname)
    # com = db[colname]
    # time = db[timename]
    #
    # followee, follower, tweets, users, olddate, newdate, during = [], [], [], [], [], [], []
    # filter = {'liwc_anal.result.i':{'$exists':True}, 'new_liwc_anal.result.i':{'$exists':True}}
    #
    # for user in com.find(filter):
    #     newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0]
    #     oldtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', 1)]).limit(1)[0]
    #     print user['id'], oldtweet['created_at'], newtweet['created_at'], \
    #         (newtweet['created_at'].date() - oldtweet['created_at'].date()).days+1
    #     users.append(user['id'])
    #     olddate.append(oldtweet['created_at'])
    #     newdate.append(newtweet['created_at'])
    #     during.append((newtweet['created_at'].date() - oldtweet['created_at'].date()).days + 1)
    #     follower.append(newtweet['user']['followers_count'] - oldtweet['user']['followers_count'])
    #     followee.append(newtweet['user']['friends_count']- oldtweet['user']['friends_count'])
    #     tweets.append(newtweet['user']['statuses_count']- oldtweet['user']['statuses_count'])
    # df =  pd.DataFrame({'User': users,
    #                     'OldDate': olddate,
    #                     'NewDate': newdate,
    #                     'Follower': follower,
    #                     'Followee': followee,
    #                     'Tweet': tweets,
    #                     'ActiveTime': during})
    # pickle.dump(df, open('data/df.pick', 'w'))
    df = pickle.load(open('data/df.pick', 'r'))
    pt.plot_config()
    df['Followee/Day'] = (df.Followee / df.ActiveTime)
    df['Follower/Day'] = (df.Follower / df.ActiveTime)
    df['Tweet/Day'] = (df.Tweet / df.ActiveTime)
    print df.describe()
    df.to_csv('profiles.csv')
    sns.boxplot(data=df.loc[:,
                            ['Followee', 'Follower', 'Tweet', 'ActiveTime']])
    # sns.boxplot(data=df.loc[:, ['Followee/Day', 'Follower/Day', 'Tweet/Day']])
    plt.ylim(-300, 400)
    plt.show()
예제 #11
0
파일: split_data.py 프로젝트: wtgme/ohsn
def profile_change(dbname, colname, timename):
    # db = dbt.db_connect_no_auth(dbname)
    # com = db[colname]
    # time = db[timename]
    #
    # followee, follower, tweets, users, olddate, newdate, during = [], [], [], [], [], [], []
    # filter = {'liwc_anal.result.i':{'$exists':True}, 'new_liwc_anal.result.i':{'$exists':True}}
    #
    # for user in com.find(filter):
    #     newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0]
    #     oldtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', 1)]).limit(1)[0]
    #     print user['id'], oldtweet['created_at'], newtweet['created_at'], \
    #         (newtweet['created_at'].date() - oldtweet['created_at'].date()).days+1
    #     users.append(user['id'])
    #     olddate.append(oldtweet['created_at'])
    #     newdate.append(newtweet['created_at'])
    #     during.append((newtweet['created_at'].date() - oldtweet['created_at'].date()).days + 1)
    #     follower.append(newtweet['user']['followers_count'] - oldtweet['user']['followers_count'])
    #     followee.append(newtweet['user']['friends_count']- oldtweet['user']['friends_count'])
    #     tweets.append(newtweet['user']['statuses_count']- oldtweet['user']['statuses_count'])
    # df =  pd.DataFrame({'User': users,
    #                     'OldDate': olddate,
    #                     'NewDate': newdate,
    #                     'Follower': follower,
    #                     'Followee': followee,
    #                     'Tweet': tweets,
    #                     'ActiveTime': during})
    # pickle.dump(df, open('data/df.pick', 'w'))
    df = pickle.load(open('data/df.pick', 'r'))
    pt.plot_config()
    df['Followee/Day']=(df.Followee/df.ActiveTime)
    df['Follower/Day']=(df.Follower/df.ActiveTime)
    df['Tweet/Day']=(df.Tweet/df.ActiveTime)
    print df.describe()
    df.to_csv('profiles.csv')
    sns.boxplot(data=df.loc[:, ['Followee', 'Follower', 'Tweet', 'ActiveTime']])
    # sns.boxplot(data=df.loc[:, ['Followee/Day', 'Follower/Day', 'Tweet/Day']])
    plt.ylim(-300, 400)
    plt.show()
예제 #12
0
def compare_post_time():
    # prec = tsplit.timeline('fed', 'prorec_tag_refine')
    # ped = tsplit.timeline('fed', 'proed_tag_refine')
    # pickle.dump((prec, ped), open('tweets_dates.pick', 'w'))
    prec, ped = pickle.load(open('tweets_dates.pick', 'r'))
    print len(prec), len(ped)

    '''Get index '''
    mind = min(min(prec), min(ped))
    maxd = max(max(prec), max(ped))
    print mind, maxd
    indeces = pd.date_range(mind, maxd, freq='M')

    plu.plot_config()
    fig, ax = plt.subplots()

    '''counting'''
    df_rec = pd.DataFrame(prec, columns=['Recovery'])
    df_rec['year'] = df_rec["Recovery"].dt.year
    df_rec['month'] = df_rec["Recovery"].dt.month
    rec_counts = df_rec.groupby([df_rec["year"], df_rec["month"]]).count()

    '''Get count per month'''
    rec_cs = [0.0]*len(indeces)
    for i in xrange(len(indeces)):
        year = indeces[i].year
        month = indeces[i].month
        count = rec_counts.loc[(rec_counts.index.get_level_values('year') == year) & (rec_counts.index.get_level_values('month') == month)]
        if not count.empty:
            rec_cs[i] = count.iloc[0, 1]
    '''Plot series'''
    rec_s = pd.Series(rec_cs, index=indeces, name='Pro-recovery')
    rec_s.plot(kind="line", marker='s', ax=ax)
    ax.legend(loc='best')

    df_ped = pd.DataFrame(ped, columns=['Pro-ED'])
    df_ped['year'] = df_ped['Pro-ED'].dt.year
    df_ped['month'] = df_ped['Pro-ED'].dt.month
    ped_counts = df_ped.groupby([df_ped["year"], df_ped["month"]]).count()

    ped_cs = [0.0]*len(indeces)
    for i in xrange(len(indeces)):
        year = indeces[i].year
        month = indeces[i].month
        count = ped_counts.loc[(ped_counts.index.get_level_values('year') == year) & (ped_counts.index.get_level_values('month') == month)]
        if not count.empty:
            ped_cs[i] = count.iloc[0, 1]

    ped_s = pd.Series(ped_cs, index=indeces, name='Pro-ED')
    ped_s.plot(kind="line", marker='o', ax=ax)
    ax.legend(loc='best')

    ax.set_ylabel('Number of tweets')
    ax.set_xlabel('Date')
    print len(rec_cs), len(ped_cs), len(indeces)
    s, p = stats.kendalltau(rec_cs, ped_cs)
    print s, p
    print ('kendalltau test: %.2f, p-value: %.5f' %(s, p))
    s, p = stats.spearmanr(rec_cs, ped_cs)
    print s, p
    print ('spearmanr test: %.2f, p-value: %.5f' %(s, p))
    plt.show()

    return rec_cs, ped_cs
예제 #13
0
def roc_plot(datafile, savename, pca_num=10):
    X, y = load_scale_data(datafile)
    print X.shape
    plu.plot_config()
    # plt.rcParams['axes.labelsize'] = 20
    # plt.rcParams['xtick.labelsize'] = 15
    # plt.rcParams['ytick.labelsize'] = 15
    # plt.rcParams['legend.fontsize'] = 20
    # plt.rcParams['lines.markersize'] = 50
    # plt.rcParams['pdf.fonttype'] = 42
    # plt.rcParams['ps.fonttype'] = 42
    ax = plt.gca()
    ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    '''social status features'''
    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 0:6], y)
    # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'soc-short.pick', 'w'))
    # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'soc-short.pick', 'r'))
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'r--^',
            label='Soc. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)
    '''Behavioral pattern features'''
    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 6:17], y)
    # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'beh.pick', 'w'))
    # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'beh.pick', 'r'))
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'g--d',
            label='Beh. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)
    '''LIWC features'''
    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 17:], y)
    # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'liwc.pick', 'w'))
    # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'liwc.pick', 'r'))
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'b--o',
            label='Psy. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)

    # '''Plus Hashtag features'''
    # mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 21:], y)
    # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'liwc-hash.pick', 'w'))
    # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'liwc-hash.pick', 'r'))
    # ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'c--o', label='L+H. (area = %0.2f)' % mean_auc, lw=3, ms=10)
    '''All features'''
    '''Remove social impact features'''
    # X_short = np.delete(X, [6,7,8,9], 1)

    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X, y)
    # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'all-short.pick', 'w'))
    # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'all-short.pick', 'r'))
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'k--*',
            label='All. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)
    '''PCA'''
    # from sklearn import decomposition
    # pca = decomposition.PCA(n_components=pca_num)
    # X = pca.fit_transform(X)
    # mean_fpr, mean_tpr, mean_auc = cross_val_roc(X, y)
    # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'red.pick', 'w'))
    # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'red.pick', 'r'))
    # ax.plot(mean_fpr, mean_tpr, 'c--*', label='Red. (area = %0.2f)' % mean_auc, lw=2, ms=10)

    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.legend(loc="lower right")
    ax.grid(True)
    # plt.gca().set_aspect('equal')
    plt.savefig(savename)
    plt.clf()
예제 #14
0
def roc_plot_feature(datafile):
    X, y = load_scale_data(datafile)
    fields = iot.read_fields()
    trim_files = [f.split('.')[-1] for f in fields]
    print len(trim_files)
    select_f = [
        'friend_count',
        'status_count',
        'follower_count',
        'friends_day',
        'statuses_day',
        'followers_day',
        'retweet_pro',
        'dmention_pro',
        'reply_pro',
        # 'hashtag_pro',
        # 'url_pro',
        'retweet_div',
        'mention_div',
        'reply_div',
        'i',
        'we',
        'swear',
        'negate',
        'body',
        'health',
        'ingest',
        'social',
        'posemo',
        'negemo'
    ]

    indecs = [trim_files.index(f) for f in select_f]
    print indecs
    X = X[:, indecs]
    # '''Calculate positive emotion ratio'''
    # # print X.shape
    # X[:,-2] /= (X[:,-2] + X[:, -1])
    # X = X[:, :-1]
    # X[:, -1][~np.isfinite(X[:, -1])] = 0

    # min_max_scaler = preprocessing.MinMaxScaler()
    # X = min_max_scaler.fit_transform(X)

    X = preprocessing.scale(X)

    print X.shape, y.shape
    # Z = np.append(X, y.reshape((len(y), 1)), axis=1)
    # df = pd.DataFrame(Z, columns=select_f + ['label'])
    # affair_mod = logit("label ~ " + '+'.join(select_f[:-1]), df).fit()
    # print(affair_mod.summary())
    # df.to_csv('scaling-clsuter-feature.csv', index=False)

    print X.shape
    plu.plot_config()
    ax = plt.gca()
    ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))

    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 0:12], y)
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'r--^',
            label='Soc. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)
    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 12:22], y)
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'g--d',
            label='Lin. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)

    mean_fpr, mean_tpr, mean_auc = cross_val_roc(X, y)
    ax.plot(mean_fpr[0:100:5],
            mean_tpr[0:100:5],
            'b--o',
            label='All. (AUC = %0.2f)' % mean_auc,
            lw=3,
            ms=10)
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.legend(loc="lower right")
    ax.grid(True)
    plt.show()

    data = []
    result = svm_cv(X[:, 0:12], y)
    for i, v in enumerate(result):
        data.append(['Social Activities', i, v])
    result = svm_cv(X[:, 12:22], y)
    for i, v in enumerate(result):
        data.append(['Linguistic Constructs', i, v])
    result = svm_cv(X, y)
    for i, v in enumerate(result):
        data.append(['All', i, v])
    df = pd.DataFrame(data, columns=['Feature', 'Metric', 'Value'])
    plu.plot_config()
    g = sns.factorplot(x="Metric",
                       y="Value",
                       hue="Feature",
                       data=df,
                       kind="bar",
                       legend=False,
                       palette={
                           "Social Activities": "#e9a3c9",
                           "Linguistic Constructs": "#91bfdb",
                           'All': '#a1d76a'
                       })
    g.set_xticklabels(["Accuracy", "Micro-F1", 'Macro-F1'])
    g.set_ylabels('Index')
    g.set_xlabels('Metric')
    annots = df['Value']
    print annots
    hatches = ['/', '/', '/', '', '', '', '\\', '\\', '\\']

    ax = g.ax  #annotate axis = seaborn axis
    for i, p in enumerate(ax.patches):
        ax.annotate("%.2f" % (annots[i]),
                    (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center',
                    va='center',
                    fontsize=25,
                    color='black',
                    rotation=0,
                    xytext=(0, 20),
                    textcoords='offset points')
        p.set_hatch(hatches[i])
    plt.legend(bbox_to_anchor=(1, 1.2), ncol=6)
    plt.ylim(0.5, 1)
    plt.show()