예제 #1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--out_dir', default='../../output/results/')
    parser.add_argument('--tf_file', default='../../data/frequency/2013_2016_tf_norm_log.tsv')
    parser.add_argument('--growth_score_file', default='../../data/frequency/growth_scores.tsv')
    args = parser.parse_args()
    out_dir = args.out_dir
    tf_file = args.tf_file
    growth_score_file = args.growth_score_file
    
    ## load data
    tf = pd.read_csv(tf_file, sep='\t', index_col=0)
    growth_params = pd.read_csv(growth_score_file, sep='\t', index_col=0)
    growth_words = get_growth_words()
    decline_words, decline_params = get_growth_decline_words_and_params()
    logistic_decline_words, logistic_params = get_logistic_decline_words()
    piecewise_decline_words, piecewise_params = get_piecewise_decline_words()
    logistic_decline_words = list(set(logistic_decline_words) & set(decline_words) - set(growth_words))
    piecewise_decline_words = list(set(piecewise_decline_words) & set(decline_words) - set(growth_words))
    
    ## sort scores
    growth_scores = growth_params.loc[growth_words, 'spearman'].sort_values(inplace=False, ascending=False)
    decline_logistic_scores = logistic_params.loc[logistic_decline_words, 'R2'].sort_values(inplace=False, ascending=False)
    decline_piecewise_scores = piecewise_params.loc[piecewise_decline_words, 'R2'].sort_values(inplace=False, ascending=False)
    
    ## get example words
    top_k = 5
    example_growth_words = growth_scores.index.tolist()[:top_k]
    example_logistic_words = decline_logistic_scores.index.tolist()[:top_k]
    example_piecewise_words = decline_piecewise_scores.index.tolist()[:top_k]
    
    ## plot!! and write to file
    word_categories = ['growth', 'logistic_decline', 'piecewise_decline']
    word_lists = [example_growth_words, example_logistic_words, example_piecewise_words]
    for word_category, word_list in izip(word_categories, word_lists):
        plot_time_series(tf, sorted(word_list))
        out_file = os.path.join(out_dir, '%s_best_fit.pdf'%(word_category))
        # save to file
        plt.savefig(out_file)
예제 #2
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        '--tf_file', default='../../data/frequency/2013_2016_tf_norm_log.tsv')
    parser.add_argument(
        '--POS_file',
        default='../../data/frequency/2013_2016_tag_estimates.tsv')
    parser.add_argument('--k', default=1, type=int)
    parser.add_argument('--out_dir', default='../../output')
    args = parser.parse_args()
    tf_file = args.tf_file
    POS_file = args.POS_file
    k = args.k
    out_dir = args.out_dir

    ## load data
    k_range = pd.np.arange(1, k + 1)
    tf = pd.read_csv(tf_file, sep='\t', index_col=0).iloc[:, k_range - 1]
    tf.columns = map(lambda x: 'f_%d' % (x), k_range)
    POS_tags = pd.read_csv(POS_file, sep='\t', index_col=0).iloc[:, 0]
    # convert to dummy vars
    POS_tags = POS_tags.str.get_dummies()
    POS_tag_list = list(POS_tags.columns)
    # combine
    shared_vocab = tf.index & POS_tags.index
    data = pd.concat([tf.loc[shared_vocab], POS_tags.loc[shared_vocab]],
                     axis=1)

    ## restrict to success/fail words
    success_words = get_growth_words()
    fail_words, _ = get_growth_decline_words_and_params()
    #     success_words = get_success_words_final()
    #     fail_words = get_fail_words_final()
    fail_words = list(set(fail_words))
    # restrict to shared vocab
    success_words = list(set(success_words) & set(shared_vocab))
    fail_words = list(set(fail_words) & set(shared_vocab))
    change_words = success_words + fail_words
    data = data.loc[change_words, :]
    # add success condition
    y_var = 'success'
    data.loc[:, 'success'] = map(lambda x: int(x in success_words),
                                 data.index.tolist())

    ## organize
    data_sets = [
        data.loc[:, [y_var] + POS_tag_list],  # just POS
        data,  # f+POS
    ]
    data_set_names = ['POS', 'f+POS']
    results = pd.DataFrame()
    n_folds = 10
    for data_set, data_set_name in izip(data_sets, data_set_names):
        for k in k_range:
            feat_results = predict_LR(data_set, y_var, n_folds)
            feat_results.loc[:, 'k'] = k
            feat_results.loc[:, 'feat_names'] = data_set_name
            results = results.append(feat_results)

    ## write to file!!
    k_range_str = '%d_%d' % (min(k_range), max(k_range))
    out_file = os.path.join(out_dir,
                            'success_%s_window_POS.tsv' % (k_range_str))
    results.to_csv(out_file, sep='\t', index=False)
예제 #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        '--tf_file', default='../../data/frequency/2013_2016_tf_norm_log.tsv')
    parser.add_argument(
        '--DL_file',
        default='../../data/frequency/2013_2016_3gram_residuals.tsv')
    parser.add_argument(
        '--word_category_file',
        default='../../data/frequency/word_lists/2013_2016_word_categories.csv'
    )
    parser.add_argument('--k', default=1, type=int)
    parser.add_argument('--out_dir', default='../../output')
    args = parser.parse_args()
    tf_file = args.tf_file
    DL_file = args.DL_file
    word_category_file = args.word_category_file
    k = args.k
    out_dir = args.out_dir

    ## load data
    k_range = pd.np.arange(1, k + 1)
    tf = pd.read_csv(tf_file, sep='\t', index_col=0).iloc[:, k_range - 1]
    tf.columns = map(lambda x: 'f_%d' % (x), k_range)
    DL = pd.read_csv(DL_file, sep='\t', index_col=0).iloc[:, k_range - 1]
    DL.columns = map(lambda x: 'f_%d' % (x), k_range)
    word_categories = pd.read_csv(word_category_file, sep=',', index_col=0)
    word_categories = word_categories.loc[:, 'category']
    # eliminate partials
    word_categories = word_categories.apply(lambda x: x.split('/')[0])
    # convert to dummy vars
    word_categories = word_categories.str.get_dummies()
    category_list = list(word_categories.columns)
    # combine
    shared_vocab = list(set(tf.index) & set(word_categories.index))
    data = pd.concat([
        tf.loc[shared_vocab], word_categories.loc[shared_vocab],
        DL.loc[shared_vocab]
    ],
                     axis=1)

    ## restrict to growth/decline words
    growth_words = get_growth_words()
    decline_words, _ = get_growth_decline_words_and_params()
    decline_words = list(set(decline_words))
    # restrict to shared vocab
    growth_words = list(set(growth_words) & set(shared_vocab))
    decline_words = list(set(decline_words) & set(shared_vocab))
    change_words = growth_words + decline_words
    data = data.loc[change_words, :]
    # add growth condition
    y_var = 'growth'
    data.loc[:, 'growth'] = map(lambda x: int(x in growth_words),
                                data.index.tolist())

    ## organize
    data_sets = [
        data.loc[:, [y_var] + category_list],  # just categories
        data.loc[:,
                 [y_var] + category_list + list(tf.columns)],  # f+categories
        data,  # f+categories+DL
    ]
    data_set_names = ['CAT', 'f+CAT', 'f+DL+CAT']
    results = pd.DataFrame()
    n_folds = 10
    for data_set, data_set_name in izip(data_sets, data_set_names):
        for k in k_range:
            feat_results = predict_LR(data_set, y_var, n_folds)
            feat_results.loc[:, 'k'] = k
            feat_results.loc[:, 'feat_names'] = data_set_name
            results = results.append(feat_results)

    ## write to file!!
    k_range_str = '%d_%d' % (min(k_range), max(k_range))
    out_file = os.path.join(out_dir,
                            'growth_%s_window_CAT.tsv' % (k_range_str))
    results.to_csv(out_file, sep='\t', index=False)
def main():
    parser = ArgumentParser()
    parser.add_argument('--tf_file', default='../../data/frequency/2013_2016_tf_norm_log.tsv')
    parser.add_argument('--DL_file', default='../../data/frequency/2013_2016_3gram_residuals.tsv')
    parser.add_argument('--DU_file', default='../../data/frequency/2013_2016_user_diffusion.tsv')
    parser.add_argument('--DS_file', default='../../data/frequency/2013_2016_subreddit_diffusion.tsv')
    parser.add_argument('--DT_file', default='../../data/frequency/2013_2016_thread_diffusion.tsv')
    parser.add_argument('--k', default=12, type=int)
    parser.add_argument('--out_dir', default='../../output')
    args = parser.parse_args()
    tf_file = args.tf_file
    DL_file = args.DL_file
    DU_file = args.DU_file
    DS_file = args.DS_file
    DT_file = args.DT_file
    k = args.k
    out_dir = args.out_dir
    
    ## load data
    tf = pd.read_csv(tf_file, sep='\t', index_col=0)
    DL = pd.read_csv(DL_file, sep='\t', index_col=0)
    DU = pd.read_csv(DU_file, sep='\t', index_col=0)
    DS = pd.read_csv(DS_file, sep='\t', index_col=0)
    DT = pd.read_csv(DT_file, sep='\t', index_col=0)
    all_stats = [tf, DL, DU, DS, DT]
    all_stats = [s.fillna(0, inplace=False) for s in all_stats]
    shared_vocab = list(reduce(lambda x,y : x&y, map(lambda y: y.index, all_stats)))
    k_start = 0
    k_range = range(k_start+1,k_start+k+1)
    n_folds = 10
    all_time_steps = tf.columns.tolist()
    
    ## restrict to success/fail words
    success_words = get_growth_words()
    fail_words, _ = get_growth_decline_words_and_params()
    # restrict to shared vocab
    success_words = list(set(success_words) & set(shared_vocab))
    fail_words = list(set(fail_words) & set(shared_vocab))
    change_words = success_words + fail_words
    
    all_stats = [s.loc[change_words, :] for s in all_stats]
    # add success condition
    y_var = 'success'
    for s in all_stats:
        s.loc[:, y_var] = map(lambda x: int(x in success_words), s.index.tolist())
    
    ## organize
    feat_sets = [
        [all_stats[0]], 
        [all_stats[0], all_stats[1]], 
        [all_stats[0], all_stats[2], all_stats[3], all_stats[4]], 
        all_stats
    ]
    feat_name_lists = [
        ['f'], 
        ['f','DL'], 
        ['f','DU','DS','DT'], 
        ['f','DL','DU','DS','DT']
    ]
    feat_set_names = ['f', 'f+L', 'f+S', 'f+L+S']
    results = pd.DataFrame()
    use_mean = False
    for feat_set, feat_set_name, feat_name_list in izip(feat_sets, feat_set_names, feat_name_lists):
        for k_ in k_range:
            time_steps = all_time_steps[k_start:k_]
            feat_results = predict_LR(feat_set, feat_name_list, y_var, time_steps, n_folds, use_mean=use_mean)
            feat_results.loc[:, 'k'] = k_
            feat_results.loc[:, 'feat_names'] = feat_set_name
            results = results.append(feat_results)
    
    ## write to file!!
    k_range_str = '%s_%s'%(min(k_range), max(k_range))
    if(use_mean):
        out_file = os.path.join(out_dir, 'success_%s_window_mean.tsv'%(k_range_str))
    else:
        out_file = os.path.join(out_dir, 'success_%s_window.tsv'%(k_range_str))
    results.to_csv(out_file, sep='\t', index=False)
def main():
    parser = ArgumentParser()
    parser.add_argument('--out_dir', default='../../output')
    args = parser.parse_args()
    out_dir = args.out_dir
    
    # load data
    growth_decline_words, split_points = get_growth_decline_words_and_params()
    split_points = split_points.apply(lambda x: int(ceil(x)))
    # drop bad split points
    T = 36
    split_points = split_points[(split_points > 0) & (split_points < T)]
    growth_words = get_growth_words()
    GD = len(growth_decline_words)
    G = len(growth_words)
    N = G + GD
    survivors = pd.np.repeat(N, T)
    deaths = pd.Series(pd.np.zeros(T), index=pd.np.arange(T))
    deaths = (deaths + split_points.value_counts()).fillna(0, inplace=False)
    deaths_cumulative = deaths.cumsum()
    survivors -= deaths_cumulative
    timesteps = pd.np.arange(T)
    t_0 = '2013-06'
    t_0 = datetime.strptime(t_0, '%Y-%m')
    time_labels = [datetime.strftime(t_0 + relativedelta(months=+d), '%Y-%m') for d in range(T)]
    time_interval = 8
    time_ticks, time_labels = zip(*zip(timesteps, time_labels)[::time_interval])

    # make curve
    x_buffer = 0.5
    y_buffer = 50
    xlabel = 'Date'
    ylabel = 'Survivors'
    label_size = 20
    tick_size = 14
    survivor_marker_size = 10
    survivor_color = 'k'
    survivor_linestyle = '-'
    fill_hatch = '//'
    # fill_color = 'b'
    # use light-blue as fill color
    fill_color = (117, 117, 255)
    fill_color = tuple(c/255 for c in fill_color)
    xlim = [min(timesteps)-x_buffer, max(timesteps)+x_buffer]
    # cutoff at y=0
    # ylim = [min(survivors) - y_buffer, max(survivors)+y_buffer]
    ylim = [0, max(survivors)+y_buffer]
    plt.plot(timesteps, survivors, color=survivor_color, linestyle=survivor_linestyle, zorder=2)
    # add markers
    plt.scatter(timesteps, survivors, color=survivor_color, s=survivor_marker_size, zorder=3)
    # add dotted line at lower bound
    lower_bound_x = [0 - x_buffer, max(timesteps) + x_buffer]
    lower_bound_y = [G, G]
    plt.plot(lower_bound_x, lower_bound_y, color='k',  linestyle='--')
    # fill between survivor curve and lower bound
    plt.fill_between(timesteps, survivors, facecolor='none', hatch=fill_hatch, edgecolor=fill_color, linewidth=0.0)
#     plt.fill_between(timesteps, survivors, hatch='X', edgecolor='none', facecolor=fill_color, zorder=1)
    # fix ticks
    plt.xticks(time_ticks, time_labels, fontsize=tick_size)
    plt.yticks(fontsize=tick_size)
    plt.xlabel(xlabel, fontsize=label_size)
    plt.ylabel(ylabel, fontsize=label_size)
    plt.xlim(xlim)
    plt.ylim(ylim)
    
    # add bracket annotation for growth/failure
    def bracket_text(x, y1, y2, text, fraction=0.2, text_x_offset=2., text_y_offset=20):
        connection_style = 'bar, fraction=%.2f'%(fraction)
        plt.annotate('', xy=(x,y1), xycoords='data', xytext=(x,y2), textcoords='data', arrowprops=dict(arrowstyle='-', connectionstyle=connection_style))
        text_x = x + text_x_offset
        text_y = (y1 + y2) / 2. + text_y_offset
        plt.text(text_x, text_y, text, rotation=270.)
    growth_bracket_x = max(timesteps) + .5
    # growth bracket
    growth_bracket_y1 = G * .75
    growth_bracket_y2 = G * .25
    growth_text = 'growth'
    text_y_offset = 110
    bracket_text(growth_bracket_x, growth_bracket_y1, growth_bracket_y2, growth_text, text_y_offset=text_y_offset)
    # failure bracket
    failure_bracket_x = max(timesteps) + .5
    failure_bracket_y1 = N * .95
    failure_bracket_y2 = G * 1.05
    failure_text = 'decline'
    text_y_offset = 35
    bracket_text(failure_bracket_x, failure_bracket_y1, failure_bracket_y2, failure_text, fraction=0.3, text_y_offset=text_y_offset)
    # squeeze layout
    plt.tight_layout()
    # write to file
    out_file = os.path.join(out_dir, 'split_point_survivor_curve.pdf')
    plt.savefig(out_file, bbox_inches='tight')
예제 #6
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--data_dir', default='../../data/frequency')
    parser.add_argument(
        '--match_stat',
        default='../../data/frequency/2013_2016_tf_norm_log.tsv')
    parser.add_argument(
        '--plot_stat',
        default='../../data/frequency/2013_2016_3gram_residuals.tsv')
    parser.add_argument('--tag_pcts',
                        default='../../data/frequency/2013_2016_tag_pcts.tsv')
    parser.add_argument('--out_dir', default='../../output')
    args = parser.parse_args()
    data_dir = args.data_dir
    match_stat_file = args.match_stat
    plot_stat_file = args.plot_stat
    tag_pct_file = args.tag_pcts
    out_dir = args.out_dir
    growth_words = get_growth_words()
    decline_words, split_points = get_growth_decline_words_and_params()
    split_points = split_points.apply(lambda x: int(ceil(x)))

    vocab = get_default_vocab()
    # match_stat = pd.read_csv(os.path.join(data_dir, '2013_2016_tf_norm.tsv'), sep='\t', index_col=0).loc[vocab, :]
    match_stat = pd.read_csv(match_stat_file, sep='\t', index_col=0)
    DL = pd.read_csv(plot_stat_file, sep='\t', index_col=0)
    min_diff_pct = 0
    # match on split point
    #     k = 1
    #     match_diffs = match_words_split_points(decline_words, growth_words, match_stat, split_points, k, min_diff_pct, replace=False)
    # match on first k months of data
    k = 12
    match_diffs = match_word_diffs_all_pairs(decline_words,
                                             growth_words,
                                             match_stat,
                                             k,
                                             min_diff_pct=min_diff_pct)

    # tag_estimates = pd.read_csv(os.path.join(data_dir, '2013_2016_tag_pcts.tsv'), sep='\t', index_col=0).apply(lambda x: x.argmax(), axis=1)
    # use tag estimates without proper nouns
    tag_estimates = pd.read_csv(tag_pct_file, sep='\t', index_col=0).drop(
        '^', inplace=False, axis=1).apply(lambda x: x.argmax(), axis=1)
    decline_words_matched = match_diffs.loc[:, 'word'].tolist()
    growth_words_matched = match_diffs.loc[:, 'match'].tolist()
    split_points_ordered = split_points.loc[decline_words_matched]
    split_points_growth = pd.Series(split_points_ordered)
    split_points_growth.index = growth_words_matched
    combined_words = decline_words_matched + growth_words_matched
    tag_estimates_combined = tag_estimates.loc[combined_words]
    tag_list = []
    growth_vals = []
    decline_vals = []
    ttest_results = []
    ttest_results = pd.DataFrame()
    min_count = 5
    DL_k = DL.iloc[:, 1:k]
    for t, group in tag_estimates_combined.groupby(tag_estimates_combined):
        decline_relevant = list(group.index & set(decline_words_matched))
        growth_relevant = list(group.index & set(growth_words_matched))
        if ((len(decline_relevant) >= min_count)
                and (len(growth_relevant) >= min_count)):
            tag_list.append(t)
            # now! get DL values
            # get mean DL values
            decline_DL = DL_k.loc[decline_relevant, :].mean(axis=1)
            growth_DL = DL_k.loc[growth_relevant, :].mean(axis=1)
            decline_vals.append(decline_DL)
            growth_vals.append(growth_DL)

            # t-test for significance
            tval, pval = ttest_ind(growth_DL, decline_DL, equal_var=False)
            pval /= 2  # divide by two because one-sided
            # track means, t-val, p-val
            ttest_results_ = pd.Series({
                'POS_tag': t,
                'growth_DL_mean': growth_DL.mean(),
                'growth_DL_sd': growth_DL.std(),
                'growth_DL_N': len(growth_DL),
                'growth_DL_mean': decline_DL.mean(),
                'growth_DL_sd': decline_DL.std(),
                'growth_DL_N': len(decline_DL),
                't': tval,
                'p': pval,
            })
            ttest_results = ttest_results.append(ttest_results_,
                                                 ignore_index=True)


#             ttest_results.append((t, pval))
    name_1 = 'growth'
    name_2 = 'decline'
    xlabel = 'POS tag'
    ylabel = '$D^{L}$'
    ylim = (-1., 0.5)
    # TACL size
    tick_size = 15
    # NWAV size
    #     tick_size = 18
    # save ttest to file first
    ttest_out_file = os.path.join(
        out_dir,
        '%s_vs_%s_matched_pos_DL_distribution_1_%d.tsv' % (name_1, name_2, k))
    ttest_results.to_csv(ttest_out_file, sep='\t', index=False)
    out_file = os.path.join(
        out_dir,
        '%s_vs_%s_matched_pos_DL_distribution_1_%d.pdf' % (name_1, name_2, k))
    # convert tag list to meanings
    tag_meanings = pd.read_csv(
        '../../data/metadata/tag_meaning.tsv', sep='\t', index_col=0).applymap(
            lambda x: x.split('/')[0].replace(' ', '\n'))  #replace('/', '\n'))
    tag_list = [tag_meanings.loc[t, 'meaning'] for t in tag_list]
    # plot boxes
    color_1 = 'b'
    color_2 = 'r'
    linestyle_1 = '--'
    linestyle_2 = '-'
    # TACL size
    #     label_size = 18
    # NWAV size
    label_size = 28
    compare_boxplots(growth_vals,
                     decline_vals,
                     tag_list,
                     xlabel,
                     ylabel,
                     name_1,
                     name_2,
                     color_1=color_1,
                     color_2=color_2,
                     linestyle_1=linestyle_1,
                     linestyle_2=linestyle_2,
                     label_size=label_size,
                     tick_size=tick_size,
                     ylim=ylim)

    # add xticks
    x_offset = 0.25
    x_positions = pd.np.arange(len(tag_list)) + x_offset
    plt.xticks(x_positions, tag_list, fontsize=tick_size)

    # add significance stars
    # new: add as brackets between boxes
    def bracket_text(x1_bracket,
                     x2_bracket,
                     y_bracket,
                     x_txt,
                     y_txt,
                     text,
                     fraction=0.2,
                     textsize=12,
                     bracket_color='black'):
        connection_style = 'bar, fraction=%.2f' % (fraction)
        arrowprops = dict(arrowstyle='-',
                          ec=bracket_color,
                          connectionstyle=connection_style)
        plt.annotate('',
                     xy=(x1_bracket, y_bracket),
                     xycoords='data',
                     xytext=(x2_bracket, y_bracket),
                     textcoords='data',
                     arrowprops=arrowprops)
        plt.text(x_txt, y_txt, text, rotation=0., size=textsize, weight='bold')

    pval_upper = 0.05
    # ttest_results is a data frame
    x_positions_significant = [
        x_positions[i] for i in range(len(x_positions))
        if ttest_results.iloc[i, :].loc['p'] < pval_upper
    ]
    bracket_y = max(max(map(max, growth_vals)), max(map(max, decline_vals)))
    bracket_x_offset = 0.25
    text_x_offset = -0.025
    text_y_offset = 0.1
    fraction = 0.3
    annotate_txt = '*'
    annotate_txt_size = 15
    for x_position in x_positions_significant:
        bracket_x1 = x_position - bracket_x_offset
        bracket_x2 = x_position + bracket_x_offset
        x_txt = (bracket_x1 + bracket_x2) / 2. + text_x_offset
        y_txt = bracket_y + text_y_offset
        bracket_text(bracket_x1,
                     bracket_x2,
                     bracket_y,
                     x_txt,
                     y_txt,
                     annotate_txt,
                     fraction=fraction,
                     textsize=annotate_txt_size)

    # update xlim to fit labels and boxes
    xmin = x_positions.min() - x_offset * 2.
    xmax = x_positions.max() + x_offset * 2.
    plt.xlim(xmin, xmax)

    plt.tight_layout()
    # remove border but keep axes
    plt.axis('on')
    # plt.box(on=False)
    plt.savefig(out_file)