예제 #1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('tf_file')
    parser.add_argument('UP3_file')
    parser.add_argument('--out_dir', default='../../data/frequency/')
    args = parser.parse_args()
    tf_file = args.tf_file
    UP3_file = args.UP3_file
    out_dir = args.out_dir

    # load data
    vocab = get_default_vocab()
    print('got vocab')
    tf = pd.read_csv(tf_file, sep='\t',
                     index_col=0)  #, na_values=[' '], dtype=int)
    print('f loaded')
    tf = pd.np.log10(smooth_stats(tf.loc[vocab, :].fillna(0, inplace=False)))
    UP3 = pd.read_csv(UP3_file, sep='\t',
                      index_col=0).loc[vocab].fillna(0, inplace=False)
    print('UP3 loaded')
    timeframe = re.findall('201[0-9]_201[0-9]', tf_file)[0]

    # fit regression for each month
    all_months = sorted(tf.columns)
    UP3_resids = []
    for d in all_months:
        tf_d = tf.loc[:, d]
        UP3_d = UP3.loc[:, d]
        N = tf_d.shape[0]
        print('bout to regress over data N=%d' % (N))
        m, b, r, p, err = linregress(tf_d, UP3_d)
        print('d=%s, UP3=%.3E*f + %.3E (R=%.3f, p=%.3E)' % (d, m, b, r, p))
        UP3_pred = m * tf_d + UP3_d
        UP3_resids_d = UP3_d - UP3_pred
        UP3_resids_d = pd.DataFrame(UP3_resids_d, columns=[d])
        print(UP3_resids_d.shape)
        UP3_resids.append(UP3_resids_d)

    # write to file
    UP3_resids = pd.concat(UP3_resids, axis=1)
    print(U3_resids.shape)
    out_file = os.path.join(out_dir, '%s_P3.tsv' % (timeframe))
    UP3_resids.to_csv(out_file, sep='\t')
예제 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_tf',
                        default='../../data/frequency/2013_2016_tf.tsv')
    parser.add_argument('--out_file', default=None)
    parser.add_argument('--vocab', default='ALL')
    args = parser.parse_args()
    raw_tf_file = args.raw_tf
    raw_tf = pd.read_csv(raw_tf_file, sep='\t', index_col=0)
    vocab = args.vocab
    if (vocab != 'ALL'):
        vocab = get_default_vocab()
        raw_tf = raw_tf.loc[vocab]
    sums = raw_tf.sum(axis=0)
    norm_tf = raw_tf / sums
    # smooth and log
    norm_tf += norm_tf[norm_tf > 0].min().min()
    log_tf = pd.np.log10(norm_tf)
    if (args.out_file is None):
        out_dir = os.path.dirname(raw_tf_file)
        new_name = os.path.basename(raw_tf_file).replace('tf', 'tf_norm_log')
        out_file = os.path.join(out_dir, new_name)
        log_tf.to_csv(out_file, sep='\t')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', default='../../data/frequency/')
    parser.add_argument(
        '--social_var',
        # default='user')
        default='subreddit')
    # default='thread')
    parser.add_argument('--tf',
                        default='../../data/frequency/2015_2016_tf_norm.tsv')
    parser.add_argument('--all_dates', nargs='+', default=None)
    args = parser.parse_args()
    data_dir = args.data_dir
    social_var = args.social_var
    tf_file = args.tf
    all_dates = args.all_dates
    vocab = get_default_vocab()
    social_var_count_file = os.path.join(
        data_dir, '2015_2016_%s_unique.tsv' % (social_var))
    social_var_counts = pd.read_csv(social_var_count_file,
                                    sep='\t',
                                    index_col=0)
    vocab = list(set(vocab) & set(social_var_counts.index.tolist()))
    print('got %d final vocab' % (len(vocab)))
    social_word_count_file = os.path.join(
        data_dir, '2015_2016_%s_words.tsv' % (social_var))
    social_word_counts = pd.read_csv(social_word_count_file,
                                     sep='\t',
                                     index_col=0)
    if (all_dates is None):
        all_dates = sorted(social_word_counts.columns)
    tf = pd.read_csv(tf_file, sep='\t', index_col=0)
    all_social_vals = social_word_counts.index.tolist()
    # all_diffusion_vals = defaultdict(Counter)
    cutoff = 200
    for d in all_dates:
        #print('relevant social var counts %s'%
        #       (social_var_counts[d]))
        # all_sums = (1 - math.e ** (tf[d] * social_var_counts[d]))
        relevant_social_var_counts = social_var_counts[d]
        relevant_social_word_counts = social_word_counts[d]
        all_diffusion_vals = defaultdict(Counter)
        # vectorizing!
        vocab_tf = tf[d]
        vocab_social_counts = relevant_social_var_counts.loc[vocab]
        # diffusion = diffusion_exact(vocab_social_counts, relevant_social_word_counts, vocab_tf)
        diffusion = diffusion_approx(vocab_social_counts,
                                     relevant_social_word_counts, vocab_tf)
        diffusion = pd.DataFrame(diffusion, index=vocab, columns=[d])
        print('got diffusion %s' % (diffusion))
        # replace inf and NaN values?

        # unvectorized
        # for i, v in enumerate(vocab):
        #     v_tf = tf[d].loc[v]
        #     if(v_tf > 0):
        #         v_social_count = relevant_social_var_counts.loc[v]
        #         # compute social val sum
        #         denom = (1 - math.e**(-v_tf * relevant_social_word_counts)).sum()
        #         diffusion = v_social_count / denom
        #     else:
        #         diffusion = 0
        #     all_diffusion_vals[d][v] = diffusion
        #     if(i % 100 == 0):
        #         print('processed %d vocab'%(i))
        # if(i >= cutoff):
        #     break
        # write to file!
        out_fname = os.path.join(data_dir,
                                 '%s_%s_diffusion.tsv' % (d, social_var))
        diffusion.to_csv(out_fname, sep='\t')
def main():
    parser = ArgumentParser()
    parser.add_argument('--data_dir', default='../../data/frequency')
    parser.add_argument('--out_dir', default='../../output')
    args = parser.parse_args()
    data_dir = args.data_dir
    out_dir = args.out_dir
    # collect data
    vocab = get_default_vocab()
    tf = pd.read_csv(os.path.join(data_dir, '2013_2016_tf_norm_log.tsv'),
                     sep='\t',
                     index_col=0)
    D_L = pd.read_csv(os.path.join(data_dir, '2013_2016_3gram_residuals.tsv'),
                      sep='\t',
                      index_col=0).loc[vocab, :].fillna(0, inplace=False)
    D_U = pd.read_csv(os.path.join(data_dir,
                                   '2013_2016_user_diffusion_log.tsv'),
                      sep='\t',
                      index_col=0).loc[vocab, :].fillna(0, inplace=False)
    D_S = pd.read_csv(os.path.join(data_dir,
                                   '2013_2016_subreddit_diffusion_log.tsv'),
                      sep='\t',
                      index_col=0).loc[vocab, :].fillna(0, inplace=False)
    D_T = pd.read_csv(os.path.join(data_dir,
                                   '2013_2016_thread_diffusion_log.tsv'),
                      sep='\t',
                      index_col=0).loc[vocab, :].fillna(0, inplace=False)
    #     growth_words = get_growth_words()
    #     growth_decline_words, split_points = get_growth_decline_words_and_params()
    success_words, fail_words, split_points = get_success_fail_words()

    split_points = split_points.apply(lambda x: int(ceil(x)))
    # organize into survival df
    combined_words = fail_words + success_words
    V = len(combined_words)
    deaths = pd.Series(pd.np.zeros(V), index=combined_words)
    deaths.loc[fail_words] = 1
    N = tf.shape[1]
    split_points_combined = pd.concat([
        split_points,
        pd.Series([
            N,
        ] * len(success_words), index=success_words)
    ],
                                      axis=0)
    covariates = [tf, D_L, D_U, D_S, D_T]
    covariate_names = ['f', 'D_L', 'D_U', 'D_S', 'D_T']
    survival_df = build_survival_df(fail_words, success_words,
                                    split_points_combined, covariates,
                                    covariate_names)
    survival_df_nan = survival_df[survival_df.isnull().any(axis=1)]

    # full timeframe test
    # fit regression using all covariates and all data up to and including time of death
    scaler = StandardScaler()
    survival_df_norm = survival_df.copy()
    survival_df_norm[covariate_names] = scaler.fit_transform(
        survival_df_norm[covariate_names])
    cox_model = CoxPHFitter()
    event_var = 'death'
    time_var = 't'
    cox_model.fit(survival_df_norm, time_var, event_col=event_var)
    regression_output_file = os.path.join(out_dir,
                                          'cox_regression_all_data.txt')
    orig_stdout = sys.stdout
    with open(regression_output_file, 'w') as regression_output:
        sys.stdout = regression_output
        cox_model.print_summary()
        sys.stdout = orig_stdout

    # fixed timeframe test
    # fit regression using all covariates and only data up to first m months
    m = 3
    death_words = list(fail_words)
    right_censored_words = list(success_words)
    combined_words = death_words + right_censored_words
    fixed_death_times = pd.Series(pd.np.repeat(m, len(combined_words)),
                                  index=combined_words)
    covariates = [tf, D_L, D_U, D_S, D_T]
    covariate_names = ['f', 'D_L', 'D_U', 'D_S', 'D_T']
    survival_df = build_survival_df(death_words, right_censored_words,
                                    fixed_death_times, covariates,
                                    covariate_names)
    # now provide the actual death/censorship times
    N = tf.shape[1]
    death_times = pd.concat([
        split_points.loc[death_words],
        pd.Series([
            N,
        ] * len(right_censored_words),
                  index=right_censored_words)
    ],
                            axis=0)
    survival_df['t'] = death_times
    cox_model = CoxPHFitter()
    survival_df.loc[:, covariate_names] = scaler.fit_transform(
        survival_df.loc[:, covariate_names])
    cox_model.fit(survival_df, time_var, event_col=event_var)
    regression_output_file = os.path.join(out_dir,
                                          'cox_regression_first_%d.txt' % (m))
    orig_stdout = sys.stdout
    with open(regression_output_file, 'w') as regression_output:
        sys.stdout = regression_output
        cox_model.print_summary()
        sys.stdout = orig_stdout

    # concordance values
    # set up multiple models with different feature sets
    # then run 10-fold cross-validation to generate concordance scores
    # and plot distributions
    cv = 10
    feature_sets = []

    covariate_sets = [['f'], ['f', 'D_L'], ['f', 'D_U', 'D_S', 'D_T'],
                      ['f', 'D_L', 'D_U', 'D_S', 'D_T']]
    covariate_set_names = ['f', 'f+L', 'f+S', 'f+L+S']
    covariate_set_scores = {}
    cv = 10
    for covariate_set, covariate_set_name in izip(covariate_sets,
                                                  covariate_set_names):
        survival_df_relevant = survival_df.loc[:, covariate_set +
                                               [time_var, event_var]]
        cox_model = CoxPHFitter()
        scores = k_fold_cross_validation(cox_model,
                                         survival_df_relevant,
                                         time_var,
                                         event_col=event_var,
                                         k=cv)
        covariate_set_scores[covariate_set_name] = scores
    covariate_set_scores = pd.DataFrame(covariate_set_scores).transpose()
    score_names = ['score_%d' % (i) for i in range(cv)]
    covariate_set_scores.columns = score_names
    # significance test between f and f+C, f+D, f+C+D concordance scores
    pval_thresh = 0.05
    baseline_scores = covariate_set_scores.loc['f', score_names]
    covariate_test_names = ['f+L', 'f+S', 'f+L+S']
    # bonferroni correction = alpha / 3
    pval_corrected = pval_thresh / len(covariate_test_names)
    covariate_set_scores.loc[:, 'pval_thresh'] = pval_corrected
    for covariate_test_name in covariate_test_names:
        covariate_test_scores = covariate_set_scores.loc[covariate_test_name,
                                                         score_names]
        t_stat, pval = ttest_ind(covariate_test_scores,
                                 baseline_scores,
                                 equal_var=False)
        covariate_set_scores.loc[covariate_test_name, 't_test'] = t_stat
        covariate_set_scores.loc[covariate_test_name, 'pval'] = pval
    # write to file
    out_file = os.path.join(
        out_dir, 'cox_regression_concordance_%d_fold_scores.tsv' % (cv))
    covariate_set_scores.to_csv(out_file, sep='\t', index=True)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--comment_files', nargs='+', default=None)
    parser.add_argument('--out_dir', default='../../data/frequency/')
    parser.add_argument(
        '--social_vars',
        nargs='+',
        # default=['user', 'thread', 'subreddit'])
        # default=['user'])
        # default=['thread'])
        default=['subreddit'])
    args = parser.parse_args()
    comment_files = args.comment_files
    out_dir = args.out_dir
    social_vars = args.social_vars
    if (comment_files is None):
        # data_dir = '/mnt/new_hg190/corpora/reddit_comment_data/monthly_submission/'
        data_dir = '/mnt/new_hg190/corpora/reddit_comment_data/monthly_submission/'
        years = ['2015', '2016']
        comment_files = get_all_comment_files(data_dir, years)
        print('comment files %s' % (str(comment_files)))
        # but we actually want clean_normalized lol
        comment_files = [
            f.replace('.bz2', '_normalized.bz2') for f in comment_files
        ]
    meta_files = [f.replace('.bz2', '_meta.bz2') for f in comment_files]
    # print('got meta files %s'%(meta_files))
    # TODO: start small, eventually move to rest of files
    # comment_files = comment_files[3:]
    # comment_files = comment_files[1:]

    # for testing
    # social_vars = social_vars[:1]

    vocab = get_default_vocab()
    # chunk_size = 1000
    # chunk_size = 5000
    # chunk_size = len(vocab)
    # chunks = int(len(vocab) / chunk_size)
    # vocab_chunks = [vocab[i*chunk_size:i*chunk_size+chunk_size]
    #                 for i in xrange(chunks)]
    # start small
    # top_vocab = 1000
    top_vocab = 100000
    stopwords = get_default_stopwords()
    # already whitespace separated, so just need whitespace tokenizer
    tokenizer = WhitespaceTokenizer()
    ngram_range = (1, 1)
    min_df = 1
    cv = CountVectorizer(
        encoding='utf-8',
        lowercase=True,
        tokenizer=tokenizer.tokenize,
        stop_words=stopwords,
        ngram_range=ngram_range,
        min_df=min_df,
        # max_features=top_vocab,
        vocabulary=vocab,
        # binarize to save space b/c we only care about cooccurrence
        binary=True)
    out_dir = args.out_dir
    # min number of comments within social value
    # to make it count
    # social_comment_thresh = 10
    social_comment_thresh = 1
    for comment_file, meta_file in izip(comment_files, meta_files):
        print('processing comment file %s and meta file %s' %
              (comment_file, meta_file))
        date_str = re.findall(r'201[0-9]-[0-9]+', comment_file)[0]
        for social_var in social_vars:
            # use for full dtm
            # out_fname = os.path.join(out_dir, '%s_%s_dtm'%(date_str, social_var))
            out_fname = os.path.join(
                out_dir, '%s_%s_unique.tsv' % (date_str, social_var))
            # for each vocab chunk in list, get unique social counts!
            # for vocab in vocab_chunks:
            print('got vocab size %d' % (len(vocab)))
            social_word_counts = get_social_word_counts(
                social_var,
                vocab,
                comment_file,
                meta_file,
                comment_thresh=social_comment_thresh)
            # write to file
            social_word_counts = pd.DataFrame(social_word_counts, index=vocab)
            social_word_counts.to_csv(out_fname, sep='\t', header=False)
예제 #6
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--data_dir', default='../../data/frequency')
    parser.add_argument(
        '--match_stat',
        default='../../data/frequency/2013_2016_tf_norm_log.tsv')
    parser.add_argument(
        '--plot_stat',
        default='../../data/frequency/2013_2016_3gram_residuals.tsv')
    parser.add_argument('--tag_pcts',
                        default='../../data/frequency/2013_2016_tag_pcts.tsv')
    parser.add_argument('--out_dir', default='../../output')
    args = parser.parse_args()
    data_dir = args.data_dir
    match_stat_file = args.match_stat
    plot_stat_file = args.plot_stat
    tag_pct_file = args.tag_pcts
    out_dir = args.out_dir
    growth_words = get_growth_words()
    decline_words, split_points = get_growth_decline_words_and_params()
    split_points = split_points.apply(lambda x: int(ceil(x)))

    vocab = get_default_vocab()
    # match_stat = pd.read_csv(os.path.join(data_dir, '2013_2016_tf_norm.tsv'), sep='\t', index_col=0).loc[vocab, :]
    match_stat = pd.read_csv(match_stat_file, sep='\t', index_col=0)
    DL = pd.read_csv(plot_stat_file, sep='\t', index_col=0)
    min_diff_pct = 0
    # match on split point
    #     k = 1
    #     match_diffs = match_words_split_points(decline_words, growth_words, match_stat, split_points, k, min_diff_pct, replace=False)
    # match on first k months of data
    k = 12
    match_diffs = match_word_diffs_all_pairs(decline_words,
                                             growth_words,
                                             match_stat,
                                             k,
                                             min_diff_pct=min_diff_pct)

    # tag_estimates = pd.read_csv(os.path.join(data_dir, '2013_2016_tag_pcts.tsv'), sep='\t', index_col=0).apply(lambda x: x.argmax(), axis=1)
    # use tag estimates without proper nouns
    tag_estimates = pd.read_csv(tag_pct_file, sep='\t', index_col=0).drop(
        '^', inplace=False, axis=1).apply(lambda x: x.argmax(), axis=1)
    decline_words_matched = match_diffs.loc[:, 'word'].tolist()
    growth_words_matched = match_diffs.loc[:, 'match'].tolist()
    split_points_ordered = split_points.loc[decline_words_matched]
    split_points_growth = pd.Series(split_points_ordered)
    split_points_growth.index = growth_words_matched
    combined_words = decline_words_matched + growth_words_matched
    tag_estimates_combined = tag_estimates.loc[combined_words]
    tag_list = []
    growth_vals = []
    decline_vals = []
    ttest_results = []
    ttest_results = pd.DataFrame()
    min_count = 5
    DL_k = DL.iloc[:, 1:k]
    for t, group in tag_estimates_combined.groupby(tag_estimates_combined):
        decline_relevant = list(group.index & set(decline_words_matched))
        growth_relevant = list(group.index & set(growth_words_matched))
        if ((len(decline_relevant) >= min_count)
                and (len(growth_relevant) >= min_count)):
            tag_list.append(t)
            # now! get DL values
            # get mean DL values
            decline_DL = DL_k.loc[decline_relevant, :].mean(axis=1)
            growth_DL = DL_k.loc[growth_relevant, :].mean(axis=1)
            decline_vals.append(decline_DL)
            growth_vals.append(growth_DL)

            # t-test for significance
            tval, pval = ttest_ind(growth_DL, decline_DL, equal_var=False)
            pval /= 2  # divide by two because one-sided
            # track means, t-val, p-val
            ttest_results_ = pd.Series({
                'POS_tag': t,
                'growth_DL_mean': growth_DL.mean(),
                'growth_DL_sd': growth_DL.std(),
                'growth_DL_N': len(growth_DL),
                'growth_DL_mean': decline_DL.mean(),
                'growth_DL_sd': decline_DL.std(),
                'growth_DL_N': len(decline_DL),
                't': tval,
                'p': pval,
            })
            ttest_results = ttest_results.append(ttest_results_,
                                                 ignore_index=True)


#             ttest_results.append((t, pval))
    name_1 = 'growth'
    name_2 = 'decline'
    xlabel = 'POS tag'
    ylabel = '$D^{L}$'
    ylim = (-1., 0.5)
    # TACL size
    tick_size = 15
    # NWAV size
    #     tick_size = 18
    # save ttest to file first
    ttest_out_file = os.path.join(
        out_dir,
        '%s_vs_%s_matched_pos_DL_distribution_1_%d.tsv' % (name_1, name_2, k))
    ttest_results.to_csv(ttest_out_file, sep='\t', index=False)
    out_file = os.path.join(
        out_dir,
        '%s_vs_%s_matched_pos_DL_distribution_1_%d.pdf' % (name_1, name_2, k))
    # convert tag list to meanings
    tag_meanings = pd.read_csv(
        '../../data/metadata/tag_meaning.tsv', sep='\t', index_col=0).applymap(
            lambda x: x.split('/')[0].replace(' ', '\n'))  #replace('/', '\n'))
    tag_list = [tag_meanings.loc[t, 'meaning'] for t in tag_list]
    # plot boxes
    color_1 = 'b'
    color_2 = 'r'
    linestyle_1 = '--'
    linestyle_2 = '-'
    # TACL size
    #     label_size = 18
    # NWAV size
    label_size = 28
    compare_boxplots(growth_vals,
                     decline_vals,
                     tag_list,
                     xlabel,
                     ylabel,
                     name_1,
                     name_2,
                     color_1=color_1,
                     color_2=color_2,
                     linestyle_1=linestyle_1,
                     linestyle_2=linestyle_2,
                     label_size=label_size,
                     tick_size=tick_size,
                     ylim=ylim)

    # add xticks
    x_offset = 0.25
    x_positions = pd.np.arange(len(tag_list)) + x_offset
    plt.xticks(x_positions, tag_list, fontsize=tick_size)

    # add significance stars
    # new: add as brackets between boxes
    def bracket_text(x1_bracket,
                     x2_bracket,
                     y_bracket,
                     x_txt,
                     y_txt,
                     text,
                     fraction=0.2,
                     textsize=12,
                     bracket_color='black'):
        connection_style = 'bar, fraction=%.2f' % (fraction)
        arrowprops = dict(arrowstyle='-',
                          ec=bracket_color,
                          connectionstyle=connection_style)
        plt.annotate('',
                     xy=(x1_bracket, y_bracket),
                     xycoords='data',
                     xytext=(x2_bracket, y_bracket),
                     textcoords='data',
                     arrowprops=arrowprops)
        plt.text(x_txt, y_txt, text, rotation=0., size=textsize, weight='bold')

    pval_upper = 0.05
    # ttest_results is a data frame
    x_positions_significant = [
        x_positions[i] for i in range(len(x_positions))
        if ttest_results.iloc[i, :].loc['p'] < pval_upper
    ]
    bracket_y = max(max(map(max, growth_vals)), max(map(max, decline_vals)))
    bracket_x_offset = 0.25
    text_x_offset = -0.025
    text_y_offset = 0.1
    fraction = 0.3
    annotate_txt = '*'
    annotate_txt_size = 15
    for x_position in x_positions_significant:
        bracket_x1 = x_position - bracket_x_offset
        bracket_x2 = x_position + bracket_x_offset
        x_txt = (bracket_x1 + bracket_x2) / 2. + text_x_offset
        y_txt = bracket_y + text_y_offset
        bracket_text(bracket_x1,
                     bracket_x2,
                     bracket_y,
                     x_txt,
                     y_txt,
                     annotate_txt,
                     fraction=fraction,
                     textsize=annotate_txt_size)

    # update xlim to fit labels and boxes
    xmin = x_positions.min() - x_offset * 2.
    xmax = x_positions.max() + x_offset * 2.
    plt.xlim(xmin, xmax)

    plt.tight_layout()
    # remove border but keep axes
    plt.axis('on')
    # plt.box(on=False)
    plt.savefig(out_file)