def main(): parser = ArgumentParser() parser.add_argument('tf_file') parser.add_argument('UP3_file') parser.add_argument('--out_dir', default='../../data/frequency/') args = parser.parse_args() tf_file = args.tf_file UP3_file = args.UP3_file out_dir = args.out_dir # load data vocab = get_default_vocab() print('got vocab') tf = pd.read_csv(tf_file, sep='\t', index_col=0) #, na_values=[' '], dtype=int) print('f loaded') tf = pd.np.log10(smooth_stats(tf.loc[vocab, :].fillna(0, inplace=False))) UP3 = pd.read_csv(UP3_file, sep='\t', index_col=0).loc[vocab].fillna(0, inplace=False) print('UP3 loaded') timeframe = re.findall('201[0-9]_201[0-9]', tf_file)[0] # fit regression for each month all_months = sorted(tf.columns) UP3_resids = [] for d in all_months: tf_d = tf.loc[:, d] UP3_d = UP3.loc[:, d] N = tf_d.shape[0] print('bout to regress over data N=%d' % (N)) m, b, r, p, err = linregress(tf_d, UP3_d) print('d=%s, UP3=%.3E*f + %.3E (R=%.3f, p=%.3E)' % (d, m, b, r, p)) UP3_pred = m * tf_d + UP3_d UP3_resids_d = UP3_d - UP3_pred UP3_resids_d = pd.DataFrame(UP3_resids_d, columns=[d]) print(UP3_resids_d.shape) UP3_resids.append(UP3_resids_d) # write to file UP3_resids = pd.concat(UP3_resids, axis=1) print(U3_resids.shape) out_file = os.path.join(out_dir, '%s_P3.tsv' % (timeframe)) UP3_resids.to_csv(out_file, sep='\t')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--raw_tf', default='../../data/frequency/2013_2016_tf.tsv') parser.add_argument('--out_file', default=None) parser.add_argument('--vocab', default='ALL') args = parser.parse_args() raw_tf_file = args.raw_tf raw_tf = pd.read_csv(raw_tf_file, sep='\t', index_col=0) vocab = args.vocab if (vocab != 'ALL'): vocab = get_default_vocab() raw_tf = raw_tf.loc[vocab] sums = raw_tf.sum(axis=0) norm_tf = raw_tf / sums # smooth and log norm_tf += norm_tf[norm_tf > 0].min().min() log_tf = pd.np.log10(norm_tf) if (args.out_file is None): out_dir = os.path.dirname(raw_tf_file) new_name = os.path.basename(raw_tf_file).replace('tf', 'tf_norm_log') out_file = os.path.join(out_dir, new_name) log_tf.to_csv(out_file, sep='\t')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default='../../data/frequency/') parser.add_argument( '--social_var', # default='user') default='subreddit') # default='thread') parser.add_argument('--tf', default='../../data/frequency/2015_2016_tf_norm.tsv') parser.add_argument('--all_dates', nargs='+', default=None) args = parser.parse_args() data_dir = args.data_dir social_var = args.social_var tf_file = args.tf all_dates = args.all_dates vocab = get_default_vocab() social_var_count_file = os.path.join( data_dir, '2015_2016_%s_unique.tsv' % (social_var)) social_var_counts = pd.read_csv(social_var_count_file, sep='\t', index_col=0) vocab = list(set(vocab) & set(social_var_counts.index.tolist())) print('got %d final vocab' % (len(vocab))) social_word_count_file = os.path.join( data_dir, '2015_2016_%s_words.tsv' % (social_var)) social_word_counts = pd.read_csv(social_word_count_file, sep='\t', index_col=0) if (all_dates is None): all_dates = sorted(social_word_counts.columns) tf = pd.read_csv(tf_file, sep='\t', index_col=0) all_social_vals = social_word_counts.index.tolist() # all_diffusion_vals = defaultdict(Counter) cutoff = 200 for d in all_dates: #print('relevant social var counts %s'% # (social_var_counts[d])) # all_sums = (1 - math.e ** (tf[d] * social_var_counts[d])) relevant_social_var_counts = social_var_counts[d] relevant_social_word_counts = social_word_counts[d] all_diffusion_vals = defaultdict(Counter) # vectorizing! vocab_tf = tf[d] vocab_social_counts = relevant_social_var_counts.loc[vocab] # diffusion = diffusion_exact(vocab_social_counts, relevant_social_word_counts, vocab_tf) diffusion = diffusion_approx(vocab_social_counts, relevant_social_word_counts, vocab_tf) diffusion = pd.DataFrame(diffusion, index=vocab, columns=[d]) print('got diffusion %s' % (diffusion)) # replace inf and NaN values? # unvectorized # for i, v in enumerate(vocab): # v_tf = tf[d].loc[v] # if(v_tf > 0): # v_social_count = relevant_social_var_counts.loc[v] # # compute social val sum # denom = (1 - math.e**(-v_tf * relevant_social_word_counts)).sum() # diffusion = v_social_count / denom # else: # diffusion = 0 # all_diffusion_vals[d][v] = diffusion # if(i % 100 == 0): # print('processed %d vocab'%(i)) # if(i >= cutoff): # break # write to file! out_fname = os.path.join(data_dir, '%s_%s_diffusion.tsv' % (d, social_var)) diffusion.to_csv(out_fname, sep='\t')
def main(): parser = ArgumentParser() parser.add_argument('--data_dir', default='../../data/frequency') parser.add_argument('--out_dir', default='../../output') args = parser.parse_args() data_dir = args.data_dir out_dir = args.out_dir # collect data vocab = get_default_vocab() tf = pd.read_csv(os.path.join(data_dir, '2013_2016_tf_norm_log.tsv'), sep='\t', index_col=0) D_L = pd.read_csv(os.path.join(data_dir, '2013_2016_3gram_residuals.tsv'), sep='\t', index_col=0).loc[vocab, :].fillna(0, inplace=False) D_U = pd.read_csv(os.path.join(data_dir, '2013_2016_user_diffusion_log.tsv'), sep='\t', index_col=0).loc[vocab, :].fillna(0, inplace=False) D_S = pd.read_csv(os.path.join(data_dir, '2013_2016_subreddit_diffusion_log.tsv'), sep='\t', index_col=0).loc[vocab, :].fillna(0, inplace=False) D_T = pd.read_csv(os.path.join(data_dir, '2013_2016_thread_diffusion_log.tsv'), sep='\t', index_col=0).loc[vocab, :].fillna(0, inplace=False) # growth_words = get_growth_words() # growth_decline_words, split_points = get_growth_decline_words_and_params() success_words, fail_words, split_points = get_success_fail_words() split_points = split_points.apply(lambda x: int(ceil(x))) # organize into survival df combined_words = fail_words + success_words V = len(combined_words) deaths = pd.Series(pd.np.zeros(V), index=combined_words) deaths.loc[fail_words] = 1 N = tf.shape[1] split_points_combined = pd.concat([ split_points, pd.Series([ N, ] * len(success_words), index=success_words) ], axis=0) covariates = [tf, D_L, D_U, D_S, D_T] covariate_names = ['f', 'D_L', 'D_U', 'D_S', 'D_T'] survival_df = build_survival_df(fail_words, success_words, split_points_combined, covariates, covariate_names) survival_df_nan = survival_df[survival_df.isnull().any(axis=1)] # full timeframe test # fit regression using all covariates and all data up to and including time of death scaler = StandardScaler() survival_df_norm = survival_df.copy() survival_df_norm[covariate_names] = scaler.fit_transform( survival_df_norm[covariate_names]) cox_model = CoxPHFitter() event_var = 'death' time_var = 't' cox_model.fit(survival_df_norm, time_var, event_col=event_var) regression_output_file = os.path.join(out_dir, 'cox_regression_all_data.txt') orig_stdout = sys.stdout with open(regression_output_file, 'w') as regression_output: sys.stdout = regression_output cox_model.print_summary() sys.stdout = orig_stdout # fixed timeframe test # fit regression using all covariates and only data up to first m months m = 3 death_words = list(fail_words) right_censored_words = list(success_words) combined_words = death_words + right_censored_words fixed_death_times = pd.Series(pd.np.repeat(m, len(combined_words)), index=combined_words) covariates = [tf, D_L, D_U, D_S, D_T] covariate_names = ['f', 'D_L', 'D_U', 'D_S', 'D_T'] survival_df = build_survival_df(death_words, right_censored_words, fixed_death_times, covariates, covariate_names) # now provide the actual death/censorship times N = tf.shape[1] death_times = pd.concat([ split_points.loc[death_words], pd.Series([ N, ] * len(right_censored_words), index=right_censored_words) ], axis=0) survival_df['t'] = death_times cox_model = CoxPHFitter() survival_df.loc[:, covariate_names] = scaler.fit_transform( survival_df.loc[:, covariate_names]) cox_model.fit(survival_df, time_var, event_col=event_var) regression_output_file = os.path.join(out_dir, 'cox_regression_first_%d.txt' % (m)) orig_stdout = sys.stdout with open(regression_output_file, 'w') as regression_output: sys.stdout = regression_output cox_model.print_summary() sys.stdout = orig_stdout # concordance values # set up multiple models with different feature sets # then run 10-fold cross-validation to generate concordance scores # and plot distributions cv = 10 feature_sets = [] covariate_sets = [['f'], ['f', 'D_L'], ['f', 'D_U', 'D_S', 'D_T'], ['f', 'D_L', 'D_U', 'D_S', 'D_T']] covariate_set_names = ['f', 'f+L', 'f+S', 'f+L+S'] covariate_set_scores = {} cv = 10 for covariate_set, covariate_set_name in izip(covariate_sets, covariate_set_names): survival_df_relevant = survival_df.loc[:, covariate_set + [time_var, event_var]] cox_model = CoxPHFitter() scores = k_fold_cross_validation(cox_model, survival_df_relevant, time_var, event_col=event_var, k=cv) covariate_set_scores[covariate_set_name] = scores covariate_set_scores = pd.DataFrame(covariate_set_scores).transpose() score_names = ['score_%d' % (i) for i in range(cv)] covariate_set_scores.columns = score_names # significance test between f and f+C, f+D, f+C+D concordance scores pval_thresh = 0.05 baseline_scores = covariate_set_scores.loc['f', score_names] covariate_test_names = ['f+L', 'f+S', 'f+L+S'] # bonferroni correction = alpha / 3 pval_corrected = pval_thresh / len(covariate_test_names) covariate_set_scores.loc[:, 'pval_thresh'] = pval_corrected for covariate_test_name in covariate_test_names: covariate_test_scores = covariate_set_scores.loc[covariate_test_name, score_names] t_stat, pval = ttest_ind(covariate_test_scores, baseline_scores, equal_var=False) covariate_set_scores.loc[covariate_test_name, 't_test'] = t_stat covariate_set_scores.loc[covariate_test_name, 'pval'] = pval # write to file out_file = os.path.join( out_dir, 'cox_regression_concordance_%d_fold_scores.tsv' % (cv)) covariate_set_scores.to_csv(out_file, sep='\t', index=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--comment_files', nargs='+', default=None) parser.add_argument('--out_dir', default='../../data/frequency/') parser.add_argument( '--social_vars', nargs='+', # default=['user', 'thread', 'subreddit']) # default=['user']) # default=['thread']) default=['subreddit']) args = parser.parse_args() comment_files = args.comment_files out_dir = args.out_dir social_vars = args.social_vars if (comment_files is None): # data_dir = '/mnt/new_hg190/corpora/reddit_comment_data/monthly_submission/' data_dir = '/mnt/new_hg190/corpora/reddit_comment_data/monthly_submission/' years = ['2015', '2016'] comment_files = get_all_comment_files(data_dir, years) print('comment files %s' % (str(comment_files))) # but we actually want clean_normalized lol comment_files = [ f.replace('.bz2', '_normalized.bz2') for f in comment_files ] meta_files = [f.replace('.bz2', '_meta.bz2') for f in comment_files] # print('got meta files %s'%(meta_files)) # TODO: start small, eventually move to rest of files # comment_files = comment_files[3:] # comment_files = comment_files[1:] # for testing # social_vars = social_vars[:1] vocab = get_default_vocab() # chunk_size = 1000 # chunk_size = 5000 # chunk_size = len(vocab) # chunks = int(len(vocab) / chunk_size) # vocab_chunks = [vocab[i*chunk_size:i*chunk_size+chunk_size] # for i in xrange(chunks)] # start small # top_vocab = 1000 top_vocab = 100000 stopwords = get_default_stopwords() # already whitespace separated, so just need whitespace tokenizer tokenizer = WhitespaceTokenizer() ngram_range = (1, 1) min_df = 1 cv = CountVectorizer( encoding='utf-8', lowercase=True, tokenizer=tokenizer.tokenize, stop_words=stopwords, ngram_range=ngram_range, min_df=min_df, # max_features=top_vocab, vocabulary=vocab, # binarize to save space b/c we only care about cooccurrence binary=True) out_dir = args.out_dir # min number of comments within social value # to make it count # social_comment_thresh = 10 social_comment_thresh = 1 for comment_file, meta_file in izip(comment_files, meta_files): print('processing comment file %s and meta file %s' % (comment_file, meta_file)) date_str = re.findall(r'201[0-9]-[0-9]+', comment_file)[0] for social_var in social_vars: # use for full dtm # out_fname = os.path.join(out_dir, '%s_%s_dtm'%(date_str, social_var)) out_fname = os.path.join( out_dir, '%s_%s_unique.tsv' % (date_str, social_var)) # for each vocab chunk in list, get unique social counts! # for vocab in vocab_chunks: print('got vocab size %d' % (len(vocab))) social_word_counts = get_social_word_counts( social_var, vocab, comment_file, meta_file, comment_thresh=social_comment_thresh) # write to file social_word_counts = pd.DataFrame(social_word_counts, index=vocab) social_word_counts.to_csv(out_fname, sep='\t', header=False)
def main(): parser = ArgumentParser() parser.add_argument('--data_dir', default='../../data/frequency') parser.add_argument( '--match_stat', default='../../data/frequency/2013_2016_tf_norm_log.tsv') parser.add_argument( '--plot_stat', default='../../data/frequency/2013_2016_3gram_residuals.tsv') parser.add_argument('--tag_pcts', default='../../data/frequency/2013_2016_tag_pcts.tsv') parser.add_argument('--out_dir', default='../../output') args = parser.parse_args() data_dir = args.data_dir match_stat_file = args.match_stat plot_stat_file = args.plot_stat tag_pct_file = args.tag_pcts out_dir = args.out_dir growth_words = get_growth_words() decline_words, split_points = get_growth_decline_words_and_params() split_points = split_points.apply(lambda x: int(ceil(x))) vocab = get_default_vocab() # match_stat = pd.read_csv(os.path.join(data_dir, '2013_2016_tf_norm.tsv'), sep='\t', index_col=0).loc[vocab, :] match_stat = pd.read_csv(match_stat_file, sep='\t', index_col=0) DL = pd.read_csv(plot_stat_file, sep='\t', index_col=0) min_diff_pct = 0 # match on split point # k = 1 # match_diffs = match_words_split_points(decline_words, growth_words, match_stat, split_points, k, min_diff_pct, replace=False) # match on first k months of data k = 12 match_diffs = match_word_diffs_all_pairs(decline_words, growth_words, match_stat, k, min_diff_pct=min_diff_pct) # tag_estimates = pd.read_csv(os.path.join(data_dir, '2013_2016_tag_pcts.tsv'), sep='\t', index_col=0).apply(lambda x: x.argmax(), axis=1) # use tag estimates without proper nouns tag_estimates = pd.read_csv(tag_pct_file, sep='\t', index_col=0).drop( '^', inplace=False, axis=1).apply(lambda x: x.argmax(), axis=1) decline_words_matched = match_diffs.loc[:, 'word'].tolist() growth_words_matched = match_diffs.loc[:, 'match'].tolist() split_points_ordered = split_points.loc[decline_words_matched] split_points_growth = pd.Series(split_points_ordered) split_points_growth.index = growth_words_matched combined_words = decline_words_matched + growth_words_matched tag_estimates_combined = tag_estimates.loc[combined_words] tag_list = [] growth_vals = [] decline_vals = [] ttest_results = [] ttest_results = pd.DataFrame() min_count = 5 DL_k = DL.iloc[:, 1:k] for t, group in tag_estimates_combined.groupby(tag_estimates_combined): decline_relevant = list(group.index & set(decline_words_matched)) growth_relevant = list(group.index & set(growth_words_matched)) if ((len(decline_relevant) >= min_count) and (len(growth_relevant) >= min_count)): tag_list.append(t) # now! get DL values # get mean DL values decline_DL = DL_k.loc[decline_relevant, :].mean(axis=1) growth_DL = DL_k.loc[growth_relevant, :].mean(axis=1) decline_vals.append(decline_DL) growth_vals.append(growth_DL) # t-test for significance tval, pval = ttest_ind(growth_DL, decline_DL, equal_var=False) pval /= 2 # divide by two because one-sided # track means, t-val, p-val ttest_results_ = pd.Series({ 'POS_tag': t, 'growth_DL_mean': growth_DL.mean(), 'growth_DL_sd': growth_DL.std(), 'growth_DL_N': len(growth_DL), 'growth_DL_mean': decline_DL.mean(), 'growth_DL_sd': decline_DL.std(), 'growth_DL_N': len(decline_DL), 't': tval, 'p': pval, }) ttest_results = ttest_results.append(ttest_results_, ignore_index=True) # ttest_results.append((t, pval)) name_1 = 'growth' name_2 = 'decline' xlabel = 'POS tag' ylabel = '$D^{L}$' ylim = (-1., 0.5) # TACL size tick_size = 15 # NWAV size # tick_size = 18 # save ttest to file first ttest_out_file = os.path.join( out_dir, '%s_vs_%s_matched_pos_DL_distribution_1_%d.tsv' % (name_1, name_2, k)) ttest_results.to_csv(ttest_out_file, sep='\t', index=False) out_file = os.path.join( out_dir, '%s_vs_%s_matched_pos_DL_distribution_1_%d.pdf' % (name_1, name_2, k)) # convert tag list to meanings tag_meanings = pd.read_csv( '../../data/metadata/tag_meaning.tsv', sep='\t', index_col=0).applymap( lambda x: x.split('/')[0].replace(' ', '\n')) #replace('/', '\n')) tag_list = [tag_meanings.loc[t, 'meaning'] for t in tag_list] # plot boxes color_1 = 'b' color_2 = 'r' linestyle_1 = '--' linestyle_2 = '-' # TACL size # label_size = 18 # NWAV size label_size = 28 compare_boxplots(growth_vals, decline_vals, tag_list, xlabel, ylabel, name_1, name_2, color_1=color_1, color_2=color_2, linestyle_1=linestyle_1, linestyle_2=linestyle_2, label_size=label_size, tick_size=tick_size, ylim=ylim) # add xticks x_offset = 0.25 x_positions = pd.np.arange(len(tag_list)) + x_offset plt.xticks(x_positions, tag_list, fontsize=tick_size) # add significance stars # new: add as brackets between boxes def bracket_text(x1_bracket, x2_bracket, y_bracket, x_txt, y_txt, text, fraction=0.2, textsize=12, bracket_color='black'): connection_style = 'bar, fraction=%.2f' % (fraction) arrowprops = dict(arrowstyle='-', ec=bracket_color, connectionstyle=connection_style) plt.annotate('', xy=(x1_bracket, y_bracket), xycoords='data', xytext=(x2_bracket, y_bracket), textcoords='data', arrowprops=arrowprops) plt.text(x_txt, y_txt, text, rotation=0., size=textsize, weight='bold') pval_upper = 0.05 # ttest_results is a data frame x_positions_significant = [ x_positions[i] for i in range(len(x_positions)) if ttest_results.iloc[i, :].loc['p'] < pval_upper ] bracket_y = max(max(map(max, growth_vals)), max(map(max, decline_vals))) bracket_x_offset = 0.25 text_x_offset = -0.025 text_y_offset = 0.1 fraction = 0.3 annotate_txt = '*' annotate_txt_size = 15 for x_position in x_positions_significant: bracket_x1 = x_position - bracket_x_offset bracket_x2 = x_position + bracket_x_offset x_txt = (bracket_x1 + bracket_x2) / 2. + text_x_offset y_txt = bracket_y + text_y_offset bracket_text(bracket_x1, bracket_x2, bracket_y, x_txt, y_txt, annotate_txt, fraction=fraction, textsize=annotate_txt_size) # update xlim to fit labels and boxes xmin = x_positions.min() - x_offset * 2. xmax = x_positions.max() + x_offset * 2. plt.xlim(xmin, xmax) plt.tight_layout() # remove border but keep axes plt.axis('on') # plt.box(on=False) plt.savefig(out_file)