def test_parse_input(self): # the output of results is as follows # samp_ids, var_names, df, n_var, n_samp results = parse.parse_input(self.f2type, self.samp_var2_fp, self.startcol2, self.endcol2, self.delimiter2, self.skip2) # testing samp_ids match for i in range(len(results[0])): assert results[0][i] == [ '100716FG.C.1.RL', '100804MB.C.1.RL', '100907LG.C.1.RL', '101007PC.C.1.RL', '101018WG.N.1.RL', '101019AB.N.1.RL', '101026RM.C.1.RL', '101109JD.N.1.RL', '101206DM.N.1.RL', '110222MG.C.1.RL', '110228CJ.N.1.RL', '110308DK.C.1.RL', '110314CS.N.1.RL', '110330DS.C.1.RL', '110406MB.C.1.RL', '110412ET.C.1.RL', '110418ML.C.1.RL', '110420JR.C.1.RL', '110502BC.N.1.RL', '110523CB.C.1.RL', '110601OG.C.1.RL', '110720BB.C.1.RL', '110727MK.C.1.RL', '110801EH.C.1.RL', '110808JB.N.1.RL', '110921AR.C.1.RL', '111003JG.C.1.RL', '111115WK.C.1.RL' ][i] # testing var_names match for i in range(len(results[1])): assert results[1][i] == ['glutamic_acid', 'glycine'][i] # testing n_var and n_samp match assert results[3] == 2 assert results[4] == 28 results = parse.parse_input(self.f1type, self.samp_var1_fp, self.startcol1, self.endcol1, self.delimiter1, self.skip1) # testing samp_ids match for i in range(len(results[0])): assert results[0][i] == [ '101019AB.N.1.RL', '110228CJ.N.1.RL', '110314CS.N.1.RL', '110502BC.N.1.RL', '110808JB.N.1.RL', '101018WG.N.1.RL', '101109JD.N.1.RL', '101206DM.N.1.RL', '100907LG.C.1.RL', '110308DK.C.1.RL', '110412ET.C.1.RL', '110418ML.C.1.RL', '110601OG.C.1.RL', '110720BB.C.1.RL', '110727MK.C.1.RL', '110801EH.C.1.RL', '110921AR.C.1.RL', '111003JG.C.1.RL', '111115WK.C.1.RL', '100804MB.C.1.RL', '100716FG.C.1.RL', '101007PC.C.1.RL', '101026RM.C.1.RL', '110222MG.C.1.RL', '110330DS.C.1.RL', '110406MB.C.1.RL', '110420JR.C.1.RL', '110523CB.C.1.RL' ][i] # testing var_names match for i in range(len(results[1])): assert results[1][i] == [ 'k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__', 'k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobacterium', 'k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter', 'k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanothermobacter', 'k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanocellales;f__Methanocellaceae;g__Methanocella' ][i] # testing n_var and n_samp match assert results[3] == 5 assert results[4] == 28
def test_process_df(self): # test processing of dataframes samp_ids, var_names, df, n_var, n_samp = parse.parse_input(self.f1type, self.samp_var1_fp, self.startcol1, self.endcol1, self.delimiter1, self.skip1) results = parse.process_df(df, samp_ids) for r in range(len(results)): assert_almost_equal(self.process_df_results['1'][r], results[r]) samp_ids, var_names, df, n_var, n_samp = parse.parse_input(self.f2type, self.samp_var2_fp, self.startcol2, self.endcol2, self.delimiter2, self.skip2) results = parse.process_df(df, samp_ids) for r in range(len(results)): assert_almost_equal(self.process_df_results['2'][r], results[r],decimal=-5)
def calculate_cutie(input_config_fp): """ Computes pairwise correlations between each variable pair and for the significant correlations, recomputes correlation for each pair after iteratively excluding n observations, differentiating true and false correlations on the basis of whether the correlation remains significant when each individual observation is dropped """ # unpack config variables (samp_var1_fp, delimiter1, samp_var2_fp, delimiter2, f1type, f2type, working_dir, skip1, skip2, startcol1, endcol1, startcol2, endcol2, param, statistic, corr_compare, resample_k, paired, overwrite, alpha, multi_corr, fold, fold_value, graph_bound, fix_axis) = parse.parse_config(input_config_fp) # create subfolder to hold data analysis files if os.path.exists(working_dir) is not True: os.makedirs(working_dir) elif overwrite is not True: print('Working directory already exists, exiting.') sys.exit() if os.path.exists(working_dir + 'data_processing') is not True: os.makedirs(working_dir + 'data_processing') elif overwrite is not True: print('Data_processing directory already exists, exiting.') sys.exit() # initialize and write log file start_time = time.process_time() log_fp = output.init_log(working_dir, input_config_fp) ### # Parsing and Pre-processing ### # define possible stats forward_stats = ['pearson', 'spearman', 'kendall'] reverse_stats = ['rpearson', 'rspearman', 'rkendall'] all_stats = forward_stats + reverse_stats pearson_stats = ['pearson', 'rpearson'] spearman_stats = ['spearman', 'rspearman'] kendall_stats = ['kendall', 'rkendall'] if statistic not in all_stats: raise ValueError('Invalid statistic: %s chosen' % statistic) if corr_compare and resample_k != 1: raise ValueError('Resample_k must be 1 for pointwise stats') # file handling and parsing decisions # file 1 is the 'dominant' file type and should always contain the OTU file # we let the dominant fil 'override' the sample_id list ordering samp_ids2, var2_names, samp_var2_df, n_var2, n_samp = parse.parse_input( f2type, samp_var2_fp, startcol2, endcol2, delimiter2, skip2) output.write_log('The length of variables for file 2 is ' + str(n_var2), log_fp) output.write_log('The number of samples for file 2 is ' + str(n_samp), log_fp) output.write_log('The md5 of samp_var2 was ' + \ str(parse.md5_checksum(samp_var2_fp)), log_fp) samp_ids1, var1_names, samp_var1_df, n_var1, n_samp = parse.parse_input( f1type, samp_var1_fp, startcol1, endcol1, delimiter1, skip1) output.write_log('The length of variables for file 1 is ' + str(n_var1), log_fp) output.write_log('The number of samples for file 1 is ' + str(n_samp), log_fp) output.write_log('The md5 of samp_var1 was ' + \ str(parse.md5_checksum(samp_var1_fp)), log_fp) # if the samp_ids differ, only take common elements samp_ids = [value for value in samp_ids1 if value in samp_ids2] n_samp = len(samp_ids) # subset dataframe, obtain avg and variance samp_var1 = parse.process_df(samp_var1_df, samp_ids) samp_var2 = parse.process_df(samp_var2_df, samp_ids) # printing of samp and var names for reference output.write_log('There are ' + str(len(samp_ids)) + ' samples', log_fp) output.write_log('The first 3 samples are ' + str(samp_ids[0:3]), log_fp) if len(var1_names) >= 3: output.write_log('The first 3 var1 are ' + str(var1_names[0:3]), log_fp) else: output.write_log('Var1 was ' + str(var1_names), log_fp) if len(var2_names) >= 3: output.write_log('The first 3 var2 are ' + str(var2_names[0:3]), log_fp) else: output.write_log('Var2 was ' + str(var2_names), log_fp) ### # Pearson, Spearman, Kendall ### # initial output pvalues, corrs, r2vals = statistics.assign_statistics(samp_var1, samp_var2, statistic, pearson_stats, spearman_stats, kendall_stats, paired) # determine parameter (either r or p) output.write_log('The parameter chosen was ' + param, log_fp) # determine significance threshold and number of correlations if param == 'p': output.write_log('The type of mc correction used was ' + multi_corr, log_fp) threshold, n_corr, minp = statistics.set_threshold(pvalues, param, alpha, multi_corr, paired) output.write_log('The threshold value was ' + str(threshold), log_fp) # calculate initial sig candidates initial_corr, all_pairs = statistics.get_initial_corr(n_var1, n_var2, pvalues, corrs, threshold, param, paired) # change initial_corr if doing rCUtIe if statistic in reverse_stats: initial_corr = set(all_pairs).difference(initial_corr) output.write_log('The length of initial_corr is ' + str(len(initial_corr)), log_fp) # if interested in evaluating dffits, dsr, etc. region_sets = [] if corr_compare: infln_metrics = ['cutie_1pc', 'cookd', 'dffits', 'dsr'] infln_mapping = { 'cutie_1pc': statistics.resample1_cutie_pc, 'cookd': statistics.cookd, 'dffits': statistics.dffits, 'dsr': statistics.dsr } (FP_infln_sets, region_combs, region_sets) = statistics.pointwise_comparison( infln_metrics, infln_mapping, samp_var1, samp_var2, initial_corr, threshold, fold_value, fold, param) for region in region_combs: output.write_log('The amount of unique elements in set ' + str(region) + ' is ' + str(len(region_sets[str(region)])), log_fp) # report results for metric in infln_metrics: metric_FP = FP_infln_sets[metric] output.write_log('The number of false correlations according to ' + metric + ' is ' + str(len(metric_FP)), log_fp) output.write_log('The number of true correlations according to ' + metric + ' is ' + str(len(initial_corr) - len(metric_FP)), log_fp) # return sets of interest; some of these will be empty dicts depending # on the statistic (true_corr, true_corr_to_rev, false_corr_to_rev, corr_extrema_p, corr_extrema_r, samp_counter, var1_counter, var2_counter, exceeds_points, rev_points) = statistics.update_cutiek_true_corr( initial_corr, samp_var1, samp_var2, pvalues, corrs, threshold, statistic, forward_stats, reverse_stats, resample_k, fold, fold_value, param) ### # Determine indicator matrices ### # element i,j is -1 if flagged by CUtIe as FP, 1 if TP, # and 0 if insig originally true_indicators = utils.return_indicators(n_var1, n_var2, initial_corr, true_corr, resample_k) true_rev_indicators = utils.return_indicators(n_var1, n_var2, initial_corr, true_corr_to_rev, resample_k) false_rev_indicators = utils.return_indicators(n_var1, n_var2, initial_corr, false_corr_to_rev, resample_k) if corr_compare: metric_set_to_indicator = {} keys = [] for region in region_sets: temp_dict = {} region_truths = set(initial_corr).difference(region_sets[region]) temp_dict['1'] = region_truths metric_set_to_indicator[region] = utils.return_indicators( n_var1, n_var2, initial_corr, temp_dict, 1)['1'] ### # Report statistics ### for k in range(resample_k): resample_key = str(k+1) # for Spearman and MIC, R2 value stored is same as rho or MIC # respectively p_ratio = np.divide(corr_extrema_p[resample_key], pvalues) r2_ratio = np.divide(corr_extrema_r[resample_key], r2vals) variables = [pvalues, corrs, r2vals, true_indicators[resample_key], true_rev_indicators[resample_key], false_rev_indicators[resample_key], corr_extrema_p[resample_key], corr_extrema_r[resample_key], p_ratio, r2_ratio] if statistic in forward_stats: variable_names = ['pvalues', 'correlations', 'r2vals', 'indicators','TP_rev_indicators', 'FP_rev_indicators', 'extreme_p', 'extreme_r', 'p_ratio', 'r2_ratio'] elif statistic in reverse_stats: variable_names = ['pvalues', 'correlations', 'r2vals', 'indicators', 'FN_rev_indicators', 'TN_rev_indicators', 'extreme_p', 'extreme_r', 'p_ratio', 'r2_ratio'] # for pointwise if corr_compare: variable_names.extend(region_sets) for region in region_sets: variables.append(metric_set_to_indicator[region]) # Output results, write summary df if statistic in forward_stats: summary_df = output.print_summary_df(n_var1, n_var2, variable_names, variables, working_dir, resample_key, n_corr, paired) elif statistic in reverse_stats: summary_df = output.print_summary_df(n_var1, n_var2, variable_names, variables, working_dir, resample_key, n_corr, paired) output.report_results(initial_corr, true_corr, true_corr_to_rev, false_corr_to_rev, resample_key, log_fp) ### # Graphing ### # create subfolder to hold graphing files if os.path.exists(working_dir + 'graphs') is not True: os.makedirs(working_dir + 'graphs') output.graph_subsets(working_dir, var1_names, var2_names, f1type, f2type, summary_df, statistic, forward_stats, resample_k, initial_corr, true_corr, true_corr_to_rev, false_corr_to_rev, graph_bound, samp_var1, samp_var2, all_pairs, region_sets, corr_compare, exceeds_points, rev_points, fix_axis) output.diag_plots(samp_counter, var1_counter, var2_counter, resample_k, working_dir, paired) # write log file output.write_log('The runtime was ' + str(time.process_time() - start_time), log_fp) now = datetime.datetime.now() output.write_log('Ended logging at ' + str(now.isoformat()), log_fp) return
def create_json(label, samp_var1_fp, delimiter1, samp_var2_fp, delimiter2, f1type, f2type, working_dir, skip, startcol, endcol, statistic, resample_k, rm_zero, paired, alpha, mc, stat_names, stat_files, log_transform1, log_transform2): start_time = time.clock() ### # Parsing and Pre-processing ### # create subfolder to hold data analysis files if os.path.exists(working_dir + 'data_processing') is not True: os.makedirs(working_dir + 'data_processing') # file handling and parsing decisions # file 1 is the 'dominant' file type and should always contain the OTU file # we let the dominant fil 'override' the sample_id list ordering samp_ids, var2_names, samp_to_var2, n_var2, n_samp = \ parse.parse_input(f2type, samp_var2_fp, startcol, endcol, delimiter2, skip) samp_ids, var1_names, samp_to_var1, n_var1, n_samp = \ parse.parse_input(f1type, samp_var1_fp, startcol, endcol, delimiter1, skip) # convert dictionaries to matrices samp_var1, avg_var1, norm_avg_var1, var_var1, norm_var_var1, skew_var1 = \ parse.dict_to_matrix(samp_to_var1, samp_ids) samp_var2, avg_var2, norm_avg_var2, var_var2, norm_var_var2, skew_var2 = \ parse.dict_to_matrix(samp_to_var2, samp_ids) ### # Simple Linear Regression: Spearman and Pearson ### pearson_stats = ['kpc', 'jkp', 'bsp', 'rpc', 'rjkp', 'rbsp'] spearman_stats = ['ksc', 'jks', 'bss', 'rsc', 'rjks', 'rbss'] linear_stats = pearson_stats + spearman_stats if statistic in linear_stats: # statistic-specific initial output stat_to_matrix = statistics.assign_statistics(samp_var1, samp_var2, statistic, rm_zero) # unpack statistic matrices pvalues = stat_to_matrix['pvalues'] corrs = stat_to_matrix['correlations'] logpvals = stat_to_matrix['logpvals'] r2vals = stat_to_matrix['r2vals'] # determine significance threshold and number of correlations threshold, n_corr = statistics.set_threshold(pvalues, alpha, mc, paired) # calculate initial sig candidates initial_sig, all_pairs = statistics.initial_sig_SLR( n_var1, n_var2, pvalues, threshold, paired) # def output.files_to_sets(stat_names, stat_files) # return infln_metrics, FP_infln_sets infln_metrics = [str(x) for x in stat_names.split(',')] stat_files = [str(x) for x in stat_files.split(',')] infln_files = {} FP_infln_sets = {} counter = 0 for metric in infln_metrics: infln_files[metric] = stat_files[counter] FP_infln_sets[metric] = set() counter += 1 with open(infln_files[metric]) as f: f.readline() for line in f.readlines(): if line: parts = line.strip().split('\t') point = (int(float(parts[0])), int(float(parts[1]))) FP_infln_sets[metric].add(point) # this is to test what is picked up by different statistics initial_sig = all_pairs output.print_json_matrix(n_var1, n_var2, n_corr, infln_metrics, FP_infln_sets, initial_sig, working_dir, paired, point=False) output.print_json_matrix(n_var1, n_var2, n_corr, infln_metrics, FP_infln_sets, initial_sig, working_dir, paired, point=True) # log transform of data (if log_transform1 or log_transform2 are true) if log_transform1 and statistic != 'prop': samp_var1 = statistics.log_transform(samp_var1, working_dir, 1) if log_transform2 and statistic != 'prop': samp_var2 = statistics.log_transform(samp_var2, working_dir, 2) # do set operations and determine which is unique to each grouping # e.g. comparing jkp3, jkpl, jkpn # and comparing jkp3, bsp3, and kpc print time.clock() - start_time return
def gen_commands_configs(fold_value, statistic, multi_corr, param, datasets, corr_compare, cutie_fp, working_dir, output_dir): data_to_params = { 'hdac': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/data/HDAC_data/GSE15222_series_matrix_x100_del62.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/data/HDAC_data/GSE15222_series_matrix_x100_del62.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'untidy', 'f2type': 'untidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'lungtx': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/data/lungtx_data/otu_table_L6_filt1e3.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/data/lungtx_data/Genes.KEGG.L3.add_counts.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'untidy', 'f2type': 'untidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.05' }, 'lungc': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/data/pre_sparcc_MSQ/otu_table.MSQ34_L6.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/data/pre_sparcc_MSQ/otu_table.MSQ34_L6.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'untidy', 'f2type': 'untidy', 'skip1': '1', 'skip2': '1', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'lungpt': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/data/lungpt_data/otu_table_MultiO_merged___L6.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/data/lungpt_data/Mapping.Pneumotype.Multiomics.RL.NYU.w_metabolites.w_inflamm.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'untidy', 'f2type': 'tidy', 'skip1': '1', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '16', 'endcol2': '99', 'paired': 'False', 'alpha': '0.05' }, 'who': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/data/MINE_data/WHOfix.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/data/MINE_data/WHOfix.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '2', 'endcol1': '356', 'startcol2': '2', 'endcol2': '356', 'paired': 'True', 'alpha': '0.05' }, 'whonous': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/data/MINE_data/WHOnous.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/data/MINE_data/WHOnous.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '2', 'endcol1': '356', 'startcol2': '2', 'endcol2': '356', 'paired': 'True', 'alpha': '0.05' }, 'covidlong0': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_early.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_early.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'covidlong1': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_moderate.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_moderate.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'covidlong2': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_severe.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_severe.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'covidlongfull': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_full.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_full.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'covidmod': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_moderate.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_moderate.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'covidsev': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_severe.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_severe.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'baseball': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Baseballfix.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Baseballfix.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'ad0': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/atopic_dz0.csv', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/atopic_dz0.csv', 'delimiter1': ',', 'delimiter2': ',', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'ad1': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/atopic_dz1.csv', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/atopic_dz1.csv', 'delimiter1': ',', 'delimiter2': ',', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'oom': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/OOM.csv', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/OOM.csv', 'delimiter1': ',', 'delimiter2': ',', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'roc': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/ROC.csv', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/ROC.csv', 'delimiter1': ',', 'delimiter2': ',', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'mennonites': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/df_mennonites.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/df_mennonites.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'covid': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_covid.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_covid.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'airplane': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/data/MINE_data/2008_data.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/data/MINE_data/2008_data.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'ici': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_pre.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_ici.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.05' }, 'liverf': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_female_500.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_female_500.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'untidy', 'f2type': 'untidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'liverm': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_male_500.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_male_500.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'untidy', 'f2type': 'untidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'micro': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Microbiome_fix_500.csv', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Microbiome_fix_500.csv', 'delimiter1': ',', 'delimiter2': ',', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'hgoral': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_oral.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_oral.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'hgstool': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_stool.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_stool.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'crc': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_CRC_otu.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_CRC_cyto.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.05' }, 'ibd': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_nat_otu.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_nat_meta.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.0047' }, 'cell': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_cell_Al.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_cell_Al.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'nc': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/natcom_fix.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/natcom_fix.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'plos': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/plos_fungi.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/plos_bact.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.05' }, 'ca': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/ca_otu.csv', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/ca_plasma.csv', 'delimiter1': ',', 'delimiter2': ',', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.05' }, 'statin': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/bmis/df_combined.csv', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/bmis/df_combined.csv', 'delimiter1': ',', 'delimiter2': ',', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.01128' }, 'spatial': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/spatial/df_f1.csv', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/spatial/df_f1.csv', 'delimiter1': ',', 'delimiter2': ',', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'True', 'alpha': '0.05' }, 'livermfull': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_male.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_male.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'untidy', 'f2type': 'untidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.05', 'njobs': 1000 }, 'liverffull': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_female.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_female.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'untidy', 'f2type': 'untidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.05', 'njobs': 1000 }, 'microfull': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Microbiome_fix.csv', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Microbiome_fix.csv', 'delimiter1': ',', 'delimiter2': ',', 'f1type': 'tidy', 'f2type': 'tidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.05', 'njobs': 1000 }, 'hdacfull': { 'samp_var1_fp': '/sc/arion/projects/clemej05a/kevin/data/HDAC_data/GSE15222_series_matrix_full_del62.txt', 'samp_var2_fp': '/sc/arion/projects/clemej05a/kevin/data/HDAC_data/GSE15222_series_matrix_full_del62.txt', 'delimiter1': '\\t', 'delimiter2': '\\t', 'f1type': 'untidy', 'f2type': 'untidy', 'skip1': '0', 'skip2': '0', 'startcol1': '-1', 'endcol1': '-1', 'startcol2': '-1', 'endcol2': '-1', 'paired': 'False', 'alpha': '0.05', 'njobs': 10000 }, } # liverffull, microfull, hdacfull # endcol startcol check: lungpt and WHO fv = fold_value # files = glob.glob(input_dir + '*.txt') datasets = datasets.split(',') # datasets = ['hdac','lungc','lungpt','who','tx'] for data in datasets: param_to_str = data_to_params[data] # example fid: p_nomc_1_mine_False_lungtx f_id = '_'.join([param, multi_corr, fv, statistic, corr_compare, data]) ftype, samp_var_fp, startcol, endcol, delimiter, skip = param_to_str['f1type'], \ param_to_str['samp_var1_fp'], int(param_to_str['startcol1']), \ int(param_to_str['endcol1']), param_to_str['delimiter1'], int(param_to_str['skip1']) samp_ids, var_names, samp_var_df, n_var, n_samp = parse.parse_input( ftype, samp_var_fp, startcol, endcol, delimiter, skip) try: njobs = param_to_str['njobs'] except: njobs = 1 # create column tuples if njobs > 1: # create subtypes # samp_var_df is always in tidy format and has already been iloc'd dfs = np.array_split(samp_var_df, njobs, axis=1) vals = [df.shape[1] for df in dfs] col_tuples = [(0, 0)] indices = [0] indices.extend(vals) for i in range(len(indices) - 1): t = [] prev = col_tuples[i] t.append(prev[1]) t.append(indices[i + 1] + prev[1]) col_tuples.append(t) # get rid of 0,0 placeholder col_tuples.pop(0) else: col_tuples = [[param_to_str['startcol2'], param_to_str['endcol2']]] for i in range(njobs): # sub fid if njobs > 1: fid = f_id + '_' + str(i) else: fid = f_id # output_dir = '/sc/arion/projects/clemej05a/kevin/data/real_data_analysis/' # out_dir = output_dir + f_id + '/' out_dir = output_dir + fid + '/' try: os.makedirs(out_dir) except: pass # working_dir = '/sc/hydra/scratch/buk02/real_data_analysis/' working_outdir = working_dir + fid + '/' try: os.makedirs(working_outdir) except: pass with open(out_dir + 'config_' + fid + '.txt', 'w') as f: f.write('[input]') f.write('\n') f.write('samp_var1_fp: ' + param_to_str['samp_var1_fp']) f.write('\n') f.write('delimiter1: ' + param_to_str['delimiter2']) f.write('\n') f.write('samp_var2_fp: ' + param_to_str['samp_var2_fp']) f.write('\n') f.write('delimiter2: ' + param_to_str['delimiter2']) f.write('\n') f.write('f1type: ' + param_to_str['f1type']) f.write('\n') f.write('f2type: ' + param_to_str['f2type']) f.write('\n') f.write('skip1: ' + param_to_str['skip1']) f.write('\n') f.write('skip2: ' + param_to_str['skip2']) f.write('\n') f.write('startcol1: ' + param_to_str['startcol1']) f.write('\n') f.write('endcol1: ' + param_to_str['endcol1']) f.write('\n') f.write('startcol2: ' + str(col_tuples[i][0])) f.write('\n') f.write('endcol2: ' + str(col_tuples[i][1])) f.write('\n') f.write('paired: ' + param_to_str['paired']) f.write('\n') f.write('\n') f.write('[output]') f.write('\n') f.write('working_dir: ' + working_outdir) f.write('\n') f.write('overwrite: True') f.write('\n') f.write('\n') f.write('[stats]') f.write('\n') f.write('param: ' + param) f.write('\n') f.write('statistic: ' + statistic) f.write('\n') f.write('resample_k: 1') f.write('\n') f.write('alpha: ' + param_to_str['alpha']) f.write('\n') f.write('mc: ' + multi_corr) f.write('\n') f.write('fold: True') f.write('\n') f.write('fold_value: ' + fv) f.write('\n') f.write('corr_compare: ' + corr_compare) f.write('\n') f.write('\n') f.write('[graph]') f.write('\n') f.write('graph_bound: 30') f.write('\n') f.write('fix_axis: False') with open(out_dir + 'commands_' + fid + '.txt', 'w') as f: f.write('export PYTHONPATH=$PYTHONPATH:/hpc/users/buk02/tools/sandbox/lib/python3.7/site-packages/ && python ' + \ cutie_fp + ' -i ' + out_dir + 'config_' + fid + '.txt')