def main(args): # Parse input gcts external_gct = parse.parse(args.external_gct_path) internal_gct = parse.parse(args.internal_gct_path) bg_gct = parse.parse(args.bg_gct_path) # Meat of the script (sim_gct, conn_gct) = do_steep_and_sip( external_gct, internal_gct, bg_gct, args.similarity_metric, args.connectivity_metric, args.fields_to_aggregate_for_external_profiles, args.fields_to_aggregate_for_internal_profiles) # Write output gcts wg.write(sim_gct, args.out_steep_name, data_null="NaN", metadata_null="NaN", filler_null="NaN") wg.write(conn_gct, args.out_sip_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def main(args): # Parse gct file gct = parse.parse(args.path_to_gct) # Parse mapping tsv file mapping = pd.read_csv(args.path_to_mapping_tsv, sep="\t", index_col=0) # Make sure the ids from the mapping file are unique duplicated_bool_array = mapping.index.duplicated() assert sum(duplicated_bool_array) == 0, ( "ids in mapping file must be unique. duplicated ids in mapping:\n{}". format(mapping.index[duplicated_bool_array])) for col in mapping.columns: if args.row_and_or_col == "both": annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) elif args.row_and_or_col == "row": annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) elif args.row_and_or_col == "col": annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) wg.write(gct, args.out_name, filler_null="NA", data_null="NaN", metadata_null="NA")
def main(): # Get args args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) # Read the input gct in_gct = parse.parse(args.in_gct_path) # Read in each of the command line arguments rid = _read_arg(args.rid) cid = _read_arg(args.cid) exclude_rid = _read_arg(args.exclude_rid) exclude_cid = _read_arg(args.exclude_cid) # Slice the gct out_gct = sg.slice_gctoo(in_gct, rid=rid, cid=cid, exclude_rid=exclude_rid, exclude_cid=exclude_cid) assert out_gct.data_df.size > 0, "Slicing yielded an empty gct!" # Write the output gct if args.use_gctx: wgx.write(out_gct, args.out_name) else: wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
def write_output_gct(out_gct, out_gct_name, data_null, filler_null): wg.write(out_gct, out_gct_name, data_null=data_null, filler_null=filler_null, data_float_format=None)
def main(args): # Import data assert os.path.exists( args.in_gct_path), ("in_gct_path could not be found: {}").format( args.in_gct_path) in_gct = parse(args.in_gct_path) # First, check if any rows are all NaN; if so, remove them dropped_df = in_gct.data_df.dropna(how="all") bools_of_remaining = in_gct.data_df.index.isin(dropped_df.index.values) in_gct = sg.slice_gctoo(in_gct, row_bool=bools_of_remaining) if args.replace_with == "zero": in_gct.data_df.fillna(0, inplace=True) elif args.replace_with == "median": probe_medians = in_gct.data_df.median(axis=1) for row_idx, row in enumerate(in_gct.data_df.values): this_row = in_gct.data_df.iloc[row_idx, :] this_row[this_row.isnull()] = probe_medians[row_idx] in_gct.data_df.iloc[row_idx, :] = this_row elif args.replace_with == "mean": probe_means = in_gct.data_df.mean(axis=1) for row_idx, row in enumerate(in_gct.data_df.values): this_row = in_gct.data_df.iloc[row_idx, :] this_row[this_row.isnull()] = probe_means[row_idx] in_gct.data_df.iloc[row_idx, :] = this_row wg.write(in_gct, args.out_name, filler_null="NA")
def test_main(self): out_name = os.path.join(FUNCTIONAL_TESTS_PATH, "test_main_out.gct") gctoo = GCToo.GCToo(data_df=self.data_df, row_metadata_df=self.row_metadata_df, col_metadata_df=self.col_metadata_df) wg.write(gctoo, out_name, data_null="NaN", metadata_null="-666", filler_null="-666") # Read in the gct and verify that it's the same as gctoo new_gct = pg.parse(out_name) pd.util.testing.assert_frame_equal(new_gct.data_df, gctoo.data_df) pd.util.testing.assert_frame_equal(new_gct.row_metadata_df, gctoo.row_metadata_df) pd.util.testing.assert_frame_equal(new_gct.col_metadata_df, gctoo.col_metadata_df) # Also check that missing values were written to the file as expected in_df = pd.read_csv(out_name, sep="\t", skiprows=2, keep_default_na=False) self.assertEqual(in_df.iloc[0, 1], "-666") self.assertEqual(in_df.iloc[5, 6], "NaN") # Cleanup os.remove(out_name)
def main(args): """ The main method. """ # Import gct in_gct = parse.parse(args.in_gct_path) # Create the separated gcts (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field, args.row_or_col) # Save the returned gcts for gct, name in zip(out_gcts, out_gct_prefixes): full_out_name = os.path.join( args.out_dir, args.out_name_prefix + str(name) + args.out_name_suffix) # Write to GCT or GCTX depending on extension if str.lower(os.path.splitext(full_out_name)[1]) == ".gct": wg.write(gct, full_out_name, data_null="NaN", metadata_null="NA", filler_null="NA") elif str.lower(os.path.splitext(full_out_name)[1]) == ".gctx": wgx.write(gct, full_out_name) else: raise (Exception( "out_name_suffix must end in either .gct or .gctx. out_name_suffix: {}" .format((args.out_name_suffix))))
def reader_writer(input_file, output_file, function, check_size=False): plate_failure = False # Read in input file gctoo = pe.parse(input_file) # Call normalizing function on gctoo new_gctoo = function(gctoo) new_gctoo = drop_nans(new_gctoo) if new_gctoo == 'empty_plate': logger.debug("{} has no usable data and has not been written.".format( os.path.basename(output_file))) plate_failure = True return plate_failure # If told to, check size of new_gctoo and flag if too small if new_gctoo.data_df.shape[1] <= 349 and check_size == True: logger.debug('{} Plate Failure With {} Failed Wells'.format( os.path.basename(os.path.dirname(input_file)), 384 - new_gctoo.data_df.shape[1])) plate_failure = True # write out new gctoo wgx.write(new_gctoo, out_fname=output_file) logger.debug("{} file written.".format(output_file)) return plate_failure
def main(): # get args args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) logger.debug("args: {}".format(args)) # Get files directly if args.input_filepaths is not None: files = args.input_filepaths # Or find them else: files = get_file_list(args.file_wildcard) # No files found if len(files) == 0: msg = "No files were found. args.file_wildcard: {}".format( args.file_wildcard) logger.error(msg) raise Exception(msg) # Only 1 file found if len(files) == 1: logger.warning( "Only 1 file found. No concatenation needs to be done, exiting") return # More than 1 file found else: # Parse each file and append to a list gctoos = [] for f in files: gctoos.append(parse(f)) # Create concatenated gctoo object if args.concat_direction == "horiz": out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) elif args.concat_direction == "vert": out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) # Write out_gctoo to file logger.info("Writing to output file args.out_name: {}".format( args.out_name)) if args.out_type == "gctx": write_gctx.write(out_gctoo, args.out_name) elif args.out_type == "gct": write_gct.write(out_gctoo, args.out_name, filler_null=args.filler_null, metadata_null=args.metadata_null, data_null=args.data_null)
def main(args): # Read in the first gct gct1 = parse(args.in_gct_path) # If second gct provided, compute similarity between 2 gcts if args.in_gct2_path is not None: logger.info( "in_gct2_path was provided. Will compute pairwise similarities " + "between the columns of in_gct and in_gct2.") # Read in the second gct gct2 = parse(args.in_gct2_path) # Compute similarities between gct1 and gct2 out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df, args.similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_df = gct1.col_metadata_df col_metadata_df = gct2.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df) # If only 1 gct provided, compute similarities between the columns of gct1 else: out_df = compute_similarity_within_df(gct1.data_df, args.similarity_metric) # Row and column metadata are both from gct1 metadata_df = gct1.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df) # Write output gct if os.path.splitext(args.out_name)[1] == ".gct": wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA") elif os.path.splitext(args.out_name)[1] == ".gctx": wgx.write(out_gct, args.out_name) else: raise (Exception( "out_name must end in .gct or .gctx. out_name: {}".format( args.out_name)))
def concat_main(args): """ Separate method from main() in order to make testing easier and to enable command-line access. """ # Get files directly if args.input_filepaths is not None: files = args.input_filepaths # Or find them else: files = get_file_list(args.file_wildcard) # No files found if len(files) == 0: msg = "No files were found. args.file_wildcard: {}".format( args.file_wildcard) logger.error(msg) raise Exception(msg) # Only 1 file found if len(files) == 1: logger.warning( "Only 1 file found. No concatenation needs to be done, exiting") return # More than 1 file found else: # Parse each file and append to a list gctoos = [] for f in files: gctoos.append(parse.parse(f)) # Create concatenated gctoo object if args.concat_direction == "horiz": out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) elif args.concat_direction == "vert": out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) # Write out_gctoo to file logger.info("Writing to output file args.out_name: {}".format( args.out_name)) if args.out_type == "gctx": write_gctx.write(out_gctoo, args.out_name) elif args.out_type == "gct": write_gct.write(out_gctoo, args.out_name, filler_null=args.filler_null, metadata_null=args.metadata_null, data_null=args.data_null)
def main(args): gct = parse.parse(args.in_gct_path) (_, conn_gct) = do_steep_and_sip( gct, args.similarity_metric, args.connectivity_metric, args.fields_to_aggregate) # Write output gct wg.write(conn_gct, args.out_sip_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def save_data(adj_ds, adj_list): """ Write batch-adjusted data to files """ wg.write(adj_ds, 'batch_adjusted_values.gct') for ctr, this_ds in enumerate(adj_list): if this_ds.src is not None: out_file = '{}.COMBAT.gct'.format(os.path.splitext(os.path.basename(this_ds.src))[0]) else: out_file = 'batch_adjusted_values_X{}.gct'.format(ctr) wg.write(this_ds, out_file)
def main(): args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False) if args.output_filepath == None: basename = os.path.basename(args.filename) out_name = ".".join(basename.split(".")[:-1]) else: out_name = args.output_filepath write_gct.write(in_gctoo, out_name)
def gctx2gct_main(args): """ Separate from main() in order to make command-line tool. """ in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False) if args.output_filepath is None: basename = os.path.basename(args.filename) out_name = os.path.splitext(basename)[0] + ".gct" else: out_name = args.output_filepath write_gct.write(in_gctoo, out_name)
def main(args): # Find files full_path_wildcard = args.in_dir + args.file_wildcard gct_paths = glob.glob(full_path_wildcard) assert len(gct_paths) > 1, "full_path_wildcard: {}".format( full_path_wildcard) # Extract prefixes in order to use them later for saving prefixes = [(os.path.basename(path)).split(args.prefix_separator)[0] for path in gct_paths] for path, prefix in zip(gct_paths, prefixes): print "path: {}".format(path) print "prefix: {}".format(prefix) # Import gcts gctoos = [parse(x) for x in gct_paths] assert len(gctoos) > 1, "gct_paths: {}".format(gct_paths) # Compute & save ranks for g, prefix in zip(gctoos, prefixes): # Extract data_df score_df = g.data_df # Must be square assert score_df.shape[0] == score_df.shape[ 1], "Input dataframe must be square." # Set diagonal to NaN np.fill_diagonal(score_df.values, np.nan) # Rank the matrix (percentile score or not) if args.do_percentile_rank: rank_df = score_df.rank(ascending=False, pct=True) * 100 else: rank_df = score_df.rank(ascending=False) # Make a GCToo rank_gctoo = GCToo.GCToo(data_df=rank_df, row_metadata_df=g.row_metadata_df, col_metadata_df=g.col_metadata_df) # Save the rank_df to file out_name = args.out_dir + prefix + args.output_suffix wg.write(rank_gctoo, out_name, filler_null="NaN", data_null="NaN", metadata_null="NaN")
def main(args): # Read in the first gct gct1 = parse(args.in_gct_path, convert_neg_666=False, make_multiindex=True) # If second gct provided, compute similarity between 2 gcts if args.in_gct2_path is not None: logger.info( "in_gct2_path was provided. Will compute pairwise similarities " + "between the columns of in_gct and in_gct2.") # Read in the second gct gct2 = parse(args.in_gct2_path, convert_neg_666=False, make_multiindex=True) # Compute similarities between gct1 and gct2 out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df, args.similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_df = gct1.col_metadata_df col_metadata_df = gct2.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df) # If only 1 gct provided, compute similarities between the columns of gct1 else: out_df = compute_similarity_within_df(gct1.data_df, args.similarity_metric) # Row and column metadata are both from gct1 metadata_df = gct1.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df) # Write output gct wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
def write_outputs(top_level_dir, weights, cb_weights, modZ_GCT, cb_modZ_GCT, cc_q75_df, cb_cc_q75_df, rep_set, input_type): if not os.path.exists(top_level_dir): os.mkdir(top_level_dir) weights[0].to_csv(os.path.join(top_level_dir,rep_set + '_'+input_type+ '_norm_weights.txt'), sep='\t') cb_weights[0].to_csv(os.path.join(top_level_dir,rep_set + '_'+input_type+ '_norm_weights.txt'), sep='\t') weights[1].to_csv(os.path.join(top_level_dir,rep_set + '_'+input_type+ '_raw_weights.txt'), sep='\t') cb_weights[1].to_csv(os.path.join(top_level_dir,rep_set + '_'+input_type+ '_raw_weights.txt'), sep='\t') wg.write(modZ_GCT, os.path.join(top_level_dir,rep_set + '_MODZ.{}'.format(input_type.split('.')[0]))) wg.write(cb_modZ_GCT, os.path.join(top_level_dir,rep_set + '_MODZ.{}.COMBAT'.format(input_type.split('.')[0]))) cc_q75_df.to_csv(os.path.join(top_level_dir,rep_set + '_' + 'MODZ.{}_cc_q75.txt'.format(input_type.split('.')[0])), sep='\t') cb_cc_q75_df.to_csv(os.path.join(top_level_dir,rep_set + '_' + 'MODZ.' + input_type + '.COMBAT' + '_cc_q75.txt'), sep='\t')
def weave(proj_dir, replicate_set_name, args, input_type='ZSPC', nprofile_drop=True): gct_list = define_replicate_set_files_and_parse(proj_dir, input_type, replicate_set_name) if gct_list == False: return if args.aggregate_output: top_level_dir = os.path.join(proj_dir, "weave") else: top_level_dir = os.path.join(proj_dir, 'weave', replicate_set_name) reload(distil) group_by_list = [x for x in args.group_by.split(',')] #Perform ComBat adjustment if args.davepool_combat == True: all_ds, pre_list = batch_adjust.combat_by_group(gct_list, col_group=group_by_list, batch_field='davepool_id') all_ds, combat_adjusted_gct_list = batch_adjust.combat_by_group(pre_list, col_group=group_by_list, batch_field='pool_id') else: all_ds, combat_adjusted_gct_list = batch_adjust.combat_by_group(gct_list, col_group=group_by_list, batch_field='pool_id') print 'here' logger.info('here') logger.debug("sample combat adjusted gct shape {}".format(combat_adjusted_gct_list[0].data_df.shape)) # Write out ComBat adjusted GCTs for combat_adjusted_gct in combat_adjusted_gct_list: replicate_name = combat_adjusted_gct.col_metadata_df['prism_replicate'].unique()[0] wg.write(combat_adjusted_gct, os.path.join(proj_dir, 'card', replicate_name,replicate_name + '_' + input_type + '.COMBAT.gct')) if args.skip is not None: modZ_GCT, cc_q75_df, weights = distil.calculate_modz(gct_list, group_by=group_by_list, skip=json.loads(args.skip)) cb_modZ_GCT, cb_cc_q75_df, cb_weights = distil.calculate_modz(combat_adjusted_gct_list, group_by=group_by_list, skip=json.loads(args.skip)) else: modZ_GCT, cc_q75_df, weights = distil.calculate_modz(gct_list, group_by=group_by_list) cb_modZ_GCT, cb_cc_q75_df, cb_weights = distil.calculate_modz(combat_adjusted_gct_list, group_by=group_by_list) # Filter out signatures where nprofile = 1 if nprofile_drop==True: (modZ_GCT, cc_q75_df, cb_modZ_GCT, cb_cc_q75_df) = drop_less_than_2_replicates(modZ_GCT, cc_q75_df, cb_modZ_GCT, cb_cc_q75_df) # outfile = os.path.join(top_level_dir, 'MODZ.{}'.format(input_type), replicate_set_name) # cb_outfile = os.path.join(top_level_dir, 'MODZ.{}.COMBAT'.format(input_type), replicate_set_name) write_outputs(top_level_dir,weights, cb_weights, modZ_GCT, cb_modZ_GCT, cc_q75_df, cb_cc_q75_df, replicate_set_name, input_type)
def main(args): # Import data in_gct = parse(args.in_gct_path) # Compute distance df dist_df = 1 - in_gct.data_df # Create distance gct dist_gct = GCToo.GCToo(dist_df, in_gct.row_metadata_df, in_gct.col_metadata_df) # Write dist_gct to file wg.write(dist_gct, args.out_name, filler_null="NA")
def contin_renorm_main(args): """ Separate method from main() in order to make testing easier and to enable command-line access. """ # print in_gct.data_df.isnull().apply(np.sum, axis=1) gct = parse.parse(args.in_gct_path) data_df = gct.data_df.copy(deep=True) row_metadata_df = gct.row_metadata_df.copy(deep=True) col_metadata_df = gct.col_metadata_df.copy(deep=True) ### hack to remove rows that are all NA values data_df = data_df.loc[( data_df.isnull().apply(np.sum, axis=1) < data_df.shape[1]), :] # enrichment_score es = col_metadata_df.loc[:, "det_well_enrichment_score"].copy(deep=True) # calculate lim as x approaches 1 for non-median normalized data pep_y_offsets = calc_y_offsets(data_df, es) # calculate the fit_params fit_params = calc_fit(data_df, es, pep_y_offsets) # annotate which need to be renormalized row_metadata_df["is_log_renormed"] = is_log_renormed( fit_params.loc[:, "deg1"].apply(get_slope), args.slope_cutoff) # calculate the offset matrix offset_mat = calc_pep_samp_offsets(data_df, row_metadata_df, es, fit_params, pep_y_offsets) # calculate the output data out_data_df = calc_out_mat(data_df, offset_mat) # add the metadata field col_metadata_df["renorm_correction"] = calc_tot_samp_offsets(offset_mat) #return GCToo.GCToo(data_df=out_data_df, # col_metadata_df=col_metadata_df, # row_metadata_df=row_metadata_df) # write the file write_gct.write( GCToo.GCToo(data_df=out_data_df, col_metadata_df=col_metadata_df, row_metadata_df=row_metadata_df), args.out_name)
def continuous_renormalization(args): # Read in GCT, if path provided, and make deep copies of all DataFrames if args.in_gct_path: gct = parse.parse(args.in_gct_path) else: gct = args.in_gct data_df = gct.data_df.copy(deep=True) row_metadata_df = gct.row_metadata_df.copy(deep=True) col_metadata_df = gct.col_metadata_df.copy(deep=True) # Remove rows that are all NA values data_df = data_df.loc[(data_df.isnull().apply(np.sum, axis=1) < data_df.shape[1]), :] # Pull out enrichment scores from column metadata dataframe enrichment_scores = col_metadata_df.loc[:, "det_well_enrichment_score"].copy(deep=True) # Calculate limit as x approaches 1 for non-median normalized data pep_y_offsets = calculate_y_offsets(data_df, enrichment_scores) # Calculate the fit parameters fit_parameters = calculate_fit(data_df, enrichment_scores, pep_y_offsets) # Annotate which rows will be renormalized based on slope_cutoff argument (default 0.2) row_metadata_df["is_log_renormed"] = is_log_renormed(fit_parameters.loc[:, "deg1"].apply(get_slope), args.slope_cutoff) # Calculate the offset matrix offset_mat = calculate_peptide_sample_offsets(data_df, row_metadata_df, enrichment_scores, fit_parameters, pep_y_offsets) # Calculate the output DataFrame out_data_df = calculate_out_matrix(data_df, offset_mat) # Add the 'renorm_correction' metadata field with total sample offset values col_metadata_df["renorm_correction"] = calculate_total_sample_offsets(offset_mat) # Output if args.write_gct: write_gct.write(GCToo.GCToo(data_df=out_data_df, col_metadata_df=col_metadata_df, row_metadata_df=row_metadata_df), args.out_name) else: return GCToo.GCToo(data_df=out_data_df, col_metadata_df=col_metadata_df, row_metadata_df=row_metadata_df)
def test_p100_functional(self): p100_in_path = os.path.join(FUNCTIONAL_TESTS_PATH, "test_p100.gct") p100_out_path = os.path.join(FUNCTIONAL_TESTS_PATH, "test_p100_writing.gct") # Read in original gct file p100_in_gct = pg.parse(p100_in_path) # Read in new gct file wg.write(p100_in_gct, p100_out_path) p100_out_gct = pg.parse(p100_out_path) self.assertTrue(p100_in_gct.data_df.equals(p100_out_gct.data_df)) self.assertTrue(p100_in_gct.row_metadata_df.equals(p100_out_gct.row_metadata_df)) self.assertTrue(p100_in_gct.col_metadata_df.equals(p100_out_gct.col_metadata_df)) # Clean up os.remove(p100_out_path)
def main(prism_replicate_name, outfile, all_perturbagens, davepool_data_objects, prism_cell_list): # Build one-to-many mapping between davepool ID and the multiple PRISM cell lines that are within that davepool davepool_id_to_cells_map = build_davepool_id_to_cells_map(prism_cell_list) # Put all the data in gct-able form (all_median_data_by_cell, all_count_data_by_cell) = process_data(davepool_data_objects, davepool_id_to_cells_map) # Create full outfile, build the gct, and write it out! median_outfile = os.path.join(outfile, "assemble", prism_replicate_name, prism_replicate_name + "_MEDIAN.gct") median_gctoo = build_gctoo(prism_replicate_name, all_perturbagens, all_median_data_by_cell) write_gct.write(median_gctoo, median_outfile, data_null=_NaN, filler_null=_null) # Write Inst info file instinfo_outfile = os.path.join(outfile, "assemble", prism_replicate_name, prism_replicate_name + "_inst_info.txt") inst = median_gctoo.col_metadata_df logger.info("Formatting instinfo pert_dose") # cast pert_dose field to str inst['pert_dose'] = inst['pert_dose'].apply( lambda el: process_pert_doses(el)) if 'pert_idose' in inst.columns: logger.info("Formatting instinfo pert_idose") inst['pert_idose'] = inst['pert_idose'].apply( lambda el: process_pert_idoses(el)) inst.to_csv(instinfo_outfile, sep='\t') logger.info("Instinfo has been written to {}".format(instinfo_outfile)) count_outfile = os.path.join(outfile, "assemble", prism_replicate_name, prism_replicate_name + "_COUNT.gct") count_gctoo = build_gctoo(prism_replicate_name, all_perturbagens, all_count_data_by_cell) write_gct.write(count_gctoo, count_outfile, data_null=_NaN, filler_null=_null)
def mk_cell_metadata(args, failed_plates): if args.aggregate_out: paths = glob.glob( os.path.join(args.proj_dir, args.search_pattern, 'card', '*', '*NORM.gct')) mfi_paths = glob.glob( os.path.join(args.proj_dir, args.search_pattern, 'assemble', '*', '*MEDIAN.gct')) else: paths = glob.glob( os.path.join(args.proj_dir, 'card', args.search_pattern, '*NORM.gct')) mfi_paths = glob.glob( os.path.join(args.proj_dir, 'assemble', args.search_pattern, '*MEDIAN.gct')) cell_temp = pe.parse(mfi_paths[0]) cell_temp.row_metadata_df.to_csv(os.path.join( args.build_folder, args.cohort_name + '_cell_info.txt'), sep='\t') # Calculate SSMD matrix using paths that were just grabbed and write out ssmd_mat = ssmd.ssmd_matrix(cut_to_l2.cut_l1(paths)) ssmd_gct = GCToo.GCToo( data_df=ssmd_mat, col_metadata_df=pd.DataFrame(index=ssmd_mat.columns), row_metadata_df=pd.DataFrame(index=ssmd_mat.index)) wg.write( ssmd_gct, os.path.join( args.build_folder, args.cohort_name + '_ssmd_matrix_n{}_{}.gct'.format( ssmd_gct.data_df.shape[1], ssmd_gct.data_df.shape[0]))) ssmd_failures = ssmd_gct.data_df.median()[ ssmd_gct.data_df.median() < 2].index.tolist() fails_dict = dict({ 'dropout_failures': failed_plates, 'ssmd_failures': ssmd_failures }) fails_df = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in fails_dict.iteritems()])) fails_df.to_csv(os.path.join(args.build_folder, 'failed_plates.txt'), sep='\t', index=False)
def main(args): """ The main method. """ # Import gct in_gct = parse(args.in_gct_path) # Create the separated gcts (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field, args.row_or_col) # Save the returned gcts for gct, name in zip(out_gcts, out_gct_prefixes): full_out_name = os.path.join( args.out_dir, args.out_name_prefix + str(name) + args.out_name_suffix) wg.write(gct, full_out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
def write_output_gct(gct, out_dir, out_gct_name, data_null, filler_null): """Write output gct file. Args: gct (GCToo object) out_dir (string): path to save directory out_gct_name (string): name of output gct data_null (string): string with which to represent NaN in data filler_null (string): string with which to fill the empty top-left quadrant in the output gct Returns: None """ out_fname = os.path.join(out_dir, out_gct_name) wg.write(gct, out_fname, data_null=data_null, filler_null=filler_null, data_float_format=None)
def main(args): """ The main method. """ # Read test gct test_gct = parse(args.test_gct_path, convert_neg_666=False, make_multiindex=True) # Read bg_gct bg_gct = parse(args.bg_gct_path, convert_neg_666=False, make_multiindex=True) # Create an aggregated metadata field for index and columns of both gcts # and sort by that field (test_df, bg_df) = prepare_multi_index_dfs( test_gct.multi_index_df, bg_gct.multi_index_df, args.fields_to_aggregate_in_test_gct_queries, args.fields_to_aggregate_in_test_gct_targets, args.fields_to_aggregate_in_bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, args.separator) # Check symmetry (is_test_df_sym, _) = check_symmetry(test_gct.multi_index_df, bg_gct.multi_index_df) # Compute connectivity (conn_mi_df, signed_conn_mi_df) = compute_connectivities( test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME, args.connectivity_metric, is_test_df_sym) # Convert multi-index to component dfs in order to write output gct (signed_data_df, signed_row_metadata_df, signed_col_metadata_df) = ( GCToo.multi_index_df_to_component_dfs( signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME)) # Append to queries a new column saying what connectivity metric was used add_connectivity_metric_to_metadata(signed_col_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) add_connectivity_metric_to_metadata(signed_row_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) # Create gct and write it to file conn_gct = GCToo.GCToo(data_df=signed_data_df, row_metadata_df=signed_row_metadata_df, col_metadata_df=signed_col_metadata_df) wg.write(conn_gct, args.out_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def main(args): """ The main method. """ # Read test gct test_gct = parse(args.test_gct_path) # Read bg_gct bg_gct = parse(args.bg_gct_path) # Check symmetry (is_test_df_sym, _) = check_symmetry(test_gct.data_df, bg_gct.data_df) # Create an aggregated metadata field in test and background GCTs # that will be used to aggregate replicates (test_gct, bg_gct) = create_aggregated_fields_in_GCTs( test_gct, bg_gct, args.fields_to_aggregate_in_test_gct_queries, args.fields_to_aggregate_in_test_gct_targets, args.fields_to_aggregate_in_bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, args.separator) # Compute connectivity (conn_gct, signed_conn_gct) = compute_connectivities( test_gct, bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME, args.connectivity_metric, is_test_df_sym, args.separator) # Append to queries a new column saying what connectivity metric was used add_connectivity_metric_to_metadata(signed_conn_gct.col_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) add_connectivity_metric_to_metadata(signed_conn_gct.row_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) # Write signed result to file wg.write(signed_conn_gct, args.out_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def subset_main(args): """ Separate method from main() in order to make testing easier and to enable command-line access. """ # Read in each of the command line arguments rid = _read_arg(args.rid) cid = _read_arg(args.cid) exclude_rid = _read_arg(args.exclude_rid) exclude_cid = _read_arg(args.exclude_cid) # If GCT, use subset_gctoo if args.in_path.endswith(".gct"): in_gct = parse_gct.parse(args.in_path) out_gct = sg.subset_gctoo(in_gct, rid=rid, cid=cid, exclude_rid=exclude_rid, exclude_cid=exclude_cid) # If GCTx, use parse_gctx else: if (exclude_rid is not None) or (exclude_cid is not None): msg = "exclude_{rid,cid} args not currently supported for parse_gctx." raise (Exception(msg)) logger.info("Using hyperslab selection functionality of parse_gctx...") out_gct = parse_gctx.parse(args.in_path, rid=rid, cid=cid) # Write the output gct if args.out_type == "gctx": wgx.write(out_gct, args.out_name) else: wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")