def test_main2(self): input_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_introspect_main.gct") output_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_introspect_main_out2.gct") expected_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_introspect_main_expected2.gct") args_string = "-i {} -o {} -fa moa".format(input_gct_path, output_gct_path) args = introspect.build_parser().parse_args(args_string.split()) introspect.main(args) # Read in output and expected gcts and confirm that they're equal output_gct = parse.parse(output_gct_path) expected_gct = parse.parse(expected_gct_path) pd.util.testing.assert_almost_equal(expected_gct.data_df, output_gct.data_df, check_less_precise=True) pd.util.testing.assert_frame_equal(expected_gct.row_metadata_df, output_gct.row_metadata_df) pd.util.testing.assert_frame_equal(expected_gct.col_metadata_df, output_gct.col_metadata_df) # Clean up os.remove(output_gct_path)
def main(args): # Parse input gcts external_gct = parse.parse(args.external_gct_path) internal_gct = parse.parse(args.internal_gct_path) bg_gct = parse.parse(args.bg_gct_path) # Meat of the script (sim_gct, conn_gct) = do_steep_and_sip( external_gct, internal_gct, bg_gct, args.similarity_metric, args.connectivity_metric, args.fields_to_aggregate_for_external_profiles, args.fields_to_aggregate_for_internal_profiles) # Write output gcts wg.write(sim_gct, args.out_steep_name, data_null="NaN", metadata_null="NaN", filler_null="NaN") wg.write(conn_gct, args.out_sip_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def test_main(self): gct_path = os.path.join(functional_tests_dir, "test_annotate_gct_from_mapping_in.gct") mapping_path = os.path.join(functional_tests_dir, "test_annotate_gct_from_mapping.tsv") expected_gct_path = os.path.join( functional_tests_dir, "test_annotate_gct_from_mapping_expected.gct") out_path = os.path.join(functional_tests_dir, "test_annotate_gct_from_mapping_out.gct") args_string = "-i {} -m {} -o {} -f {}".format(gct_path, mapping_path, out_path, "pert_iname") args = agfm.build_parser().parse_args(args_string.split()) agfm.main(args) # Read in expected and actual outputs e_gct = parse.parse(expected_gct_path) out_gct = parse.parse(out_path) pd.util.testing.assert_frame_equal(e_gct.data_df, out_gct.data_df) pd.util.testing.assert_frame_equal(e_gct.row_metadata_df, out_gct.row_metadata_df) pd.util.testing.assert_frame_equal(e_gct.col_metadata_df, out_gct.col_metadata_df) # Clean up os.remove(out_path)
def read_gctx(fname, col_meta=True, row_meta=True, ignore_data_df=False): print(" Parsing GCTX file.") if ignore_data_df: print(" Loading row metadata.") rm = parse(fname, row_meta_only=True) fix_mangled_byte_literals(rm) print(" Loading column metadata.") cm = parse(fname, col_meta_only=True) fix_mangled_byte_literals(cm) cm['sig_num'] = list(range(cm.shape[0])) return (None, cm, rm) else: # Load everything print(" IGNORING EXPRESSION SIGNATURES; ONLY LOADING METADATA.") print(" If you did not intend to do this, re-run with different arguments.") tmp = parse(fname) data_df = tmp.data_df print(" Fixing mangled byte literals.") fix_mangled_byte_literals(data_df) if (col_meta): print(" Loading column metadata") cm = tmp.col_metadata_df fix_mangled_byte_literals(cm) cm['sig_num'] = list(range(cm.shape[0])) if (row_meta): print("Loading row metadata") rm = tmp.row_metadata_df fix_mangled_byte_literals(rm) return (data_df, cm, rm)
def main(): """Parse args and reads and write expression files for paired metadata""" args_dict = main_parse_args() # get list of probes probeset_df = pd.read_table(args_dict['probeset_infile'], sep='\t') probeset = np.array(map(str, probeset_df['pr_gene_id'].values)) # get list of experiments expid_df = pd.read_table(args_dict['expid_infile'], sep='\t', header=None) myexpids = np.array(map(str, expid_df[0].values)) # get info about gctx file col_metadata = parse.parse(args_dict['gctx_infile'], col_meta_only=True) geoexpset = set(col_metadata.index.values) # keep only ids in gctx file print("Filtering exp ids for chunk " + str(chunk) + "...") validexp_ids = np.array(list(set(myexpids) & geoexpset)) # fetch data from gctx print("Fetching chunk " + str(chunk) + "...") allexps_gct = parse.parse(args_dict['gctx_infile'], rid=probeset, cid=validexp_ids) #returns rows and columns in different order print("Gene Ids Order: " + str(list(allexps_gct.data_df.index))) # merge and write outfile print("Writing outfile: "+ args_dict['outfile']) write_gctx.write(allexps_gct, args_dict['outfile'])
def test_gct_parsing(self): # parse in gct, no other arguments mg1 = mini_gctoo_for_testing.make() mg2 = parse.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct" ) pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df) pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) # check convert_neg_666 worked correctly self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all()) # parse w/o convert_neg_666 mg2_alt = parse.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct", convert_neg_666=False) self.assertCountEqual( mg2_alt.col_metadata_df["mfc_plate_id"].values.tolist(), [-666] * 6) # parse in gct with subsetting my_rid = "LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33" mg3 = parse.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct", cidx=[0, 2], rid=[my_rid]) self.assertEqual(mg3.data_df.shape, (1, 2)) self.assertCountEqual(mg3.data_df.values.flatten().tolist(), [1., 3.]) self.assertEqual(mg3.row_metadata_df.index[0], my_rid)
def test_subset_main(self): in_gct_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_in.gct") rid_grp_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp") out_name = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_out.gct") expected_out_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_expected.gct") args_string = "-i {} --rid {} -ec {} -o {}".format( in_gct_path, rid_grp_path, "f", out_name) args = sg.build_parser().parse_args(args_string.split()) # Run main method sg.subset_main(args) # Compare output to expected out_gct = parse.parse(out_name) expected_gct = parse.parse(expected_out_path) pd.util.testing.assert_frame_equal(out_gct.data_df, expected_gct.data_df) pd.util.testing.assert_frame_equal(out_gct.row_metadata_df, expected_gct.row_metadata_df) pd.util.testing.assert_frame_equal(out_gct.col_metadata_df, expected_gct.col_metadata_df) # Clean up os.remove(out_name) # gctx with exclude_rid should fail args_string2 = "-i {} --rid {} -ec {} -o {}".format( "FAKE.gctx", rid_grp_path, "f", out_name) args2 = sg.build_parser().parse_args(args_string2.split()) with self.assertRaises(Exception) as e: sg.subset_main(args2) self.assertIn("exclude_{rid,cid} args not currently supported", str(e.exception))
def main(args): # Read in the first gct gct1 = parse.parse(args.in_gct_path) # If second gct provided, compute similarity between 2 gcts if args.in_gct2_path is not None: logger.info( "in_gct2_path was provided. Will compute pairwise similarities " + "between the columns of in_gct and in_gct2.") # Read in the second gct gct2 = parse.parse(args.in_gct2_path) # Compute similarities between gct1 and gct2 out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df, args.similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_df = gct1.col_metadata_df col_metadata_df = gct2.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df) # If only 1 gct provided, compute similarities between the columns of gct1 else: out_df = compute_similarity_within_df(gct1.data_df, args.similarity_metric) # Row and column metadata are both from gct1 metadata_df = gct1.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df) # Write output gct if os.path.splitext(args.out_name)[1] == ".gct": wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA") elif os.path.splitext(args.out_name)[1] == ".gctx": wgx.write(out_gct, args.out_name) else: raise (Exception( "out_name must end in .gct or .gctx. out_name: {}".format( args.out_name)))
def setUpClass(cls): external_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_external.gct") cls.external_gct = parse.parse(external_gct_path) internal_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_internal.gct") cls.internal_gct = parse.parse(internal_gct_path) bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_bg.gct") cls.bg_gct = parse.parse(bg_gct_path)
def main(args): data = pe.parse(args.gct) meta = pd.read_table(args.meta, index_col=args.index_col) if args.sense is not None: wtks(data, meta, args.out, args.sense, group_col=args.prefix_name) else: wtks(data, meta, args.out, group_col=args.prefix_name)
def main(): # Get args args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) # Read the input gct in_gct = parse.parse(args.in_gct_path) # Read in each of the command line arguments rid = _read_arg(args.rid) cid = _read_arg(args.cid) exclude_rid = _read_arg(args.exclude_rid) exclude_cid = _read_arg(args.exclude_cid) # Slice the gct out_gct = sg.slice_gctoo(in_gct, rid=rid, cid=cid, exclude_rid=exclude_rid, exclude_cid=exclude_cid) assert out_gct.data_df.size > 0, "Slicing yielded an empty gct!" # Write the output gct if args.use_gctx: wgx.write(out_gct, args.out_name) else: wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
def convert_gct_to_config(assay_type, gct_path, output_path): """ Use custom parameters embedded within a GCT to create a config file for processing Args - gct_path - where to read in GCT for custom parameters assay_type (string) - assay used to specify parameters in config output_path - where to write output config Returns - Nothing - writes output to output path """ gct = parse.parse(gct_path) # All rows have same parameters embedded, choose any try : custom_params = gct.row_metadata_df.loc[:,"pr_processing_params"].any() except Exception as error: print "GCT does not contain pr_processing_params field" return None if custom_params == "{}": print "GCT contains pr_processing_params field, but it is empty" return None custom_params = create_dict_from_pseudojson(custom_params) differential_parameters = check_custom_parameters_against_defaults(assay_type, custom_params, json=True) if differential_parameters is not None: write_config(differential_parameters, output_path) return differential_parameters
def load_expr_data(self, phase): """ Load differential gene expression profiles (in a dataframe) from one of the two phases of the L1000 dataset """ assert phase in ["phase1", "phase2"] if phase == "phase1": df_path = os.path.join(self.raw_dir, "dataframe_phase1.pkl") file_name = self.raw_file_names[5] else: df_path = os.path.join(self.raw_dir, "dataframe_phase2.pkl") file_name = self.raw_file_names[0] if os.path.isfile(df_path): pickle_in = open(df_path, "rb") expr_data = pickle.load(pickle_in) else: # If the data has not been saved yet, parse the original file and save dataframe print("Parsing original data, only happens the first time...") from cmapPy.pandasGEXpress.parse import parse expr_data = parse(os.path.join(self.raw_dir, file_name), rid=self.landmark_gene_list).data_df.T # Ensure that the order of columns corresponds to landmark_gene_list expr_data = expr_data[self.landmark_gene_list] # Remove rows that are not in sig_info expr_data = expr_data[expr_data.index.isin(self.sig_info.index)] # Save data pickle_out = open(df_path, "wb") pickle.dump(expr_data, pickle_out, protocol=2) pickle_out.close() return expr_data
def main(args): # Parse gct file gct = parse.parse(args.path_to_gct) # Parse mapping tsv file mapping = pd.read_csv(args.path_to_mapping_tsv, sep="\t", index_col=0) # Make sure the ids from the mapping file are unique duplicated_bool_array = mapping.index.duplicated() assert sum(duplicated_bool_array) == 0, ( "ids in mapping file must be unique. duplicated ids in mapping:\n{}". format(mapping.index[duplicated_bool_array])) for col in mapping.columns: if args.row_and_or_col == "both": annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) elif args.row_and_or_col == "row": annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) elif args.row_and_or_col == "col": annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) wg.write(gct, args.out_name, filler_null="NA", data_null="NaN", metadata_null="NA")
def get_CCLE_Exp_gct_from_selected_genes(selected_gene_list, result_addr=None): ccle_GCToo = parse( "/Users/woochanghwang/PycharmProjects/LifeArc/General/data/CCLE/CCLE_RNAseq_genes_rpkm_20180929.gct" ) # print(ccle_GCToo.row_metadata_df[:10]) # print(ccle_GCToo.col_metadata_df[:10]) # print(ccle_GCToo.data_df.head()) selected_gene_rid_list = [] for target_gene in selected_gene_list: target_gene_rid = find_gene_rid(ccle_GCToo, target_gene) selected_gene_rid_list.append(target_gene_rid) selected_gene_expression_df = ccle_GCToo.data_df.loc[ selected_gene_rid_list, :] selected_gene_expression_df_T = selected_gene_expression_df.T print(selected_gene_expression_df_T) gene_map_dict = dict() for gene, rid in zip(selected_gene_list, selected_gene_rid_list): gene_map_dict[rid] = gene # rename selected_gene_expression_df_T = selected_gene_expression_df_T.rename( index=str, columns=gene_map_dict) selected_gene_expression_df = selected_gene_expression_df_T.T if result_addr != None: selected_gene_expression_df.to_csv(result_addr, sep='\t', quoting=csv.QUOTE_NONE, index=False) return selected_gene_expression_df
def read_gct_and_config_file(gct_path, config_path): """Read gct and config file. The config file has three sections: io, metadata, and parameters. These are returned as dictionaries. Args: gct_path (string): filepath to gct file config_path (string): filepath to config file Returns: gct (GCToo object) config_io (dictionary) config_metadata (dictionary) config_parameters (dictionary) """ assert os.path.exists(os.path.expanduser(config_path)) # Read config file config_parser = ConfigParser.RawConfigParser() config_parser.read(os.path.expanduser(config_path)) # Return config fields as dictionarires config_io = dict(config_parser.items("io")) config_metadata = dict(config_parser.items("metadata")) config_parameters = dict(config_parser.items("parameters")) # Parse the gct file and return GCToo object gct = parse.parse(gct_path) return gct, config_io, config_metadata, config_parameters
def main(args): """ The main method. """ # Import gct in_gct = parse.parse(args.in_gct_path) # Create the separated gcts (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field, args.row_or_col) # Save the returned gcts for gct, name in zip(out_gcts, out_gct_prefixes): full_out_name = os.path.join( args.out_dir, args.out_name_prefix + str(name) + args.out_name_suffix) # Write to GCT or GCTX depending on extension if str.lower(os.path.splitext(full_out_name)[1]) == ".gct": wg.write(gct, full_out_name, data_null="NaN", metadata_null="NA", filler_null="NA") elif str.lower(os.path.splitext(full_out_name)[1]) == ".gctx": wgx.write(gct, full_out_name) else: raise (Exception( "out_name_suffix must end in either .gct or .gctx. out_name_suffix: {}" .format((args.out_name_suffix))))
def reader_writer(input_file, output_file, function, check_size=False): plate_failure = False # Read in input file gctoo = pe.parse(input_file) # Call normalizing function on gctoo new_gctoo = function(gctoo) new_gctoo = drop_nans(new_gctoo) if new_gctoo == 'empty_plate': logger.debug("{} has no usable data and has not been written.".format( os.path.basename(output_file))) plate_failure = True return plate_failure # If told to, check size of new_gctoo and flag if too small if new_gctoo.data_df.shape[1] <= 349 and check_size == True: logger.debug('{} Plate Failure With {} Failed Wells'.format( os.path.basename(os.path.dirname(input_file)), 384 - new_gctoo.data_df.shape[1])) plate_failure = True # write out new gctoo wgx.write(new_gctoo, out_fname=output_file) logger.debug("{} file written.".format(output_file)) return plate_failure
def main(args): # Import data assert os.path.exists( args.in_gct_path), ("in_gct_path could not be found: {}").format( args.in_gct_path) in_gct = parse.parse(args.in_gct_path) # First, check if any rows are all NaN; if so, remove them dropped_df = in_gct.data_df.dropna(how="all") bools_of_remaining = in_gct.data_df.index.isin(dropped_df.index.values) in_gct = sg.subset_gctoo(in_gct, row_bool=bools_of_remaining) if args.replace_with == "zero": in_gct.data_df.fillna(0, inplace=True) elif args.replace_with == "median": probe_medians = in_gct.data_df.median(axis=1) for row_idx, row in enumerate(in_gct.data_df.values): this_row = in_gct.data_df.iloc[row_idx, :] this_row[this_row.isnull()] = probe_medians[row_idx] in_gct.data_df.iloc[row_idx, :] = this_row elif args.replace_with == "mean": probe_means = in_gct.data_df.mean(axis=1) for row_idx, row in enumerate(in_gct.data_df.values): this_row = in_gct.data_df.iloc[row_idx, :] this_row[this_row.isnull()] = probe_means[row_idx] in_gct.data_df.iloc[row_idx, :] = this_row wg.write(in_gct, args.out_name, filler_null="NA")
def check_ssmds(norm_path, plate_failure): norm_gct = pe.parse(norm_path) ssmds = ssmd_analysis.get_ssmd(norm_gct, unlog=True) ssmd_failures = ssmds[ssmds < 2].count() if ssmd_failures > len(ssmds) / 3: plate_failure = True return plate_failure
def gctx_to_pandas(filename, columnlist): gctToo = parse(filename, make_multiindex=True) df = gctToo.data_df df = df.T if len(columnlist) > 0: df = df[columnlist] return (df)
def load_data(self): self.df = parse(self.data_path).data_df.T # Map gene names eh_map = ensg_to_hugo_map() columns_to_drop = [i for i in self.df.columns if str(i)[str(i).find('ENS'):].split('.')[0] not in eh_map.keys()] self.df = self.df.drop(columns_to_drop, axis=1) # Drop columns whose gene is not covered by the map self.df.columns = [eh_map[str(i)[str(i).find('ENS'):].split('.')[0]] for i in self.df.columns] # Rename columns
def main(): # get args args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) logger.debug("args: {}".format(args)) # Get files directly if args.input_filepaths is not None: files = args.input_filepaths # Or find them else: files = get_file_list(args.file_wildcard) # No files found if len(files) == 0: msg = "No files were found. args.file_wildcard: {}".format( args.file_wildcard) logger.error(msg) raise Exception(msg) # Only 1 file found if len(files) == 1: logger.warning( "Only 1 file found. No concatenation needs to be done, exiting") return # More than 1 file found else: # Parse each file and append to a list gctoos = [] for f in files: gctoos.append(parse.parse(f)) # Create concatenated gctoo object if args.concat_direction == "horiz": out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) elif args.concat_direction == "vert": out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) # Write out_gctoo to file logger.info("Writing to output file args.out_name: {}".format( args.out_name)) if args.out_type == "gctx": write_gctx.write(out_gctoo, args.out_name) elif args.out_type == "gct": write_gct.write(out_gctoo, args.out_name, filler_null=args.filler_null, metadata_null=args.metadata_null, data_null=args.data_null)
def mk_report(proj_folder, out_folder): mar_sense = pd.read_table(os.path.join(out_folder,'sense/expected_sensitivity_ranks.txt'), index_col='det_plate') mar_sense = mar_sense / 384 mar_sense = mar_sense * 100 for x in pd.Series([y.split('_')[0] for y in mar_sense.columns]).unique(): mar_sense[x] = mar_sense[[p for p in mar_sense.columns if p.startswith(x)]].median(axis=1) mar_sense = mar_sense[pd.Series([y.split('_')[0] for y in mar_sense.columns]).unique()] gct_list = [pe.parse(y) for y in [x for x in glob.glob(os.path.join(proj_folder,'card/*/*NORM*'))]] norm_gct = concat.hstack(gct_list) gct_list = [pe.parse(y) for y in [x for x in glob.glob(os.path.join(proj_folder,'assemble/*/*MEDIAN*'))]] mfi_gct = concat.hstack(gct_list) n_recovered = [] invs = [] beadsets = [] plate = [] med_rank = [] dropouts = [] for det_plate in mar_sense.index: temp = norm_gct.data_df[[x for x in norm_gct.data_df.columns if x.startswith(det_plate)]] dropouts.append(384 - temp.shape[1]) sigs_recovered = mar_sense.loc[det_plate].dropna()[mar_sense.loc[det_plate].dropna() < 50].count() median_rank = mar_sense.loc[det_plate].median() temp = mfi_gct.data_df[[x for x in mfi_gct.data_df.columns if x.startswith(det_plate)]] median_inv = temp.loc[['c-661', 'c-662', 'c-663', 'c-664']].median(axis=1).median() beadset = det_plate.split('_')[-1].split(':')[0] n_recovered.append(sigs_recovered) invs.append(median_inv) beadsets.append(beadset) plate.append(det_plate) med_rank.append(median_rank) mar_df = pd.concat([pd.Series(plate).rename('det_plate'), pd.Series(n_recovered).rename('sigs_recovered_core'), pd.Series(med_rank).rename('median_rank_core'), pd.Series(invs).rename('median_inv'), pd.Series(dropouts).rename('n_dropouts'), pd.Series(beadsets).rename('beadset')], axis=1) mar_df.set_index('det_plate', inplace=True) mar_ssmd = ssmd_an.ssmd_matrix( norm_paths=glob.glob(os.path.join(proj_folder,'card/*/*NORM*'))) mar_df = mar_df.join(mar_ssmd[mar_ssmd < 2].count().rename('ssmd_failures')) return mar_df
def load_data(gct_files): """ Read a list of GCT files and returns a list """ gct_list = [] for gct_path in gct_files: LOGGER.info('Reading {}'.format(gct_path)) gct = pe.parse(gct_path) gct_list.append(gct) return gct_list
def main(args): # Read GCTs into a list gctoo_list = [parse.parse(gct) for gct in args.list_of_gcts] # Create superset of all probes in GCTs probe_superset = create_probe_superset(gctoo_list) # Create pdf in which each page is a probe of the superset create_output_pdf(probe_superset, gctoo_list, args.metadata_field, args.output_name)
def concat_main(args): """ Separate method from main() in order to make testing easier and to enable command-line access. """ # Get files directly if args.input_filepaths is not None: files = args.input_filepaths # Or find them else: files = get_file_list(args.file_wildcard) # No files found if len(files) == 0: msg = "No files were found. args.file_wildcard: {}".format( args.file_wildcard) logger.error(msg) raise Exception(msg) # Only 1 file found if len(files) == 1: logger.warning( "Only 1 file found. No concatenation needs to be done, exiting") return # More than 1 file found else: # Parse each file and append to a list gctoos = [] for f in files: gctoos.append(parse.parse(f)) # Create concatenated gctoo object if args.concat_direction == "horiz": out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) elif args.concat_direction == "vert": out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) # Write out_gctoo to file logger.info("Writing to output file args.out_name: {}".format( args.out_name)) if args.out_type == "gctx": write_gctx.write(out_gctoo, args.out_name) elif args.out_type == "gct": write_gct.write(out_gctoo, args.out_name, filler_null=args.filler_null, metadata_null=args.metadata_null, data_null=args.data_null)
def run_sensitivities(proj_folder, gmt_path, out_folder): gct_list = [pe.parse(y) for y in [x for x in glob.glob(os.path.join(proj_folder,'card/*/*ZSPC.gct'))]] fail_gct = concat.hstack(gct_list) if not os.path.exists(os.path.join(args.outfolder, 'sense')): os.mkdir(os.path.join(args.outfolder, 'sense')) sense.wtks(gct=fail_gct, metadata=fail_gct.col_metadata_df, outfolder=os.path.join(out_folder, 'sense'), group_col='prism_replicate', gmt_path=gmt_path)
def main(args): gct = parse.parse(args.in_gct_path) (_, conn_gct) = do_steep_and_sip( gct, args.similarity_metric, args.connectivity_metric, args.fields_to_aggregate) # Write output gct wg.write(conn_gct, args.out_sip_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def test_main(self): test_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_in_test.gct") bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_in_bg.gct") out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_main_out.gct") args_string = "-t {} -b {} -o {} -tfq {} -tft {} -bf {} -s {}".format( test_gct_path, bg_gct_path, out_path, "pert_iname", "pert_iname", "pert_iname", "|") args = sip.build_parser().parse_args(args_string.split()) # Run main method sip.main(args) # Compare the output of main with the expected output e_out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_expected_conn.gct") e_out_gct = parse.parse(e_out_path) out_gct = parse.parse(out_path) logger.debug("e_out_gct.data_df:\n{}".format(e_out_gct.data_df)) logger.debug("out_gct.data_df:\n{}".format(out_gct.data_df)) pd.util.testing.assert_frame_equal(e_out_gct.data_df, out_gct.data_df, check_less_precise=3) logger.debug("e_out_gct.row_metadata_df:\n{}".format( e_out_gct.row_metadata_df)) logger.debug("out_gct.row_metadata_df:\n{}".format( out_gct.row_metadata_df)) pd.util.testing.assert_frame_equal(e_out_gct.row_metadata_df, out_gct.row_metadata_df) logger.debug("e_out_gct.col_metadata_df:\n{}".format( e_out_gct.col_metadata_df)) logger.debug("out_gct.col_metadata_df:\n{}".format( out_gct.col_metadata_df)) pd.util.testing.assert_frame_equal(e_out_gct.col_metadata_df, out_gct.col_metadata_df) # Remove the created file os.remove(out_path)