def test_main(self): in_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_slice_in.gct") rid_grp_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_slice_rid.grp") out_name = os.path.join(FUNCTIONAL_TESTS_DIR, "test_slice_out.gct") expected_out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_slice_expected.gct") args_string = "-i {} --rid {} -ec {} -o {}".format( in_gct_path, rid_grp_path, "f", out_name) args = slice_gct.build_parser().parse_args(args_string.split()) # Run main method slice_gct.main(args) # Compare output to expected out_gct = pg.parse(out_name) expected_gct = pg.parse(expected_out_path) pd.util.testing.assert_frame_equal(out_gct.data_df, expected_gct.data_df) pd.util.testing.assert_frame_equal(out_gct.row_metadata_df, expected_gct.row_metadata_df) pd.util.testing.assert_frame_equal(out_gct.col_metadata_df, expected_gct.col_metadata_df) # Clean up os.remove(out_name)
def main(args): # Get files directly if args.list_of_gct_paths is not None: files = args.list_of_gct_paths # Or find them else: files = get_file_list(args.file_wildcard) assert len( files) > 0, "No files were found. args.file_wildcard: {}".format( args.file_wildcard) # Parse each file and append to a list gctoos = [] for f in files: gctoos.append(parse_gctoo.parse(f)) # Create concatenated gctoo object out_gctoo = hstack(gctoos, args.fields_to_remove, args.reset_sample_ids, args.sort_headers) # Write out_gctoo to file logger.info("Write to file...") write_gctoo.write(out_gctoo, args.full_out_name, filler_null=args.filler_null, metadata_null=args.metadata_null, data_null=args.data_null)
def test_parse(self): # L1000 gct l1000_file_path = os.path.join(FUNCTIONAL_TESTS_PATH, "test_l1000.gct") l1000_gct = pg.parse(l1000_file_path) # Check a few values correct_val = 11.3819 self.assertTrue(l1000_gct.data_df.iloc[0, 0] == correct_val, ("The first value in the data matrix should be " + "{} not {}").format(str(correct_val), l1000_gct.data_df.iloc[0, 0])) correct_val = 58 self.assertTrue(l1000_gct.col_metadata_df.iloc[0, 0] == correct_val, ("The first value in the column metadata should be " + "{} not {}").format( str(correct_val), l1000_gct.col_metadata_df.iloc[0, 0])) correct_val = "Analyte 11" self.assertTrue(l1000_gct.row_metadata_df.iloc[0, 0] == correct_val, ("The first value in the row metadata should be " + "{} not {}").format( str(correct_val), l1000_gct.row_metadata_df.iloc[0, 0])) # P100 gct p100_file_path = os.path.join(FUNCTIONAL_TESTS_PATH, "test_p100.gct") p100_gct = pg.parse(p100_file_path) # Check a few values correct_val = 0.918157217057044 self.assertTrue(p100_gct.data_df.iloc[0, 0] == correct_val, ("The first value in the data matrix should be " + "{} not {}").format(str(correct_val), p100_gct.data_df.iloc[0, 0])) correct_val = "MCF7" self.assertTrue(p100_gct.col_metadata_df.iloc[0, 0] == correct_val, ("The first value in the column metadata should be " + "{} not {}").format( str(correct_val), p100_gct.col_metadata_df.iloc[0, 0])) correct_val = 1859 self.assertTrue(p100_gct.row_metadata_df.iloc[0, 0] == correct_val, ("The first value in the row metadata should be " + "{} not {}").format( str(correct_val), p100_gct.row_metadata_df.iloc[0, 0]))
def test_left_right(self): # Verify that concatenation replicates the output file left_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_merge_left.gct") right_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_merge_right.gct") expected_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_merged_left_right.gct") left_gct = pg.parse(left_gct_path) right_gct = pg.parse(right_gct_path) expected_gct = pg.parse(expected_gct_path) # Merge left and right concated_gct = cg.hstack([left_gct, right_gct], None, False, False) self.assertTrue(expected_gct.data_df.equals(concated_gct.data_df), ( "\nconcated_gct.data_df:\n{}\nexpected_gct.data_df:\n{}".format( concated_gct.data_df, expected_gct.data_df))) self.assertTrue(expected_gct.row_metadata_df.equals(concated_gct.row_metadata_df)) self.assertTrue(expected_gct.col_metadata_df.equals(concated_gct.col_metadata_df))
def test_p100_functional(self): p100_in_path = os.path.join(FUNCTIONAL_TESTS_PATH, "test_p100.gct") p100_out_path = os.path.join(FUNCTIONAL_TESTS_PATH, "test_p100_writing.gct") # Read in original gct file p100_in_gct = parse_gctoo.parse(p100_in_path) # Read in new gct file wg.write(p100_in_gct, p100_out_path) p100_out_gct = parse_gctoo.parse(p100_out_path) self.assertTrue(p100_in_gct.data_df.equals(p100_out_gct.data_df)) self.assertTrue( p100_in_gct.row_metadata_df.equals(p100_out_gct.row_metadata_df)) self.assertTrue( p100_in_gct.col_metadata_df.equals(p100_out_gct.col_metadata_df)) # Clean up os.remove(p100_out_path)
def test_with_both_metadata_fields(self): # path to files gctoo_path = FUNCTIONAL_TESTS_PATH + "/both_metadata_example_n1476x978.gct" gctoox_path = FUNCTIONAL_TESTS_PATH + "/both_metadata_example_n1476x978.gctx" # parse files c1_gctoo = parse_gctoo.parse(gctoo_path) c1_gctoox = parse_gctoox.parse(gctoox_path) #check rows and columns: data_df self.assertTrue(set(list(c1_gctoo.data_df.index)) == set(list(c1_gctoox.data_df.index)), "Mismatch between data_df index values of gct vs gctx: {} vs {}".format(c1_gctoo.data_df.index, c1_gctoox.data_df.index)) self.assertTrue(set(list(c1_gctoo.data_df.columns)) == set(list(c1_gctoox.data_df.columns)), "Mismatch between data_df column values of gct vs gctx: {} vs {}".format(c1_gctoo.data_df.columns, c1_gctoox.data_df.columns)) logger.debug("c1 gctoo data_df columns equal to gctoox data_df columns? {}".format(set(c1_gctoo.data_df.columns) == set(c1_gctoox.data_df.columns))) for c in list(c1_gctoo.data_df.columns): # logger.debug("Comparing data values in Column: {}".format(c)) self.assertTrue(len(list(c1_gctoo.data_df[c])) == len(list(c1_gctoox.data_df[c])), "Lengths of column {} differ between gct and gctx".format(c)) # assert_frame_equal(pandas.DataFrame(c1_gctoo.data_df[c]), pandas.DataFrame(c1_gctoox.data_df[c])) assert_series_equal(c1_gctoo.data_df[c], c1_gctoox.data_df[c]) # check rows and columns: row_metadata_df self.assertTrue(set(list(c1_gctoo.row_metadata_df.index)) == set(list(c1_gctoox.row_metadata_df.index)), "Mismatch between row_metadata_df index values of gct vs gctx: {} vs {}".format(c1_gctoo.row_metadata_df.index, c1_gctoox.row_metadata_df.index)) self.assertTrue(set(list(c1_gctoo.row_metadata_df.columns)) == set(list(c1_gctoox.row_metadata_df.columns)), "Mismatch between row_metadata_df column values of gct vs gctx: difference is {}".format(set(c1_gctoo.row_metadata_df.columns).symmetric_difference(set(c1_gctoox.row_metadata_df.columns)))) logger.debug("c1 gctoo row_metadata_df columns equal to gctoox row_metadata_df columns? {}".format(set(c1_gctoo.row_metadata_df.columns) == set(c1_gctoox.row_metadata_df.columns))) logger.debug("c1 gctoo dtypes: {}".format(c1_gctoo.row_metadata_df.dtypes)) logger.debug("c1 gctoox dtypes: {}".format(c1_gctoox.row_metadata_df.dtypes)) for c in list(c1_gctoo.row_metadata_df.columns): self.assertTrue(len(list(c1_gctoo.row_metadata_df[c])) == len(list(c1_gctoox.row_metadata_df[c])), "Lengths of column {} differ between gct and gctx".format(c)) logger.debug("first couple elems of {} in gctoo: {}".format(c, list(c1_gctoo.row_metadata_df[c])[0:3])) self.assertTrue(c1_gctoo.row_metadata_df[c].dtype == c1_gctoox.row_metadata_df[c].dtype, "Dtype mismatch for {} between parsed gct & gctx: {} vs {}".format(c, c1_gctoo.row_metadata_df[c].dtype, c1_gctoox.row_metadata_df[c].dtype)) assert_series_equal(c1_gctoo.row_metadata_df[c], c1_gctoox.row_metadata_df[c]) # check rows and columns: col_metadata_df self.assertTrue(set(list(c1_gctoo.col_metadata_df.index)) == set(list(c1_gctoox.col_metadata_df.index)), "Mismatch between col_metadata_df index values of gct vs gctx: {} vs {}".format(c1_gctoo.col_metadata_df.index, c1_gctoox.col_metadata_df.index)) self.assertTrue(set(list(c1_gctoo.col_metadata_df.columns)) == set(list(c1_gctoox.col_metadata_df.columns)), "Mismatch between col_metadata_df column values of gct vs gctx: {} vs {}".format(c1_gctoo.col_metadata_df.columns, c1_gctoox.col_metadata_df.columns)) logger.debug("c1 gctoo col_metadata_df columns equal to gctoox col_metadata_df columns? {}".format(set(c1_gctoo.col_metadata_df.columns) == set(c1_gctoox.col_metadata_df.columns))) for c in list(c1_gctoo.col_metadata_df.columns): self.assertTrue(len(list(c1_gctoo.col_metadata_df[c])) == len(list(c1_gctoox.col_metadata_df[c])), "Lengths of column {} differ between gct and gctx".format(c)) self.assertTrue(c1_gctoo.col_metadata_df[c].dtype == c1_gctoox.col_metadata_df[c].dtype, "Dtype mismatch between parsed gct & gctx: {} vs {}".format(c1_gctoo.col_metadata_df[c].dtype, c1_gctoox.col_metadata_df[c].dtype)) assert_series_equal(c1_gctoo.col_metadata_df[c], c1_gctoox.col_metadata_df[c])
def main(args): in_gctoo = parse_gctoo.parse(args.filename, convert_neg_666=False) logger.debug("Original out name: {}".format(in_gctoo.src)) if args.outname == None: out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0] else: out_name = args.outname if args.outpath != None: out_name = args.outpath + out_name write_gctoox.write(in_gctoo, out_name)
def main(args): # Read the input gct in_gct = pg.parse(args.in_gct_path) # Read in each of the command line arguments rid = _read_arg(args.rid) cid = _read_arg(args.cid) exclude_rid = _read_arg(args.exclude_rid) exclude_cid = _read_arg(args.exclude_cid) # Slice the gct out_gct = slice_gctoo(in_gct, rid=rid, cid=cid, exclude_rid=exclude_rid, exclude_cid=exclude_cid) assert out_gct.data_df.size > 0, "Slicing yielded an empty gct!" # Write the output gct wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
def test_with_only_row_metadata(self): # path to files gctoo_path = FUNCTIONAL_TESTS_PATH + "/row_meta_only_example_n2x1203.gct" gctoox_path = FUNCTIONAL_TESTS_PATH + "/row_meta_only_example_n2x1203.gctx" # parse files c2_gctoo = parse_gctoo.parse(gctoo_path) c2_gctoox = parse_gctoox.parse(gctoox_path) #check rows and columns: data_df self.assertTrue(set(list(c2_gctoo.data_df.index)) == set(list(c2_gctoox.data_df.index)), "Mismatch between data_df index values of gct vs gctx: {} vs {}".format(c2_gctoo.data_df.index, c2_gctoox.data_df.index)) self.assertTrue(set(list(c2_gctoo.data_df.columns)) == set(list(c2_gctoox.data_df.columns)), "Mismatch between data_df column values of gct vs gctx: {} vs {}".format(c2_gctoo.data_df.columns, c2_gctoox.data_df.columns)) logger.debug("c2 gctoo data_df columns equal to gctoox data_df columns? {}".format(set(c2_gctoo.data_df.columns) == set(c2_gctoox.data_df.columns))) for c in list(c2_gctoo.data_df.columns): self.assertTrue(len(list(c2_gctoo.data_df[c])) == len(list(c2_gctoox.data_df[c])), "Lengths of column {} differ between gct and gctx".format(c)) assert_series_equal(c2_gctoo.data_df[c], c2_gctoox.data_df[c]) # check rows and columns: row_metadata_df self.assertTrue(set(list(c2_gctoo.row_metadata_df.index)) == set(list(c2_gctoox.row_metadata_df.index)), "Mismatch between row_metadata_df index values of gct vs gctx: {} vs {}".format(c2_gctoo.row_metadata_df.index, c2_gctoox.row_metadata_df.index)) self.assertTrue(set(list(c2_gctoo.row_metadata_df.columns)) == set(list(c2_gctoox.row_metadata_df.columns)), "Mismatch between row_metadata_df column values of gct vs gctx: {} vs {}".format(c2_gctoo.row_metadata_df.columns, c2_gctoox.row_metadata_df.columns)) logger.debug("c2 gctoo row_metadata_df columns equal to gctoox row_metadata_df columns? {}".format(set(c2_gctoo.row_metadata_df.columns) == set(c2_gctoox.row_metadata_df.columns))) for c in list(c2_gctoo.row_metadata_df.columns): self.assertTrue(len(list(c2_gctoo.row_metadata_df[c])) == len(list(c2_gctoox.row_metadata_df[c])), "Lengths of column {} differ between gct and gctx".format(c)) self.assertTrue(c2_gctoo.row_metadata_df[c].dtype == c2_gctoox.row_metadata_df[c].dtype, "Dtype mismatch between parsed gct & gctx: {} vs {}".format(c2_gctoo.row_metadata_df[c].dtype, c2_gctoox.row_metadata_df[c].dtype)) logger.debug("first couple elems of {} in gctoo: {}".format(c, list(c2_gctoo.row_metadata_df[c])[0:3])) assert_series_equal(c2_gctoo.row_metadata_df[c], c2_gctoox.row_metadata_df[c]) # check rows and columns: col_metadata_df self.assertTrue(set(list(c2_gctoo.col_metadata_df.index)) == set(list(c2_gctoox.col_metadata_df.index)), "Mismatch between col_metadata_df index values of gct vs gctx: {} vs {}".format(c2_gctoo.col_metadata_df.index, c2_gctoox.col_metadata_df.index)) self.assertTrue(set(list(c2_gctoo.col_metadata_df.columns)) == set(list(c2_gctoox.col_metadata_df.columns)), "Mismatch between col_metadata_df column values of gct vs gctx: {} vs {}".format(c2_gctoo.col_metadata_df.columns, c2_gctoox.col_metadata_df.columns)) logger.debug("c2 gctoo col_metadata_df columns equal to gctoox col_metadata_df columns? {}".format(set(c2_gctoo.col_metadata_df.columns) == set(c2_gctoox.col_metadata_df.columns))) for c in list(c2_gctoo.col_metadata_df.columns): self.assertTrue(len(list(c2_gctoo.col_metadata_df[c])) == len(list(c2_gctoox.col_metadata_df[c])), "Lengths of column {} differ between gct and gctx".format(c)) self.assertTrue(c2_gctoo.col_metadata_df[c].dtype == c2_gctoox.col_metadata_df[c].dtype, "Dtype mismatch between parsed gct & gctx: {} vs {}".format(c2_gctoo.col_metadata_df[c].dtype, c2_gctoox.col_metadata_df[c].dtype)) assert_series_equal(c2_gctoo.col_metadata_df[c], c2_gctoox.col_metadata_df[c])
def parse(file_path, convert_neg_666=True, rid=None, cid=None, nan_values=None, meta_only=None): """ Identifies whether file_path corresponds to a .gct or .gctx file and calls the correct corresponding parse method. Input: Mandatory: - gct(x)_file_path (str): full path to gct(x) file you want to parse. Optional: - convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not (see Note below for more details on this). Default = False. - rid (list of strings): list of row ids to specifically keep from gctx. Default=None. - cid (list of strings): list of col ids to specifically keep from gctx. Default=None. Output: - myGCToo (GCToo) Note: why does convert_neg_666 exist? - In CMap--for somewhat obscure historical reasons--we use "-666" as our null value for metadata. However (so that users can take full advantage of pandas' methods, including those for filtering nan's etc) we provide the option of converting these into numpy.NaN values, the pandas default. """ if file_path.endswith(".gct"): curr = parse_gctoo.parse(file_path, convert_neg_666, rid, cid) elif file_path.endswith(".gctx"): curr = parse_gctoox.parse(file_path, convert_neg_666, rid, cid, meta_only) else: logger.error("File to parse must be .gct or .gctx!") return curr