def main(): # get args args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) # Get files directly if args.input_filepaths is not None: files = args.input_filepaths # Or find them else: files = get_file_list(args.file_wildcard) # No files found if len(files) == 0: msg = "No files were found. args.file_wildcard: {}".format( args.file_wildcard) logger.error(msg) raise Exception(msg) # Only 1 file found if len(files) == 1: logger.warning( "Only 1 file found. No concatenation needs to be done, exiting") return # More than 1 file found else: # Parse each file and append to a list gctoos = [] for f in files: gctoos.append(parse.parse(f)) # Create concatenated gctoo object if args.concat_direction == "horiz": out_gctoo = hstack(gctoos, args.fields_to_remove, args.reset_ids) elif args.concat_direction == "vert": out_gctoo = vstack(gctoos, args.fields_to_remove, args.reset_ids) # Write out_gctoo to file logger.info("Writing to output file args.out_name: {}".format( args.out_name)) if args.out_type == "gctx": write_gctx.write(out_gctoo, args.out_name) elif args.out_type == "gct": write_gct.write(out_gctoo, args.out_name, filler_null=args.filler_null, metadata_null=args.metadata_null, data_null=args.data_null)
(data_df, row_df, col_df) = GCToo.multi_index_df_to_component_dfs(mi_df) self.assertTrue(col_df.equals(e_col_metadata_df)) self.assertTrue(row_df.equals(e_row_metadata_df)) self.assertTrue(data_df.equals(e_data_df)) # edge case: if the index (or column) of the multi-index has only one # level, it becomes a regular index mi_df_index_plain = pd.MultiIndex.from_arrays([["D", "E"]], names=["rid"]) mi_df2 = pd.DataFrame([[1, 3, 5], [7, 11, 13]], index=mi_df_index_plain, columns=mi_df_columns) # row df should be empty e_row_df2 = pd.DataFrame(index=["D", "E"]) (data_df2, row_df2, col_df2) = GCToo.multi_index_df_to_component_dfs(mi_df2) self.assertTrue(row_df2.equals(e_row_df2)) self.assertTrue(col_df2.equals(e_col_metadata_df)) self.assertTrue(data_df2.equals(e_data_df)) if __name__ == "__main__": setup_GCToo_logger.setup(verbose=True) unittest.main()
""" import logging import setup_GCToo_logger as setup_logger import os import numpy as np import pandas as pd import h5py import GCToo __author__ = "Oana Enache" __email__ = "*****@*****.**" #instantiate logger logger = logging.getLogger(setup_logger.LOGGER_NAME) # when not in debug mode, probably best to set verbose=False setup_logger.setup(verbose=False) version_node = "version" rid_node = "/0/META/ROW/id" cid_node = "/0/META/COL/id" data_node = "/0/DATA/0/matrix" row_meta_group_node = "/0/META/ROW" col_meta_group_node = "/0/META/COL" def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None): """ Primary method of script. Reads in path to a gctx file and parses into GCToo object. Input: Mandatory:
"rids in concatenated_meta_df do not agree with rids in data_df.") # Reset rids in concatenated_meta_df reset_ids_in_meta_df(concatenated_meta_df) # Replace rids in data_df with the new ones from concatenated_meta_df # (just an array of unique integers, zero-indexed) data_df.index = pd.Index(concatenated_meta_df.index.values) def reset_ids_in_meta_df(meta_df): """ Meta_df is modified inplace. """ # Record original index name, and then change it so that the column that it # becomes will be appropriately named original_index_name = meta_df.index.name meta_df.index.name = "old_id" # Reset index meta_df.reset_index(inplace=True) # Change the index name back to what it was meta_df.index.name = original_index_name if __name__ == "__main__": args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) main(args)
set(mini_gctoo.row_metadata_df[c]))) self.assertTrue( set(mini_gctoo.row_metadata_df[c]) == set( mini_gctoo_row_metadata[c]), "Values in column {} differ between expected metadata and written row metadata!" .format(c)) # check col metadata self.assertTrue( set(mini_gctoo.col_metadata_df.columns) == set( mini_gctoo_col_metadata.columns), "Mismatch between expected col metadata columns {} and column values written to file: {}" .format(mini_gctoo.col_metadata_df.columns, mini_gctoo_col_metadata.columns)) self.assertTrue( set(mini_gctoo.col_metadata_df.index) == set( mini_gctoo.col_metadata_df.index), "Mismatch between expect col metadata index {} and index values written to file: {}" .format(mini_gctoo.col_metadata_df.index, mini_gctoo_col_metadata.index)) for c in list(mini_gctoo.col_metadata_df.columns): self.assertTrue( set(mini_gctoo.col_metadata_df[c]) == set( mini_gctoo_col_metadata[c]), "Values in column {} differ between expected metadata and written col metadata!" .format(c)) if __name__ == "__main__": setup_logger.setup(verbose=True) unittest.main()