def join(args, outs, chunk_defs, chunk_outs): if args.filtered_matrix is None: outs.tsne = None outs.tsne_summary = {} return if not os.path.exists(outs.tsne): os.mkdir(outs.tsne) outs.tsne_summary = {'h5': {}, 'csv': {}} for method in args.factorization: # get all tsnes for a given method chunk_h5s = [os.path.join(chunk_out.tsne, method + '_tsne.h5') for chunk_def, chunk_out in zip(chunk_defs, chunk_outs) if chunk_def.method == method] chunk_csv_dirs = [os.path.join(chunk_out.tsne, method + '_tsne_csv') for chunk_def, chunk_out in zip(chunk_defs, chunk_outs) if chunk_def.method == method] analysis_io.combine_h5_files(chunk_h5s, os.path.join(outs.tsne, method + "_tsne.h5"), [analysis_constants.ANALYSIS_H5_TSNE_GROUP]) for csv_dir in chunk_csv_dirs: cr_io.copytree(csv_dir, os.path.join(outs.tsne, method + "_tsne_csv"), allow_existing=True) outs.tsne_summary['h5'][method] = os.path.join(outs.tsne, method + "_tsne.h5") outs.tsne_summary['csv'][method] = os.path.join(outs.tsne, method + "_tsne_csv")
def join(args, outs, chunk_defs, chunk_outs): if args.skip: return chunk_h5s = [chunk_out.tsne_h5 for chunk_out in chunk_outs] chunk_csv_dirs = [chunk_out.tsne_csv for chunk_out in chunk_outs] analysis_io.combine_h5_files(chunk_h5s, outs.tsne_h5, [analysis_constants.ANALYSIS_H5_TSNE_GROUP]) for csv_dir in chunk_csv_dirs: cr_io.copytree(csv_dir, outs.tsne_csv, allow_existing=True)
def main(args, outs): if args.skip: return analysis_io.combine_h5_files([args.kmeans_h5, args.graphclust_h5], outs.clustering_h5, [analysis_constants.ANALYSIS_H5_KMEANS_GROUP, analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP]) csv_path = os.path.join(outs.clustering_csv) cr_io.makedirs(csv_path, allow_existing=True) copy_subdirs(args.kmeans_csv, csv_path) copy_subdirs(args.graphclust_csv, csv_path)
def join(args, outs, chunk_defs, chunk_outs): if args.skip or args.is_multi_genome: return chunk_h5s = [chunk_out.diffexp_h5 for chunk_out in chunk_outs] chunk_csv_dirs = [chunk_out.diffexp_csv for chunk_out in chunk_outs] cr_io.combine_h5_files(chunk_h5s, outs.diffexp_h5, [cr_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP, cr_constants.ANALYSIS_H5_KMEANS_DIFFERENTIAL_EXPRESSION_GROUP]) for csv_dir in chunk_csv_dirs: cr_utils.copytree(csv_dir, outs.diffexp_csv, allow_existing=True)
def join(args, outs, chunk_defs, chunk_outs): if args.skip or args.is_multi_genome: return chunk_h5s = [chunk_out.kmeans_h5 for chunk_out in chunk_outs] chunk_csv_dirs = [chunk_out.kmeans_csv for chunk_out in chunk_outs] cr_io.combine_h5_files(chunk_h5s, outs.kmeans_h5, [ cr_constants.ANALYSIS_H5_CLUSTERING_GROUP, cr_constants.ANALYSIS_H5_KMEANS_GROUP ]) for csv_dir in chunk_csv_dirs: cr_utils.copytree(csv_dir, outs.kmeans_csv, allow_existing=True)
def join(args, outs, chunk_defs, chunk_outs): ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: outs.enrichment_analysis = None outs.enrichment_analysis_summary = {} return peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix) tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}} # for each method, we merge h5 files and copy csv directories to one place cr_io.mkdir(outs.enrichment_analysis, allow_existing=True) for method in args.factorization: method_dir = os.path.join(outs.enrichment_analysis, method) cr_io.mkdir(method_dir, allow_existing=True) _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method)) outs.enrichment_analysis_summary['h5'][method] = _h5 chunk_h5s = [] _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method)) outs.enrichment_analysis_summary['csv'][method] = _csv diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs] if args.filtered_tf_bc_matrix is not None: diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs] clustering_h5 = args.clustering_summary['h5'][method] for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5): chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs) if chunk_def.clustering_key == key], key=lambda x: x[1].cluster) chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering] # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering])) # write out h5 chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key)) with analysis_io.open_h5_for_writing(chunk_h5) as f: cr_diffexp.save_differential_expression_h5(f, key, diffexp) chunk_h5s += [chunk_h5] # write out csv cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv) analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP, analysis_constants.ANALYSIS_H5_MAP_DE[method]])
def main(args, outs): outs.clustering_summary = {} if args.filtered_matrix is None: outs.clustering = None return if not os.path.exists(outs.clustering): cr_io.mkdir(outs.clustering) # NOTE: both graph clustering and normal clustering should have run for given method assert args.clustering_summary['h5'].keys( ) == args.graph_clustering_summary['h5'].keys() outs.clustering_summary = {'h5': {}, 'csv': {}} for method in args.clustering_summary['h5'].keys(): if method not in ALLOWED_FACTORIZATIONS: raise ValueError("invalid method") merge_h5 = [ args.clustering_summary['h5'][method], args.graph_clustering_summary['h5'][method] ] groups = [ analysis_constants.ANALYSIS_H5_MAP_CLUSTERING[method], analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP ] out_method_dir = os.path.join(outs.clustering, method) cr_io.mkdir(out_method_dir, allow_existing=True) out_clustering_h5 = os.path.join(out_method_dir, "{}_clustering.h5".format(method)) outs.clustering_summary['h5'][method] = out_clustering_h5 analysis_io.combine_h5_files(merge_h5, out_clustering_h5, groups) _csv1 = os.path.join(args.clustered_data, method, CLUSTER_FILE_HEAD[method] + "_csv") _csv2 = os.path.join(args.knn_clusters, method, "clusters_csv") out_csv = os.path.join(out_method_dir, method + "_csv") cr_io.copytree(_csv1, out_csv, allow_existing=True) cr_io.copytree(_csv2, out_csv, allow_existing=True) outs.clustering_summary['csv'][method] = out_csv
def join(args, outs, chunk_defs, chunk_outs): if args.filtered_matrix is None: outs.clustered_data = None outs.clustering_summary = {} return if not os.path.exists(outs.clustered_data): cr_io.mkdir(outs.clustered_data) outs.clustering_summary = {'h5': {}, 'csv': {}} for method in args.factorization: chunk_h5s = [ os.path.join(chunk_out.clustered_data, method, CLUSTER_FILE_HEAD[method] + ".h5") for chunk_out in chunk_outs ] chunk_csv_dirs = [ os.path.join(chunk_out.clustered_data, method, CLUSTER_FILE_HEAD[method] + "_csv") for chunk_out in chunk_outs ] method_dir = os.path.join(outs.clustered_data, method) cr_io.mkdir(method_dir, allow_existing=True) analysis_io.combine_h5_files( chunk_h5s, os.path.join(method_dir, CLUSTER_FILE_HEAD[method] + ".h5"), [ analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP, analysis_constants.ANALYSIS_H5_MAP_CLUSTERING[method] ]) for csv_dir in chunk_csv_dirs: cr_io.copytree(csv_dir, os.path.join(method_dir, CLUSTER_FILE_HEAD[method] + "_csv"), allow_existing=True) outs.clustering_summary['h5'][method] = os.path.join( method_dir, CLUSTER_FILE_HEAD[method] + ".h5") outs.clustering_summary['csv'][method] = os.path.join( method_dir, CLUSTER_FILE_HEAD[method] + "_csv")