def run(args): #if l not specified, set default based on test statistic if not args.sizes: args.sizes = [5,10,15,20] if args.test_statistic == MAX_CC_SIZE else [3] #disallow finding delta by # of CCs of size >= l for HotNet2, since this is not currently #implemented correctly (and is non-trivial to implement) if not args.classic and args.test_statistic != MAX_CC_SIZE: raise ValueError("For HotNet2, the largest CC size test statistic must be used.") infmat_index = hnio.load_index(args.infmat_index_file) heat, heat_params = hnio.load_heat_json(args.heat_file) if args.perm_type == "heat": infmat = hnio.load_infmat(args.infmat_file, args.infmat_name) addtl_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file else None deltas = get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations, args.test_statistic, args.sizes, args.classic, args.num_cores) elif args.perm_type == "mutations": infmat = hnio.load_infmat(args.infmat_file, args.infmat_name) deltas = get_deltas_for_mutations(args, infmat, infmat_index, heat_params) elif args.perm_type == "network": deltas = get_deltas_for_network(args.permuted_networks_path, heat, args.infmat_name, infmat_index, args.test_statistic, args.sizes, args.classic, args.num_permutations, args.num_cores) else: raise ValueError("Invalid mutation permutation type: %s" % args.perm_type) output_file = open(args.output_file, 'w') if args.output_file else sys.stdout json.dump({"parameters": vars(args), "heat_parameters": heat_params, "deltas": deltas}, output_file, indent=4) if (args.output_file): output_file.close()
def run(args): # Load the input data if args.verbose: print('* Loading infmat and heat files...') infmat = hnio.load_infmat(args.infmat_file, args.infmat_name) full_index2gene = hnio.load_index(args.infmat_index_file) using_json_heat = os.path.splitext(args.heat_file.lower())[1] == '.json' if using_json_heat: heat = json.load(open(args.heat_file))['heat'] else: heat = hnio.load_heat_tsv(args.heat_file) print("* Loaded heat scores for %s genes" % len(heat)) # filter out genes not in the network heat = hnheat.filter_heat_to_network_genes(heat, set(full_index2gene.values())) # genes with score 0 cannot be in output components, but are eligible for heat in permutations heat, addtl_genes = hnheat.filter_heat( heat, None, False, 'There are ## genes with heat score 0') if args.verbose: print('* Creating similarity matrix...') sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True) # Create and output the dendrogram createDendrogram(sim, list(index2gene.values()), args.output_directory, vars(args), args.verbose)
def run(args): infmat_index = hnio.load_index(args.infmat_index_file) heat, heat_params = hnio.load_heat_json(args.heat_file) deltas = get_deltas_for_network(args.permuted_networks_path, heat, args.infmat_name, infmat_index, args.sizes, args.num_permutations, args.parallel) output_file = open(args.output_file, 'w') if args.output_file else sys.stdout json.dump({"parameters": vars(args), "heat_parameters": heat_params, "deltas": deltas}, output_file, indent=4) if (args.output_file): output_file.close()
def run(args): # Load input graph print "* Loading input graph..." with open(args.edgelist_file) as infile: G = nx.Graph() G.add_edges_from([map(int, l.rstrip().split()[:2]) for l in infile]) print "\t{} nodes with {} edges".format(len(G.nodes()), len(G.edges())) # Remove self-loops and zero degree nodes, and # restrict to the largest connected component print "* Removing self-loops, zero degree nodes, and ", print "restricting to the largest connected component" G.remove_edges_from([(u,v) for u, v in G.edges() if u == v]) G.remove_nodes_from([n for n in G.nodes() if G.degree(n) == 0]) G = G.subgraph(sorted(nx.connected_components( G ), key=lambda cc: len(cc), reverse=True)[0]) print "\t{} nodes with {} edges remaining".format(len(G.nodes()), len(G.edges())) # Load gene index indexToGene = hnio.load_index(args.gene_index_file) # Compute and save Laplacian if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) print "* Computing Laplacian..." L = nx.laplacian_matrix(G) # Exponentiate the Laplacian for the given time and save it print "* Computing diffusion matrix..." Li = expm_eig( -args.time * L.todense() ) #Li = sp.sparse.linalg.expm( -args.time * L) output_prefix = "{}/{}_inf_{}".format(args.output_dir, args.prefix, args.time) if args.format == 'hdf5': hnio.save_hdf5(output_prefix + ".h5", dict(Li=Li)) elif args.format == 'npy': np.save(output_prefix + ".npy", Li) # Save the index to gene mapping indexOutputFile = "{}/{}_index_genes".format(args.output_dir, args.prefix) nodes = G.nodes() geneIndexOutput = ["{} {}".format(i+args.start_index, indexToGene[node]) for i, node in enumerate(nodes)] hnio.write_file(indexOutputFile, "\n".join(geneIndexOutput)) # Create edge list with revised indices edgeIndices = [] for u, v in G.edges(): i = nodes.index(u) + args.start_index j = nodes.index(v) + args.start_index edgeIndices.append( sorted([i, j]) ) edgeOutputFile = "{}/{}_edge_list".format(args.output_dir, args.prefix) edgeOutput = ["{} {} 1".format(u, v) for u, v in edgeIndices] hnio.write_file(edgeOutputFile, "\n".join(edgeOutput))
def get_deltas_for_mutations(args, infmat, index2gene, heat_params): print "* Performing permuted mutation data delta selection..." index2gene = hnio.load_index(args.infmat_index_file) heat_permutations = permutations.generate_mutation_permutation_heat( heat_params["heat_fn"], heat_params["sample_file"], heat_params["gene_file"], index2gene.values(), heat_params["snv_file"], args.gene_length_file, args.bmr, args.bmr_file, heat_params["cna_file"], args.gene_order_file, heat_params["cna_filter_threshold"], heat_params["min_freq"], args.num_permutations, args.num_cores) return get_deltas_from_heat_permutations(infmat, index2gene, heat_permutations, args.test_statistic, args.sizes, args.classic, args.num_cores)
def run(args): # Load input graph print "* Loading input graph..." G = nx.Graph() G.add_edges_from([map(int, l.rstrip().split()[:2]) for l in open(args.edgelist_file)]) print "\t{} nodes with {} edges".format(len(G.nodes()), len(G.edges())) # Remove self-loops and zero degree nodes, and # restrict to the largest connected component print "* Removing self-loops, zero degree nodes, and ", print "restricting to the largest connected component" G.remove_edges_from([(u,v) for u, v in G.edges() if u == v]) G.remove_nodes_from([n for n in G.nodes() if G.degree(n) == 0]) G = G.subgraph(sorted(nx.connected_components( G ), key=lambda cc: len(cc), reverse=True)[0]) print "\t{} nodes with {} edges remaining".format(len(G.nodes()), len(G.edges())) # Load gene index index2gene = hnio.load_index(args.gene_index_file) # Compute and save Laplacian if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) print "* Computing Laplacian..." L = nx.laplacian_matrix(G) scipy.io.savemat("{}/{}_laplacian.mat".format(args.output_dir, args.prefix), dict(L=L),oned_as='column') # Exponentiate the Laplacian for the given time and save it from scipy.linalg import expm Li = expm( -args.time * L ) scipy.io.savemat("{}/{}_inf_{}.mat".format(args.output_dir, args.prefix, args.time), dict(Li=Li), oned_as='column') # Save the index to gene mapping index_output_file = "{}/{}_index_genes".format(args.output_dir, args.prefix) nodes = G.nodes() gene_index_output = ["{} {}".format(i+args.start_index, index2gene[node]) for i, node in enumerate(nodes)] hnio.write_file(index_output_file, "\n".join(gene_index_output)) # Create edge list with revised indices edge_indices = [] for u, v in G.edges(): i = nodes.index(u) + args.start_index j = nodes.index(v) + args.start_index edge_indices.append( sorted([i, j]) ) edge_output_file = "{}/{}_edge_list".format(args.output_dir, args.prefix) edge_output = ["{} {} 1".format(u, v) for u, v in edge_indices] hnio.write_file(edge_output_file, "\n".join(edge_output))
def run(args): # if l not specified, set default based on test statistic if not args.sizes: args.sizes = [5, 10, 15, 20] if args.test_statistic == MAX_CC_SIZE else [3] # disallow finding delta by # of CCs of size >= l for HotNet2, since this is not currently # implemented correctly (and is non-trivial to implement) if not args.classic and args.test_statistic != MAX_CC_SIZE: raise ValueError("For HotNet2, the largest CC size test statistic must be used.") infmat_index = hnio.load_index(args.infmat_index_file) heat, heat_params = hnio.load_heat_json(args.heat_file) if args.perm_type == "heat": infmat = hnio.load_infmat(args.infmat_file, args.infmat_name) addtl_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file else None deltas = get_deltas_for_heat( infmat, infmat_index, heat, addtl_genes, args.num_permutations, args.test_statistic, args.sizes, args.classic, args.num_cores, ) elif args.perm_type == "mutations": infmat = hnio.load_infmat(args.infmat_file, args.infmat_name) deltas = get_deltas_for_mutations(args, infmat, infmat_index, heat_params) elif args.perm_type == "network": deltas = get_deltas_for_network( args.permuted_networks_path, heat, args.infmat_name, infmat_index, args.test_statistic, args.sizes, args.classic, args.num_permutations, args.num_cores, ) else: raise ValueError("Invalid mutation permutation type: %s" % args.perm_type) output_file = open(args.output_file, "w") if args.output_file else sys.stdout json.dump({"parameters": vars(args), "heat_parameters": heat_params, "deltas": deltas}, output_file, indent=4) if args.output_file: output_file.close()
def get_deltas_for_mutations(args, infmat, index2gene, heat_params): print "* Performing permuted mutation data delta selection..." index2gene = hnio.load_index(args.infmat_index_file) heat_permutations = permutations.generate_mutation_permutation_heat( heat_params["heat_fn"], heat_params["sample_file"], heat_params["gene_file"], index2gene.values(), heat_params["snv_file"], args.gene_length_file, args.bmr, args.bmr_file, heat_params["cna_file"], args.gene_order_file, heat_params["cna_filter_threshold"], heat_params["min_freq"], args.num_permutations, args.num_cores, ) return get_deltas_from_heat_permutations( infmat, index2gene, heat_permutations, args.test_statistic, args.sizes, args.classic, args.num_cores )
def run(args): # Load the input data if args.verbose: print '* Loading infmat and heat files...' infmat = hnio.load_infmat(args.infmat_file, args.infmat_name) full_index2gene = hnio.load_index(args.infmat_index_file) using_json_heat = os.path.splitext(args.heat_file.lower())[1] == '.json' if using_json_heat: heat = json.load(open(args.heat_file))['heat'] else: heat = hnio.load_heat_tsv(args.heat_file) print "* Loaded heat scores for %s genes" % len(heat) # filter out genes not in the network heat = hnheat.filter_heat_to_network_genes(heat, set(full_index2gene.values())) # genes with score 0 cannot be in output components, but are eligible for heat in permutations heat, addtl_genes = hnheat.filter_heat(heat, None, False, 'There are ## genes with heat score 0') if args.verbose: print '* Creating similarity matrix...' sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True) # Create and output the dendrogram createDendrogram( sim, index2gene.values(), args.output_directory, vars(args), args.verbose )
def run(args): # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print( "WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") # load data infmat = hnio.load_infmat(args.infmat_file, args.infmat_name) full_index2gene = hnio.load_index(args.infmat_index_file) heat, heat_params = hnio.load_heat_json(args.heat_file) # compute similarity matrix sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, not args.classic) # only calculate permuted data sets for significance testing once if args.permutation_type != "none": if args.permutation_type == "heat": print "* Generating heat permutations for statistical significance testing" extra_genes = hnio.load_genes(args.permutation_genes_file) \ if args.permutation_genes_file else None heat_permutations = p.permute_heat(heat, full_index2gene.values(), args.num_permutations, extra_genes, args.num_cores) elif args.permutation_type == "mutations": if heat_params["heat_fn"] != "load_mutation_heat": raise RuntimeError( "Heat scores must be based on mutation data to perform\ significance testing based on mutation data permutation." ) print "* Generating mutation permutations for statistical significance testing" heat_permutations = p.generate_mutation_permutation_heat( heat_params["heat_fn"], heat_params["sample_file"], heat_params["gene_file"], full_index2gene.values(), heat_params["snv_file"], args.gene_length_file, args.bmr, args.bmr_file, heat_params["cna_file"], args.gene_order_file, heat_params["cna_filter_threshold"], heat_params["min_freq"], args.num_permutations, args.num_cores) elif args.permutation_type == "network": pass #nothing to do right now elif args.permutation_type == "precomputed": heat_file_paths = [ args.datasets_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i)) for i in range(1, args.num_permutations + 1) ] heat_permutations = [ hnio.load_heat_tsv(heat_file) for heat_file in heat_file_paths ] else: raise ValueError("Unrecognized permutation type %s" % (args.permutation_type)) for delta in args.deltas: delta_out_dir = args.output_directory + "/delta_" + str(delta) if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) G = hn.weighted_graph(sim, index2gene, delta, not args.classic) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance if args.permutation_type != "none": if args.permutation_type == "network": sizes2stats = calculate_significance_network( args, args.permuted_networks_path, full_index2gene, G, heat, delta, args.num_permutations) else: sizes2stats = calculate_significance(args, infmat, full_index2gene, G, delta, heat_permutations) #sort ccs list such that genes within components are sorted alphanumerically, and components #are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) #write output hnio.write_components_as_tsv( os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs) args.delta = delta # include delta in parameters section of output JSON output_dict = { "parameters": vars(args), "heat_parameters": heat_params, "sizes": hn.component_sizes(ccs), "components": ccs } if args.permutation_type != "none": output_dict["statistics"] = sizes2stats hnio.write_significance_as_tsv( os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV, sizes2stats) json_out = open( os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w') json.dump(output_dict, json_out, indent=4) json_out.close()
def run(args): subnetworks_file = '%s/viz_files/%s' % (str(hotnet2.__file__).rsplit('/', 1)[0], VIZ_SUBNETWORKS) # create output directory if doesn't exist; warn if it exists and is not empty outdir = args.output_directory if not os.path.exists(outdir): os.makedirs(outdir) if len(os.listdir(outdir)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") ks = set() output = dict(deltas=[], subnetworks=dict(), mutation_matrices=dict(), stats=dict()) subnetworks = dict() for results_file in args.results_files: results = json.load(open(results_file)) ccs = results['components'] heat_file = json.load(open(results['parameters']['heat_file'])) gene2heat = heat_file['heat'] heat_parameters = heat_file['parameters'] d_score = hnio.load_display_score_tsv(args.display_score_file) if args.display_score_file else None d_name = hnio.load_display_name_tsv(args.display_name_file) if args.display_name_file else dict() edges = hnio.load_ppi_edges(args.edge_file, hnio.load_index(results['parameters']['infmat_index_file'])) delta = format(results['parameters']['delta'], 'g') output['deltas'].append(delta) subnetworks[delta] = ccs output["subnetworks"][delta] = [] for cc in ccs: output['subnetworks'][delta].append(viz.get_component_json(cc, gene2heat, edges, args.network_name, d_score, d_name)) # make oncoprints if heat file was generated from mutation data if 'heat_fn' in heat_parameters and heat_parameters['heat_fn'] == 'load_mutation_heat': output['mutation_matrices'][delta] = list() samples = hnio.load_samples(heat_parameters['sample_file']) if heat_parameters['sample_file'] else None genes = hnio.load_genes(heat_parameters['gene_file']) if heat_parameters['gene_file'] else None snvs = hnio.load_snvs(heat_parameters['snv_file'], genes, samples) if heat_parameters['snv_file'] else [] cnas = hnio.load_cnas(heat_parameters['cna_file'], genes, samples) if heat_parameters['cna_file'] else [] for cc in ccs: output['mutation_matrices'][delta].append(viz.get_oncoprint_json(cc, snvs, cnas, d_name)) if heat_parameters.get('sample_type_file'): with open(heat_parameters['sample_type_file']) as f: output['sampleToTypes'] = dict(l.rstrip().split() for l in f if not l.startswith("#") ) output['typeToSamples'] = dict((t, []) for t in set(output['sampleToTypes'].values())) for s, ty in output['sampleToTypes'].iteritems(): output['typeToSamples'][ty].append( s ) else: output['sampleToTypes'] = dict( (s, "Cancer") for s in samples ) output['typeToSamples'] = dict(Cancer=list(samples)) output['stats'][delta] = results['statistics'] for k in sorted(map(int, results['statistics'].keys())): ks.add(k) continue stats = results['statistics'][str(k)] output['stats'][delta].append( dict(k=k, expected=stats['expected'], observed=stats['observed'], pval=stats['pval'])) output['ks'] = range(min(ks), max(ks)+1) with open('%s/subnetworks.json' % outdir, 'w') as out: json.dump(output, out, indent=4) shutil.copy(subnetworks_file, '%s/%s' % (outdir, VIZ_INDEX))
def run(args): # Load input graph print "* Loading input graph..." G = nx.Graph() G.add_edges_from( [map(int, l.rstrip().split()[:2]) for l in open(args.edgelist_file)]) print "\t{} nodes with {} edges".format(len(G.nodes()), len(G.edges())) # Remove self-loops and zero degree nodes, and # restrict to the largest connected component print "* Removing self-loops, zero degree nodes, and ", print "restricting to the largest connected component" G.remove_edges_from([(u, v) for u, v in G.edges() if u == v]) G.remove_nodes_from([n for n in G.nodes() if G.degree(n) == 0]) G = G.subgraph( sorted(nx.connected_components(G), key=lambda cc: len(cc), reverse=True)[0]) print "\t{} nodes with {} edges remaining".format(len(G.nodes()), len(G.edges())) # Load gene index index2gene = hnio.load_index(args.gene_index_file) # Compute and save Laplacian if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) print "* Computing Laplacian..." L = nx.laplacian_matrix(G) scipy.io.savemat("{}/{}_laplacian.mat".format(args.output_dir, args.prefix), dict(L=L), oned_as='column') # Exponentiate the Laplacian for the given time and save it from scipy.linalg import expm Li = expm(-args.time * L) scipy.io.savemat("{}/{}_inf_{}.mat".format(args.output_dir, args.prefix, args.time), dict(Li=Li), oned_as='column') # Save the index to gene mapping index_output_file = "{}/{}_index_genes".format(args.output_dir, args.prefix) nodes = G.nodes() gene_index_output = [ "{} {}".format(i + args.start_index, index2gene[node]) for i, node in enumerate(nodes) ] hnio.write_file(index_output_file, "\n".join(gene_index_output)) # Create edge list with revised indices edge_indices = [] for u, v in G.edges(): i = nodes.index(u) + args.start_index j = nodes.index(v) + args.start_index edge_indices.append(sorted([i, j])) edge_output_file = "{}/{}_edge_list".format(args.output_dir, args.prefix) edge_output = ["{} {} 1".format(u, v) for u, v in edge_indices] hnio.write_file(edge_output_file, "\n".join(edge_output))
def run(args): # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") # load data infmat = np.array(scipy.io.loadmat(args.infmat_file)[args.infmat_name]) full_index2gene = hnio.load_index(args.infmat_index_file) heat, heat_params = hnio.load_heat_json(args.heat_file) # compute similarity matrix sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, not args.classic) # only calculate permuted data sets for significance testing once if args.permutation_type != "none": if args.permutation_type == "heat": print "* Generating heat permutations for statistical significance testing" extra_genes = hnio.load_genes(args.permutation_genes_file) \ if args.permutation_genes_file else None heat_permutations = p.permute_heat(heat, full_index2gene.values(), args.num_permutations, extra_genes, args.num_cores) elif args.permutation_type == "mutations": if heat_params["heat_fn"] != "load_mutation_heat": raise RuntimeError("Heat scores must be based on mutation data to perform\ significance testing based on mutation data permutation.") print "* Generating mutation permutations for statistical significance testing" heat_permutations = p.generate_mutation_permutation_heat( heat_params["heat_fn"], heat_params["sample_file"], heat_params["gene_file"], full_index2gene.values(), heat_params["snv_file"], args.gene_length_file, args.bmr, args.bmr_file, heat_params["cna_file"], args.gene_order_file, heat_params["cna_filter_threshold"], heat_params["min_freq"], args.num_permutations, args.num_cores) elif args.permutation_type == "network": pass #nothing to do right now elif args.permutation_type == "precomputed": heat_file_paths = [args.datasets_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i)) for i in range(1, args.num_permutations+1)] heat_permutations = [hnio.load_heat_tsv(heat_file) for heat_file in heat_file_paths] else: raise ValueError("Unrecognized permutation type %s" % (args.permutation_type)) for delta in args.deltas: delta_out_dir = args.output_directory + "/delta_" + str(delta) if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) G = hn.weighted_graph(sim, index2gene, delta, not args.classic) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance if args.permutation_type != "none": if args.permutation_type == "network": sizes2stats = calculate_significance_network(args, args.permuted_networks_path, full_index2gene, G, heat, delta, args.num_permutations) else: sizes2stats = calculate_significance(args, infmat, full_index2gene, G, delta, heat_permutations) #sort ccs list such that genes within components are sorted alphanumerically, and components #are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) #write output hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs) args.delta = delta # include delta in parameters section of output JSON output_dict = {"parameters": vars(args), "heat_parameters": heat_params, "sizes": hn.component_sizes(ccs), "components": ccs} if args.permutation_type != "none": output_dict["statistics"] = sizes2stats hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV, sizes2stats) json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w') json.dump(output_dict, json_out, indent=4) json_out.close()
def run(args): # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") infmat = scipy.io.loadmat(args.infmat_file)[INFMAT_NAME] infmat_index = hnio.load_index(args.infmat_index_file) heat = hnio.load_heat_tsv(args.heat_file) # filter out genes with heat score less than min_heat_score heat, addtl_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score) # find smallest delta deltas = ft.get_deltas_for_network(args.permuted_networks_path, heat, INFMAT_NAME, infmat_index, MAX_CC_SIZES, args.num_permutations, args.parallel) # and run HotNet with the median delta for each size run_deltas = [np.median(deltas[size]) for size in deltas] M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys())) h = hn.heat_vec(heat, gene_index) sim = hn.similarity_matrix(M, h) # load interaction network edges and determine location of static HTML files for visualization edges = hnio.load_ppi_edges(args.edge_file) if args.edge_file else None index_file = '%s/viz_files/%s' % (hotnet2.__file__.rsplit('/', 1)[0], VIZ_INDEX) subnetworks_file = '%s/viz_files/%s' % (hotnet2.__file__.rsplit('/', 1)[0], VIZ_SUBNETWORKS) gene2index = dict([(gene, index) for index, gene in infmat_index.iteritems()]) for delta in run_deltas: # create output directory delta_out_dir = args.output_directory + "/delta_" + str(delta) if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) # find connected components G = hn.weighted_graph(sim, gene_index, delta) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance (using all genes with heat scores) print "* Performing permuted heat statistical significance..." heat_permutations = p.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel) sizes = range(2, 11) print "\t- Using no. of components >= k (k \\in", print "[%s, %s]) as statistic" % (min(sizes), max(sizes)) sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index, heat_permutations, delta, sizes, args.parallel) real_counts = stats.num_components_min_size(G, sizes) size2real_counts = dict(zip(sizes, real_counts)) sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.num_permutations) # sort ccs list such that genes within components are sorted alphanumerically, and components # are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) # write output heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}} heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w') json.dump(heat_dict, heat_out, indent=4) heat_out.close() args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON args.delta = delta output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs), "components": ccs, "statistics": sizes2stats} hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV, sizes2stats) json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w') json.dump(output_dict, json_out, indent=4) json_out.close() hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs) # write visualization output if edge file given if args.edge_file: viz_data = {"delta": delta, 'subnetworks': list()} for cc in ccs: viz_data['subnetworks'].append(viz.get_component_json(cc, heat, edges, gene2index, args.network_name)) delta_viz_dir = '%s/viz/delta%s' % (args.output_directory, delta) if not os.path.isdir(delta_viz_dir): os.makedirs(delta_viz_dir) viz_out = open('%s/subnetworks.json' % delta_viz_dir, 'w') json.dump(viz_data, viz_out, indent=4) viz_out.close() shutil.copy(subnetworks_file, delta_viz_dir) if args.edge_file: viz.write_index_file(index_file, '%s/viz/%s' % (args.output_directory, VIZ_INDEX), run_deltas)
def run(args): subnetworks_file = '%s/viz_files/%s' % (str(hotnet2.__file__).rsplit('/', 1)[0], VIZ_SUBNETWORKS) # create output directory if doesn't exist; warn if it exists and is not empty outdir = args.output_directory if not os.path.exists(outdir): os.makedirs(outdir) if len(os.listdir(outdir)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") ks = set() output = dict(deltas=[], subnetworks=dict(), mutation_matrices=dict(), stats=dict()) predictions = set() multipleHeatFiles = False for results_file in args.results_files: with open(results_file, 'r') as IN: results = json.load(IN) ccs = results['components'] heat_file = json.load(open(results['parameters']['heat_file'])) gene2heat = heat_file['heat'] heat_parameters = heat_file['parameters'] d_score = hnio.load_display_score_tsv(args.display_score_file) if args.display_score_file else None d_name = hnio.load_display_name_tsv(args.display_name_file) if args.display_name_file else dict() edges = hnio.load_ppi_edges(args.edge_file, hnio.load_index(results['parameters']['infmat_index_file'])) delta = format(results['parameters']['delta'], 'g') output['deltas'].append(delta) output["subnetworks"][delta] = [] predictions |= set( g for cc in ccs for g in cc ) for cc in ccs: output['subnetworks'][delta].append(viz.get_component_json(cc, gene2heat, edges, args.network_name, d_score, d_name)) # Record the heat scores if 'geneToHeat' in output: if any( output['geneToHeat'][g] != h for g, h in gene2heat.iteritems() ) or len(gene2heat.keys()) != len(output['geneToHeat'].keys()): multipleHeatFiles = True output['geneToHeat'] = gene2heat # make oncoprints if heat file was generated from mutation data if 'heat_fn' in heat_parameters and heat_parameters['heat_fn'] == 'load_mutation_heat': output['mutation_matrices'][delta] = list() samples = hnio.load_samples(heat_parameters['sample_file']) if heat_parameters['sample_file'] else None genes = hnio.load_genes(heat_parameters['gene_file']) if heat_parameters['gene_file'] else None snvs = hnio.load_snvs(heat_parameters['snv_file'], genes, samples) if heat_parameters['snv_file'] else [] cnas = hnio.load_cnas(heat_parameters['cna_file'], genes, samples) if heat_parameters['cna_file'] else [] # Get the samples and genes from the mutations directly if they weren't provided if not samples: samples = set( m.sample for m in snvs ) | set( m.sample for m in cnas ) if not genes: genes = set( m.gene for m in snvs) | set( m.gene for m in cnas ) for cc in ccs: output['mutation_matrices'][delta].append(viz.get_oncoprint_json(cc, snvs, cnas, d_name)) if heat_parameters.get('sample_type_file'): with open(heat_parameters['sample_type_file']) as f: output['sampleToTypes'] = dict(l.rstrip().split() for l in f if not l.startswith("#") ) output['typeToSamples'] = dict((t, []) for t in set(output['sampleToTypes'].values())) for s, ty in output['sampleToTypes'].iteritems(): output['typeToSamples'][ty].append( s ) else: if not samples: samples = set( m.sample for m in snvs ) | set( m.sample for m in cnas ) output['sampleToTypes'] = dict( (s, "Cancer") for s in samples ) output['typeToSamples'] = dict(Cancer=list(samples)) output['stats'][delta] = results['statistics'] ks |= set(map(int, results['statistics'].keys())) # Print a warning if there were multiple heat files referenced by # the results files if multipleHeatFiles: sys.stderr.write('Warning: results files used multiple heat files. Only the last heat file will be used to tabulate scores.\n') # Output to file output['predictions'] = sorted(predictions) # list of nodes found in any run output['ks'] = range(min(ks), max(ks)+1) with open('%s/subnetworks.json' % outdir, 'w') as out: json.dump(output, out, indent=4) shutil.copy(subnetworks_file, '%s/%s' % (outdir, VIZ_INDEX))
def run(args): subnetworks_file = '%s/viz_files/%s' % (str(hotnet2.__file__).rsplit( '/', 1)[0], VIZ_SUBNETWORKS) # create output directory if doesn't exist; warn if it exists and is not empty outdir = args.output_directory if not os.path.exists(outdir): os.makedirs(outdir) if len(os.listdir(outdir)) > 0: print( "WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") ks = set() output = dict(deltas=[], subnetworks=dict(), mutation_matrices=dict(), stats=dict()) subnetworks = dict() for results_file in args.results_files: results = json.load(open(results_file)) ccs = results['components'] heat_file = json.load(open(results['parameters']['heat_file'])) gene2heat = heat_file['heat'] heat_parameters = heat_file['parameters'] d_score = hnio.load_display_score_tsv( args.display_score_file) if args.display_score_file else None d_name = hnio.load_display_name_tsv( args.display_name_file) if args.display_name_file else dict() edges = hnio.load_ppi_edges( args.edge_file, hnio.load_index(results['parameters']['infmat_index_file'])) delta = format(results['parameters']['delta'], 'g') output['deltas'].append(delta) subnetworks[delta] = ccs output["subnetworks"][delta] = [] for cc in ccs: output['subnetworks'][delta].append( viz.get_component_json(cc, gene2heat, edges, args.network_name, d_score, d_name)) # make oncoprints if heat file was generated from mutation data if 'heat_fn' in heat_parameters and heat_parameters[ 'heat_fn'] == 'load_mutation_heat': output['mutation_matrices'][delta] = list() samples = hnio.load_samples( heat_parameters['sample_file'] ) if heat_parameters['sample_file'] else None genes = hnio.load_genes(heat_parameters['gene_file'] ) if heat_parameters['gene_file'] else None snvs = hnio.load_snvs( heat_parameters['snv_file'], genes, samples) if heat_parameters['snv_file'] else [] cnas = hnio.load_cnas( heat_parameters['cna_file'], genes, samples) if heat_parameters['cna_file'] else [] # Get the samples and genes from the mutations directly if they weren't provided if not samples: samples = set(m.sample for m in snvs) | set(m.sample for m in cnas) if not genes: genes = set(m.gene for m in snvs) | set(m.gene for m in cnas) for cc in ccs: output['mutation_matrices'][delta].append( viz.get_oncoprint_json(cc, snvs, cnas, d_name)) if heat_parameters.get('sample_type_file'): with open(heat_parameters['sample_type_file']) as f: output['sampleToTypes'] = dict(l.rstrip().split() for l in f if not l.startswith("#")) output['typeToSamples'] = dict( (t, []) for t in set(output['sampleToTypes'].values())) for s, ty in output['sampleToTypes'].iteritems(): output['typeToSamples'][ty].append(s) else: output['sampleToTypes'] = dict((s, "Cancer") for s in samples) output['typeToSamples'] = dict(Cancer=list(samples)) output['stats'][delta] = results['statistics'] ks |= set(map(int, results['statistics'].keys())) output['ks'] = range(min(ks), max(ks) + 1) with open('%s/subnetworks.json' % outdir, 'w') as out: json.dump(output, out, indent=4) shutil.copy(subnetworks_file, '%s/%s' % (outdir, VIZ_INDEX))
def run(args): # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") # load data infmat = scipy.io.loadmat(args.infmat_file)[args.infmat_name] infmat_index = hnio.load_index(args.infmat_index_file) heat, heat_params = hnio.load_heat_json(args.heat_file) # compute similarity matrix and extract connected components M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys())) h = hn.heat_vec(heat, gene_index) sim = hn.similarity_matrix(M, h) # only calculate permuted data sets for significance testing once if args.permutation_type != "none": if args.permutation_type == "heat": print "* Generating heat permutations for statistical significance testing" extra_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file \ else None heat_permutations = permutations.permute_heat(heat, args.num_permutations, extra_genes, args.parallel) elif args.permutation_type == "precomputed": heat_file_paths = [args.datasets_path.replace(ITERATION_REPLACEMENT_TOKEN, str(i)) for i in range(1, args.num_permutations+1)] heat_permutations = [hnio.load_heat_tsv(heat_file) for heat_file in heat_file_paths] else: raise ValueError("Unrecognized permutation type %s" % (args.permutation_type)) for delta in args.deltas: delta_out_dir = args.output_directory + "/delta_" + str(delta) if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) G = hn.weighted_graph(sim, gene_index, delta) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance if args.permutation_type != "none": sizes2stats = calculate_significance(args, infmat, infmat_index, G, delta, heat_permutations) #sort ccs list such that genes within components are sorted alphanumerically, and components #are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) #write output hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs) args.delta = delta output_dict = {"parameters": vars(args), "heat_parameters": heat_params, "sizes": hn.component_sizes(ccs), "components": ccs} if args.permutation_type != "none": output_dict["statistics"] = sizes2stats hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV, sizes2stats) json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w') json.dump(output_dict, json_out, indent=4) json_out.close()