def get_deltas_for_heat(infmat, index2gene, gene2heat, addtl_genes, num_permutations, parallel): print("* Performing permuted heat delta selection...") heat_permutations = permutations.permute_heat(gene2heat, num_permutations, addtl_genes, parallel) return get_deltas_from_heat_permutations(infmat, index2gene, heat_permutations, parallel)
def consensus_with_stats(args, networks, heats, verbose=0): # Run with the input heat """Change here, add the single_permuted_sub_score_delta output""" single_runs, single_My_results, consensus, linkers, auto_deltas = consensus_run( args, networks, heats, verbose) # Generate permuted heats np = args.consensus_permutations permuted_single_runs = defaultdict(list) for (infmat, indexToGene, G, nname, pnp), (heat, hname) in product(networks, heats): # 1) Filter the heat scores # 1a) Remove enes not in the network heat = filter_heat_to_network_genes(heat, set(indexToGene.values()), verbose) # 1b) Genes with score 0 cannot be in output components, but are eligible for heat in permutations heat, addtl_genes = filter_heat( heat, None, False, 'There are ## genes with heat score 0') for permutation in permute_heat(heat, indexToGene.values(), np, addtl_genes, args.num_cores): result, Myresult = run_helper(args, infmat, indexToGene, G, nname, pnp, heat, hname, addtl_genes, get_deltas_hotnet2, HN2_INFMAT_NAME, HN2_MAX_CC_SIZES, verbose=verbose) permuted_single_runs[(hname, nname)].append(result) # Run consensus to compute observed statistics network_heat_pairs = permuted_single_runs.keys() permuted_counts = [] for i in range(args.heat_permutations): runs = [(n, h, permuted_single_runs[(n, h)][i]) for n, h in network_heat_pairs] permuted_consensus, _, _ = identify_consensus(runs, verbose=verbose) permuted_counts.append(count_consensus(permuted_consensus)) # Summarize stats consensus_stats = dict() for k, count in count_consensus(consensus).iteritems(): empirical = [permuted_count[k] for permuted_count in permuted_counts] if np == 0: pval = 1. expected = 0. else: expected = numpy.mean(empirical) pval = sum(1. for p in empirical if p >= count) / np consensus_stats[k] = dict(observed=count, expected=expected, pval=pval) return single_runs, single_My_results, consensus, linkers, auto_deltas, consensus_stats
def heat_permutation_significance(args, heat, infmat, infmat_index, G): print("* Performing permuted heat statistical significance...") addtl_genes = hnio.load_genes( args.permutation_genes_file) if args.permutation_genes_file else None heat_permutations = permutations.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel) return calculate_significance(args, infmat, infmat_index, G, heat_permutations)
def run_helper(args, infmat, full_index2gene, G, nname, pnp, heat, hname, addtl_genes, get_deltas_fn, infmat_name="PPR", max_cc_sizes=[5, 10, 15, 20], verbose=0): """Helper shared by runHotNet2 and runClassicHotNet. """ # Perform delta selection (if necessary) if args.deltas: deltas = args.deltas else: deltas = get_deltas_fn(full_index2gene, heat, args.network_permutations, args.num_cores, infmat, addtl_genes, pnp, infmat_name, max_cc_sizes, verbose) sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True, verbose=verbose) results = [] for delta in deltas: # find connected components G = hn.weighted_graph(sim, index2gene, delta, directed=True) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance (using all genes with heat scores) if verbose > 4: print "* Performing permuted heat statistical significance..." print "\t- Using no. of components >= k (k \\in", print "[%s, %s]) as statistic" % (min(HN2_STATS_SIZES), max(HN2_STATS_SIZES)) heat_permutations = p.permute_heat(heat, full_index2gene.values(), args.heat_permutations, addtl_genes, args.num_cores) sizes2counts = stats.calculate_permuted_cc_counts(infmat, full_index2gene, heat_permutations, delta, HN2_STATS_SIZES, True, args.num_cores) real_counts = stats.num_components_min_size(G, HN2_STATS_SIZES) size2real_counts = dict(zip(HN2_STATS_SIZES, real_counts)) sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.heat_permutations) # sort ccs list such that genes within components are sorted alphanumerically, and components # are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) # Record the results for this delta results.append( (ccs, sizes2stats, delta) ) return results
def consensus_with_stats(args, networks, heats, verbose=0): # Run with the input heat single_runs, consensus, linkers, auto_deltas = consensus_run( args, networks, heats, verbose ) # Generate permuted heats np = args.consensus_permutations permuted_single_runs = defaultdict(list) for (infmat, indexToGene, G, nname, pnp), (heat, hname) in product(networks, heats): # 1) Filter the heat scores # 1a) Remove genes not in the network heat = filter_heat_to_network_genes(heat, set(indexToGene.values()), verbose) # 1b) Genes with score 0 cannot be in output components, but are eligible for heat in permutations heat, addtl_genes = filter_heat(heat, None, False, 'There are ## genes with heat score 0') for permutation in permute_heat(heat, indexToGene.values(), np, addtl_genes, args.num_cores): result = run_helper(args, infmat, indexToGene, G, nname, pnp, heat, hname, addtl_genes, get_deltas_hotnet2, HN2_INFMAT_NAME, HN2_MAX_CC_SIZES, verbose=verbose) permuted_single_runs[(hname, nname)].append(result) # Run consensus to compute observed statistics network_heat_pairs = permuted_single_runs.keys() permuted_counts = [] for i in range(np): runs = [ (n, h, permuted_single_runs[(n, h)][i]) for n, h in network_heat_pairs ] permuted_consensus, _, _ = identify_consensus( runs, verbose=verbose ) permuted_counts.append(count_consensus(permuted_consensus)) # Summarize stats consensus_stats = dict() for k, count in count_consensus(consensus).iteritems(): empirical = [ permuted_count[k] for permuted_count in permuted_counts ] if np == 0: pval = 1. expected = 0. else: expected = numpy.mean(empirical) pval = sum(1. for p in empirical if p >= count )/np consensus_stats[k] = dict(observed=count, expected=expected, pval=pval) return single_runs, consensus, linkers, auto_deltas, consensus_stats
def get_deltas_for_heat(infmat, index2gene, gene2heat, addtl_genes, num_permutations, parallel): print "* Performing permuted heat delta selection..." heat_permutations = permutations.permute_heat(gene2heat, num_permutations, addtl_genes, parallel) return get_deltas_from_heat_permutations(infmat, index2gene, heat_permutations, parallel)
def run_helper(args, infmat_name, get_deltas_fn, extra_delta_args): """Helper shared by simpleRun and simpleRunClassic. """ # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") infmat = hnio.load_infmat(args.infmat_file, infmat_name) full_index2gene = hnio.load_index(args.infmat_index_file) using_json_heat = os.path.splitext(args.heat_file.lower())[1] == '.json' if using_json_heat: heat = json.load(open(args.heat_file))['heat'] else: heat = hnio.load_heat_tsv(args.heat_file) print "* Loaded heat scores for %s genes" % len(heat) # filter out genes not in the network heat = hnheat.filter_heat_to_network_genes(heat, set(full_index2gene.values())) # genes with score 0 cannot be in output components, but are eligible for heat in permutations heat, addtl_genes = hnheat.filter_heat(heat, None, False, 'There are ## genes with heat score 0') deltas = get_deltas_fn(full_index2gene, heat, args.delta_permutations, args.num_cores, infmat, addtl_genes, *extra_delta_args) sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True) results_files = [] for delta in deltas: # create output directory delta_out_dir = args.output_directory + "/delta_" + str(delta) if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) # find connected components G = hn.weighted_graph(sim, index2gene, delta, directed=True) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance (using all genes with heat scores) print "* Performing permuted heat statistical significance..." heat_permutations = p.permute_heat(heat, full_index2gene.values(), args.significance_permutations, addtl_genes, args.num_cores) sizes = range(2, 11) print "\t- Using no. of components >= k (k \\in", print "[%s, %s]) as statistic" % (min(sizes), max(sizes)) sizes2counts = stats.calculate_permuted_cc_counts(infmat, full_index2gene, heat_permutations, delta, sizes, True, args.num_cores) real_counts = stats.num_components_min_size(G, sizes) size2real_counts = dict(zip(sizes, real_counts)) sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.significance_permutations) # sort ccs list such that genes within components are sorted alphanumerically, and components # are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) # write output if not using_json_heat: heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}} heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w') json.dump(heat_dict, heat_out, indent=4) heat_out.close() args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON args.delta = delta # include delta in parameters section of output JSON output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs), "components": ccs, "statistics": sizes2stats} hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV, sizes2stats) json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w') json.dump(output_dict, json_out, indent=4) json_out.close() results_files.append( os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT ) hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs) # create the hierarchy if necessary if args.output_hierarchy: from bin import createDendrogram as CD hierarchy_out_dir = '{}/hierarchy/'.format(args.output_directory) if not os.path.isdir(hierarchy_out_dir): os.mkdir(hierarchy_out_dir) params = vars(args) CD.createDendrogram( sim, index2gene.values(), hierarchy_out_dir, params, verbose=False) hierarchyFile = '{}/viz_files/{}'.format(str(hn.__file__).rsplit('/', 1)[0], HIERARCHY_WEB_FILE) shutil.copy(hierarchyFile, '{}/index.html'.format(hierarchy_out_dir)) # write visualization output if edge file given if args.edge_file: from bin import makeResultsWebsite as MRW viz_args = [ "-r" ] + results_files viz_args += ["-ef", args.edge_file, "-o", args.output_directory + "/viz" ] if args.network_name: viz_args += [ "-nn", args.network_name ] if args.display_score_file: viz_args += [ "-dsf", args.display_score_file ] if args.display_name_file: viz_args += [ "-dnf", args.display_name_file ] MRW.run( MRW.get_parser().parse_args(viz_args) )
def run_helper(args, infmat_name, get_deltas_fn, extra_delta_args): """Helper shared by simpleRun and simpleRunClassic. """ # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") infmat = scipy.io.loadmat(args.infmat_file)[infmat_name] full_index2gene = hnio.load_index(args.infmat_index_file) using_json_heat = os.path.splitext(args.heat_file.lower())[1] == '.json' if using_json_heat: heat = json.load(open(args.heat_file))['heat'] else: heat = hnio.load_heat_tsv(args.heat_file) print "* Loaded heat scores for %s genes" % len(heat) # filter out genes not in the network heat = hnheat.filter_heat_to_network_genes(heat, set(full_index2gene.values())) # genes with score 0 cannot be in output components, but are eligible for heat in permutations heat, addtl_genes = hnheat.filter_heat(heat, None, False, 'There are ## genes with heat score 0') deltas = get_deltas_fn(full_index2gene, heat, args.delta_permutations, args.num_cores, infmat, addtl_genes, *extra_delta_args) sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True) results_files = [] for delta in deltas: # create output directory delta_out_dir = args.output_directory + "/delta_" + str(delta) if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) # find connected components G = hn.weighted_graph(sim, index2gene, delta, directed=True) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance (using all genes with heat scores) print "* Performing permuted heat statistical significance..." heat_permutations = p.permute_heat(heat, full_index2gene.values(), args.significance_permutations, addtl_genes, args.num_cores) sizes = range(2, 11) print "\t- Using no. of components >= k (k \\in", print "[%s, %s]) as statistic" % (min(sizes), max(sizes)) sizes2counts = stats.calculate_permuted_cc_counts(infmat, full_index2gene, heat_permutations, delta, sizes, True, args.num_cores) real_counts = stats.num_components_min_size(G, sizes) size2real_counts = dict(zip(sizes, real_counts)) sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.significance_permutations) # sort ccs list such that genes within components are sorted alphanumerically, and components # are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) # write output if not using_json_heat: heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}} heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w') json.dump(heat_dict, heat_out, indent=4) heat_out.close() args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON args.delta = delta # include delta in parameters section of output JSON output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs), "components": ccs, "statistics": sizes2stats} hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV, sizes2stats) json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w') json.dump(output_dict, json_out, indent=4) json_out.close() results_files.append( os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT ) hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs) # write visualization output if edge file given if args.edge_file: from bin import makeResultsWebsite as MRW viz_args = [ "-r" ] + results_files viz_args += ["-ef", args.edge_file, "-o", args.output_directory + "/viz" ] if args.network_name: viz_args += [ "-nn", args.network_name ] if args.display_score_file: viz_args += [ "-dsf", args.display_score_file ] MRW.run( MRW.get_parser().parse_args(viz_args) )
def run_helper(args, infmat, full_index2gene, G, nname, pnp, heat, hname, addtl_genes, get_deltas_fn, infmat_name="PPR", max_cc_sizes=[5, 10, 15, 20], verbose=0): """Helper shared by runHotNet2 and runClassicHotNet.""" # Perform delta selection (if necessary) if args.deltas: deltas = args.deltas else: deltas = get_deltas_fn(full_index2gene, heat, args.network_permutations, args.num_cores, infmat, addtl_genes, pnp, infmat_name, max_cc_sizes, verbose) sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True, verbose=verbose) results = [] """Change from here """ #permuted_sub_score_delta = [] #subnet_geneScore_delta = [] #Alpha_sig_delta = [] #Subnetscore_sig_delta = [] #Conponet_sig_delta = [] #sig_Conp_delta = [] #sig_Count_delta = [] #degree_delta = [] #permuted_degree_delta = [] #degree_weighted_score_delta = [] #cent_weighted_score_delta = [] My_results = [] """end here""" for delta in deltas: """Change from here""" simG = hn.weighted_graph(sim, index2gene, delta, directed=True) ccs = hn.connected_components(simG, args.min_cc_size) """end here""" # calculate significance (using all genes with heat scores) if verbose > 4: print "* Performing permuted heat statistical significance..." print "\t- Using no. of components >= k (k \\in", print "[%s, %s]) as statistic" % (min(HN2_STATS_SIZES), max(HN2_STATS_SIZES)) heat_permutations = p.permute_heat(heat, full_index2gene.values(), args.heat_permutations, addtl_genes, args.num_cores) #print heat_permutations #print heat_permutations[0] """change from here""" permuted_sub_score, permuted_gene_degree, permuted_degree_weighted_score, permuted_centality_weighted_score = my.calculate_permuted_subnetScore( infmat, full_index2gene, heat_permutations, delta, verbose, args.min_cc_size, G, args.num_cores) #permuted_sub_score_delta.append((permuted_sub_score, delta)) #permuted_degree_delta.append((permuted_gene_degree,delta)) #print permuted_sub_score_delta ####print permuted_degree_weighted_score #print permuted_degree_delta ####calcaule the degree of gene in each permuted subnetwork """ End here """ sizes2counts = stats.calculate_permuted_cc_counts( infmat, full_index2gene, heat_permutations, delta, HN2_STATS_SIZES, True, args.num_cores) real_counts = stats.num_components_min_size(simG, HN2_STATS_SIZES) size2real_counts = dict(zip(HN2_STATS_SIZES, real_counts)) sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.heat_permutations) #print ccs # sort ccs list such that genes within components are sorted alphanumerically, and components # are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) #print ccs """changing here""" #####calculate all the gene score for gene list genelist, scoreVec = my.score_vec(infmat, full_index2gene, heat) #gene_socre = np.column_stack((genelist, scoreVec)) subnet_geneScore = my.Matchscore_Permuted_fun(genelist, scoreVec, ccs) #print subnet_geneScore ####output the subnetwork with score only #print ccs #subnet_geneScore_delta.append((subnet_geneScore,delta)) ####calculate the gene degree within each subnetwork gene_degree = my.Gene_Degree_func(G, ccs) #degree_delta.append((gene_degree,delta)) #####find score weighted by degree degree_weighted_score = my.Product_func(subnet_geneScore, gene_degree) #degree_weighted_score_delta.append((degree_weighted_score,delta)) #####find score of gene weighted by centrality gene_centrality = my.Gene_Centrality_func(G, ccs) cent_weighted_score = my.Product_func(subnet_geneScore, gene_centrality) #cent_weighted_score_delta.append((cent_weighted_score,delta)) #####calculate significant test """Sum and Size of subnetwork""" Alpha_sig, Subnetscore_sig, Conponet_sig, count, sig_conponet = my.MyStatistics_func( subnet_geneScore, gene_degree, permuted_sub_score, permuted_gene_degree, ccs, 1) """Degree weighted sum and size of subnetwork""" Alpha_sig2, Subnetscore_sig_dwss, Conponet_sig_dwss, count_dwss, sig_conponet_dwss = my.MyStatistics_func( degree_weighted_score, gene_degree, permuted_degree_weighted_score, permuted_gene_degree, ccs, 1) """Centrality weighted sum and size of subnewtwork""" Alpha_sig3, Subnetscore_sig_cwss, Conponet_sig_cwss, count_cwss, sig_conponet_cwss = my.MyStatistics_func( cent_weighted_score, gene_degree, permuted_centality_weighted_score, permuted_gene_degree, ccs, 1) """Sum and Sum of degree of gene within subnetwork""" Alpha_sig1, Subnetscore_sig_sd, Conponet_sig_sd, count_sd, sig_conponet_sd = my.MyStatistics_func( subnet_geneScore, gene_degree, permuted_sub_score, permuted_gene_degree, ccs, 2) """Degree weighted sum and Sum of degree of gene within subnetwork""" Alpha_sig4, Subnetscore_sig_dwsd, Conponet_sig_dwsd, count_dwsd, sig_conponet_dwsd = my.MyStatistics_func( degree_weighted_score, gene_degree, permuted_degree_weighted_score, permuted_gene_degree, ccs, 2) """Centrality weighted sum and Sum of degree of gene within subnetwork""" Alpha_sig5, Subnetscore_sig_cwsd, Conponet_sig_cwsd, count_cwsd, sig_conponet_cwsd = my.MyStatistics_func( cent_weighted_score, gene_degree, permuted_centality_weighted_score, permuted_gene_degree, ccs, 2) #####store all the result ####this is My results My_results.append( (permuted_sub_score, subnet_geneScore, Alpha_sig, Subnetscore_sig, Conponet_sig, count, sig_conponet, gene_degree, permuted_gene_degree, cent_weighted_score, degree_weighted_score, permuted_centality_weighted_score, permuted_degree_weighted_score, Subnetscore_sig_sd, Conponet_sig_sd, count_sd, sig_conponet_sd, Subnetscore_sig_dwss, Conponet_sig_dwss, count_dwss, sig_conponet_dwss, Subnetscore_sig_cwss, Conponet_sig_cwss, count_cwss, sig_conponet_cwss, Subnetscore_sig_dwsd, Conponet_sig_dwsd, count_dwsd, sig_conponet_dwsd, Subnetscore_sig_cwsd, Conponet_sig_cwsd, count_cwsd, sig_conponet_cwsd, delta)) #print Subnetscore_sig """end here """ results.append((ccs, sizes2stats, delta)) ####this is My results return results, My_results
def run(args): # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. ") print("(Ctrl-c to cancel).") infmat = scipy.io.loadmat(args.infmat_file)[INFMAT_NAME] infmat_index = hnio.load_index(args.infmat_index_file) heat = hnio.load_heat_tsv(args.heat_file) # filter out genes with heat score less than min_heat_score heat, addtl_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score) # find delta that maximizes # CCs of size >= MIN_SIZE for each permuted data set deltas = ft.get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations, args.parallel) #find the multiple of the median delta s.t. the size of the largest CC in the real data #is <= MAX_CC_SIZE medianDelta = np.median(deltas[MIN_CC_SIZE]) M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()), quiet=False) h = hn.heat_vec(heat, gene_index) sim = hn.similarity_matrix(M, h) for i in range(1, 11): G = hn.weighted_graph(sim, gene_index, i*medianDelta) max_cc_size = max([len(cc) for cc in hn.connected_components(G)]) if max_cc_size <= MAX_CC_SIZE: break # load interaction network edges and determine location of static HTML files for visualization edges = hnio.load_ppi_edges(args.edge_file) if args.edge_file else None index_file = '%s/viz_files/%s' % (os.path.realpath(__file__).rsplit('/', 1)[0], VIZ_INDEX) subnetworks_file = '%s/viz_files/%s' % (os.path.realpath(__file__).rsplit('/', 1)[0], VIZ_SUBNETWORKS) gene2index = dict([(gene, index) for index, gene in list(infmat_index.items())]) #and run HotNet with that multiple and the next 4 multiples run_deltas = [i*medianDelta for i in range(i, i+5)] for delta in run_deltas: # create output directory delta_out_dir = args.output_directory + "/delta_" + str(delta) if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) # find connected components G = hn.weighted_graph(sim, gene_index, delta) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance (using all genes with heat scores) print("* Performing permuted heat statistical significance...") heat_permutations = p.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel) sizes = list(range(2, 11)) print("\t- Using no. of components >= k (k \\in") print("[%s, %s]) as statistic" % (min(sizes), max(sizes))) sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index, heat_permutations, delta, sizes, args.parallel) real_counts = stats.num_components_min_size(G, sizes) size2real_counts = dict(list(zip(sizes, real_counts))) sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.num_permutations) # sort ccs list such that genes within components are sorted alphanumerically, and components # are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) # write output heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}} heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w') json.dump(heat_dict, heat_out, indent=4) heat_out.close() args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON args.delta = delta output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs), "components": ccs, "statistics": sizes2stats} hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV, sizes2stats) json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w') json.dump(output_dict, json_out, indent=4) json_out.close() hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs) # write visualization output if edge file given if args.edge_file: viz_data = {"delta": delta, 'subnetworks': list()} for cc in ccs: viz_data['subnetworks'].append(viz.get_component_json(cc, heat, edges, gene2index, args.network_name)) delta_viz_dir = '%s/viz/delta%s' % (args.output_directory, delta) if not os.path.isdir(delta_viz_dir): os.makedirs(delta_viz_dir) viz_out = open('%s/subnetworks.json' % delta_viz_dir, 'w') json.dump(viz_data, viz_out, indent=4) viz_out.close() shutil.copy(subnetworks_file, delta_viz_dir) if args.edge_file: viz.write_index_file(index_file, '%s/viz/%s' % (args.output_directory, VIZ_INDEX), run_deltas)
def run_helper(args, infmat, full_index2gene, G, nname, pnp, heat, hname, addtl_genes, get_deltas_fn, infmat_name="PPR", max_cc_sizes=[5, 10, 15, 20], verbose=0): """Helper shared by runHotNet2 and runClassicHotNet. """ # Perform delta selection (if necessary) if args.deltas: deltas = args.deltas else: deltas = get_deltas_fn(full_index2gene, heat, args.network_permutations, args.num_cores, infmat, addtl_genes, pnp, infmat_name, max_cc_sizes, verbose) sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True, verbose=verbose) results = [] for delta in deltas: # find connected components G = hn.weighted_graph(sim, index2gene, delta, directed=True) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance (using all genes with heat scores) if verbose > 4: print "* Performing permuted heat statistical significance..." print "\t- Using no. of components >= k (k \\in", print "[%s, %s]) as statistic" % (min(HN2_STATS_SIZES), max(HN2_STATS_SIZES)) heat_permutations = p.permute_heat(heat, full_index2gene.values(), args.heat_permutations, addtl_genes, args.num_cores) sizes2counts = stats.calculate_permuted_cc_counts( infmat, full_index2gene, heat_permutations, delta, HN2_STATS_SIZES, True, args.num_cores) real_counts = stats.num_components_min_size(G, HN2_STATS_SIZES) size2real_counts = dict(zip(HN2_STATS_SIZES, real_counts)) sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.heat_permutations) # sort ccs list such that genes within components are sorted alphanumerically, and components # are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) # Record the results for this delta results.append((ccs, sizes2stats, delta)) return results
def run(args): # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") infmat = scipy.io.loadmat(args.infmat_file)[INFMAT_NAME] infmat_index = hnio.load_index(args.infmat_index_file) heat = hnio.load_heat_tsv(args.heat_file) #filter out genes with heat score less than min_heat_score heat, addtl_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score) #find delta that maximizes # CCs of size >= MIN_SIZE for each permuted data set deltas = ft.get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations, args.parallel) #find the multiple of the median delta s.t. the size of the largest CC in the real data #is <= MAX_CC_SIZE medianDelta = np.median(deltas[MIN_CC_SIZE]) M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()), quiet=False) h = hn.heat_vec(heat, gene_index) sim = hn.similarity_matrix(M, h) for i in range(1, 11): G = hn.weighted_graph(sim, gene_index, i*medianDelta) max_cc_size = max([len(cc) for cc in hn.connected_components(G)]) if max_cc_size <= MAX_CC_SIZE: break #and run HotNet with that multiple and the next 4 multiples run_deltas = [i*medianDelta for i in range(i, i+5)] for delta in run_deltas: #create output directory delta_out_dir = args.output_directory + "/delta_" + str(delta) if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) #find connected components G = hn.weighted_graph(sim, gene_index, delta) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance (using all genes with heat scores) print "* Performing permuted heat statistical significance..." heat_permutations = p.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel) sizes = range(2, 11) print "\t- Using no. of components >= k (k \\in", print "[%s, %s]) as statistic" % (min(sizes), max(sizes)) sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index, heat_permutations, delta, sizes, args.parallel) real_counts = stats.num_components_min_size(G, sizes) size2real_counts = dict(zip(sizes, real_counts)) sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.num_permutations) #sort ccs list such that genes within components are sorted alphanumerically, and components #are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) # write output output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs), "components": ccs, "statistics": sizes2stats} hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV, sizes2stats) json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w') json.dump(output_dict, json_out, indent=4) json_out.close() hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)
def run(args): # create output directory if doesn't exist; warn if it exists and is not empty if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) if len(os.listdir(args.output_directory)) > 0: print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. " "(Ctrl-c to cancel).") infmat = scipy.io.loadmat(args.infmat_file)[INFMAT_NAME] infmat_index = hnio.load_index(args.infmat_index_file) heat = hnio.load_heat_tsv(args.heat_file) # filter out genes with heat score less than min_heat_score heat, addtl_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score) # find delta that maximizes # CCs of size >= MIN_SIZE for each permuted data set deltas = ft.get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations, args.parallel) #find the multiple of the median delta s.t. the size of the largest CC in the real data #is <= MAX_CC_SIZE medianDelta = np.median(deltas[MIN_CC_SIZE]) M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()), quiet=False) h = hn.heat_vec(heat, gene_index) sim = hn.similarity_matrix(M, h) for i in range(1, 11): G = hn.weighted_graph(sim, gene_index, i*medianDelta) max_cc_size = max([len(cc) for cc in hn.connected_components(G)]) if max_cc_size <= MAX_CC_SIZE: break # load interaction network edges and determine location of static HTML files for visualization edges = hnio.load_ppi_edges(args.edge_file) if args.edge_file else None index_file = '%s/viz_files/%s' % (os.path.realpath(__file__).rsplit('/', 1)[0], VIZ_INDEX) subnetworks_file = '%s/viz_files/%s' % (os.path.realpath(__file__).rsplit('/', 1)[0], VIZ_SUBNETWORKS) gene2index = dict([(gene, index) for index, gene in infmat_index.iteritems()]) #and run HotNet with that multiple and the next 4 multiples run_deltas = [i*medianDelta for i in range(i, i+5)] for delta in run_deltas: # create output directory delta_out_dir = args.output_directory + "/delta_" + str(delta) if not os.path.isdir(delta_out_dir): os.mkdir(delta_out_dir) # find connected components G = hn.weighted_graph(sim, gene_index, delta) ccs = hn.connected_components(G, args.min_cc_size) # calculate significance (using all genes with heat scores) print "* Performing permuted heat statistical significance..." heat_permutations = p.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel) sizes = range(2, 11) print "\t- Using no. of components >= k (k \\in", print "[%s, %s]) as statistic" % (min(sizes), max(sizes)) sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index, heat_permutations, delta, sizes, args.parallel) real_counts = stats.num_components_min_size(G, sizes) size2real_counts = dict(zip(sizes, real_counts)) sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.num_permutations) # sort ccs list such that genes within components are sorted alphanumerically, and components # are sorted first by length, then alphanumerically by name of the first gene in the component ccs = [sorted(cc) for cc in ccs] ccs.sort(key=lambda comp: comp[0]) ccs.sort(key=len, reverse=True) # write output heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}} heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w') json.dump(heat_dict, heat_out, indent=4) heat_out.close() args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON args.delta = delta output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs), "components": ccs, "statistics": sizes2stats} hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV, sizes2stats) json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w') json.dump(output_dict, json_out, indent=4) json_out.close() hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs) # write visualization output if edge file given if args.edge_file: viz_data = {"delta": delta, 'subnetworks': list()} for cc in ccs: viz_data['subnetworks'].append(viz.get_component_json(cc, heat, edges, gene2index, args.network_name)) delta_viz_dir = '%s/viz/delta%s' % (args.output_directory, delta) if not os.path.isdir(delta_viz_dir): os.makedirs(delta_viz_dir) viz_out = open('%s/subnetworks.json' % delta_viz_dir, 'w') json.dump(viz_data, viz_out, indent=4) viz_out.close() shutil.copy(subnetworks_file, delta_viz_dir) if args.edge_file: viz.write_index_file(index_file, '%s/viz/%s' % (args.output_directory, VIZ_INDEX), run_deltas)
def heat_permutation_significance(args, heat, infmat, infmat_index, G): print "* Performing permuted heat statistical significance..." addtl_genes = hnio.load_genes(args.permutation_genes_file) if args.permutation_genes_file else None heat_permutations = permutations.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel) return calculate_significance(args, infmat, infmat_index, G, heat_permutations)