Exemplo n.º 1
0
def calculate_significance(args, infmat, infmat_index, G, heat_permutations):
    sizes = range(args.cc_start_size, args.cc_stop_size+1)
    
    print "\t- Using no. of components >= k (k \\in",
    print "[%s, %s]) as statistic" % (min(sizes), max(sizes))

    #size2counts is dict(size -> (list of counts, 1 per permutation))
    sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index, heat_permutations,
                                                      args.delta, sizes, args.parallel)
    real_counts = stats.num_components_min_size(G, sizes)
    size2real_counts = dict(zip(sizes, real_counts))
    return stats.compute_statistics(size2real_counts, sizes2counts, args.num_permutations)
Exemplo n.º 2
0
def calculate_significance(args, infmat, infmat_index, G, heat_permutations):
    sizes = list(range(args.cc_start_size, args.cc_stop_size + 1))

    print("\t- Using no. of components >= k (k \\in")
    print("[%s, %s]) as statistic" % (min(sizes), max(sizes)))

    #size2counts is dict(size -> (list of counts, 1 per permutation))
    sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index,
                                                      heat_permutations,
                                                      args.delta, sizes,
                                                      args.parallel)
    real_counts = stats.num_components_min_size(G, sizes)
    size2real_counts = dict(list(zip(sizes, real_counts)))
    return stats.compute_statistics(size2real_counts, sizes2counts,
                                    args.num_permutations)
Exemplo n.º 3
0
def run_helper(args, infmat, full_index2gene, G, nname, pnp, heat, hname, addtl_genes, get_deltas_fn, infmat_name="PPR", max_cc_sizes=[5, 10, 15, 20], verbose=0):
    """Helper shared by runHotNet2 and runClassicHotNet.
    """
    # Perform delta selection (if necessary)
    if args.deltas:
        deltas = args.deltas
    else:
        deltas = get_deltas_fn(full_index2gene, heat, args.network_permutations, args.num_cores, infmat, addtl_genes, pnp, infmat_name, max_cc_sizes, verbose)

    sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True, verbose=verbose)

    results = []
    for delta in deltas:

        # find connected components
        G = hn.weighted_graph(sim, index2gene, delta, directed=True)
        ccs = hn.connected_components(G, args.min_cc_size)

        # calculate significance (using all genes with heat scores)
        if verbose > 4:
            print "* Performing permuted heat statistical significance..."
            print "\t- Using no. of components >= k (k \\in",
            print "[%s, %s]) as statistic" % (min(HN2_STATS_SIZES), max(HN2_STATS_SIZES))

        heat_permutations = p.permute_heat(heat, full_index2gene.values(),
                                           args.heat_permutations, addtl_genes,
                                           args.num_cores)
        sizes2counts = stats.calculate_permuted_cc_counts(infmat, full_index2gene,
                                                          heat_permutations, delta, HN2_STATS_SIZES, True,
                                                          args.num_cores)
        real_counts = stats.num_components_min_size(G, HN2_STATS_SIZES)
        size2real_counts = dict(zip(HN2_STATS_SIZES, real_counts))
        sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts,
                                               args.heat_permutations)

        # sort ccs list such that genes within components are sorted alphanumerically, and components
        # are sorted first by length, then alphanumerically by name of the first gene in the component
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)

        # Record the results for this delta
        results.append( (ccs, sizes2stats, delta) )

    return results
Exemplo n.º 4
0
def run_helper(args, infmat_name, get_deltas_fn, extra_delta_args):
    """Helper shared by simpleRun and simpleRunClassic.
    """
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")

    infmat = hnio.load_infmat(args.infmat_file, infmat_name)
    full_index2gene = hnio.load_index(args.infmat_index_file)
    
    using_json_heat = os.path.splitext(args.heat_file.lower())[1] == '.json'
    if using_json_heat:
        heat = json.load(open(args.heat_file))['heat']
    else:
        heat = hnio.load_heat_tsv(args.heat_file)
    print "* Loaded heat scores for %s genes" % len(heat)
    
    # filter out genes not in the network
    heat = hnheat.filter_heat_to_network_genes(heat, set(full_index2gene.values()))
    
    # genes with score 0 cannot be in output components, but are eligible for heat in permutations
    heat, addtl_genes = hnheat.filter_heat(heat, None, False, 'There are ## genes with heat score 0')
    
    deltas = get_deltas_fn(full_index2gene, heat, args.delta_permutations, args.num_cores, infmat, addtl_genes, *extra_delta_args)
    
    sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True)

    results_files = []
    for delta in deltas:
        # create output directory
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        # find connected components
        G = hn.weighted_graph(sim, index2gene, delta, directed=True)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance (using all genes with heat scores)
        print "* Performing permuted heat statistical significance..."
        heat_permutations = p.permute_heat(heat, full_index2gene.values(),
                                           args.significance_permutations, addtl_genes,
                                           args.num_cores)
        sizes = range(2, 11)
        print "\t- Using no. of components >= k (k \\in",
        print "[%s, %s]) as statistic" % (min(sizes), max(sizes))
        sizes2counts = stats.calculate_permuted_cc_counts(infmat, full_index2gene,
                                                          heat_permutations, delta, sizes, True,
                                                          args.num_cores)
        real_counts = stats.num_components_min_size(G, sizes)
        size2real_counts = dict(zip(sizes, real_counts))
        sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts,
                                               args.significance_permutations)
    
        # sort ccs list such that genes within components are sorted alphanumerically, and components
        # are sorted first by length, then alphanumerically by name of the first gene in the component
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)

        # write output
        if not using_json_heat:
            heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}}
            heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w')
            json.dump(heat_dict, heat_out, indent=4)
            heat_out.close()
            args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON
        
        args.delta = delta  # include delta in parameters section of output JSON
        output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs),
                       "components": ccs, "statistics": sizes2stats}
        hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                       sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
        results_files.append( os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT )
        
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)

    # create the hierarchy if necessary
    if args.output_hierarchy:
        from bin import createDendrogram as CD

        hierarchy_out_dir = '{}/hierarchy/'.format(args.output_directory)
        if not os.path.isdir(hierarchy_out_dir): os.mkdir(hierarchy_out_dir)

        params = vars(args)
        CD.createDendrogram( sim, index2gene.values(), hierarchy_out_dir, params, verbose=False)
        hierarchyFile = '{}/viz_files/{}'.format(str(hn.__file__).rsplit('/', 1)[0], HIERARCHY_WEB_FILE)
        shutil.copy(hierarchyFile, '{}/index.html'.format(hierarchy_out_dir))

    # write visualization output if edge file given
    if args.edge_file:
        from bin import makeResultsWebsite as MRW
        viz_args = [ "-r" ] + results_files
        viz_args += ["-ef", args.edge_file, "-o", args.output_directory + "/viz" ]
        if args.network_name: viz_args += [ "-nn", args.network_name ]
        if args.display_score_file: viz_args += [ "-dsf", args.display_score_file ]
        if args.display_name_file: viz_args += [ "-dnf", args.display_name_file ]
        MRW.run( MRW.get_parser().parse_args(viz_args) )
Exemplo n.º 5
0
def run_helper(args, infmat_name, get_deltas_fn, extra_delta_args):
    """Helper shared by simpleRun and simpleRunClassic.
    """
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")
    
    infmat = scipy.io.loadmat(args.infmat_file)[infmat_name]
    full_index2gene = hnio.load_index(args.infmat_index_file)
    
    using_json_heat = os.path.splitext(args.heat_file.lower())[1] == '.json'
    if using_json_heat:
        heat = json.load(open(args.heat_file))['heat']
    else:
        heat = hnio.load_heat_tsv(args.heat_file)
    print "* Loaded heat scores for %s genes" % len(heat)
    
    # filter out genes not in the network
    heat = hnheat.filter_heat_to_network_genes(heat, set(full_index2gene.values()))
    
    # genes with score 0 cannot be in output components, but are eligible for heat in permutations
    heat, addtl_genes = hnheat.filter_heat(heat, None, False, 'There are ## genes with heat score 0')
    
    deltas = get_deltas_fn(full_index2gene, heat, args.delta_permutations, args.num_cores, infmat, addtl_genes, *extra_delta_args)
    
    sim, index2gene = hn.similarity_matrix(infmat, full_index2gene, heat, True)

    results_files = []
    for delta in deltas:
        # create output directory
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        # find connected components
        G = hn.weighted_graph(sim, index2gene, delta, directed=True)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance (using all genes with heat scores)
        print "* Performing permuted heat statistical significance..."
        heat_permutations = p.permute_heat(heat, full_index2gene.values(),
                                           args.significance_permutations, addtl_genes,
                                           args.num_cores)
        sizes = range(2, 11)
        print "\t- Using no. of components >= k (k \\in",
        print "[%s, %s]) as statistic" % (min(sizes), max(sizes))
        sizes2counts = stats.calculate_permuted_cc_counts(infmat, full_index2gene,
                                                          heat_permutations, delta, sizes, True,
                                                          args.num_cores)
        real_counts = stats.num_components_min_size(G, sizes)
        size2real_counts = dict(zip(sizes, real_counts))
        sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts,
                                               args.significance_permutations)
    
        # sort ccs list such that genes within components are sorted alphanumerically, and components
        # are sorted first by length, then alphanumerically by name of the first gene in the component
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)

        # write output
        if not using_json_heat:
            heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}}
            heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w')
            json.dump(heat_dict, heat_out, indent=4)
            heat_out.close()
            args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON
        
        args.delta = delta  # include delta in parameters section of output JSON
        output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs),
                       "components": ccs, "statistics": sizes2stats}
        hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                       sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
        results_files.append( os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT )
        
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)

    # write visualization output if edge file given
    if args.edge_file:
        from bin import makeResultsWebsite as MRW
        viz_args = [ "-r" ] + results_files
        viz_args += ["-ef", args.edge_file, "-o", args.output_directory + "/viz" ]
        if args.network_name: viz_args += [ "-nn", args.network_name ]
        if args.display_score_file: viz_args += [ "-dsf", args.display_score_file ]
        MRW.run( MRW.get_parser().parse_args(viz_args) )
Exemplo n.º 6
0
def run_helper(args,
               infmat,
               full_index2gene,
               G,
               nname,
               pnp,
               heat,
               hname,
               addtl_genes,
               get_deltas_fn,
               infmat_name="PPR",
               max_cc_sizes=[5, 10, 15, 20],
               verbose=0):
    """Helper shared by runHotNet2 and runClassicHotNet."""

    # Perform delta selection (if necessary)
    if args.deltas:
        deltas = args.deltas
    else:
        deltas = get_deltas_fn(full_index2gene, heat,
                               args.network_permutations, args.num_cores,
                               infmat, addtl_genes, pnp, infmat_name,
                               max_cc_sizes, verbose)

    sim, index2gene = hn.similarity_matrix(infmat,
                                           full_index2gene,
                                           heat,
                                           True,
                                           verbose=verbose)

    results = []
    """Change from here """
    #permuted_sub_score_delta = []
    #subnet_geneScore_delta = []
    #Alpha_sig_delta = []
    #Subnetscore_sig_delta = []
    #Conponet_sig_delta = []
    #sig_Conp_delta = []
    #sig_Count_delta = []
    #degree_delta = []
    #permuted_degree_delta = []
    #degree_weighted_score_delta = []
    #cent_weighted_score_delta  = []
    My_results = []
    """end here"""

    for delta in deltas:
        """Change from here"""
        simG = hn.weighted_graph(sim, index2gene, delta, directed=True)
        ccs = hn.connected_components(simG, args.min_cc_size)
        """end here"""
        # calculate significance (using all genes with heat scores)
        if verbose > 4:
            print "* Performing permuted heat statistical significance..."
            print "\t- Using no. of components >= k (k \\in",
            print "[%s, %s]) as statistic" % (min(HN2_STATS_SIZES),
                                              max(HN2_STATS_SIZES))

        heat_permutations = p.permute_heat(heat, full_index2gene.values(),
                                           args.heat_permutations, addtl_genes,
                                           args.num_cores)

        #print heat_permutations

        #print heat_permutations[0]
        """change from here"""
        permuted_sub_score, permuted_gene_degree, permuted_degree_weighted_score, permuted_centality_weighted_score = my.calculate_permuted_subnetScore(
            infmat, full_index2gene, heat_permutations, delta, verbose,
            args.min_cc_size, G, args.num_cores)

        #permuted_sub_score_delta.append((permuted_sub_score, delta))
        #permuted_degree_delta.append((permuted_gene_degree,delta))

        #print permuted_sub_score_delta
        ####print permuted_degree_weighted_score
        #print permuted_degree_delta
        ####calcaule the degree of gene in each permuted subnetwork
        """ End here """

        sizes2counts = stats.calculate_permuted_cc_counts(
            infmat, full_index2gene, heat_permutations, delta, HN2_STATS_SIZES,
            True, args.num_cores)

        real_counts = stats.num_components_min_size(simG, HN2_STATS_SIZES)
        size2real_counts = dict(zip(HN2_STATS_SIZES, real_counts))
        sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts,
                                               args.heat_permutations)
        #print ccs
        # sort ccs list such that genes within components are sorted alphanumerically, and components
        # are sorted first by length, then alphanumerically by name of the first gene in the component
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)

        #print ccs
        """changing here"""
        #####calculate all the gene score for gene list
        genelist, scoreVec = my.score_vec(infmat, full_index2gene, heat)
        #gene_socre = np.column_stack((genelist, scoreVec))
        subnet_geneScore = my.Matchscore_Permuted_fun(genelist, scoreVec, ccs)
        #print subnet_geneScore
        ####output the subnetwork with score only
        #print ccs
        #subnet_geneScore_delta.append((subnet_geneScore,delta))

        ####calculate the gene degree within each subnetwork
        gene_degree = my.Gene_Degree_func(G, ccs)
        #degree_delta.append((gene_degree,delta))

        #####find score weighted by degree
        degree_weighted_score = my.Product_func(subnet_geneScore, gene_degree)
        #degree_weighted_score_delta.append((degree_weighted_score,delta))

        #####find score of gene weighted by centrality
        gene_centrality = my.Gene_Centrality_func(G, ccs)
        cent_weighted_score = my.Product_func(subnet_geneScore,
                                              gene_centrality)

        #cent_weighted_score_delta.append((cent_weighted_score,delta))

        #####calculate significant test
        """Sum and Size of subnetwork"""
        Alpha_sig, Subnetscore_sig, Conponet_sig, count, sig_conponet = my.MyStatistics_func(
            subnet_geneScore, gene_degree, permuted_sub_score,
            permuted_gene_degree, ccs, 1)
        """Degree weighted sum and size of subnetwork"""
        Alpha_sig2, Subnetscore_sig_dwss, Conponet_sig_dwss, count_dwss, sig_conponet_dwss = my.MyStatistics_func(
            degree_weighted_score, gene_degree, permuted_degree_weighted_score,
            permuted_gene_degree, ccs, 1)
        """Centrality weighted sum and size of subnewtwork"""
        Alpha_sig3, Subnetscore_sig_cwss, Conponet_sig_cwss, count_cwss, sig_conponet_cwss = my.MyStatistics_func(
            cent_weighted_score, gene_degree,
            permuted_centality_weighted_score, permuted_gene_degree, ccs, 1)
        """Sum and Sum of degree of gene within subnetwork"""
        Alpha_sig1, Subnetscore_sig_sd, Conponet_sig_sd, count_sd, sig_conponet_sd = my.MyStatistics_func(
            subnet_geneScore, gene_degree, permuted_sub_score,
            permuted_gene_degree, ccs, 2)
        """Degree weighted sum and Sum of degree of gene within subnetwork"""
        Alpha_sig4, Subnetscore_sig_dwsd, Conponet_sig_dwsd, count_dwsd, sig_conponet_dwsd = my.MyStatistics_func(
            degree_weighted_score, gene_degree, permuted_degree_weighted_score,
            permuted_gene_degree, ccs, 2)
        """Centrality weighted sum and Sum of degree of gene within subnetwork"""
        Alpha_sig5, Subnetscore_sig_cwsd, Conponet_sig_cwsd, count_cwsd, sig_conponet_cwsd = my.MyStatistics_func(
            cent_weighted_score, gene_degree,
            permuted_centality_weighted_score, permuted_gene_degree, ccs, 2)

        #####store all the result
        ####this is My results
        My_results.append(
            (permuted_sub_score, subnet_geneScore, Alpha_sig, Subnetscore_sig,
             Conponet_sig, count, sig_conponet, gene_degree,
             permuted_gene_degree, cent_weighted_score, degree_weighted_score,
             permuted_centality_weighted_score, permuted_degree_weighted_score,
             Subnetscore_sig_sd, Conponet_sig_sd, count_sd, sig_conponet_sd,
             Subnetscore_sig_dwss, Conponet_sig_dwss, count_dwss,
             sig_conponet_dwss, Subnetscore_sig_cwss, Conponet_sig_cwss,
             count_cwss, sig_conponet_cwss, Subnetscore_sig_dwsd,
             Conponet_sig_dwsd, count_dwsd, sig_conponet_dwsd,
             Subnetscore_sig_cwsd, Conponet_sig_cwsd, count_cwsd,
             sig_conponet_cwsd, delta))

        #print Subnetscore_sig
        """end here """

        results.append((ccs, sizes2stats, delta))
        ####this is My results

    return results, My_results
Exemplo n.º 7
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. ")
        print("(Ctrl-c to cancel).")
    
    infmat = scipy.io.loadmat(args.infmat_file)[INFMAT_NAME]
    infmat_index = hnio.load_index(args.infmat_index_file)
    heat = hnio.load_heat_tsv(args.heat_file)
    
    # filter out genes with heat score less than min_heat_score
    heat, addtl_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score)
    
    # find delta that maximizes # CCs of size >= MIN_SIZE for each permuted data set
    deltas = ft.get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations,
                                    args.parallel)

    #find the multiple of the median delta s.t. the size of the largest CC in the real data
    #is <= MAX_CC_SIZE
    medianDelta = np.median(deltas[MIN_CC_SIZE])
    M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()), quiet=False)
    h = hn.heat_vec(heat, gene_index)
    sim = hn.similarity_matrix(M, h)
    
    for i in range(1, 11):
        G = hn.weighted_graph(sim, gene_index, i*medianDelta)
        max_cc_size = max([len(cc) for cc in hn.connected_components(G)])
        if max_cc_size <= MAX_CC_SIZE:
            break
    
    # load interaction network edges and determine location of static HTML files for visualization
    edges = hnio.load_ppi_edges(args.edge_file) if args.edge_file else None
    index_file = '%s/viz_files/%s' % (os.path.realpath(__file__).rsplit('/', 1)[0], VIZ_INDEX)
    subnetworks_file = '%s/viz_files/%s' % (os.path.realpath(__file__).rsplit('/', 1)[0], VIZ_SUBNETWORKS)
    gene2index = dict([(gene, index) for index, gene in list(infmat_index.items())])

    #and run HotNet with that multiple and the next 4 multiples
    run_deltas = [i*medianDelta for i in range(i, i+5)]
    for delta in run_deltas: 
        # create output directory
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        # find connected components
        G = hn.weighted_graph(sim, gene_index, delta)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance (using all genes with heat scores)
        print("* Performing permuted heat statistical significance...")
        heat_permutations = p.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel)
        sizes = list(range(2, 11))
        print("\t- Using no. of components >= k (k \\in")
        print("[%s, %s]) as statistic" % (min(sizes), max(sizes)))
        sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index, heat_permutations,
                                                          delta, sizes, args.parallel)
        real_counts = stats.num_components_min_size(G, sizes)
        size2real_counts = dict(list(zip(sizes, real_counts)))
        sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.num_permutations)
    
        # sort ccs list such that genes within components are sorted alphanumerically, and components
        # are sorted first by length, then alphanumerically by name of the first gene in the component 
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)
    
        # write output
        heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}}
        heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w')
        json.dump(heat_dict, heat_out, indent=4)
        heat_out.close()
        
        args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON
        args.delta = delta
        output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs),
                       "components": ccs, "statistics": sizes2stats}
        hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                       sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
        
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)

        # write visualization output if edge file given
        if args.edge_file:
            viz_data = {"delta": delta, 'subnetworks': list()}
            for cc in ccs:
                viz_data['subnetworks'].append(viz.get_component_json(cc, heat, edges, gene2index, args.network_name))
                
            delta_viz_dir = '%s/viz/delta%s' % (args.output_directory, delta)
            if not os.path.isdir(delta_viz_dir):
                os.makedirs(delta_viz_dir)
            viz_out = open('%s/subnetworks.json' % delta_viz_dir, 'w')
            json.dump(viz_data, viz_out, indent=4)
            viz_out.close()
   
            shutil.copy(subnetworks_file, delta_viz_dir)
    
    if args.edge_file:
        viz.write_index_file(index_file, '%s/viz/%s' % (args.output_directory, VIZ_INDEX), run_deltas)
Exemplo n.º 8
0
def run_helper(args,
               infmat,
               full_index2gene,
               G,
               nname,
               pnp,
               heat,
               hname,
               addtl_genes,
               get_deltas_fn,
               infmat_name="PPR",
               max_cc_sizes=[5, 10, 15, 20],
               verbose=0):
    """Helper shared by runHotNet2 and runClassicHotNet.
    """
    # Perform delta selection (if necessary)
    if args.deltas:
        deltas = args.deltas
    else:
        deltas = get_deltas_fn(full_index2gene, heat,
                               args.network_permutations, args.num_cores,
                               infmat, addtl_genes, pnp, infmat_name,
                               max_cc_sizes, verbose)

    sim, index2gene = hn.similarity_matrix(infmat,
                                           full_index2gene,
                                           heat,
                                           True,
                                           verbose=verbose)

    results = []
    for delta in deltas:

        # find connected components
        G = hn.weighted_graph(sim, index2gene, delta, directed=True)
        ccs = hn.connected_components(G, args.min_cc_size)

        # calculate significance (using all genes with heat scores)
        if verbose > 4:
            print "* Performing permuted heat statistical significance..."
            print "\t- Using no. of components >= k (k \\in",
            print "[%s, %s]) as statistic" % (min(HN2_STATS_SIZES),
                                              max(HN2_STATS_SIZES))

        heat_permutations = p.permute_heat(heat, full_index2gene.values(),
                                           args.heat_permutations, addtl_genes,
                                           args.num_cores)
        sizes2counts = stats.calculate_permuted_cc_counts(
            infmat, full_index2gene, heat_permutations, delta, HN2_STATS_SIZES,
            True, args.num_cores)
        real_counts = stats.num_components_min_size(G, HN2_STATS_SIZES)
        size2real_counts = dict(zip(HN2_STATS_SIZES, real_counts))
        sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts,
                                               args.heat_permutations)

        # sort ccs list such that genes within components are sorted alphanumerically, and components
        # are sorted first by length, then alphanumerically by name of the first gene in the component
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)

        # Record the results for this delta
        results.append((ccs, sizes2stats, delta))

    return results
Exemplo n.º 9
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")
    
    infmat = scipy.io.loadmat(args.infmat_file)[INFMAT_NAME]
    infmat_index = hnio.load_index(args.infmat_index_file)
    heat = hnio.load_heat_tsv(args.heat_file)
    
    #filter out genes with heat score less than min_heat_score
    heat, addtl_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score)
    
    #find delta that maximizes # CCs of size >= MIN_SIZE for each permuted data set
    deltas = ft.get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations,
                                    args.parallel)

    #find the multiple of the median delta s.t. the size of the largest CC in the real data
    #is <= MAX_CC_SIZE
    medianDelta = np.median(deltas[MIN_CC_SIZE])
    M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()), quiet=False)
    h = hn.heat_vec(heat, gene_index)
    sim = hn.similarity_matrix(M, h)
    
    for i in range(1, 11):
        G = hn.weighted_graph(sim, gene_index, i*medianDelta)
        max_cc_size = max([len(cc) for cc in hn.connected_components(G)])
        if max_cc_size <= MAX_CC_SIZE:
            break
    
    #and run HotNet with that multiple and the next 4 multiples
    run_deltas = [i*medianDelta for i in range(i, i+5)]
    for delta in run_deltas: 
        #create output directory
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        #find connected components
        G = hn.weighted_graph(sim, gene_index, delta)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance (using all genes with heat scores)
        print "* Performing permuted heat statistical significance..."
        heat_permutations = p.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel)
        sizes = range(2, 11)
        print "\t- Using no. of components >= k (k \\in",
        print "[%s, %s]) as statistic" % (min(sizes), max(sizes))
        sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index, heat_permutations,
                                                          delta, sizes, args.parallel)
        real_counts = stats.num_components_min_size(G, sizes)
        size2real_counts = dict(zip(sizes, real_counts))
        sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.num_permutations)
    
        #sort ccs list such that genes within components are sorted alphanumerically, and components
        #are sorted first by length, then alphanumerically by name of the first gene in the component 
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)
    
        # write output
        output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs),
                       "components": ccs, "statistics": sizes2stats}
        hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                       sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
        
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)
Exemplo n.º 10
0
def run(args):
    # create output directory if doesn't exist; warn if it exists and is not empty
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)
    if len(os.listdir(args.output_directory)) > 0:
        print("WARNING: Output directory is not empty. Any conflicting files will be overwritten. "
              "(Ctrl-c to cancel).")
    
    infmat = scipy.io.loadmat(args.infmat_file)[INFMAT_NAME]
    infmat_index = hnio.load_index(args.infmat_index_file)
    heat = hnio.load_heat_tsv(args.heat_file)
    
    # filter out genes with heat score less than min_heat_score
    heat, addtl_genes, args.min_heat_score = hnheat.filter_heat(heat, args.min_heat_score)
    
    # find delta that maximizes # CCs of size >= MIN_SIZE for each permuted data set
    deltas = ft.get_deltas_for_heat(infmat, infmat_index, heat, addtl_genes, args.num_permutations,
                                    args.parallel)

    #find the multiple of the median delta s.t. the size of the largest CC in the real data
    #is <= MAX_CC_SIZE
    medianDelta = np.median(deltas[MIN_CC_SIZE])
    M, gene_index = hn.induce_infmat(infmat, infmat_index, sorted(heat.keys()), quiet=False)
    h = hn.heat_vec(heat, gene_index)
    sim = hn.similarity_matrix(M, h)
    
    for i in range(1, 11):
        G = hn.weighted_graph(sim, gene_index, i*medianDelta)
        max_cc_size = max([len(cc) for cc in hn.connected_components(G)])
        if max_cc_size <= MAX_CC_SIZE:
            break
    
    # load interaction network edges and determine location of static HTML files for visualization
    edges = hnio.load_ppi_edges(args.edge_file) if args.edge_file else None
    index_file = '%s/viz_files/%s' % (os.path.realpath(__file__).rsplit('/', 1)[0], VIZ_INDEX)
    subnetworks_file = '%s/viz_files/%s' % (os.path.realpath(__file__).rsplit('/', 1)[0], VIZ_SUBNETWORKS)
    gene2index = dict([(gene, index) for index, gene in infmat_index.iteritems()])

    #and run HotNet with that multiple and the next 4 multiples
    run_deltas = [i*medianDelta for i in range(i, i+5)]
    for delta in run_deltas: 
        # create output directory
        delta_out_dir = args.output_directory + "/delta_" + str(delta)
        if not os.path.isdir(delta_out_dir):
            os.mkdir(delta_out_dir)
        
        # find connected components
        G = hn.weighted_graph(sim, gene_index, delta)
        ccs = hn.connected_components(G, args.min_cc_size)
        
        # calculate significance (using all genes with heat scores)
        print "* Performing permuted heat statistical significance..."
        heat_permutations = p.permute_heat(heat, args.num_permutations, addtl_genes, args.parallel)
        sizes = range(2, 11)
        print "\t- Using no. of components >= k (k \\in",
        print "[%s, %s]) as statistic" % (min(sizes), max(sizes))
        sizes2counts = stats.calculate_permuted_cc_counts(infmat, infmat_index, heat_permutations,
                                                          delta, sizes, args.parallel)
        real_counts = stats.num_components_min_size(G, sizes)
        size2real_counts = dict(zip(sizes, real_counts))
        sizes2stats = stats.compute_statistics(size2real_counts, sizes2counts, args.num_permutations)
    
        # sort ccs list such that genes within components are sorted alphanumerically, and components
        # are sorted first by length, then alphanumerically by name of the first gene in the component 
        ccs = [sorted(cc) for cc in ccs]
        ccs.sort(key=lambda comp: comp[0])
        ccs.sort(key=len, reverse=True)
    
        # write output
        heat_dict = {"heat": heat, "parameters": {"heat_file": args.heat_file}}
        heat_out = open(os.path.abspath(delta_out_dir) + "/" + HEAT_JSON, 'w')
        json.dump(heat_dict, heat_out, indent=4)
        heat_out.close()
        
        args.heat_file = os.path.abspath(delta_out_dir) + "/" + HEAT_JSON
        args.delta = delta
        output_dict = {"parameters": vars(args), "sizes": hn.component_sizes(ccs),
                       "components": ccs, "statistics": sizes2stats}
        hnio.write_significance_as_tsv(os.path.abspath(delta_out_dir) + "/" + SIGNIFICANCE_TSV,
                                       sizes2stats)
        
        json_out = open(os.path.abspath(delta_out_dir) + "/" + JSON_OUTPUT, 'w')
        json.dump(output_dict, json_out, indent=4)
        json_out.close()
        
        hnio.write_components_as_tsv(os.path.abspath(delta_out_dir) + "/" + COMPONENTS_TSV, ccs)

        # write visualization output if edge file given
        if args.edge_file:
            viz_data = {"delta": delta, 'subnetworks': list()}
            for cc in ccs:
                viz_data['subnetworks'].append(viz.get_component_json(cc, heat, edges, gene2index,
                                                                      args.network_name))
                
            delta_viz_dir = '%s/viz/delta%s' % (args.output_directory, delta)
            if not os.path.isdir(delta_viz_dir):
                os.makedirs(delta_viz_dir)
            viz_out = open('%s/subnetworks.json' % delta_viz_dir, 'w')
            json.dump(viz_data, viz_out, indent=4)
            viz_out.close()
   
            shutil.copy(subnetworks_file, delta_viz_dir)
    
    if args.edge_file:
        viz.write_index_file(index_file, '%s/viz/%s' % (args.output_directory, VIZ_INDEX), run_deltas)