print "Number of nodes in network:%d" % network.number_of_nodes()

    '''
    # Remove unannotated gene from network
    for node in network.nodes():
        if not node in gene_annotation:
            network.remove_node(node)
    print "Number of nodes in network after removing unannotated genes:%d" % network.number_of_nodes()
    '''

    # Remove individual nodes by get the largest indepedent connected component
    network = nx.connected_component_subgraphs(network)[0]
    print "Number of nodes in network after removing individual genes:%d" % network.number_of_nodes()
    print "Number of edges in network after removing individual genes:%d" % network.number_of_edges()

    network_annotated_gene = {} # cross validation set of annotated genes
    for gene in gene_annotation:
        if gene in network.nodes():
            network_annotated_gene[gene] = gene_annotation[gene]
    print "Number of annotated genes in network:%d" % len(network_annotated_gene)

    sim_terms = utils.filtered_most_sim()
    sim_cache = utils.read_sim(config.simcache_fpath)
    wang_sim_cache = utils.read_sim(config.folder + "filtered_wang_sim.csv")

    #remove_predict()

    for GONUMBER in range(1, 11):
        cross_validation()

Exemplo n.º 2
0
                sim = compute_gene_sim_total(nterms, terms)
                #sim = compute_gene_sim_max(nterms, terms)
                non_sim_avg += sim
        non_sim_avg /= count
        print "%s, %f, %f" % (gene, sim_avg, non_sim_avg)


if __name__ == "__main__":
    dag = DAG(config.go_fpath)

    gene_annotation = utils.get_annotation(config.annotation_fpath, config.filtered_annotation_fpath, dag.get_root().id)

    term_ic = utils.calculate_ic(gene_annotation, dag, config.ic_fpath)

    network = utils.create_network(config.network_fpath)
    # Remove unannotated gene from network
    for node in network.nodes():
        if not node in gene_annotation:
            network.remove_node(node)
    # Remove individual nodes by get the largest indepedent connected component
    network = nx.connected_component_subgraphs(network)[0]

    sim_cache = utils.read_sim(config.simcache_fpath)

    #compute_term_in_neighbour_ratio()
    #compute_avg_term_num()

    compute_avg_sim()


Exemplo n.º 3
0
def iterate_weighted_mv(network, annotated_genes, go_num):
    predicted_genes = {}
    iter = 0
    last_sum = -1.0
    ITERATION = 20

    sim_cache = utils.read_sim("pfalciparum_data/modified_wang_sim.csv")

    while iter < ITERATION:
        total_sum = 0.0
        for gene in network.nodes():
            if not gene in annotated_genes:
                candidate_terms = get_candidate_terms(network, gene, annotated_genes, predicted_genes)
                
                cterm_sim_sum = {}
                for cterm in candidate_terms:
                    sim_sum = 0.0
                    # For each neighbour of gene
                    for neighbour in network.neighbors(gene):
                        if neighbour in annotated_genes:
                            max_sim = -1.0
                            for nterm in annotated_genes[neighbour]:
                                new_sim = sim_cache[cterm][nterm]
                                if new_sim > max_sim:
                                    max_sim = new_sim
                            if gene in predicted_genes:
                                weight = compute_gene_sim(predicted_genes[gene], annotated_genes[neighbour], sim_cache)
                            else:
                                weight = compute_gene_sim([cterm], annotated_genes[neighbour], sim_cache)
                            sim_sum += 1.0 * weight * max_sim
                        elif neighbour in predicted_genes:
                            max_sim = -1.0
                            for nterm in predicted_genes[neighbour]:
                                new_sim = sim_cache[cterm][nterm]
                                if new_sim > max_sim:
                                    max_sim = new_sim
                            if gene in predicted_genes:
                                weight = compute_gene_sim(predicted_genes[gene], predicted_genes[neighbour], sim_cache)
                            else:
                                weight = compute_gene_sim([cterm], predicted_genes[neighbour], sim_cache)
                            sim_sum += 1.0 * weight * max_sim
                        cterm_sim_sum[cterm] = sim_sum

                if len(candidate_terms) > 0:
                    # Select top go_num terms as predicted GO terms for the gene
                    top_terms = heapq.nlargest(go_num, cterm_sim_sum.iteritems(), itemgetter(1))
                    if gene in predicted_genes:
                        del predicted_genes[gene]
                    predicted_genes[gene] = []
                    for rec in top_terms:
                        predicted_genes[gene].append(rec[0])

        total_sum = compute_total_sim(network, annotated_genes, predicted_genes, sim_cache)
        diff = int(total_sum) - int(last_sum)
        if diff==0:
            break
        else:
            last_sum = total_sum
        iter += 1

    return predicted_genes