def get_random_mean_edges(G, gene_set, sims):
    # ============================================
    G = nx.Graph()
    for line in open(network_file, 'r'):
        # lines starting with '#' will be ignored
        if line[0] == '#':
            continue
        # The first two columns in the line will be interpreted as an
        # interaction gene1 <=> gene2
        # 这个表的前两列将被解释为交互作用gene1<==>gene2
        line_data = line.strip().split('\t')
        # line.strip()删除数据中的换行符 .split('\t')遇到四个空格就隔开
        node1 = line_data[0]
        node2 = line_data[1]
        # 定义node1为一组数据的前一项数据
        # 定义node2为一组数据的后一项数据
        G.add_edge(node1, node2)
    all_genes_in_network = set(G.nodes())
    tools.remove_self_links(G)
    # print(list(G))
    # =============================================
    all_genes = G.nodes()
    number_of_seed_genes = len(gene_set & set(all_genes))
    # print(number_of_seed_genes)
    # print(number_of_seed_genes)
    edges_list = []
    print("")
    for i in range(1, sims + 1):
        if i % 100 == 0:
            sys.stdout.write("> random simulation [%s of %s]\r" % (i, sims))
            sys.stdout.flush()
        rand_seeds = set(random.sample(all_genes,number_of_seed_genes))
        edges = get_edges_size(G, rand_seeds)
        edges_list.append(edges)

    # =============================================
    def all_list(arr):
        result2 = {}
        for i in set(arr):
            result2[i] = arr.count(i)

        return result2

    result2 = all_list(edges_list)
    # result2[result2.keys()]=result2.pop()
    # print(result)
    x = result2.keys()
    y = result2.values()
    plt.xlabel('edges')
    plt.ylabel('Frequency_percentage')
    plt.bar(x, y)
    plt.title('The histogram of the edges-Frequency_percentage')
    plt.savefig('edges_final.pdf')
    plt.close()
    # =============================================
    return edges_list
def main(interactome, tsv, gene_file, only_crosses):
    if only_crosses and len(gene_file) < 2:
        raise Exception('Must have at least 2 gene files for --only-crosses')

    # Load network
    G = read_network(interactome)
    all_genes_in_network = set(G.nodes())
    remove_self_links(G)

    if tsv:
        with open(tsv, 'wb') as f:
            f.write('#\trank\tprotein_a\tprotein_b\td_AB\ts_AB\n')
    print ''

    # Read gene files and separate by protein
    genes = defaultdict(list)
    for data_file, title in gene_file:
        with open(data_file) as f:
            for line in f.read().splitlines():
                if line.startswith('#'):
                    continue
                protein, gene_ids = line.split('\t')
                if title and len(gene_file) > 1:
                    protein = '%s [%s]' % (protein, title)
                genes[protein].extend(gene_ids.split(';'))

    # Returns d_AB and s_AB of a protein pair
    def analyze_proteins(protein_a, protein_b):
        if (protein_a, protein_b) in cache:
            return cache[(protein_a, protein_b)]

        genes_A = set(genes[protein_a]) & all_genes_in_network
        genes_B = set(genes[protein_b]) & all_genes_in_network

        # Perform calculations
        d_A = calc_single_set_distance(G, genes_A)
        d_B = calc_single_set_distance(G, genes_B)
        d_AB = calc_set_pair_distances(G, genes_A, genes_B)
        s_AB = d_AB - (d_A + d_B)/2.

        return d_AB, s_AB

    # Print information about each protein set as debugging information
    print 'PROTEINS'
    for i, protein in enumerate(sorted(genes)):
        print str(i + 1) + '. ' + protein + '; S = ' + \
            str(get_lcc_size(G, genes[protein]))
    print ''

    # Analyze each protein combination and sort from lowest s_AB to highest
    count = 1
    analyses = []
    for protein_a, protein_b in map(tuple, combinations_with_replacement(sorted(genes), 2)):
        if only_crosses and _title_from_protein(protein_a) == _title_from_protein(protein_b):
            continue
        d_AB, s_AB = analyze_proteins(protein_a, protein_b)
        analyses.append((protein_a, protein_b, d_AB, s_AB))
        count += 1
    analyses.sort(key=lambda x: x[3])

    # Print analyses in order
    for i, analysis in enumerate(analyses):
        if tsv:
            with open(tsv, 'ab') as f:
                f.write('\t'.join((str(i + 1), analysis[0], analysis[1],
                        str(analysis[2]), str(analysis[3]))) + '\n')
        print str(i + 1) + '. Proteins: ' + analysis[0] + ' and ' \
            + analysis[1]
        print 'd_AB = ' + str(analysis[2])
        print 's_AB = ' + str(analysis[3])
        print ''
        sys.exit(0)

    if network_file == 'interactome.tsv':
        print('> default network from "interactome.tsv" will be used')

    # --------------------------------------------------------
    #
    # LOADING NETWORK and DISEASE GENES
    #
    # --------------------------------------------------------

    # read network
    G = tools.read_network(network_file)
    # get all genes ad remove self links
    all_genes_in_network = set(G.nodes())
    tools.remove_self_links(G)
    # read gene set
    gene_set_full = tools.read_gene_list(gene_file)
    # removing genes that are not in the network:
    gene_set = gene_set_full & all_genes_in_network
    if len(gene_set_full) != len(gene_set):
        print("> ignoring %s genes that are not in the network" % (
            len(gene_set_full - all_genes_in_network)))
        print("> remaining number of genes: %s" % (len(gene_set)))

    # --------------------------------------------------------
    #
    # CALCULATE NETWORK QUANTITIES
    #
    # --------------------------------------------------------
示例#4
0
def main(interactome, tsv, gene_file, only_crosses):
    if only_crosses and len(gene_file) < 2:
        raise Exception('Must have at least 2 gene files for --only-crosses')

    # Load network
    G = read_network(interactome)
    all_genes_in_network = set(G.nodes())
    remove_self_links(G)

    if tsv:
        with open(tsv, 'wb') as f:
            f.write('#\trank\tprotein_a\tprotein_b\td_AB\ts_AB\n')
    print ''

    # Read gene files and separate by protein
    genes = defaultdict(list)
    for data_file, title in gene_file:
        with open(data_file) as f:
            for line in f.read().splitlines():
                if line.startswith('#'):
                    continue
                protein, gene_ids = line.split('\t')
                if title and len(gene_file) > 1:
                    protein = '%s [%s]' % (protein, title)
                genes[protein].extend(gene_ids.split(';'))

    # Returns d_AB and s_AB of a protein pair
    def analyze_proteins(protein_a, protein_b):
        if (protein_a, protein_b) in cache:
            return cache[(protein_a, protein_b)]

        genes_A = set(genes[protein_a]) & all_genes_in_network
        genes_B = set(genes[protein_b]) & all_genes_in_network

        # Perform calculations
        d_A = calc_single_set_distance(G, genes_A)
        d_B = calc_single_set_distance(G, genes_B)
        d_AB = calc_set_pair_distances(G, genes_A, genes_B)
        s_AB = d_AB - (d_A + d_B) / 2.

        return d_AB, s_AB

    # Print information about each protein set as debugging information
    print 'PROTEINS'
    for i, protein in enumerate(sorted(genes)):
        print str(i + 1) + '. ' + protein + '; S = ' + \
            str(get_lcc_size(G, genes[protein]))
    print ''

    # Analyze each protein combination and sort from lowest s_AB to highest
    count = 1
    analyses = []
    for protein_a, protein_b in map(
            tuple, combinations_with_replacement(sorted(genes), 2)):
        if only_crosses and _title_from_protein(
                protein_a) == _title_from_protein(protein_b):
            continue
        d_AB, s_AB = analyze_proteins(protein_a, protein_b)
        analyses.append((protein_a, protein_b, d_AB, s_AB))
        count += 1
    analyses.sort(key=lambda x: x[3])

    # Print analyses in order
    for i, analysis in enumerate(analyses):
        if tsv:
            with open(tsv, 'ab') as f:
                f.write('\t'.join((str(i + 1), analysis[0], analysis[1],
                                   str(analysis[2]), str(analysis[3]))) + '\n')
        print str(i + 1) + '. Proteins: ' + analysis[0] + ' and ' \
            + analysis[1]
        print 'd_AB = ' + str(analysis[2])
        print 's_AB = ' + str(analysis[3])
        print ''
def main(interactome, gene_file):
    # Load network
    sys.stdout = StringIO()
    G = read_network(interactome)
    all_genes_in_network = set(G.nodes())
    remove_self_links(G)
    sys.stdout = sys.__stdout__

    # Read gene files and separate by protein
    genes = defaultdict(list)
    for data_file, title in gene_file:
        with open(data_file) as f:
            for line in f.read().splitlines():
                if line.startswith('#'):
                    continue
                protein, gene_ids = line.split('\t')
                if title and len(gene_file) > 1:
                    protein = '%s [%s]' % (protein, title)
                genes[protein].extend(gene_ids.split(';'))

    # Returns d_AB, s_AB, and ((gene_1, gene_2, dist)...) of a protein pair
    def analyze_proteins(protein_a, protein_b):
        genes_A = set(genes[protein_a]) & all_genes_in_network
        genes_B = set(genes[protein_b]) & all_genes_in_network

        all_path_lengths = get_pathlengths_for_two_sets(G, genes_A, genes_B)
        all_distances = []

        # Perform calculations
        d_A = calc_single_set_distance(G, genes_A)
        d_B = calc_single_set_distance(G, genes_B)

        for gene_A in genes_A:
            all_distances_A = []
            for gene_B in genes_B:
                if gene_A == gene_B:
                    all_distances_A.append((gene_A, gene_B, 0))
                else:
                    try:
                        all_distances_A.append((gene_A, gene_B, all_path_lengths[min(gene_A, gene_B)][max(gene_A, gene_B)]))
                    except KeyError:
                        pass
            if len(all_distances_A) > 0:
                all_distances.append(min(all_distances_A, key=lambda x: x[2]))

        for gene_B in genes_B:
            all_distances_B = []
            for gene_A in genes_A:
                if gene_A == gene_B:
                    all_distances_B.append((gene_A, gene_B, 0))
                else:
                    try:
                        all_distances_B.append((gene_B, gene_A, all_path_lengths[min(gene_A, gene_B)][max(gene_A, gene_B)]))
                    except KeyError:
                        pass
            if len(all_distances_B) > 0:
                all_distances.append(min(all_distances_B, key=lambda x: x[2]))

        d_AB = np.mean(map(lambda x: x[2], all_distances))
        s_AB = d_AB - (d_A + d_B)/2.

        return d_AB, s_AB, sorted(all_distances, key=lambda x: x[2])

    # Print information about each protein set as debugging information
    print 'PROTEINS'
    for i, protein in enumerate(sorted(genes)):
        print str(i + 1) + '. ' + protein + '; S = ' + \
            str(get_lcc_size(G, genes[protein]))
    print ''

    # Analyze each protein combination and sort from lowest s_AB to highest
    analyses = []
    for protein_a, protein_b in map(tuple, combinations_with_replacement(sorted(genes), 2)):
        d_AB, s_AB, distances = analyze_proteins(protein_a, protein_b)
        analyses.append((protein_a, protein_b, d_AB, s_AB, distances))
    analyses.sort(key=lambda x: x[3])

    # Print analyses in order
    for i, analysis in enumerate(analyses):
        protein_a, protein_b, d_AB, s_AB, distances = analysis
        print str(i + 1) + '. Proteins: ' + protein_a + ' and ' + protein_b
        print 'd_AB = ' + str(d_AB)
        print 's_AB = ' + str(s_AB)
        for gene_1, gene_2, dist in distances:
            print gene_1 + ' -> ' + gene_2 + ' = ' + str(dist)
        print ''
    if network_file == 'interactome.tsv':
        print '> default network from "interactome.tsv" will be used'


    # --------------------------------------------------------
    #
    # LOADING NETWORK and DISEASE GENES
    #
    # --------------------------------------------------------

    # read network
    G  = tools.read_network(network_file)
    # get all genes ad remove self links
    all_genes_in_network = set(G.nodes())
    tools.remove_self_links(G)

    # read gene set
    gene_set_full = tools.read_gene_list(gene_file)
    # removing genes that are not in the network:
    gene_set = gene_set_full & all_genes_in_network
    if len(gene_set_full) != len(gene_set):
        print "> ignoring %s genes that are not in the network" %(
            len(gene_set_full - all_genes_in_network))
        print "> remaining number of genes: %s" %(len(gene_set))


    # --------------------------------------------------------
    #
    # CALCULATE NETWORK QUANTITIES
    #