def get_random_mean_edges(G, gene_set, sims): # ============================================ G = nx.Graph() for line in open(network_file, 'r'): # lines starting with '#' will be ignored if line[0] == '#': continue # The first two columns in the line will be interpreted as an # interaction gene1 <=> gene2 # 这个表的前两列将被解释为交互作用gene1<==>gene2 line_data = line.strip().split('\t') # line.strip()删除数据中的换行符 .split('\t')遇到四个空格就隔开 node1 = line_data[0] node2 = line_data[1] # 定义node1为一组数据的前一项数据 # 定义node2为一组数据的后一项数据 G.add_edge(node1, node2) all_genes_in_network = set(G.nodes()) tools.remove_self_links(G) # print(list(G)) # ============================================= all_genes = G.nodes() number_of_seed_genes = len(gene_set & set(all_genes)) # print(number_of_seed_genes) # print(number_of_seed_genes) edges_list = [] print("") for i in range(1, sims + 1): if i % 100 == 0: sys.stdout.write("> random simulation [%s of %s]\r" % (i, sims)) sys.stdout.flush() rand_seeds = set(random.sample(all_genes,number_of_seed_genes)) edges = get_edges_size(G, rand_seeds) edges_list.append(edges) # ============================================= def all_list(arr): result2 = {} for i in set(arr): result2[i] = arr.count(i) return result2 result2 = all_list(edges_list) # result2[result2.keys()]=result2.pop() # print(result) x = result2.keys() y = result2.values() plt.xlabel('edges') plt.ylabel('Frequency_percentage') plt.bar(x, y) plt.title('The histogram of the edges-Frequency_percentage') plt.savefig('edges_final.pdf') plt.close() # ============================================= return edges_list
def main(interactome, tsv, gene_file, only_crosses): if only_crosses and len(gene_file) < 2: raise Exception('Must have at least 2 gene files for --only-crosses') # Load network G = read_network(interactome) all_genes_in_network = set(G.nodes()) remove_self_links(G) if tsv: with open(tsv, 'wb') as f: f.write('#\trank\tprotein_a\tprotein_b\td_AB\ts_AB\n') print '' # Read gene files and separate by protein genes = defaultdict(list) for data_file, title in gene_file: with open(data_file) as f: for line in f.read().splitlines(): if line.startswith('#'): continue protein, gene_ids = line.split('\t') if title and len(gene_file) > 1: protein = '%s [%s]' % (protein, title) genes[protein].extend(gene_ids.split(';')) # Returns d_AB and s_AB of a protein pair def analyze_proteins(protein_a, protein_b): if (protein_a, protein_b) in cache: return cache[(protein_a, protein_b)] genes_A = set(genes[protein_a]) & all_genes_in_network genes_B = set(genes[protein_b]) & all_genes_in_network # Perform calculations d_A = calc_single_set_distance(G, genes_A) d_B = calc_single_set_distance(G, genes_B) d_AB = calc_set_pair_distances(G, genes_A, genes_B) s_AB = d_AB - (d_A + d_B)/2. return d_AB, s_AB # Print information about each protein set as debugging information print 'PROTEINS' for i, protein in enumerate(sorted(genes)): print str(i + 1) + '. ' + protein + '; S = ' + \ str(get_lcc_size(G, genes[protein])) print '' # Analyze each protein combination and sort from lowest s_AB to highest count = 1 analyses = [] for protein_a, protein_b in map(tuple, combinations_with_replacement(sorted(genes), 2)): if only_crosses and _title_from_protein(protein_a) == _title_from_protein(protein_b): continue d_AB, s_AB = analyze_proteins(protein_a, protein_b) analyses.append((protein_a, protein_b, d_AB, s_AB)) count += 1 analyses.sort(key=lambda x: x[3]) # Print analyses in order for i, analysis in enumerate(analyses): if tsv: with open(tsv, 'ab') as f: f.write('\t'.join((str(i + 1), analysis[0], analysis[1], str(analysis[2]), str(analysis[3]))) + '\n') print str(i + 1) + '. Proteins: ' + analysis[0] + ' and ' \ + analysis[1] print 'd_AB = ' + str(analysis[2]) print 's_AB = ' + str(analysis[3]) print ''
sys.exit(0) if network_file == 'interactome.tsv': print('> default network from "interactome.tsv" will be used') # -------------------------------------------------------- # # LOADING NETWORK and DISEASE GENES # # -------------------------------------------------------- # read network G = tools.read_network(network_file) # get all genes ad remove self links all_genes_in_network = set(G.nodes()) tools.remove_self_links(G) # read gene set gene_set_full = tools.read_gene_list(gene_file) # removing genes that are not in the network: gene_set = gene_set_full & all_genes_in_network if len(gene_set_full) != len(gene_set): print("> ignoring %s genes that are not in the network" % ( len(gene_set_full - all_genes_in_network))) print("> remaining number of genes: %s" % (len(gene_set))) # -------------------------------------------------------- # # CALCULATE NETWORK QUANTITIES # # --------------------------------------------------------
def main(interactome, tsv, gene_file, only_crosses): if only_crosses and len(gene_file) < 2: raise Exception('Must have at least 2 gene files for --only-crosses') # Load network G = read_network(interactome) all_genes_in_network = set(G.nodes()) remove_self_links(G) if tsv: with open(tsv, 'wb') as f: f.write('#\trank\tprotein_a\tprotein_b\td_AB\ts_AB\n') print '' # Read gene files and separate by protein genes = defaultdict(list) for data_file, title in gene_file: with open(data_file) as f: for line in f.read().splitlines(): if line.startswith('#'): continue protein, gene_ids = line.split('\t') if title and len(gene_file) > 1: protein = '%s [%s]' % (protein, title) genes[protein].extend(gene_ids.split(';')) # Returns d_AB and s_AB of a protein pair def analyze_proteins(protein_a, protein_b): if (protein_a, protein_b) in cache: return cache[(protein_a, protein_b)] genes_A = set(genes[protein_a]) & all_genes_in_network genes_B = set(genes[protein_b]) & all_genes_in_network # Perform calculations d_A = calc_single_set_distance(G, genes_A) d_B = calc_single_set_distance(G, genes_B) d_AB = calc_set_pair_distances(G, genes_A, genes_B) s_AB = d_AB - (d_A + d_B) / 2. return d_AB, s_AB # Print information about each protein set as debugging information print 'PROTEINS' for i, protein in enumerate(sorted(genes)): print str(i + 1) + '. ' + protein + '; S = ' + \ str(get_lcc_size(G, genes[protein])) print '' # Analyze each protein combination and sort from lowest s_AB to highest count = 1 analyses = [] for protein_a, protein_b in map( tuple, combinations_with_replacement(sorted(genes), 2)): if only_crosses and _title_from_protein( protein_a) == _title_from_protein(protein_b): continue d_AB, s_AB = analyze_proteins(protein_a, protein_b) analyses.append((protein_a, protein_b, d_AB, s_AB)) count += 1 analyses.sort(key=lambda x: x[3]) # Print analyses in order for i, analysis in enumerate(analyses): if tsv: with open(tsv, 'ab') as f: f.write('\t'.join((str(i + 1), analysis[0], analysis[1], str(analysis[2]), str(analysis[3]))) + '\n') print str(i + 1) + '. Proteins: ' + analysis[0] + ' and ' \ + analysis[1] print 'd_AB = ' + str(analysis[2]) print 's_AB = ' + str(analysis[3]) print ''
def main(interactome, gene_file): # Load network sys.stdout = StringIO() G = read_network(interactome) all_genes_in_network = set(G.nodes()) remove_self_links(G) sys.stdout = sys.__stdout__ # Read gene files and separate by protein genes = defaultdict(list) for data_file, title in gene_file: with open(data_file) as f: for line in f.read().splitlines(): if line.startswith('#'): continue protein, gene_ids = line.split('\t') if title and len(gene_file) > 1: protein = '%s [%s]' % (protein, title) genes[protein].extend(gene_ids.split(';')) # Returns d_AB, s_AB, and ((gene_1, gene_2, dist)...) of a protein pair def analyze_proteins(protein_a, protein_b): genes_A = set(genes[protein_a]) & all_genes_in_network genes_B = set(genes[protein_b]) & all_genes_in_network all_path_lengths = get_pathlengths_for_two_sets(G, genes_A, genes_B) all_distances = [] # Perform calculations d_A = calc_single_set_distance(G, genes_A) d_B = calc_single_set_distance(G, genes_B) for gene_A in genes_A: all_distances_A = [] for gene_B in genes_B: if gene_A == gene_B: all_distances_A.append((gene_A, gene_B, 0)) else: try: all_distances_A.append((gene_A, gene_B, all_path_lengths[min(gene_A, gene_B)][max(gene_A, gene_B)])) except KeyError: pass if len(all_distances_A) > 0: all_distances.append(min(all_distances_A, key=lambda x: x[2])) for gene_B in genes_B: all_distances_B = [] for gene_A in genes_A: if gene_A == gene_B: all_distances_B.append((gene_A, gene_B, 0)) else: try: all_distances_B.append((gene_B, gene_A, all_path_lengths[min(gene_A, gene_B)][max(gene_A, gene_B)])) except KeyError: pass if len(all_distances_B) > 0: all_distances.append(min(all_distances_B, key=lambda x: x[2])) d_AB = np.mean(map(lambda x: x[2], all_distances)) s_AB = d_AB - (d_A + d_B)/2. return d_AB, s_AB, sorted(all_distances, key=lambda x: x[2]) # Print information about each protein set as debugging information print 'PROTEINS' for i, protein in enumerate(sorted(genes)): print str(i + 1) + '. ' + protein + '; S = ' + \ str(get_lcc_size(G, genes[protein])) print '' # Analyze each protein combination and sort from lowest s_AB to highest analyses = [] for protein_a, protein_b in map(tuple, combinations_with_replacement(sorted(genes), 2)): d_AB, s_AB, distances = analyze_proteins(protein_a, protein_b) analyses.append((protein_a, protein_b, d_AB, s_AB, distances)) analyses.sort(key=lambda x: x[3]) # Print analyses in order for i, analysis in enumerate(analyses): protein_a, protein_b, d_AB, s_AB, distances = analysis print str(i + 1) + '. Proteins: ' + protein_a + ' and ' + protein_b print 'd_AB = ' + str(d_AB) print 's_AB = ' + str(s_AB) for gene_1, gene_2, dist in distances: print gene_1 + ' -> ' + gene_2 + ' = ' + str(dist) print ''
if network_file == 'interactome.tsv': print '> default network from "interactome.tsv" will be used' # -------------------------------------------------------- # # LOADING NETWORK and DISEASE GENES # # -------------------------------------------------------- # read network G = tools.read_network(network_file) # get all genes ad remove self links all_genes_in_network = set(G.nodes()) tools.remove_self_links(G) # read gene set gene_set_full = tools.read_gene_list(gene_file) # removing genes that are not in the network: gene_set = gene_set_full & all_genes_in_network if len(gene_set_full) != len(gene_set): print "> ignoring %s genes that are not in the network" %( len(gene_set_full - all_genes_in_network)) print "> remaining number of genes: %s" %(len(gene_set)) # -------------------------------------------------------- # # CALCULATE NETWORK QUANTITIES #