def run_in_out_entropy_comparision(dataset, graphml_file, folder): g = util.read_graphml(graphml_file) in_residues = get_residues_from_graph(g) avg_entropy_dict = {} avg_in_nx_entropy_dict = {} avg_out_nx_entropy_dict = {} avg_entropy_dict['dataset'] = dataset avg_in_nx_entropy_dict['dataset'] = dataset avg_out_nx_entropy_dict['dataset'] = dataset proteins = ['ha', 'na', 'm1', 'm2', 'np', 'pb1', 'pb2', 'pa', 'ns1', 'ns2'] for protein in proteins: file = folder + os.sep + protein + '.afasta' alignment = AlignIO.read(file, 'fasta') sequences = [x.seq for x in alignment] entropies = util.entropy_all_positions(sequences) entropy_avg, in_network_entropy_avg, out_network_entropy_avg = in_out_entropies( entropies, in_residues[protein], folder, protein) avg_entropy_dict[protein] = entropy_avg avg_in_nx_entropy_dict[protein] = in_network_entropy_avg avg_out_nx_entropy_dict[protein] = out_network_entropy_avg print('averages for ', dataset, ' ', protein, ' ', in_network_entropy_avg, out_network_entropy_avg) return avg_entropy_dict, avg_in_nx_entropy_dict, avg_out_nx_entropy_dict
def get_cooccurences(graphml_file, fasta_folder): top_50_edges_cooccurence_counts = [] bottom_50_edges_cooccurence_counts = [] g = util.read_graphml(graphml_file) top_n_edges = util.get_edges(g, start=0, end=50, reverse=True) bottom_n_edges = util.get_edges(g, start=0, end=50, reverse=False) for edge in top_n_edges: residue1 = int(edge[0].split('_')[1]) - 1 protein1 = edge[0].split('_')[0] residue2 = int(edge[1].split('_')[1]) - 1 protein2 = edge[1].split('_')[0] file1 = fasta_folder + os.sep + protein1 + '.afasta' file2 = fasta_folder + os.sep + protein2 + '.afasta' rcc = perform_residue_analysis(file1, file2, residue1, residue2) print(edge) print(sorted(rcc.items(), key=operator.itemgetter(1), reverse=True)) top_50_edges_cooccurence_counts.append(rcc) print('=====================') for edge in bottom_n_edges: residue1 = int(edge[0].split('_')[1]) - 1 protein1 = edge[0].split('_')[0] residue2 = int(edge[1].split('_')[1]) - 1 protein2 = edge[1].split('_')[0] file1 = fasta_folder + os.sep + protein1 + '.afasta' file2 = fasta_folder + os.sep + protein2 + '.afasta' rcc = perform_residue_analysis(file1, file2, residue1, residue2) print(sorted(rcc.items(), key=operator.itemgetter(1), reverse=True)) bottom_50_edges_cooccurence_counts.append(rcc) return top_50_edges_cooccurence_counts, bottom_50_edges_cooccurence_counts
def run_in_out_acc_comparision(dataset, graphml_file, folder): avg_in_acc_dict = {} avg_out_acc_dict = {} avg_in_acc_dict['dataset'] = dataset avg_out_acc_dict['dataset'] = dataset g = util.read_graphml(graphml_file) in_residues = get_residues_from_graph(g) dssp = DSSPData() avg_in, avg_out = run_in_out_acc_ha(in_residues, dssp, g, folder) avg_in_acc_dict['ha'] = avg_in avg_out_acc_dict['ha'] = avg_out avg_in, avg_out = run_in_out_acc_m1(in_residues, dssp, g, folder) avg_in_acc_dict['m1'] = avg_in avg_out_acc_dict['m1'] = avg_out avg_in, avg_out = run_in_out_acc_na(in_residues, dssp, g, folder) avg_in_acc_dict['na'] = avg_in avg_out_acc_dict['na'] = avg_out avg_in, avg_out = run_in_out_acc_np(in_residues, dssp, g, folder) avg_in_acc_dict['np'] = avg_in avg_out_acc_dict['np'] = avg_out avg_in, avg_out = run_in_out_acc_ns1(in_residues, dssp, g, folder) avg_in_acc_dict['ns1'] = avg_in avg_out_acc_dict['ns1'] = avg_out return avg_in_acc_dict, avg_out_acc_dict
def get_cooccurences(graphml_file, fasta_folder): top_50_edges_cooccurence_counts = [] bottom_50_edges_cooccurence_counts = [] g = util.read_graphml(graphml_file) top_n_edges = util.get_edges(g, start=0, end=50, reverse=True) bottom_n_edges = util.get_edges(g, start=0, end=50, reverse=False) for edge in top_n_edges: residue1 = int(edge[0].split('_')[1])-1 protein1 = edge[0].split('_')[0] residue2 = int(edge[1].split('_')[1])-1 protein2 = edge[1].split('_')[0] file1 = fasta_folder + os.sep + protein1 + '.afasta' file2 = fasta_folder + os.sep + protein2 + '.afasta' rcc = perform_residue_analysis(file1, file2, residue1, residue2) print(edge) print(sorted(rcc.items(), key=operator.itemgetter(1), reverse=True)) top_50_edges_cooccurence_counts.append(rcc) print('=====================') for edge in bottom_n_edges: residue1 = int(edge[0].split('_')[1])-1 protein1 = edge[0].split('_')[0] residue2 = int(edge[1].split('_')[1])-1 protein2 = edge[1].split('_')[0] file1 = fasta_folder + os.sep + protein1 + '.afasta' file2 = fasta_folder + os.sep + protein2 + '.afasta' rcc = perform_residue_analysis(file1, file2, residue1, residue2) print(sorted(rcc.items(), key=operator.itemgetter(1), reverse=True)) bottom_50_edges_cooccurence_counts.append(rcc) return top_50_edges_cooccurence_counts, bottom_50_edges_cooccurence_counts
def run(infilename): ingraph = read_graphml(infilename) outgraph = create_protein_graph(ingraph) write_graphml(outgraph, infilename) plot_file = infilename.split('.')[0] + '.png' create_plot(outgraph, plot_file) #run('u800d_05_B01b.graphml') #compare_two_graphs('all_05_01.graphml', '800d_05_01.graphml')
def compare_two_graphs(file1, file2): g1 = read_graphml(file1) g2 = read_graphml(file2) nodes1 = g1.nodes() nodes2 = g2.nodes() matches = [] in1only = [] in2only = [] for node in nodes1: if node in nodes2: matches.append(node) else: in1only.append(node) for node in nodes2: if node not in nodes1: in2only.append(node) print(len(nodes1), len(nodes2), len(matches), len(in1only), len(in2only))
def run_in_out_acc_comparision(): graphml_file = sys.argv[1] folder = sys.argv[2] g = util.read_graphml(graphml_file) in_residues = get_residues_from_graph(g) dssp = DSSPData() run_in_out_acc_ha(in_residues, dssp, g, folder) run_in_out_acc_m1(in_residues, dssp, g, folder) run_in_out_acc_na(in_residues, dssp, g, folder) run_in_out_acc_np(in_residues, dssp, g, folder) run_in_out_acc_ns1(in_residues, dssp, g, folder)
def run(folder, infilename, title): ingraph = read_graphml(folder + os.sep + infilename) try: #avg_cluster_coeff = nx.average_clustering(ingraph) #print('average clustering for ' + title + " = " + str(avg_cluster_coeff)) avg_deg_coeff = nx.average_degree_connectivity(ingraph) print('average degree for ' + title + ' = ' + str(avg_deg_coeff)) except Exception as e: print (e.__str__()) '''
def run_in_out_entropy_comparision(): graphml_file = sys.argv[1] folder = sys.argv[2] g = util.read_graphml(graphml_file) in_residues = get_residues_from_graph(g) proteins = ['ha', 'na', 'm1', 'm2', 'np', 'pb1', 'pb2', 'pa', 'ns1', 'ns2'] for protein in proteins: file = folder + os.sep + protein + '.afasta' alignment = AlignIO.read(file, 'fasta') sequences = [x.seq for x in alignment] entropies = util.entropy_all_positions(sequences) in_network_entropy_avg, out_network_entropy_avg = in_out_entropies(entropies, in_residues[protein], folder, protein) print('averages for ', protein, ' ', in_network_entropy_avg, out_network_entropy_avg)
def run_pairwise_comparision(): graphml_file = sys.argv[1] folder = sys.argv[2] g = util.read_graphml(graphml_file) top_ten_edges = get_best_edges_from_graph(g, 10) for edge in top_ten_edges: residue1 = int(edge[0].split('_')[1])-1 protein1 = edge[0].split('_')[0] residue2 = int(edge[1].split('_')[1])-1 protein2 = edge[1].split('_')[0] print(protein1, protein2, residue1, residue2) file1 = folder + os.sep + protein1 + '.afasta' file2 = folder + os.sep + protein2 + '.afasta' perform_residue_analysis(file1, file2, residue1, residue2)
def run_in_out_entropy_comparision(dataset, graphml_file, folder): g = util.read_graphml(graphml_file) in_residues = get_residues_from_graph(g) avg_entropy_dict = {} avg_in_nx_entropy_dict = {} avg_out_nx_entropy_dict = {} avg_entropy_dict['dataset'] = dataset avg_in_nx_entropy_dict['dataset'] = dataset avg_out_nx_entropy_dict['dataset'] = dataset proteins = ['ha', 'na', 'm1', 'm2', 'np', 'pb1', 'pb2', 'pa', 'ns1', 'ns2'] for protein in proteins: file = folder + os.sep + protein + '.afasta' alignment = AlignIO.read(file, 'fasta') sequences = [x.seq for x in alignment] entropies = util.entropy_all_positions(sequences) entropy_avg, in_network_entropy_avg, out_network_entropy_avg = in_out_entropies(entropies, in_residues[protein], folder, protein) avg_entropy_dict[protein] = entropy_avg avg_in_nx_entropy_dict[protein] = in_network_entropy_avg avg_out_nx_entropy_dict[protein] = out_network_entropy_avg print('averages for ', dataset, ' ', protein, ' ', in_network_entropy_avg, out_network_entropy_avg) return avg_entropy_dict, avg_in_nx_entropy_dict, avg_out_nx_entropy_dict
def run(folder, infilename, title): ingraph = read_graphml(folder + '\\' + infilename) create_clustering_plot(ingraph, folder=folder, title=title) create_degree_plot(ingraph, folder=folder, title=title)
def run(folder, infilename, title): ingraph = read_graphml(folder + os.sep + infilename) outgraph = create_protein_graph(ingraph) write_macro_graphml(outgraph, folder, infilename) plot_file = infilename.split('.')[0] + '.png' create_plot(outgraph, folder, plot_file, title)