def generate_data(contact_threshold, sequence_separation, pdb_dir, psicov_dir): number_contacts = {} for contact_thr in contact_threshold: number_contacts[contact_thr] = {} for seqsep in sequence_separation: number_contacts[contact_thr][seqsep] = {'L': [], 'number of contacts': []} alignment_files = glob.glob(psicov_dir + "/*psc") for alignment_file in alignment_files: pdb_file = pdb_dir + "/" + os.path.basename(alignment_file).split(".")[0] + ".pdb" if not os.path.exists(pdb_file): continue print os.path.basename(alignment_file).split(".")[0] L = len(open(alignment_file).readline().rstrip()) distance_map = pdb.distance_map(pdb_file, L) for contact_thr in contact_threshold: residue_i, residue_j = np.where((distance_map < contact_thr)) for seqsep in sequence_separation: indices_seq_sep = np.where((residue_j - residue_i > seqsep)) number_contacts[contact_thr][seqsep]["L"].append(L) number_contacts[contact_thr][seqsep]["number of contacts"].append(len(indices_seq_sep[0])) return number_contacts
def generate_data(contact_threshold, sequence_separation, pdb_dir, psicov_dir): number_contacts = {} for contact_thr in contact_threshold: number_contacts[contact_thr] = {} for seqsep in sequence_separation: number_contacts[contact_thr][seqsep] = { 'L': [], 'number of contacts': [] } alignment_files = glob.glob(psicov_dir + "/*psc") for alignment_file in alignment_files: pdb_file = pdb_dir + "/" + os.path.basename(alignment_file).split( ".")[0] + ".pdb" if not os.path.exists(pdb_file): continue print os.path.basename(alignment_file).split(".")[0] L = len(open(alignment_file).readline().rstrip()) distance_map = pdb.distance_map(pdb_file, L) for contact_thr in contact_threshold: residue_i, residue_j = np.where((distance_map < contact_thr)) for seqsep in sequence_separation: indices_seq_sep = np.where((residue_j - residue_i > seqsep)) number_contacts[contact_thr][seqsep]["L"].append(L) number_contacts[contact_thr][seqsep][ "number of contacts"].append(len(indices_seq_sep[0])) return number_contacts
def collect_data(braw_dirs, alignment_dir, pdb_dir, bin_size, ab): #define distance bins bins = [0, 5, 8, 12, 15, 20, np.inf] max_nr_couplings_per_protein = 500 methods = braw_dirs.keys() couplings_per_bin = {} for method in methods: couplings_per_bin[method] = {} for bin in range(len(bins) - 1): bin_name = str(bin + 1) + ": " + str(bins[bin]) + "-" + str( bins[bin + 1]) couplings_per_bin[method][bin_name] = [] # iterate over proteins psc_files = glob.glob(alignment_dir + "/*psc") for psc_file in psc_files: # psc_file = psc_files[0] protein = os.path.basename(psc_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" # check if ALL braw files exist braw_files = {} for method in methods: braw_files[ method] = braw_dirs[method] + "/" + protein + ".filt.braw.gz" if any([not os.path.exists(braw_files[method]) for method in methods]): print("Skip this protein (braw files does not exist).") continue alignment = io.read_alignment(psc_file, format="psicov") distance_map = pdb.distance_map(pdb_file, alignment.shape[1]) diversity = np.sqrt(alignment.shape[0]) / alignment.shape[1] if diversity < 0.3: print("Skip this protein (low diversity = {0}).".format(diversity)) continue # read braw files braw = {} for method in methods: if ab == 'all': braw[method] = bu.compute_l2norm_from_brawfile( braw_files[method], apc=True) else: braw[method] = raw.parse_msgpack(braw_files[method]) # mask highly gapped positions gaps = ali.compute_gaps_per_position(alignment) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for bin in range(len(bins) - 1): cb_lower = bins[bin] cb_upper = bins[bin + 1] bin_name = sorted(couplings_per_bin[methods[0]].keys())[bin] residue_indices = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) #shuffle indices to remove positioning bias c = list(zip(residue_indices[0], residue_indices[1])) random.shuffle(c) residue_indices = zip(*c) for method in methods: if len(couplings_per_bin[method][bin_name]) < bin_size: if ab == 'all': ab_coupling = braw[method][ residue_indices[0], residue_indices[1]].tolist( )[:max_nr_couplings_per_protein] else: ab_coupling = braw[method].x_pair[ residue_indices[0], residue_indices[1], io.AMINO_INDICES[ab[0]], io.AMINO_INDICES[ab[2]]].tolist( )[:max_nr_couplings_per_protein] couplings_per_bin[method][bin_name].extend(ab_coupling) print("\nprotein {0} bin: {1:<8} size: {2}".format( protein, bin_name, len(couplings_per_bin[methods[0]][bin_name]))) # stop condition: all bins are full if all([ len(v) >= bin_size for v in couplings_per_bin[methods[0]].values() ]): break return couplings_per_bin
def collect_data(pdb_dir, alignment_dir, distance_definition, size): pdb_files = os.listdir(pdb_dir +"/") sequence_separations = [1, 6, 12, 24] distances_ab = {} for seq_sep in sequence_separations: distances_ab[seq_sep] = {} for a in io.AMINO_ACIDS[:20]: for b in io.AMINO_ACIDS[:20]: distances_ab[seq_sep][a+"-"+b] = [] for pdb_file in pdb_files[:size]: #pdb_file=pdb_files[0] protein = os.path.basename(pdb_file).split(".")[0] print protein alignment_file = alignment_dir +"/" + protein +".filt.psc" if not os.path.exists(alignment_file): continue alignment = io.read_alignment(alignment_file) L = alignment.shape[1] query_sequence = alignment[0] dist_matrix = pdb.distance_map(pdb_dir +"/" + pdb_file, L, distance_definition) for seq_sep in sequence_separations: indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, k=seq_sep) if len(indices_upper_tri_i) == 0: continue distances_ab_seqsep = dist_matrix[indices_upper_tri_i, indices_upper_tri_j] AA_a = query_sequence[indices_upper_tri_i] AA_b = query_sequence[indices_upper_tri_j] for pair in range(len(indices_upper_tri_i)): ab = io.AMINO_ACIDS[AA_a[pair]] + "-" + io.AMINO_ACIDS[AA_b[pair]] if AA_a[pair] == 20 or AA_b[pair] == 20: continue distances_ab[seq_sep][ab].extend(list(distances_ab_seqsep[pair][~np.isnan(distances_ab_seqsep[pair])])) # if ab == 'all': # indices_a = range(L) # indices_b = range(L) # else: # query_sequence = alignment[0] # indices_a = np.where(query_sequence == io.AMINO_INDICES[a])[0] # indices_b = np.where(query_sequence == io.AMINO_INDICES[b])[0] # grid_indices_ab_pairs = [(x,y) for x in indices_a for y in indices_b] # # if len(grid_indices_ab_pairs) == 0: # continue # # dist_matrix = pdb.distance_map(pdb_dir +"/" + pdb_file, L, distance_definition) # # for seq_sep in sequence_separations: # # if len(distances_ab[seq_sep]) < size: # indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, k=seq_sep) # # if len(indices_upper_tri_i) == 0: # continue # # indices_seqsep = list(set(zip(indices_upper_tri_i, indices_upper_tri_j)).intersection(grid_indices_ab_pairs)) # if len(indices_seqsep) == 0: # continue # # indices_a_seqsep, indices_b_seqsep = zip(*indices_seqsep) # distances_ab_seqsep = dist_matrix[indices_a_seqsep, indices_b_seqsep] # distances_ab[seq_sep].extend(distances_ab_seqsep[~np.isnan(distances_ab_seqsep)]) # # for seq_sep in sequence_separations: # print(protein + " seq sep " + str(seq_sep) +": " + str(len(distances_ab[seq_sep]))) # # if all([len(distances_ab[seq_sep]) >= size for seq_sep in sequence_separations]): # break for seq_sep in distances_ab.keys(): distances_ab[seq_sep]['all'] = np.concatenate(distances_ab[seq_sep].values()) return distances_ab
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') parser.add_argument("braw_dir", type=str, help="path to binary raw files") parser.add_argument("pdb_dir", type=str, help="path to pdb files") parser.add_argument("alignment_dir", type=str, help="path to alignment files") parser.add_argument("nr_couplings", type=int, default=10000, help="number of couplings") parser.add_argument("plot_out", type=str, help="path to plot file") parser.add_argument("max_per_protein", type=int, default=100, help="maximum numbr couplings per protein") args = parser.parse_args() braw_dir = args.braw_dir pdb_dir = args.pdb_dir alignment_dir = args.alignment_dir nr_couplings = args.nr_couplings plot_out = args.plot_out max_per_protein = args.max_per_protein #debugging braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpredpy_cd/braw/" pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/" alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" nr_couplings = 20000 plot_out = '/home/vorberg/' max_per_protein = 100 if not os.path.exists(braw_dir): raise IOError("Braw Path {0} does not exist.".format(braw_dir)) coupling_df = pd.DataFrame(columns=range(400) + ['Neff']) braw_files = glob.glob(braw_dir + "/*braw*") for braw_file in braw_files: if len(coupling_df) > nr_couplings: break protein = os.path.basename(braw_file).split(".")[0] print protein #-------------get couplings and metadata --------------------------------------------------------------------- braw = raw.parse_msgpack(braw_file) meta = braw.meta neff = meta['workflow'][0]['parameters']['msafile']['neff'] L = meta['workflow'][0]['parameters']['msafile']['ncol'] N = meta['workflow'][0]['parameters']['msafile']['nrow'] diversity = np.sqrt(N) / L #------------------------------------------------------------------------------------------------------------- #-------------filter contacts ------------------------------------------------------------------------------- pdb_file = pdb_dir + "/" + protein + ".pdb" dist_matrix = pdb.distance_map(pdb_file) # get contact map (either 1 encoding a contact or 1 encoding non-contact (according to class variable) contact_map = dist_matrix < 8 # select all residue pairs within contact Threshold indices_contact = list(np.where(np.triu(contact_map, k=1))) #------------------------------------------------------------------------------------------------------------- #--------------filter gap columns --------------------------------------------------------------------------- psicov_file = alignment_dir + "/" + protein + ".filt.psc" psicov = io.read_alignment(psicov_file) percent_gaps_per_column = [ float(psicov[:, l].tolist().count(0)) / N for l in range(L) ] columns_with_many_gaps = [ i for i, j in enumerate(percent_gaps_per_column) if j > 0.2 ] index_delete_contact_i = [ index for index in range(len(indices_contact[0])) if indices_contact[0][index] in columns_with_many_gaps ] index_delete_contact_j = [ index for index in range(len(indices_contact[1])) if indices_contact[1][index] in columns_with_many_gaps ] # delete column pairs from indices_contact indices_contact[0] = np.delete( indices_contact[0], np.unique(index_delete_contact_i + index_delete_contact_j)) indices_contact[1] = np.delete( indices_contact[1], np.unique(index_delete_contact_i + index_delete_contact_j)) #------------------------------------------------------------------------------------------------------------- nr_contacts = len(indices_contact[0]) if nr_contacts == 0: continue random_sample = np.random.choice(range(nr_contacts), replace=False, size=np.min( [max_per_protein, nr_contacts])) couplings = braw.x_pair[ indices_contact[0][random_sample], indices_contact[1][random_sample], :20, :20].reshape( len(random_sample), 400) df = pd.DataFrame(couplings) df['L'] = L df['Neff'] = neff df['Diversity'] = diversity df['sum_wij'] = couplings.sum(1) df['ratio_0.2L_Neff'] = 0.2 * L / neff coupling_df = coupling_df.append(df) print "nr of couplings: {0}".format(len(coupling_df)) plot_file = plot_out + "/coupling_matrix_neff_" + str( nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'Neff', plot_file) plot_file = plot_out + "/coupling_matrix_diversity_" + str( nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'Diversity', plot_file) plot_file = plot_out + "/coupling_matrix_L_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'L', plot_file) plot_file = plot_out + "/coupling_matrix_ratio_0.2L_Neff_" + str( nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'ratio_0.2L_Neff', plot_file)
def main(): parser = argparse.ArgumentParser( description= "Generate SEQATOM sequences from deprecated database or recompute") parser.add_argument("-a", "--alignment", dest="ali", help="path to alignment files") parser.add_argument("-p", "--pdb", dest="pdb", help="path to pdb files") parser.add_argument("-o", "--output", dest="output", help="path to filter directory") parser.add_argument("--min-N", dest="minN", default=10, type=int, help="Minimum number of sequences") parser.add_argument("--max-gap-percentage", dest="maxGap", default=0.8, type=float, help="Maximum percentage of gaps in alignment") parser.add_argument("--max-L", dest="maxL", default=600, type=float, help="Maximum length of protein") parser.add_argument("--min-L", dest="minL", default=20, type=float, help="Minimum length of protein") parser.add_argument("--min-contacts", dest="mincontacts", default=1, type=int, help="Minimum number of contacts") parser.add_argument( "--contact-threshold", dest="contact_threshold", default=8, type=int, help="Contact defined as distance between Cbeta atoms < threshold") parser.add_argument( "--sequence-separation", dest="seqsep", default=12, type=int, help= "Consider only residues separated by this many positions in sequence.") args = parser.parse_args() alignment_dir = args.ali pdb_dir = args.pdb output_dir = args.output minL = args.minL maxL = args.maxL minN = args.minN maxgappercentage = args.maxGap mincontacts = args.mincontacts contact_threshold = args.contact_threshold seqsep = args.seqsep aln_files = glob.glob(alignment_dir + "/*") for alignment_file in aln_files: protein = os.path.basename(alignment_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" if not os.path.exists(pdb_file): print( "PDB file {0} does not exist. Skip protein.".format(pdb_file)) continue alignment = io.read_alignment(alignment_file, format="psicov") N = alignment.shape[0] L = alignment.shape[1] percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment)) distance_map = pdb.distance_map(pdb_file, L) nr_contacts = np.sum( (distance_map[np.triu_indices(L, k=seqsep)] < contact_threshold) * 1) filter = False if N < minN: print("Alignment size {0} is smaller than filter threshold of {1}". format(N, minN)) filter = True if L < minL: print("Protein length {0} is smaller than filter threshold of {1}". format(L, minL)) filter = True if L > maxL: print("Protein length {0} is bigger than filter threshold of {1}". format(L, maxL)) filter = True if percent_gaps > maxgappercentage: print( "Percentag of gaps in alignment ({0}) is larger than filter threshold of {1}" .format(percent_gaps, maxgappercentage)) filter = True if nr_contacts < mincontacts: print( "Number of contacts (contact_thr = {0}, sequence separation = {1}) in protein structure ({2}) is less than {3}" .format(contact_threshold, seqsep, nr_contacts, mincontacts)) filter = True if filter: dest_alignment_file = output_dir + "/" + os.path.basename( alignment_file) os.rename(alignment_file, dest_alignment_file) print("Successfully moved {0} to {1}".format( alignment_file, dest_alignment_file))
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') parser.add_argument("braw_dir", type=str, help="path to binary raw files") parser.add_argument("pdb_dir", type=str, help="path to pdb files") parser.add_argument("alignment_dir", type=str, help="path to alignment files") parser.add_argument("nr_couplings", type=int, default=10000, help="number of couplings") parser.add_argument("plot_out", type=str, help="path to plot file") parser.add_argument("max_per_protein", type=int, default=100, help="maximum numbr couplings per protein") args = parser.parse_args() braw_dir = args.braw_dir pdb_dir = args.pdb_dir alignment_dir = args.alignment_dir nr_couplings = args.nr_couplings plot_out = args.plot_out max_per_protein = args.max_per_protein #debugging braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpredpy_cd/braw/" pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/" alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" nr_couplings = 20000 plot_out='/home/vorberg/' max_per_protein=100 if not os.path.exists(braw_dir): raise IOError("Braw Path {0} does not exist.".format(braw_dir)) coupling_df = pd.DataFrame(columns=range(400) + ['Neff']) braw_files = glob.glob(braw_dir + "/*braw*") for braw_file in braw_files: if len(coupling_df) > nr_couplings: break protein = os.path.basename(braw_file).split(".")[0] print protein #-------------get couplings and metadata --------------------------------------------------------------------- braw = raw.parse_msgpack(braw_file) meta = braw.meta neff = meta['workflow'][0]['parameters']['msafile']['neff'] L = meta['workflow'][0]['parameters']['msafile']['ncol'] N = meta['workflow'][0]['parameters']['msafile']['nrow'] diversity = np.sqrt(N)/L #------------------------------------------------------------------------------------------------------------- #-------------filter contacts ------------------------------------------------------------------------------- pdb_file = pdb_dir +"/"+protein+".pdb" dist_matrix = pdb.distance_map(pdb_file) # get contact map (either 1 encoding a contact or 1 encoding non-contact (according to class variable) contact_map = dist_matrix < 8 # select all residue pairs within contact Threshold indices_contact = list(np.where(np.triu(contact_map, k=1))) #------------------------------------------------------------------------------------------------------------- #--------------filter gap columns --------------------------------------------------------------------------- psicov_file = alignment_dir + "/"+protein+".filt.psc" psicov = io.read_alignment(psicov_file) percent_gaps_per_column = [float(psicov[:, l].tolist().count(0)) / N for l in range(L)] columns_with_many_gaps = [i for i, j in enumerate(percent_gaps_per_column) if j > 0.2] index_delete_contact_i = [index for index in range(len(indices_contact[0])) if indices_contact[0][index] in columns_with_many_gaps] index_delete_contact_j = [index for index in range(len(indices_contact[1])) if indices_contact[1][index] in columns_with_many_gaps] # delete column pairs from indices_contact indices_contact[0] = np.delete(indices_contact[0], np.unique(index_delete_contact_i + index_delete_contact_j)) indices_contact[1] = np.delete(indices_contact[1], np.unique(index_delete_contact_i + index_delete_contact_j)) #------------------------------------------------------------------------------------------------------------- nr_contacts = len(indices_contact[0]) if nr_contacts == 0: continue random_sample = np.random.choice(range(nr_contacts), replace=False, size=np.min([max_per_protein, nr_contacts])) couplings = braw.x_pair[indices_contact[0][random_sample], indices_contact[1][random_sample],:20,:20].reshape(len(random_sample), 400) df = pd.DataFrame(couplings) df['L'] = L df['Neff'] = neff df['Diversity'] = diversity df['sum_wij'] = couplings.sum(1) df['ratio_0.2L_Neff'] = 0.2 * L / neff coupling_df = coupling_df.append(df) print "nr of couplings: {0}".format(len(coupling_df)) plot_file = plot_out + "/coupling_matrix_neff_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'Neff', plot_file) plot_file = plot_out + "/coupling_matrix_diversity_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'Diversity', plot_file) plot_file = plot_out + "/coupling_matrix_L_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'L', plot_file) plot_file = plot_out + "/coupling_matrix_ratio_0.2L_Neff_" + str(nr_couplings) + ".html" plots.plot_coupling_vs_neff(coupling_df, 'ratio_0.2L_Neff', plot_file)
def collect_data(braw_dir, alignment_dir, pdb_dir, ab, cd, cb_lower, cb_upper): #define distance bins couplings = {ab: [], cd: []} max_nr_couplings_per_protein = 500 sequence_separation = 10 evidence_threshold = 80 max_nr_couplings = 5000 diversity_thr = 0.3 a = ab[0] b = ab[2] c = cd[0] d = cd[2] # iterate over proteins braw_files = glob.glob(braw_dir + "/*braw.gz") for braw_file in braw_files: # braw_file = braw_files[0] protein = os.path.basename(braw_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip this protein.".format( pdb_file)) continue if not os.path.exists(braw_file): print("Braw file {0} does not exist. Skip this protein.".format( braw_file)) continue if not os.path.exists(alignment_file): print("Alignment file {0} does not exist. Skip this protein.". format(alignment_file)) continue AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8) diversity = np.sqrt(AF.N) / AF.L if diversity < diversity_thr: print("Diversity = {0}. Skip this protein.".format(diversity)) continue braw = raw.parse_msgpack(braw_file) distance_map = pdb.distance_map(pdb_file, AF.L) #mask highly gapped positions gaps = 1 - (AF.Ni / AF.neff) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins residue_i, residue_j = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) Nij = AF.Nij[residue_i, residue_j] q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]] q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]] q_i_c = AF.single_frequencies[residue_i, io.AMINO_INDICES[c]] q_j_d = AF.single_frequencies[residue_j, io.AMINO_INDICES[d]] evidence_ab = Nij * q_i_a * q_j_b evidence_cd = Nij * q_i_c * q_j_d residue_i = residue_i[(evidence_ab > evidence_threshold) & (evidence_cd > evidence_threshold)] residue_j = residue_j[(evidence_ab > evidence_threshold) & (evidence_cd > evidence_threshold)] if len(residue_i) == 0: continue ab_coupling = braw.x_pair[ residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein] cd_coupling = braw.x_pair[ residue_i, residue_j, io.AMINO_INDICES[c], io.AMINO_INDICES[d]].tolist()[:max_nr_couplings_per_protein] couplings[ab].extend(ab_coupling) couplings[cd].extend(cd_coupling) print("\nprotein {0} size: {1}".format(protein, len(couplings[ab]))) # stop condition: all bins are full if len(couplings[ab]) >= max_nr_couplings: break return couplings
def collect_data(braw_dir, alignment_dir, pdb_dir, ab): #define distance bins couplings_per_bin = { 'bin1': { 'couplings': [], 'lower': 0, 'upper': 8 }, 'bin2': { 'couplings': [], 'lower': 5, 'upper': 10 }, 'bin3': { 'couplings': [], 'lower': 8, 'upper': 12 }, 'bin4': { 'couplings': [], 'lower': 10, 'upper': 15 }, 'bin5': { 'couplings': [], 'lower': 20, 'upper': 50 } } max_nr_couplings_per_protein = 500 sequence_separation = 10 evidence_threshold = 100 max_couplings_per_bin = 10000 a = ab[0] b = ab[2] # iterate over proteins braw_files = glob.glob(braw_dir + "/*braw.gz") for braw_file in braw_files: # braw_file = braw_files[0] protein = os.path.basename(braw_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip this protein.".format( pdb_file)) continue if not os.path.exists(braw_file): print("Braw file {0} does not exist. Skip this protein.".format( braw_file)) continue if not os.path.exists(alignment_file): print( "Alignment file {0} does not exist. Skip this protein.".format( alignment_file)) continue AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8) diversity = np.sqrt(AF.N) / AF.L if diversity < 0.3: print("Diversity = {0}. Skip this protein.".format(diversity)) continue braw = raw.parse_msgpack(braw_file) distance_map = pdb.distance_map(pdb_file, AF.L) #mask highly gapped positions gaps = 1 - (AF.Ni / AF.neff) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for bin_name in sorted(couplings_per_bin.keys(), reverse=True): if len(couplings_per_bin[bin_name] ['couplings']) >= max_couplings_per_bin: continue cb_lower = couplings_per_bin[bin_name]['lower'] cb_upper = couplings_per_bin[bin_name]['upper'] residue_i, residue_j = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) Nij = AF.Nij[residue_i, residue_i] q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]] q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]] evidence = Nij * q_i_a * q_j_b residue_i = residue_i[evidence > evidence_threshold] residue_j = residue_j[evidence > evidence_threshold] if len(residue_i) == 0: continue ab_coupling = braw.x_pair[ residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein] couplings_per_bin[bin_name]['couplings'].extend(ab_coupling) for bin_name in sorted(couplings_per_bin.keys(), reverse=True): print("\nprotein {0} {1:<8} size: {2}".format( protein, bin_name, len(couplings_per_bin[bin_name]['couplings']))) # stop condition: all bins are full if all([ len(bindict['couplings']) >= max_couplings_per_bin for bindict in couplings_per_bin.values() ]): break return couplings_per_bin
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') parser.add_argument("braw_dir", type=str, help="path to binary_raw_files") parser.add_argument("alignment_dir", type=str, help="path to alignment files") parser.add_argument("pdb_dir", type=str, help="path to pdb files") parser.add_argument("ab", type=str, help="ab in range(400)") parser.add_argument("cd", type=str, help="cd in range(400)") parser.add_argument("dist_lower", type=int, default=0, help="Lower Cbeta distance threshold") parser.add_argument("dist_upper", type=int, default=8, help="Upper Cbeta distance threshold") parser.add_argument( "Nij_threshold", type=int, default=100, help="Minimum number of non-gapped sequences at positions i and j ") parser.add_argument("size", type=int, help="number of pairs ij") parser.add_argument("plot_dir", type=str, help="where to save the plot") args = parser.parse_args() braw_dir = args.braw_dir pdb_dir = args.pdb_dir alignment_dir = args.alignment_dir ab = args.ab cd = args.cd dist_lower = args.dist_lower dist_upper = args.dist_upper Nij_threshold = args.Nij_threshold size = args.size plot_dir = args.plot_dir #debugging # pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/" # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" # ab='R-E' # cd='E-R' # dist_lower = 0 # dist_upper = 8 # Nij_threshold = 100 # size = 10000 # plot_dir='/home/vorberg/' braw_files = glob.glob(braw_dir + "/*braw.gz") couplings = {} couplings[ab] = [] couplings[cd] = [] for braw_file in braw_files: if len(couplings[ab]) > size: break if not os.path.exists(braw_file): print("Braw File " + str(braw_file) + "cannot be found. ") continue braw = raw.parse_msgpack(braw_file) L = braw.ncol protein = os.path.basename(braw_file).split(".")[0] alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(alignment_file): print("Alignment File " + str(alignment_file) + " cannot be found. ") continue pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb" if not os.path.exists(pdb_file): print("PDB File " + str(pdb_file) + " cannot be found. ") continue print protein indices_upper_tri = np.triu_indices(L, k=1) #filter pair indices that have specified Cb distances dist_matrix = pdb.distance_map(pdb_file, L) indices_dist_true = np.where( (dist_matrix[indices_upper_tri] > dist_lower) & (dist_matrix[indices_upper_tri] < dist_upper))[0] #filter pair indices that have more than Nij_threshold ungapped sequences alignment = io.read_alignment(alignment_file) weights = weighting.calculate_weights_simple(alignment, 0.8, True) pairwise_counts = counts.pair_counts(alignment, weights) Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2) indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0] #get pair indices that fullfill both requirements indices_merge = list( set(indices_dist_true).intersection(indices_Nij_true)) #get couplings for filtered pairs braw_reshaped = braw.x_pair[:, :, :20, :20].reshape(L, L, 400) couplings[ab].extend( braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[ab]]) couplings[cd].extend( braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[cd]]) print "Nr of couplings: {0}".format(len(couplings[ab])) plot_file = plot_dir + "/pairwise_couplings_" + ab + "_" + cd + "_Nijthreshold" + str( Nij_threshold) + "_Cbdistance_" + str(dist_lower) + "_" + str( dist_upper) + ".html" title = "Couplings {0} vs {1} <br> Nij threshold: {2}, {3} <= Cb_ij <= {4}".format( ab, cd, Nij_threshold, dist_lower, dist_upper) plots.plot_pairwise_couplings_density(couplings, title, plot_out=plot_file)
def collect_data(braw_dir, alignment_dir, pdb_dir, ab): #define distance bins couplings_per_bin={ 'bin1': { 'couplings' : [], 'lower':0, 'upper':8 }, 'bin2': { 'couplings': [], 'lower': 5, 'upper': 10 }, 'bin3': { 'couplings': [], 'lower': 8, 'upper': 12 }, 'bin4': { 'couplings': [], 'lower': 10, 'upper': 15 }, 'bin5': { 'couplings': [], 'lower': 20, 'upper': 50 } } max_nr_couplings_per_protein = 500 sequence_separation=10 evidence_threshold = 100 max_couplings_per_bin = 10000 a = ab[0] b = ab[2] # iterate over proteins braw_files = glob.glob(braw_dir + "/*braw.gz") for braw_file in braw_files: # braw_file = braw_files[0] protein = os.path.basename(braw_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip this protein.".format(pdb_file)) continue if not os.path.exists(braw_file): print("Braw file {0} does not exist. Skip this protein.".format(braw_file)) continue if not os.path.exists(alignment_file): print("Alignment file {0} does not exist. Skip this protein.".format(alignment_file)) continue AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8) diversity = np.sqrt(AF.N) / AF.L if diversity < 0.3: print("Diversity = {0}. Skip this protein.".format(diversity)) continue braw = raw.parse_msgpack(braw_file) distance_map = pdb.distance_map(pdb_file, AF.L) #mask highly gapped positions gaps = 1 - (AF.Ni / AF.neff) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for bin_name in sorted(couplings_per_bin.keys(), reverse=True): if len(couplings_per_bin[bin_name]['couplings']) >= max_couplings_per_bin: continue cb_lower = couplings_per_bin[bin_name]['lower'] cb_upper = couplings_per_bin[bin_name]['upper'] residue_i, residue_j = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) Nij = AF.Nij[residue_i, residue_i] q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]] q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]] evidence = Nij * q_i_a * q_j_b residue_i = residue_i[evidence > evidence_threshold] residue_j = residue_j[evidence > evidence_threshold] if len(residue_i) == 0: continue ab_coupling = braw.x_pair[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein] couplings_per_bin[bin_name]['couplings'].extend(ab_coupling) for bin_name in sorted(couplings_per_bin.keys(), reverse=True): print("\nprotein {0} {1:<8} size: {2}".format( protein, bin_name, len(couplings_per_bin[bin_name]['couplings']))) # stop condition: all bins are full if all([len(bindict['couplings']) >= max_couplings_per_bin for bindict in couplings_per_bin.values()]): break return couplings_per_bin
def collect_data(braw_dirs, alignment_dir, pdb_dir, bin_size, ab): #define distance bins bins=[0, 5, 8, 12, 15, 20, np.inf] max_nr_couplings_per_protein = 500 methods = braw_dirs.keys() couplings_per_bin = {} for method in methods: couplings_per_bin[method] = {} for bin in range(len(bins) - 1): bin_name = str(bin+1) + ": " + str(bins[bin]) + "-" + str(bins[bin + 1]) couplings_per_bin[method][bin_name] = [] # iterate over proteins psc_files = glob.glob(alignment_dir + "/*psc") for psc_file in psc_files: # psc_file = psc_files[0] protein = os.path.basename(psc_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" # check if ALL braw files exist braw_files = {} for method in methods: braw_files[method] = braw_dirs[method] + "/" + protein + ".filt.braw.gz" if any([not os.path.exists(braw_files[method]) for method in methods]): print("Skip this protein (braw files does not exist).") continue alignment = io.read_alignment(psc_file, format="psicov") distance_map = pdb.distance_map(pdb_file, alignment.shape[1]) diversity = np.sqrt(alignment.shape[0]) / alignment.shape[1] if diversity < 0.3: print("Skip this protein (low diversity = {0}).".format(diversity)) continue # read braw files braw = {} for method in methods: if ab == 'all': braw[method] = bu.compute_l2norm_from_brawfile(braw_files[method], apc=True) else: braw[method] = raw.parse_msgpack(braw_files[method]) # mask highly gapped positions gaps = ali.compute_gaps_per_position(alignment) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for bin in range(len(bins) - 1): cb_lower = bins[bin] cb_upper = bins[bin + 1] bin_name = sorted(couplings_per_bin[methods[0]].keys())[bin] residue_indices = np.where((distance_map > cb_lower) & (distance_map < cb_upper)) #shuffle indices to remove positioning bias c = list(zip(residue_indices[0], residue_indices[1])) random.shuffle(c) residue_indices = zip(*c) for method in methods: if len(couplings_per_bin[method][bin_name]) < bin_size: if ab == 'all': ab_coupling = braw[method][residue_indices[0], residue_indices[1]].tolist()[:max_nr_couplings_per_protein] else: ab_coupling = braw[method].x_pair[residue_indices[0], residue_indices[1], io.AMINO_INDICES[ab[0]], io.AMINO_INDICES[ab[2]]].tolist()[:max_nr_couplings_per_protein] couplings_per_bin[method][bin_name].extend(ab_coupling) print("\nprotein {0} bin: {1:<8} size: {2}".format( protein, bin_name, len(couplings_per_bin[methods[0]][bin_name]))) # stop condition: all bins are full if all([len(v) >= bin_size for v in couplings_per_bin[methods[0]].values()]): break return couplings_per_bin
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') parser.add_argument("braw_dir", type=str, help="path to binary_raw_files") parser.add_argument("alignment_dir", type=str, help="path to alignment files") parser.add_argument("pdb_dir", type=str, help="path to pdb files") parser.add_argument("ab", type=str, help="ab in range(400)") parser.add_argument("cd", type=str, help="cd in range(400)") parser.add_argument("dist_lower", type=int, default=0, help="Lower Cbeta distance threshold") parser.add_argument("dist_upper", type=int, default=8, help="Upper Cbeta distance threshold") parser.add_argument("Nij_threshold", type=int, default=100, help="Minimum number of non-gapped sequences at positions i and j ") parser.add_argument("size", type=int, help="number of pairs ij") parser.add_argument("plot_dir", type=str, help="where to save the plot") args = parser.parse_args() braw_dir = args.braw_dir pdb_dir = args.pdb_dir alignment_dir = args.alignment_dir ab = args.ab cd = args.cd dist_lower = args.dist_lower dist_upper = args.dist_upper Nij_threshold = args.Nij_threshold size = args.size plot_dir = args.plot_dir #debugging # pdb_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/pdb_renum_combs/" # braw_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" # alignment_dir = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" # ab='R-E' # cd='E-R' # dist_lower = 0 # dist_upper = 8 # Nij_threshold = 100 # size = 10000 # plot_dir='/home/vorberg/' braw_files = glob.glob(braw_dir + "/*braw.gz") couplings={} couplings[ab]=[] couplings[cd]=[] for braw_file in braw_files: if len(couplings[ab]) > size: break if not os.path.exists(braw_file): print("Braw File " + str(braw_file) + "cannot be found. ") continue braw = raw.parse_msgpack(braw_file) L = braw.ncol protein = os.path.basename(braw_file).split(".")[0] alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(alignment_file): print("Alignment File " + str(alignment_file) + " cannot be found. ") continue pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb" if not os.path.exists(pdb_file): print("PDB File " + str(pdb_file) + " cannot be found. ") continue print protein indices_upper_tri = np.triu_indices(L, k=1) #filter pair indices that have specified Cb distances dist_matrix = pdb.distance_map(pdb_file, L) indices_dist_true = np.where((dist_matrix[indices_upper_tri] > dist_lower) & (dist_matrix[indices_upper_tri] < dist_upper))[0] #filter pair indices that have more than Nij_threshold ungapped sequences alignment = io.read_alignment(alignment_file) weights = weighting.calculate_weights_simple(alignment, 0.8, True) pairwise_counts = counts.pair_counts(alignment, weights) Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2) indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0] #get pair indices that fullfill both requirements indices_merge = list(set(indices_dist_true).intersection(indices_Nij_true)) #get couplings for filtered pairs braw_reshaped = braw.x_pair[:,:,:20,:20].reshape(L,L,400) couplings[ab].extend(braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[ab]]) couplings[cd].extend(braw_reshaped[indices_upper_tri][indices_merge][:, io.AB_INDICES[cd]]) print "Nr of couplings: {0}".format(len(couplings[ab])) plot_file = plot_dir + "/pairwise_couplings_" + ab + "_"+ cd + "_Nijthreshold" + str(Nij_threshold) + "_Cbdistance_" + str(dist_lower) +"_" + str(dist_upper) + ".html" title="Couplings {0} vs {1} <br> Nij threshold: {2}, {3} <= Cb_ij <= {4}".format(ab, cd, Nij_threshold, dist_lower, dist_upper) plots.plot_pairwise_couplings_density(couplings, title, plot_out=plot_file)
def main(): parser = argparse.ArgumentParser(description="Generate SEQATOM sequences from deprecated database or recompute") parser.add_argument("-a", "--alignment", dest="ali", help="path to alignment files") parser.add_argument("-p", "--pdb", dest="pdb", help="path to pdb files") parser.add_argument("-o", "--output", dest="output", help="path to filter directory") parser.add_argument("--min-N", dest="minN", default=10, type=int, help="Minimum number of sequences") parser.add_argument("--max-gap-percentage", dest="maxGap", default=0.8, type=float, help="Maximum percentage of gaps in alignment") parser.add_argument("--max-L", dest="maxL", default=600, type=float, help="Maximum length of protein") parser.add_argument("--min-L", dest="minL", default=20, type=float, help="Minimum length of protein") parser.add_argument("--min-contacts", dest="mincontacts", default=1, type=int, help="Minimum number of contacts") parser.add_argument("--contact-threshold", dest="contact_threshold", default=8, type=int, help="Contact defined as distance between Cbeta atoms < threshold") parser.add_argument("--sequence-separation", dest="seqsep", default=12, type=int, help="Consider only residues separated by this many positions in sequence.") args = parser.parse_args() alignment_dir = args.ali pdb_dir = args.pdb output_dir = args.output minL = args.minL maxL = args.maxL minN = args.minN maxgappercentage = args.maxGap mincontacts = args.mincontacts contact_threshold = args.contact_threshold seqsep = args.seqsep aln_files = glob.glob(alignment_dir + "/*") for alignment_file in aln_files: protein = os.path.basename(alignment_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip protein.".format(pdb_file)) continue alignment = io.read_alignment(alignment_file, format="psicov") N = alignment.shape[0] L = alignment.shape[1] percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment)) distance_map = pdb.distance_map(pdb_file, L) nr_contacts = np.sum((distance_map[np.triu_indices(L, k=seqsep)] < contact_threshold) * 1) filter=False if N < minN: print("Alignment size {0} is smaller than filter threshold of {1}".format(N, minN)) filter=True if L < minL: print("Protein length {0} is smaller than filter threshold of {1}".format(L, minL)) filter=True if L > maxL: print("Protein length {0} is bigger than filter threshold of {1}".format(L, maxL)) filter=True if percent_gaps > maxgappercentage: print("Percentag of gaps in alignment ({0}) is larger than filter threshold of {1}".format(percent_gaps, maxgappercentage)) filter=True if nr_contacts < mincontacts: print("Number of contacts (contact_thr = {0}, sequence separation = {1}) in protein structure ({2}) is less than {3}".format(contact_threshold,seqsep, nr_contacts, mincontacts)) filter=True if filter: dest_alignment_file = output_dir + "/" + os.path.basename(alignment_file) os.rename(alignment_file, dest_alignment_file) print("Successfully moved {0} to {1}".format(alignment_file, dest_alignment_file))
def collect_data(braw_dir, alignment_dir, pdb_dir, pairs, lower_cb_distance, upper_cb_distance): #define distance bins couplings_per_pair={} for pair in pairs: couplings_per_pair[pair] = [] max_nr_couplings_per_protein = 500 sequence_separation=8 evidence_threshold = 100 max_couplings_per_bin = 1000 # iterate over proteins braw_files = glob.glob(braw_dir + "/*braw.gz") for braw_file in braw_files: # braw_file = braw_files[0] protein = os.path.basename(braw_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip this protein.".format(pdb_file)) continue if not os.path.exists(braw_file): print("Braw file {0} does not exist. Skip this protein.".format(braw_file)) continue if not os.path.exists(alignment_file): print("Alignment file {0} does not exist. Skip this protein.".format(alignment_file)) continue AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8) diversity = np.sqrt(AF.N) / AF.L if diversity < 0.3: print("Diversity = {0}. Skip this protein.".format(diversity)) continue braw = raw.parse_msgpack(braw_file) distance_map = pdb.distance_map(pdb_file, AF.L) #mask highly gapped positions gaps = 1 - (AF.Ni / AF.neff) highly_gapped_pos = np.where(np.array(gaps) > 0.3)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan # iterate over pairs for bins for pair in pairs: if len(couplings_per_pair[pair]) >= max_couplings_per_bin: continue residue_i, residue_j = np.where((distance_map > lower_cb_distance) & (distance_map < upper_cb_distance)) if len(residue_i) == 0: continue a = pair[0] b = pair[2] Nij = AF.Nij[residue_i, residue_i] q_i_a = AF.single_frequencies[residue_i, io.AMINO_INDICES[a]] q_j_b = AF.single_frequencies[residue_j, io.AMINO_INDICES[b]] q_ij_ab = AF.pairwise_frequencies[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]] evidence = np.max([Nij * q_i_a * q_j_b, Nij * q_ij_ab]) residue_i = residue_i[evidence > evidence_threshold] residue_j = residue_j[evidence > evidence_threshold] if len(residue_i) == 0: continue ab_coupling = braw.x_pair[residue_i, residue_j, io.AMINO_INDICES[a], io.AMINO_INDICES[b]].tolist()[:max_nr_couplings_per_protein] couplings_per_pair[pair].extend(ab_coupling) str="\n\nprotein {0}".format(protein) for pair in sorted(couplings_per_pair.keys()): str += "\n{0:<8} : {1}".format(pair, len(couplings_per_pair[pair])) print str # stop condition: all bins are full if all([len(couplings_per_pair[pair]) >= max_couplings_per_bin for pair in pairs]): break return couplings_per_pair
def collect_data(pdb_dir, alignment_dir, distance_definition, size): pdb_files = os.listdir(pdb_dir + "/") sequence_separations = [1, 6, 12, 24] distances_ab = {} for seq_sep in sequence_separations: distances_ab[seq_sep] = {} for a in io.AMINO_ACIDS[:20]: for b in io.AMINO_ACIDS[:20]: distances_ab[seq_sep][a + "-" + b] = [] for pdb_file in pdb_files[:size]: #pdb_file=pdb_files[0] protein = os.path.basename(pdb_file).split(".")[0] print protein alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(alignment_file): continue alignment = io.read_alignment(alignment_file) L = alignment.shape[1] query_sequence = alignment[0] dist_matrix = pdb.distance_map(pdb_dir + "/" + pdb_file, L, distance_definition) for seq_sep in sequence_separations: indices_upper_tri_i, indices_upper_tri_j = np.triu_indices( L, k=seq_sep) if len(indices_upper_tri_i) == 0: continue distances_ab_seqsep = dist_matrix[indices_upper_tri_i, indices_upper_tri_j] AA_a = query_sequence[indices_upper_tri_i] AA_b = query_sequence[indices_upper_tri_j] for pair in range(len(indices_upper_tri_i)): ab = io.AMINO_ACIDS[AA_a[pair]] + "-" + io.AMINO_ACIDS[ AA_b[pair]] if AA_a[pair] == 20 or AA_b[pair] == 20: continue distances_ab[seq_sep][ab].extend( list(distances_ab_seqsep[pair] [~np.isnan(distances_ab_seqsep[pair])])) # if ab == 'all': # indices_a = range(L) # indices_b = range(L) # else: # query_sequence = alignment[0] # indices_a = np.where(query_sequence == io.AMINO_INDICES[a])[0] # indices_b = np.where(query_sequence == io.AMINO_INDICES[b])[0] # grid_indices_ab_pairs = [(x,y) for x in indices_a for y in indices_b] # # if len(grid_indices_ab_pairs) == 0: # continue # # dist_matrix = pdb.distance_map(pdb_dir +"/" + pdb_file, L, distance_definition) # # for seq_sep in sequence_separations: # # if len(distances_ab[seq_sep]) < size: # indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, k=seq_sep) # # if len(indices_upper_tri_i) == 0: # continue # # indices_seqsep = list(set(zip(indices_upper_tri_i, indices_upper_tri_j)).intersection(grid_indices_ab_pairs)) # if len(indices_seqsep) == 0: # continue # # indices_a_seqsep, indices_b_seqsep = zip(*indices_seqsep) # distances_ab_seqsep = dist_matrix[indices_a_seqsep, indices_b_seqsep] # distances_ab[seq_sep].extend(distances_ab_seqsep[~np.isnan(distances_ab_seqsep)]) # # for seq_sep in sequence_separations: # print(protein + " seq sep " + str(seq_sep) +": " + str(len(distances_ab[seq_sep]))) # # if all([len(distances_ab[seq_sep]) >= size for seq_sep in sequence_separations]): # break for seq_sep in distances_ab.keys(): distances_ab[seq_sep]['all'] = np.concatenate( distances_ab[seq_sep].values()) return distances_ab