def plot_contact_map(mat, seqsep, contact_threshold, title, plot_file=None, alignment_file=None, pdb_file=None): L = len(mat) indices_upper_tri = np.triu_indices(L, seqsep) ### if alignment file is specified, compute Ni if (alignment_file): alignment = io.read_alignment(alignment_file) gaps_percentage_plot = aligncov.plot_percentage_gaps_per_position(alignment, plot_file=None) else: gaps_percentage_plot = None plot_matrix = pd.DataFrame() ###compute distance map from pdb file if (pdb_file): pdb_file = pdb_file observed_distances = pdb.distance_map(pdb_file, L) plot_matrix['distance'] = observed_distances[indices_upper_tri] plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist() # add scores plot_matrix['residue_i'] = indices_upper_tri[0] + 1 plot_matrix['residue_j'] = indices_upper_tri[1] + 1 plot_matrix['confidence'] = mat[indices_upper_tri] ### Plot Contact Map plot.plot_contact_map_someScore_plotly(plot_matrix, title, seqsep, gaps_percentage_plot, plot_file=plot_file)
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting amino acid distribution over the alignment.') parser.add_argument("alignment_file", type=str, help="path to aligment file") parser.add_argument("plot_file", type=str, help="path to plot file") args = parser.parse_args() alignment_file = str(args.alignment_file) plot_file = str(args.plot_file) #protein='2cuaA' #alignment_file="/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln" #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr/" + protein + ".star.aln" #plot_file = "/home/vorberg/alignment_"+protein+".cheating_12_incmr.html" alignment = io.read_alignment(alignment_file) protein = os.path.basename(alignment_file).split(".")[0] N = float(len(alignment)) L = len(alignment[0]) title="Distribution of Amino Acids per position in alignment of " + str(protein) + \ "<br> N="+str(N) + ", L="+str(L) #compute amino acid counts only once aa_counts_single, aa_counts_pair = au.compute_counts(alignment, compute_weights=False) plot_amino_acid_distribution_per_position(aa_counts_single, title, plot_file, freq=True) plot_amino_acid_distribution_per_position(aa_counts_single, title, plot_file, freq=False)
def plot_alignment_entropy(alignment_file, plot_dir=None): # read alignment protein = os.path.basename(alignment_file).split(".")[0] alignment = io.read_alignment(alignment_file) N = float(len(alignment)) L = len(alignment[0]) alignment = alignment.transpose() #determine amino acid frequencies (without any pseudocounts) aa_freq_per_pos = np.zeros((21, L)) for position in range(L): aa_counts = Counter(alignment[position]) for aa, counts in aa_counts.iteritems(): freq = counts / N aa_freq_per_pos[aa, position] = freq aa_freq_per_pos = aa_freq_per_pos[1:] #remove gaps aa_freq_per_pos = aa_freq_per_pos.transpose() entropy_per_position = [ entropy(aa_freq_per_pos[pos], base=2) for pos in range(L) ] #create plot data = [] data.append( go.Scatter(x=[x for x in range(L)], y=entropy_per_position, name="percentage of gaps", mode="Lines")) layout = { 'title': "Entropy (base 2) in alignment of " + str(protein) + "<br> N=" + str(N) + ", L=" + str(L), 'xaxis': { 'title': "Alignment Position" }, 'yaxis': { 'title': "Entropy " }, 'font': { 'size': 18 } } plot = {'data': data, 'layout': layout} if plot_dir is None: return plot else: plot_file = plot_dir + "/alignment_entropy_" + protein + ".html" plotly_plot(plot, filename=plot_file, auto_open=False)
def main(): ### Parse arguments parser = argparse.ArgumentParser( description='Plotting amino acid distribution over the alignment.') parser.add_argument("alignment_file", type=str, help="path to aligment file") parser.add_argument("plot_file", type=str, help="path to plot file") args = parser.parse_args() alignment_file = str(args.alignment_file) plot_file = str(args.plot_file) #protein='2cuaA' #alignment_file="/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln" #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr/" + protein + ".star.aln" #plot_file = "/home/vorberg/alignment_"+protein+".cheating_12_incmr.html" alignment = io.read_alignment(alignment_file) protein = os.path.basename(alignment_file).split(".")[0] N = float(len(alignment)) L = len(alignment[0]) title="Distribution of Amino Acids per position in alignment of " + str(protein) + \ "<br> N="+str(N) + ", L="+str(L) #compute amino acid counts only once aa_counts_single, aa_counts_pair = au.compute_counts(alignment, compute_weights=False) plot_amino_acid_distribution_per_position(aa_counts_single, title, plot_file, freq=True) plot_amino_acid_distribution_per_position(aa_counts_single, title, plot_file, freq=False)
def plot_contact_map(mat, seqsep, contact_threshold, title, plot_file=None, alignment_file=None, pdb_file=None): L = len(mat) indices_upper_tri = np.triu_indices(L, seqsep) ### if alignment file is specified, compute Ni if (alignment_file): alignment = io.read_alignment(alignment_file) gaps_percentage_plot = aligncov.plot_percentage_gaps_per_position( alignment, plot_file=None) else: gaps_percentage_plot = None plot_matrix = pd.DataFrame() ###compute distance map from pdb file if (pdb_file): pdb_file = pdb_file observed_distances = pdb.distance_map(pdb_file, L) plot_matrix['distance'] = observed_distances[indices_upper_tri] plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist() # add scores plot_matrix['residue_i'] = indices_upper_tri[0] + 1 plot_matrix['residue_j'] = indices_upper_tri[1] + 1 plot_matrix['confidence'] = mat[indices_upper_tri] ### Plot Contact Map plot.plot_contact_map_someScore_plotly(plot_matrix, title, seqsep, gaps_percentage_plot, plot_file=plot_file)
def __init__(self, alignment_file, seq_separation=8, contact_threshold=8, non_contact_threshold=25): self.alignment_file = alignment_file self.protein=os.path.basename(self.alignment_file).split(".")[0] self.msa = io.read_alignment(alignment_file) self.seq_separation = seq_separation self.contact_threshold = contact_threshold self.non_contact_threshold = non_contact_threshold self.max_gap_percentage = 0.9 self.L = self.msa.shape[1] self.N = self.msa.shape[0] self.weights=None self.neff=None #indices of upper triangle without diagonal self.ij_ind_upper = np.triu_indices(self.L, k=self.seq_separation) #with gap and without pseudocounts! self.single_counts=None self.pairwise_counts=None #without gap and with pseudocounts! self.single_frequencies=None self.pairwise_frequencies=None self.Ni = None self.Nij = None self.features = {'global': {}, 'single':{}, 'pair':{} } self.compute_frequencies(pseudocounts='background')
def __init__(self, alignment_file, seq_separation=8, contact_threshold=8, non_contact_threshold=25): self.alignment_file = alignment_file self.protein = os.path.basename(self.alignment_file).split(".")[0] self.msa = io.read_alignment(alignment_file) self.seq_separation = seq_separation self.contact_threshold = contact_threshold self.non_contact_threshold = non_contact_threshold self.max_gap_percentage = 0.9 self.L = self.msa.shape[1] self.N = self.msa.shape[0] self.weights = None self.neff = None #indices of upper triangle without diagonal self.ij_ind_upper = np.triu_indices(self.L, k=self.seq_separation) #with gap and without pseudocounts! self.single_counts = None self.pairwise_counts = None #without gap and with pseudocounts! self.single_frequencies = None self.pairwise_frequencies = None self.Ni = None self.Nij = None self.features = {'global': {}, 'single': {}, 'pair': {}} self.compute_frequencies(pseudocounts='background')
def main(): ### Parse arguments parser = argparse.ArgumentParser( description='Plotting sequence similarity matrix.') parser.add_argument("alignment_file", type=str, help="path to aligment file") parser.add_argument("plot_dir", type=str, help="path to plot dir") args = parser.parse_args() alignment_file = str(args.alignment_file) plot_dir = str(args.plot_dir) plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/seq_identity_matrices_alignments/" protein = '1dqgA' #'1i5gA' # '1dqgA'#'1ag6A'#'1ej0A'#'1g2rA' topology = "" topology = ".star" topology = ".binary" # alignment_file="/home/vorberg/" + protein +topology+".mr50.aln" # alignment_file="/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein +topology+".aln" # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein +topology+".aln" # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12/" + protein +topology+".aln" alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_4/" + protein + topology + ".aln" # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr100/" + protein +topology+".aln" # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr10/" + protein +topology+".aln" # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr1/" + protein +topology+".aln" # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12_mr100/" + protein +topology+".aln" # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12_mr10/" + protein +topology+".aln" # alignment_file="/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12_mr1/" + protein +topology+".aln" alignment = io.read_alignment(alignment_file) protein = os.path.basename(alignment_file).split(".")[0] #compute amino acid counts only once similarity_matrix = compute_seq_identities(alignment) #similarity_matrix = hamming_distance_matrix(alignment) print(np.mean(similarity_matrix[-100, :-100])) print(np.min(similarity_matrix)) print(np.mean(similarity_matrix)) #plot seq similarity matrix plot_file = plot_dir + "/sequence_similarity_matrix_" + protein + topology + ".html" plot_seq_id_matrix(similarity_matrix, plot_file=plot_file) #plot dendrogramm with similarity matrix - use hamming distance matrix plot_file = plot_dir + "/sequence_similarity_matrix_dendrogram_" + protein + topology + ".html" plot_seq_id_matrix_with_dendrogram(alignment, similarity_matrix, plot_file=plot_file) #plot boxplot of pairwise sequence identities for one protein and different methods plot_file = plot_dir + "/boxplot_sequence_similarities_" + topology + ".html" alignment_dir_list = [ "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr1/", "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr3/", "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr10/", "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_mr100/" ] plot_seq_id_boxplot(alignment_dir_list, topology, plot_file=plot_file, protein=None) plot_file = plot_dir + "/boxplot_sequence_similarities" + topology + "." + protein + ".html" plot_seq_id_boxplot(alignment_dir_list, topology, plot_file, protein=protein)
def main(): parser = argparse.ArgumentParser(description="Generate SEQATOM sequences from deprecated database or recompute") parser.add_argument("-a", "--alignment", dest="ali", help="path to alignment files") parser.add_argument("-p", "--pdb", dest="pdb", help="path to pdb files") parser.add_argument("-o", "--output", dest="output", help="path to filter directory") parser.add_argument("--min-N", dest="minN", default=10, type=int, help="Minimum number of sequences") parser.add_argument("--max-gap-percentage", dest="maxGap", default=0.8, type=float, help="Maximum percentage of gaps in alignment") parser.add_argument("--max-L", dest="maxL", default=600, type=float, help="Maximum length of protein") parser.add_argument("--min-L", dest="minL", default=20, type=float, help="Minimum length of protein") parser.add_argument("--min-contacts", dest="mincontacts", default=1, type=int, help="Minimum number of contacts") parser.add_argument("--contact-threshold", dest="contact_threshold", default=8, type=int, help="Contact defined as distance between Cbeta atoms < threshold") parser.add_argument("--sequence-separation", dest="seqsep", default=12, type=int, help="Consider only residues separated by this many positions in sequence.") args = parser.parse_args() alignment_dir = args.ali pdb_dir = args.pdb output_dir = args.output minL = args.minL maxL = args.maxL minN = args.minN maxgappercentage = args.maxGap mincontacts = args.mincontacts contact_threshold = args.contact_threshold seqsep = args.seqsep aln_files = glob.glob(alignment_dir + "/*") for alignment_file in aln_files: protein = os.path.basename(alignment_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" if not os.path.exists(pdb_file): print("PDB file {0} does not exist. Skip protein.".format(pdb_file)) continue alignment = io.read_alignment(alignment_file, format="psicov") N = alignment.shape[0] L = alignment.shape[1] percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment)) distance_map = pdb.distance_map(pdb_file, L) nr_contacts = np.sum((distance_map[np.triu_indices(L, k=seqsep)] < contact_threshold) * 1) filter=False if N < minN: print("Alignment size {0} is smaller than filter threshold of {1}".format(N, minN)) filter=True if L < minL: print("Protein length {0} is smaller than filter threshold of {1}".format(L, minL)) filter=True if L > maxL: print("Protein length {0} is bigger than filter threshold of {1}".format(L, maxL)) filter=True if percent_gaps > maxgappercentage: print("Percentag of gaps in alignment ({0}) is larger than filter threshold of {1}".format(percent_gaps, maxgappercentage)) filter=True if nr_contacts < mincontacts: print("Number of contacts (contact_thr = {0}, sequence separation = {1}) in protein structure ({2}) is less than {3}".format(contact_threshold,seqsep, nr_contacts, mincontacts)) filter=True if filter: dest_alignment_file = output_dir + "/" + os.path.basename(alignment_file) os.rename(alignment_file, dest_alignment_file) print("Successfully moved {0} to {1}".format(alignment_file, dest_alignment_file))
def plot_seq_id_boxplot(alignment_dir_list, topology, plot_file, protein=None): data = [] for alignment_dir in alignment_dir_list: method = os.path.basename(os.path.abspath(alignment_dir)) print(method) box_data = [] if protein is not None: alignment_file = alignment_dir + "/" + protein + topology + ".aln" alignment = io.read_alignment(alignment_file) similarity_matrix = compute_seq_identities(alignment) box_data = similarity_matrix[np.triu_indices( similarity_matrix.shape[0], k=1)] else: alignment_files = glob.glob(alignment_dir + "/*" + topology + ".aln") for alignment_file in alignment_files: alignment = io.read_alignment(alignment_file) protein = os.path.basename(alignment_file).split(".")[0] similarity_matrix = compute_seq_identities(alignment) mean_seq_id = np.mean(similarity_matrix[np.triu_indices( similarity_matrix.shape[0], k=1)]) median_seq_id = np.median(similarity_matrix[np.triu_indices( similarity_matrix.shape[0], k=1)]) box_data.append(mean_seq_id) box = go.Box(y=box_data, boxmean='sd', boxpoints='Outliers', name=method, marker=dict(opacity=1), orientation='v', showlegend=False) data.append(box) plot = { "data": data, "layout": go.Layout(yaxis=dict(exponentformat='e', showexponent='All', range=[0, 1]), font=dict(size=18)) } if protein is None: plot['layout']['title'] = "Mean sequence Ids for all proteins" plot['layout']['yaxis']['title'] = "mean sequence id" else: plot['layout'][ 'title'] = "Pairwise sequence identities for protein {0}".format( protein) plot['layout']['yaxis']['title'] = "pairwise sequence id" plotly_plot(plot, filename=plot_file, auto_open=False)
def main(): parser = argparse.ArgumentParser( description= "Generate SEQATOM sequences from deprecated database or recompute") parser.add_argument("-a", "--alignment", dest="ali", help="path to alignment files") parser.add_argument("-p", "--pdb", dest="pdb", help="path to pdb files") parser.add_argument("-o", "--output", dest="output", help="path to filter directory") parser.add_argument("--min-N", dest="minN", default=10, type=int, help="Minimum number of sequences") parser.add_argument("--max-gap-percentage", dest="maxGap", default=0.8, type=float, help="Maximum percentage of gaps in alignment") parser.add_argument("--max-L", dest="maxL", default=600, type=float, help="Maximum length of protein") parser.add_argument("--min-L", dest="minL", default=20, type=float, help="Minimum length of protein") parser.add_argument("--min-contacts", dest="mincontacts", default=1, type=int, help="Minimum number of contacts") parser.add_argument( "--contact-threshold", dest="contact_threshold", default=8, type=int, help="Contact defined as distance between Cbeta atoms < threshold") parser.add_argument( "--sequence-separation", dest="seqsep", default=12, type=int, help= "Consider only residues separated by this many positions in sequence.") args = parser.parse_args() alignment_dir = args.ali pdb_dir = args.pdb output_dir = args.output minL = args.minL maxL = args.maxL minN = args.minN maxgappercentage = args.maxGap mincontacts = args.mincontacts contact_threshold = args.contact_threshold seqsep = args.seqsep aln_files = glob.glob(alignment_dir + "/*") for alignment_file in aln_files: protein = os.path.basename(alignment_file).split(".")[0] pdb_file = pdb_dir + "/" + protein + ".pdb" if not os.path.exists(pdb_file): print( "PDB file {0} does not exist. Skip protein.".format(pdb_file)) continue alignment = io.read_alignment(alignment_file, format="psicov") N = alignment.shape[0] L = alignment.shape[1] percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment)) distance_map = pdb.distance_map(pdb_file, L) nr_contacts = np.sum( (distance_map[np.triu_indices(L, k=seqsep)] < contact_threshold) * 1) filter = False if N < minN: print("Alignment size {0} is smaller than filter threshold of {1}". format(N, minN)) filter = True if L < minL: print("Protein length {0} is smaller than filter threshold of {1}". format(L, minL)) filter = True if L > maxL: print("Protein length {0} is bigger than filter threshold of {1}". format(L, maxL)) filter = True if percent_gaps > maxgappercentage: print( "Percentag of gaps in alignment ({0}) is larger than filter threshold of {1}" .format(percent_gaps, maxgappercentage)) filter = True if nr_contacts < mincontacts: print( "Number of contacts (contact_thr = {0}, sequence separation = {1}) in protein structure ({2}) is less than {3}" .format(contact_threshold, seqsep, nr_contacts, mincontacts)) filter = True if filter: dest_alignment_file = output_dir + "/" + os.path.basename( alignment_file) os.rename(alignment_file, dest_alignment_file) print("Successfully moved {0} to {1}".format( alignment_file, dest_alignment_file))
def __create_evaluation_file(self, protein, pdb_file, aln_file, seqsep): """ Create evaluation file for a protein that contains information about: - cb_distance - i - j :param protein: protein identifier :param pdb_file: path to pdb file for protein :param seqsep: minimal assumed sequence separation :return: """ if not os.path.exists(pdb_file): raise IOError("PDB File " + str(pdb_file) + "does not exist. ") if not os.path.exists(aln_file): raise IOError("Alignment File " + str(aln_file) + "does not exist. ") # determine indices that are resolved in PDB and have minimal required seq sep distance_matrix = pdb.distance_map(pdb_file) # get residue pairs that are resolved and (j-i) > seqsep indices_pairs_resolved = list(zip(*np.where(~np.isnan(distance_matrix)))) indices_pairs_seqsep = list(zip(*np.triu_indices(len(distance_matrix), seqsep))) ij_indices = list(set(indices_pairs_resolved).intersection(indices_pairs_seqsep)) # Create the evaluation file eval_df = pd.DataFrame( { 'i': [i for i,j in ij_indices], 'j': [j for i,j in ij_indices], 'cb_distance': distance_matrix[[i for i,j in ij_indices], [j for i,j in ij_indices]], } ) eval_df.sort_values(by=['i', 'j'], inplace=True) #read alignment alignment = io.read_alignment(aln_file) #compute percentage of gaps percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment)) #compute effective number of sequences weights = weighting.calculate_weights_simple(alignment, 0.8, False) neff = np.sum(weights) meta_protein = { 'name': protein, 'L': alignment.shape[1], 'N': alignment.shape[0], 'diversity': np.sqrt(alignment.shape[0]) / alignment.shape[1], 'gap_percentage': percent_gaps, 'neff': neff } # write evaluation data to file evaluation_file = self.eval_dir + "/" + protein + ".protein" io.write_matfile(eval_df.values, evaluation_file, meta_protein) #add to proteins in evaluation suite if protein not in self.proteins: self.proteins.append(protein)
def collect_data(braw_dir, alignment_dir, pdb_dir, size, diversity_thr, contact_threshold, noncontact_threshold, Nij_threshold): braw_files = glob.glob(braw_dir + "/*braw.gz") couplings_df = pd.DataFrame() nr_contacts = 0 nr_noncontacts = 0 sequence_separation = 10 for braw_file in braw_files: #braw_file = braw_files[0] if nr_contacts >= size and nr_noncontacts >= size: break if not os.path.exists(braw_file): print("Braw File " + str(braw_file) + "cannot be found. ") continue braw = raw.parse_msgpack(braw_file) L = braw.ncol if 'msafile' in braw.meta['workflow'][0]: N = braw.meta['workflow'][0]['msafile']['nrow'] else: N = braw.meta['workflow'][0]['parameters']['msafile']['nrow'] diversity = np.sqrt(N) / L if diversity < diversity_thr: continue protein = os.path.basename(braw_file).split(".")[0] alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(alignment_file): print("Alignment File " + str(alignment_file) + " cannot be found. ") continue pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb" if not os.path.exists(pdb_file): print("PDB File " + str(pdb_file) + " cannot be found. ") continue print protein, "N =", N, "L =", L, "diversity =", diversity indices_upper_tri = np.triu_indices(L, k=sequence_separation) #filter pair indices that have specified Cb distances dist_matrix = pdb.distance_map(pdb_file, L) indices_contact = np.where( (dist_matrix[indices_upper_tri] < contact_threshold))[0] indices_noncontact = np.where( (dist_matrix[indices_upper_tri] > noncontact_threshold))[0] #filter pair indices that have more than Nij_threshold ungapped sequences alignment = io.read_alignment(alignment_file) weights = weighting.calculate_weights_simple(alignment, 0.8, True) pairwise_counts = counts.pair_counts(alignment, weights) Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2) indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0] #get pair indices that fullfill both requirements indices_contact = list( set(indices_contact).intersection(indices_Nij_true)) indices_noncontact = list( set(indices_noncontact).intersection(indices_Nij_true)) #get couplings for filtered pairs braw_reshaped = braw.x_pair[:, :, :20, :20].reshape(L, L, 400) if nr_contacts < size: couplings_contact = pd.DataFrame( braw_reshaped[indices_upper_tri][indices_contact]) couplings_contact['distance'] = dist_matrix[indices_upper_tri][ indices_contact] couplings_df = couplings_df.append(couplings_contact) nr_contacts += len(indices_contact) if nr_noncontacts < size: couplings_noncontact = pd.DataFrame( braw_reshaped[indices_upper_tri][indices_noncontact]) couplings_noncontact['distance'] = dist_matrix[indices_upper_tri][ indices_noncontact] couplings_df = couplings_df.append(couplings_noncontact) nr_noncontacts += len(indices_noncontact) print "Nr of couplings contact: {0} and non-contact: {1}".format( nr_contacts, nr_noncontacts) couplings_df['class'] = (couplings_df['distance'] < contact_threshold) * 1 return couplings_df
def collect_data(self, protein_set=[]): """ Setup a list of residue pairs that will be used for training - get the same amount of contacts/non-contacts - according to some filtering criteria (seqsep, diverstity, etc) :param protein_set: list of protein identifiers or None if None, protein list will be parsed from braw files :return: """ if len(protein_set) == 0: braw_files = glob.glob(self.braw_dir + "/*braw*") for braw in braw_files: protein_set.append(os.path.basename(braw).split(".")[0]) # shuffle rows WITH seed for reproducibility ! ! ! random.seed(self.seed) random.shuffle(protein_set) print('\nNumber of available proteins: {0}. Selecting {1} contacts and {2} non-contacts...'.format( len(protein_set), self.number_of_pairs, self.number_of_pairs * self.balance)) nr_pairs_contact_crossval = 0 nr_pairs_noncontact_crossval = 0 nr_pairs_contacts = 0 nr_pairs_bg = 0 # Iterate over protein files for p in protein_set: # p = protein_set[0] # set up file names psicov_file = self.psicov_dir + "/" + p + ".filt.psc" braw_file_gz = self.braw_dir + "/" + p + ".filt.braw.gz" qijabfile = self.qijab_dir + "/" + p + ".filt.bqij.gz" pdb_file = self.pdb_dir + "/" + p + ".pdb" # p_short = p.replace("_", "") # psicov_file = self.psicov_dir + "/" + p + ".psc" # braw_file_gz = self.braw_dir + "/" + p + ".braw.gz" # qijabfile = self.qijab_dir + "/" + p + ".bqijab.gz" # pdb_file = self.pdb_dir + "/" + p_short + "_ren.pdb" # check if braw file exists, otherwise continue if not os.path.isfile(braw_file_gz): print("Binary raw file {0} for protein {1} could not be found!".format(braw_file_gz, p)) continue if not os.path.isfile(psicov_file): print("Alignment file {0} for protein {1} could not be found!".format(psicov_file, p)) continue if not os.path.isfile(qijabfile): print("qij file {0} for protein {1} could not be found!".format(qijabfile, p)) continue if not os.path.isfile(pdb_file): print("PDB file {0} for protein {1} could not be found!".format(pdb_file, p)) continue psicov = io.read_alignment(psicov_file) N = len(psicov) L = len(psicov[0]) diversity = np.sqrt(N) / L # skip proteins with low diversities if diversity < self.diversity_thr: continue indices_contact, indices_non_contact = self.get_residue_pairs_from_protein( braw_file_gz, qijabfile, pdb_file, psicov) # if no data if len(indices_contact[0]) == 0 and len(indices_non_contact[0]) == 0: continue protein_data = {} protein_data['N'] = N protein_data['L'] = L protein_data['diversity'] = diversity protein_data['braw_file_path'] = braw_file_gz protein_data['msafilename'] = psicov_file protein_data['qijabfilename'] = qijabfile protein_data['residue_i'] = [] protein_data['residue_j'] = [] protein_data['contact'] = [] # shuffle indices, so to not introduce any bias when choosing only the first X pairs from each protein random.seed(self.seed) random.shuffle(indices_contact[0], lambda: 0.1) random.shuffle(indices_contact[1], lambda: 0.1) random.shuffle(indices_non_contact[0], lambda: 0.1) random.shuffle(indices_non_contact[1], lambda: 0.1) if len(indices_contact[0]) > 0 and (nr_pairs_contacts < self.number_of_pairs): protein_data['residue_i'].extend(indices_contact[0][:self.maxcontacts_per_protein]) protein_data['residue_j'].extend(indices_contact[1][:self.maxcontacts_per_protein]) protein_data['contact'].extend([1] * len(indices_contact[0][:self.maxcontacts_per_protein])) nr_pairs_contacts += len(indices_contact[0][:self.maxcontacts_per_protein]) self.training_data[p] = protein_data if len(indices_non_contact[0]) > 0 and nr_pairs_bg < (self.number_of_pairs * self.balance): protein_data['residue_i'].extend(indices_non_contact[0][:self.maxnoncontacts_per_protein]) protein_data['residue_j'].extend(indices_non_contact[1][:self.maxnoncontacts_per_protein]) protein_data['contact'].extend([0] * len(indices_non_contact[0][:self.maxnoncontacts_per_protein])) nr_pairs_bg += len(indices_non_contact[0][:self.maxnoncontacts_per_protein]) self.training_data[p] = protein_data if p not in self.training_data: if len(indices_contact[0]) > 0 and nr_pairs_contact_crossval < self.nr_crossval_pairs: protein_data['residue_i'].extend(indices_contact[0][:self.maxcontacts_per_protein]) protein_data['residue_j'].extend(indices_contact[1][:self.maxcontacts_per_protein]) protein_data['contact'].extend([1] * len(indices_contact[0][:self.maxcontacts_per_protein])) nr_pairs_contact_crossval += len(indices_contact[0][:self.maxcontacts_per_protein]) self.test_data[p] = protein_data if len(indices_non_contact[0]) > 0 and nr_pairs_noncontact_crossval < (self.nr_crossval_pairs * self.balance): protein_data['residue_i'].extend(indices_non_contact[0][:self.maxnoncontacts_per_protein]) protein_data['residue_j'].extend(indices_non_contact[1][:self.maxnoncontacts_per_protein]) protein_data['contact'].extend([0] * len(indices_non_contact[0][:self.maxnoncontacts_per_protein])) nr_pairs_noncontact_crossval += len(indices_non_contact[0][:self.maxnoncontacts_per_protein]) self.test_data[p] = protein_data print("{0}, #pairs in training set: contact={1} bg={2}, #pairs in testset: contact={3} bg={4}".format( p, nr_pairs_contacts, nr_pairs_bg, nr_pairs_contact_crossval, nr_pairs_noncontact_crossval)) # stop condition condition_training = [nr_pairs_contacts >= self.number_of_pairs, nr_pairs_bg >= (self.number_of_pairs * self.balance)] condition_test = [nr_pairs_contact_crossval >= self.nr_crossval_pairs, nr_pairs_noncontact_crossval >= self.nr_crossval_pairs] if (all(condition_training) and all(condition_test)): break self.nr_pairs_contact = nr_pairs_contacts self.nr_pairs_noncontact = nr_pairs_bg self.nr_pairs_contact_cross_val = nr_pairs_contact_crossval self.nr_pairs_noncontact_cross_val = nr_pairs_noncontact_crossval
def main(): ### Parse arguments parser = argparse.ArgumentParser( description='Plotting empirical vs model alignment statistics.') parser.add_argument("observed_alignments", type=str, help="path to original aligment files") parser.add_argument("sampled_alignments_pll", type=str, help="path to sampled alignment files (using PLL)") parser.add_argument("sampled_alignments_pcd", type=str, help="path to sampled alignment files (using PCD)") parser.add_argument("plot_dir", type=str, help="path to output directory for plots") args = parser.parse_args() observed_alignments_path = args.observed_alignments sampled_alignments_paths = args.observed_alignments plot_dir = args.plot_out log = False max_gap_pos = 50 #debug # observed_alignments_path = "/home/vorberg/work/data/ccmgen/psicov/alignments/" # sampled_alignments_paths_pll = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/" # sampled_alignments_paths_pcd = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/pll_vs_pcd_comparison/alignment_statistics_correlation/" data_dict = { 'pseudo-likelihood': { 'x': [], 'y': [] }, 'contrastive divergence': { 'x': [], 'y': [] } } observed_alignments = glob.glob(observed_alignments_path + "/*aln") for obs_aln_file in observed_alignments: protein = os.path.basename(obs_aln_file).split(".")[0] sampled_aln_file_pll = glob.glob(sampled_alignments_paths_pll + "/" + protein + "*.ind.aln") sampled_aln_file_pcd = glob.glob(sampled_alignments_paths_pcd + "/" + protein + "*.ind.aln") if len(sampled_aln_file_pll) == 0 or not os.path.exists( sampled_aln_file_pll[0]): print("Sampled alignment file {0} does not exist!".format( sampled_aln_file_pll[0])) continue if len(sampled_aln_file_pcd) == 0 or not os.path.exists( sampled_aln_file_pcd[0]): print("Sampled alignment file {0} does not exist!".format( sampled_aln_file_pcd[0])) continue print(protein) #read in alignments and remove columns with >50% gaps alignment_o = io.read_alignment(obs_aln_file, max_gap_pos=100, max_gap_seq=100) L_original = alignment_o.shape[1] alignment_o, gapped_positions = io.remove_gapped_positions( alignment_o, max_gap_percentage=max_gap_pos) non_gapped_positions = [ i for i in range(L_original) if i not in gapped_positions ] alignment_s_pll = io.read_alignment(sampled_aln_file_pll[0], max_gap_pos=100, max_gap_seq=100) alignment_s_pll = np.ascontiguousarray( alignment_s_pll[:, non_gapped_positions]) alignment_s_pcd = io.read_alignment(sampled_aln_file_pcd[0], max_gap_pos=100, max_gap_seq=100) alignment_s_pcd = np.ascontiguousarray( alignment_s_pcd[:, non_gapped_positions]) # compute amino acid counts single_freq_observed, pairwise_freq_observed = au.calculate_frequencies( alignment_o, au.uniform_pseudocounts) single_freq_sampled_pll, pairwise_freq_sampled_pll = au.calculate_frequencies( alignment_s_pll, au.uniform_pseudocounts) single_freq_sampled_pcd, pairwise_freq_sampled_pcd = au.calculate_frequencies( alignment_s_pcd, au.uniform_pseudocounts) # degap the frequencies (ignore gap frequencies) single_freq_observed = au.degap(single_freq_observed, False) single_freq_sampled_pll = au.degap(single_freq_sampled_pll, False) single_freq_sampled_pcd = au.degap(single_freq_sampled_pcd, False) pairwise_freq_observed = au.degap(pairwise_freq_observed, False) pairwise_freq_sampled_pll = au.degap(pairwise_freq_sampled_pll, False) pairwise_freq_sampled_pcd = au.degap(pairwise_freq_sampled_pcd, False) #reshape frequencies L = alignment_o.shape[1] indices_upper_triangle = np.triu_indices(L, k=1) x_single = single_freq_observed.flatten().tolist() y_single_pll = single_freq_sampled_pll.flatten().tolist() y_single_pcd = single_freq_sampled_pcd.flatten().tolist() pair_freq_observed = pairwise_freq_observed[ indices_upper_triangle[0], indices_upper_triangle[1], :, :].flatten().tolist() pair_freq_sampled_pll = pairwise_freq_sampled_pll[ indices_upper_triangle[0], indices_upper_triangle[1], :, :].flatten().tolist() pair_freq_sampled_pcd = pairwise_freq_sampled_pcd[ indices_upper_triangle[0], indices_upper_triangle[1], :, :].flatten().tolist() cov_observed = [ pairwise_freq_observed[i, j, a, b] - (single_freq_observed[i, a] * single_freq_observed[j, b]) for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20) ] cov_sampled_pll = [ pairwise_freq_sampled_pll[i, j, a, b] - (single_freq_sampled_pll[i, a] * single_freq_sampled_pll[j, b]) for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20) ] cov_sampled_pcd = [ pairwise_freq_sampled_pcd[i, j, a, b] - (single_freq_sampled_pcd[i, a] * single_freq_sampled_pcd[j, b]) for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20) ] if log: x_single = np.log(x_single) y_single_pll = np.log(y_single_pll) y_single_pcd = np.log(y_single_pcd) pair_freq_observed = np.log(pair_freq_observed) pair_freq_sampled_pll = np.log(pair_freq_sampled_pll) pair_freq_sampled_pcd = np.log(pair_freq_sampled_pcd) #compute pearson correlation coefficient data_dict['pseudo-likelihood']['x'].append( np.corrcoef(x_single, y_single_pll)[0, 1]) data_dict['pseudo-likelihood']['y'].append('single site frequencies') data_dict['pseudo-likelihood']['x'].append( np.corrcoef(pair_freq_observed, pair_freq_sampled_pll)[0, 1]) data_dict['pseudo-likelihood']['y'].append('pairwise frequencies') data_dict['pseudo-likelihood']['x'].append( np.corrcoef(cov_observed, cov_sampled_pll)[0, 1]) data_dict['pseudo-likelihood']['y'].append('Covariances') data_dict['contrastive divergence']['x'].append( np.corrcoef(x_single, y_single_pcd)[0, 1]) data_dict['contrastive divergence']['y'].append( 'single site frequencies') data_dict['contrastive divergence']['x'].append( np.corrcoef(pair_freq_observed, pair_freq_sampled_pcd)[0, 1]) data_dict['contrastive divergence']['y'].append('pairwise frequencies') data_dict['contrastive divergence']['x'].append( np.corrcoef(cov_observed, cov_sampled_pcd)[0, 1]) data_dict['contrastive divergence']['y'].append('Covariances') #plot boxplot plot_boxplot_correlation_alignment_statistics_pll_vs_pcd( data_dict, plot_dir)
def __create_evaluation_file(self, protein, pdb_file, aln_file, seqsep): """ Create evaluation file for a protein that contains information about: - cb_distance - i - j :param protein: protein identifier :param pdb_file: path to pdb file for protein :param seqsep: minimal assumed sequence separation :return: """ if not os.path.exists(pdb_file): raise IOError("PDB File " + str(pdb_file) + "does not exist. ") if not os.path.exists(aln_file): raise IOError("Alignment File " + str(aln_file) + "does not exist. ") # determine indices that are resolved in PDB and have minimal required seq sep distance_matrix = pdb.distance_map(pdb_file) # get residue pairs that are resolved and (j-i) > seqsep indices_pairs_resolved = list( zip(*np.where(~np.isnan(distance_matrix)))) indices_pairs_seqsep = list( zip(*np.triu_indices(len(distance_matrix), seqsep))) ij_indices = list( set(indices_pairs_resolved).intersection(indices_pairs_seqsep)) # Create the evaluation file eval_df = pd.DataFrame({ 'i': [i for i, j in ij_indices], 'j': [j for i, j in ij_indices], 'cb_distance': distance_matrix[[i for i, j in ij_indices], [j for i, j in ij_indices]], }) eval_df.sort_values(by=['i', 'j'], inplace=True) #read alignment alignment = io.read_alignment(aln_file) #compute percentage of gaps percent_gaps = np.mean(ali_ut.compute_gaps_per_position(alignment)) #compute effective number of sequences weights = weighting.calculate_weights_simple(alignment, 0.8, False) neff = np.sum(weights) meta_protein = { 'name': protein, 'L': alignment.shape[1], 'N': alignment.shape[0], 'diversity': np.sqrt(alignment.shape[0]) / alignment.shape[1], 'gap_percentage': percent_gaps, 'neff': neff } # write evaluation data to file evaluation_file = self.eval_dir + "/" + protein + ".protein" io.write_matfile(eval_df.values, evaluation_file, meta_protein) #add to proteins in evaluation suite if protein not in self.proteins: self.proteins.append(protein)
def collect_data(self, protein_set=[]): """ Setup a list of residue pairs that will be used for training - get the same amount of contacts/non-contacts - according to some filtering criteria (seqsep, diverstity, etc) :param protein_set: list of protein identifiers or None if None, protein list will be parsed from braw files :return: """ if len(protein_set) == 0: braw_files = glob.glob(self.braw_dir + "/*braw*") for braw in braw_files: protein_set.append(os.path.basename(braw).split(".")[0]) # shuffle rows WITH seed for reproducibility ! ! ! random.seed(self.seed) random.shuffle(protein_set) print( '\nNumber of available proteins: {0}. Selecting {1} contacts and {2} non-contacts...' .format(len(protein_set), self.number_of_pairs, self.number_of_pairs * self.balance)) nr_pairs_contact_crossval = 0 nr_pairs_noncontact_crossval = 0 nr_pairs_contacts = 0 nr_pairs_bg = 0 # Iterate over protein files for p in protein_set: # p = protein_set[0] # set up file names psicov_file = self.psicov_dir + "/" + p + ".filt.psc" braw_file_gz = self.braw_dir + "/" + p + ".filt.braw.gz" qijabfile = self.qijab_dir + "/" + p + ".filt.bqij.gz" pdb_file = self.pdb_dir + "/" + p + ".pdb" # p_short = p.replace("_", "") # psicov_file = self.psicov_dir + "/" + p + ".psc" # braw_file_gz = self.braw_dir + "/" + p + ".braw.gz" # qijabfile = self.qijab_dir + "/" + p + ".bqijab.gz" # pdb_file = self.pdb_dir + "/" + p_short + "_ren.pdb" # check if braw file exists, otherwise continue if not os.path.isfile(braw_file_gz): print( "Binary raw file {0} for protein {1} could not be found!". format(braw_file_gz, p)) continue if not os.path.isfile(psicov_file): print("Alignment file {0} for protein {1} could not be found!". format(psicov_file, p)) continue if not os.path.isfile(qijabfile): print( "qij file {0} for protein {1} could not be found!".format( qijabfile, p)) continue if not os.path.isfile(pdb_file): print( "PDB file {0} for protein {1} could not be found!".format( pdb_file, p)) continue psicov = io.read_alignment(psicov_file) N = len(psicov) L = len(psicov[0]) diversity = np.sqrt(N) / L # skip proteins with low diversities if diversity < self.diversity_thr: continue indices_contact, indices_non_contact = self.get_residue_pairs_from_protein( braw_file_gz, qijabfile, pdb_file, psicov) # if no data if len(indices_contact[0]) == 0 and len( indices_non_contact[0]) == 0: continue protein_data = {} protein_data['N'] = N protein_data['L'] = L protein_data['diversity'] = diversity protein_data['braw_file_path'] = braw_file_gz protein_data['msafilename'] = psicov_file protein_data['qijabfilename'] = qijabfile protein_data['residue_i'] = [] protein_data['residue_j'] = [] protein_data['contact'] = [] # shuffle indices, so to not introduce any bias when choosing only the first X pairs from each protein random.seed(self.seed) random.shuffle(indices_contact[0], lambda: 0.1) random.shuffle(indices_contact[1], lambda: 0.1) random.shuffle(indices_non_contact[0], lambda: 0.1) random.shuffle(indices_non_contact[1], lambda: 0.1) if len(indices_contact[0]) > 0 and (nr_pairs_contacts < self.number_of_pairs): protein_data['residue_i'].extend( indices_contact[0][:self.maxcontacts_per_protein]) protein_data['residue_j'].extend( indices_contact[1][:self.maxcontacts_per_protein]) protein_data['contact'].extend( [1] * len(indices_contact[0][:self.maxcontacts_per_protein])) nr_pairs_contacts += len( indices_contact[0][:self.maxcontacts_per_protein]) self.training_data[p] = protein_data if len(indices_non_contact[0]) > 0 and nr_pairs_bg < ( self.number_of_pairs * self.balance): protein_data['residue_i'].extend( indices_non_contact[0][:self.maxnoncontacts_per_protein]) protein_data['residue_j'].extend( indices_non_contact[1][:self.maxnoncontacts_per_protein]) protein_data['contact'].extend([0] * len( indices_non_contact[0][:self.maxnoncontacts_per_protein])) nr_pairs_bg += len( indices_non_contact[0][:self.maxnoncontacts_per_protein]) self.training_data[p] = protein_data if p not in self.training_data: if len( indices_contact[0] ) > 0 and nr_pairs_contact_crossval < self.nr_crossval_pairs: protein_data['residue_i'].extend( indices_contact[0][:self.maxcontacts_per_protein]) protein_data['residue_j'].extend( indices_contact[1][:self.maxcontacts_per_protein]) protein_data['contact'].extend( [1] * len(indices_contact[0][:self.maxcontacts_per_protein])) nr_pairs_contact_crossval += len( indices_contact[0][:self.maxcontacts_per_protein]) self.test_data[p] = protein_data if len(indices_non_contact[0] ) > 0 and nr_pairs_noncontact_crossval < ( self.nr_crossval_pairs * self.balance): protein_data['residue_i'].extend( indices_non_contact[0] [:self.maxnoncontacts_per_protein]) protein_data['residue_j'].extend( indices_non_contact[1] [:self.maxnoncontacts_per_protein]) protein_data['contact'].extend( [0] * len(indices_non_contact[0] [:self.maxnoncontacts_per_protein])) nr_pairs_noncontact_crossval += len( indices_non_contact[0] [:self.maxnoncontacts_per_protein]) self.test_data[p] = protein_data print( "{0}, #pairs in training set: contact={1} bg={2}, #pairs in testset: contact={3} bg={4}" .format(p, nr_pairs_contacts, nr_pairs_bg, nr_pairs_contact_crossval, nr_pairs_noncontact_crossval)) # stop condition condition_training = [ nr_pairs_contacts >= self.number_of_pairs, nr_pairs_bg >= (self.number_of_pairs * self.balance) ] condition_test = [ nr_pairs_contact_crossval >= self.nr_crossval_pairs, nr_pairs_noncontact_crossval >= self.nr_crossval_pairs ] if (all(condition_training) and all(condition_test)): break self.nr_pairs_contact = nr_pairs_contacts self.nr_pairs_noncontact = nr_pairs_bg self.nr_pairs_contact_cross_val = nr_pairs_contact_crossval self.nr_pairs_noncontact_cross_val = nr_pairs_noncontact_crossval
def collect_data(braw_dir, alignment_dir, pdb_dir, size, diversity_thr, contact_threshold, noncontact_threshold, Nij_threshold): braw_files = glob.glob(braw_dir + "/*braw.gz") couplings_df = pd.DataFrame() nr_contacts = 0 nr_noncontacts = 0 sequence_separation=10 for braw_file in braw_files: #braw_file = braw_files[0] if nr_contacts >= size and nr_noncontacts >= size: break if not os.path.exists(braw_file): print("Braw File " + str(braw_file) + "cannot be found. ") continue braw = raw.parse_msgpack(braw_file) L = braw.ncol if 'msafile' in braw.meta['workflow'][0]: N = braw.meta['workflow'][0]['msafile']['nrow'] else: N = braw.meta['workflow'][0]['parameters']['msafile']['nrow'] diversity = np.sqrt(N)/L if diversity < diversity_thr: continue protein = os.path.basename(braw_file).split(".")[0] alignment_file = alignment_dir + "/" + protein + ".filt.psc" if not os.path.exists(alignment_file): print("Alignment File " + str(alignment_file) + " cannot be found. ") continue pdb_file = pdb_dir + "/" + protein.replace("_", "") + ".pdb" if not os.path.exists(pdb_file): print("PDB File " + str(pdb_file) + " cannot be found. ") continue print protein, "N =", N, "L =", L, "diversity =", diversity indices_upper_tri = np.triu_indices(L, k=sequence_separation) #filter pair indices that have specified Cb distances dist_matrix = pdb.distance_map(pdb_file, L) indices_contact = np.where((dist_matrix[indices_upper_tri] < contact_threshold))[0] indices_noncontact = np.where((dist_matrix[indices_upper_tri] > noncontact_threshold))[0] #filter pair indices that have more than Nij_threshold ungapped sequences alignment = io.read_alignment(alignment_file) weights = weighting.calculate_weights_simple(alignment, 0.8, True) pairwise_counts = counts.pair_counts(alignment, weights) Nij = pairwise_counts[:, :, :20, :20].sum(3).sum(2) indices_Nij_true = np.where(Nij[indices_upper_tri] > Nij_threshold)[0] #get pair indices that fullfill both requirements indices_contact = list(set(indices_contact).intersection(indices_Nij_true)) indices_noncontact = list(set(indices_noncontact).intersection(indices_Nij_true)) #get couplings for filtered pairs braw_reshaped = braw.x_pair[:,:,:20,:20].reshape(L,L,400) if nr_contacts < size: couplings_contact = pd.DataFrame(braw_reshaped[indices_upper_tri][indices_contact]) couplings_contact['distance'] = dist_matrix[indices_upper_tri][indices_contact] couplings_df = couplings_df.append(couplings_contact) nr_contacts += len(indices_contact) if nr_noncontacts < size: couplings_noncontact = pd.DataFrame(braw_reshaped[indices_upper_tri][indices_noncontact]) couplings_noncontact['distance'] = dist_matrix[indices_upper_tri][indices_noncontact] couplings_df = couplings_df.append(couplings_noncontact) nr_noncontacts += len(indices_noncontact) print "Nr of couplings contact: {0} and non-contact: {1}".format(nr_contacts, nr_noncontacts) couplings_df['class'] = (couplings_df['distance'] < contact_threshold) * 1 return couplings_df
def main(): args = parse_args() alignment_dir = args.alignment_dir sampled_alignment_dir = args.sampled_alignment_dir filter = args.filter plot_dir = args.plot_dir #debug #alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/alignments/" #sampled_alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/" #sampled_alignment_dir = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" #plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/pca/" #filter = "ind." #filter = "ind-rand." #filter = "ind-rand-gap." #filter = "star." #filter = "binary." alignment_files = glob.glob(alignment_dir + "/*aln") for alignment_file in alignment_files: #alignment_file='/home/vorberg/work/data/ccmgen/psicov/alignments/1gmxA.aln' #alignment_file='/home/vorberg/work/data/ccmgen/psicov/alignments/1bkrA.aln' #read alignment alignment = io.read_alignment(alignment_file, max_gap_pos=100, max_gap_seq=100) L_original=alignment.shape[1] alignment, gapped_positions = io.remove_gapped_positions(alignment, max_gap_percentage=50) non_gapped_positions = [i for i in range(L_original) if i not in gapped_positions] name=os.path.basename(alignment_file).split(".")[0] print("{0}: N={1}, L={2}".format(name, alignment.shape[0], alignment.shape[1])) #one hot encoding: transform alignment into binary dummy variables enc = OneHotEncoder(n_values=21) enc.fit(alignment) alignment_one_hot = enc.transform(alignment).toarray() pca = PCA(n_components=2) pca.fit(alignment_one_hot) alignment_transformed = pca.transform(alignment_one_hot) print("N={0}, L={1}".format(alignment_transformed.shape[0], alignment_transformed.shape[1])) print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_)) plot_dict={} plot_dict['name']=name plot_dict['data']=[ { 'name': 'Pfam', 'x': alignment_transformed[:, 0], 'y': alignment_transformed[:, 1], 'seq': alignment, 'N': alignment.shape[0], 'L': alignment.shape[1], 'neff(weights)': au.compute_neff(alignment), 'neff(entropy)': au.compute_neff_hhblits(alignment) } ] plot_projection_on_two_components_gapstructure( plot_dict, plot_out=plot_dir + "/" + name + ".original.PCA_projection.gapstructure.html") #read in sampled alignment sampled_alignment_file = glob.glob(sampled_alignment_dir + "/" + name + "*" + filter + "*aln") #sampled_alignment_file=["/home/vorberg/1bkrA.binary.5.aln"] #sampled_alignment_file=["/home/vorberg/1bkrA.star.5.aln"] if len(sampled_alignment_file) > 0: sampled_alignment = io.read_alignment(sampled_alignment_file[0], max_gap_pos=100, max_gap_seq=100) sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions]) method=os.path.dirname(sampled_alignment_file[0]).split("/")[-1] #one hot encoding enc = OneHotEncoder(n_values=21) enc.fit(sampled_alignment) sampled_alignment_one_hot = enc.transform(sampled_alignment).toarray() sampled_alignment_transformed = pca.transform(sampled_alignment_one_hot) print("N={0}, L={1}".format(sampled_alignment_transformed.shape[0], sampled_alignment_transformed.shape[1])) plot_dict['data'].append( { #'name': method+ "." + filter, 'name': "MCMC PCD", 'x': sampled_alignment_transformed[:, 0], 'y': sampled_alignment_transformed[:, 1], 'seq': sampled_alignment, 'N': sampled_alignment.shape[0], 'L': sampled_alignment.shape[1], 'neff(weights)': au.compute_neff(sampled_alignment), 'neff(entropy)': au.compute_neff_hhblits(sampled_alignment) } ) title="" # title = "Projection onto first 2 PC for protein " + plot_dict['name'] # for plot_data in plot_dict['data']: # title += "<br>{0}: N={1}, L={2}, Neff(weights)={3}, Neff(entropy)={4}".format( # plot_data['name'], # plot_data['N'], plot_data['L'], # np.round(plot_data['neff(weights)'], decimals=3), # np.round(plot_data['neff(entropy)'], decimals=3) # ) plot_projection_on_two_components( plot_dict, title=title, plot_out=plot_dir + "/" + name + "." + method+ "." + filter + "PCA_projection.html" ) for alignment_file in alignment_files[5:]: #alignment_file=alignment_files[0] # read alignment alignment = io.read_alignment(alignment_file, max_gap_pos=100, max_gap_seq=100) L_original=alignment.shape[1] alignment, gapped_positions = io.remove_gapped_positions(alignment, max_gap_percentage=50) non_gapped_positions = [i for i in range(L_original) if i not in gapped_positions] name = os.path.basename(alignment_file).split(".")[0] print("{0}: N={1}, L={2}".format( name, alignment.shape[0],alignment.shape[1]) ) #multiple correspondence analysis for categorical data # MCA is "essentially PCA for categorical variables" # MCA can also be viewed as a PCA applied to the complete disjunctive table (CDT aka indicator matrix) # unstandardized PCA applied to transformed CDT,..., leads to the results of MCA # MCA is defined as the application of weighted Principal component analysis (PCA) to the indicator matrix G df = pd.DataFrame(alignment) df.columns=['col'+str(i) for i in range(1, alignment.shape[1]+1)] mca_df = mca.MCA(df, benzecri=False) #fs_r much slower than fs_r_sup.... #alignment_transformed = mca_df.fs_r(N=2) alignment_transformed = -mca_df.fs_r_sup(df, N=2) plot_dict={} plot_dict['name']=name plot_dict['N'] = alignment.shape[0] plot_dict['L'] = alignment.shape[1] plot_dict['neff(weights)'] = au.compute_neff(alignment) plot_dict['neff(entropy)'] = au.compute_neff_hhblits(alignment) plot_dict['data']=[ { 'name': 'original', 'x': alignment_transformed[:, 0], 'y': alignment_transformed[:, 1], 'N': alignment.shape[0], 'L': alignment.shape[1], 'neff(weights)': au.compute_neff(alignment), 'neff(entropy)': au.compute_neff_hhblits(alignment) } ] #read in sampled alignment sampled_alignment_file = glob.glob(sampled_alignment_dir + "/" + name + "*" + filter + "*.aln") #sampled_alignment_file=["/home/vorberg/1gmxA.star.2.aln"] if len(sampled_alignment_file) > 0: sampled_alignment = io.read_alignment(sampled_alignment_file[0], max_gap_pos=100, max_gap_seq=100) sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions]) method=os.path.dirname(sampled_alignment_file[0]).split("/")[-1] df = pd.DataFrame(sampled_alignment) df.columns = ['col' + str(i) for i in range(1, alignment.shape[1] + 1)] sampled_alignment_transformed = -mca_df.fs_r_sup(df, N=2) print("N={0}, L={1}".format(sampled_alignment_transformed.shape[0], sampled_alignment_transformed.shape[1])) plot_dict['data'].append( { 'name': method+ "." + filter, 'x': sampled_alignment_transformed[:, 0], 'y': sampled_alignment_transformed[:, 1], 'N': sampled_alignment.shape[0], 'L': sampled_alignment.shape[1], 'neff(weights)': au.compute_neff(sampled_alignment), 'neff(entropy)': au.compute_neff_hhblits(sampled_alignment) } ) plot_projection_on_two_components( plot_dict, plot_out=plot_dir + "/" + name + "." + method+ "." + filter + ".MCA_projection.html" )
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting empirical vs model alignment statistics.') parser.add_argument("observed_alignments", type=str, help="path to original aligment files") parser.add_argument("sampled_alignments_pll", type=str, help="path to sampled alignment files (using PLL)") parser.add_argument("sampled_alignments_pcd", type=str, help="path to sampled alignment files (using PCD)") parser.add_argument("plot_dir", type=str, help="path to output directory for plots") args = parser.parse_args() observed_alignments_path = args.observed_alignments sampled_alignments_paths = args.observed_alignments plot_dir = args.plot_out log=False max_gap_pos=50 #debug # observed_alignments_path = "/home/vorberg/work/data/ccmgen/psicov/alignments/" # sampled_alignments_paths_pll = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/" # sampled_alignments_paths_pcd = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/pll_vs_pcd_comparison/alignment_statistics_correlation/" data_dict = { 'pseudo-likelihood': { 'x': [], 'y': [] }, 'contrastive divergence': { 'x': [], 'y': [] } } observed_alignments = glob.glob(observed_alignments_path+"/*aln") for obs_aln_file in observed_alignments: protein= os.path.basename(obs_aln_file).split(".")[0] sampled_aln_file_pll = glob.glob(sampled_alignments_paths_pll + "/" + protein + "*.ind.aln") sampled_aln_file_pcd = glob.glob(sampled_alignments_paths_pcd + "/" + protein + "*.ind.aln") if len(sampled_aln_file_pll) == 0 or not os.path.exists(sampled_aln_file_pll[0]): print("Sampled alignment file {0} does not exist!".format(sampled_aln_file_pll[0])) continue if len(sampled_aln_file_pcd) == 0 or not os.path.exists(sampled_aln_file_pcd[0]): print("Sampled alignment file {0} does not exist!".format(sampled_aln_file_pcd[0])) continue print(protein) #read in alignments and remove columns with >50% gaps alignment_o = io.read_alignment(obs_aln_file, max_gap_pos=100, max_gap_seq=100) L_original = alignment_o.shape[1] alignment_o, gapped_positions = io.remove_gapped_positions(alignment_o, max_gap_percentage=max_gap_pos) non_gapped_positions = [i for i in range(L_original) if i not in gapped_positions] alignment_s_pll = io.read_alignment(sampled_aln_file_pll[0], max_gap_pos=100, max_gap_seq=100) alignment_s_pll = np.ascontiguousarray(alignment_s_pll[:, non_gapped_positions]) alignment_s_pcd = io.read_alignment(sampled_aln_file_pcd[0], max_gap_pos=100, max_gap_seq=100) alignment_s_pcd = np.ascontiguousarray(alignment_s_pcd[:, non_gapped_positions]) # compute amino acid counts single_freq_observed, pairwise_freq_observed = au.calculate_frequencies(alignment_o, au.uniform_pseudocounts) single_freq_sampled_pll, pairwise_freq_sampled_pll = au.calculate_frequencies(alignment_s_pll, au.uniform_pseudocounts) single_freq_sampled_pcd, pairwise_freq_sampled_pcd = au.calculate_frequencies(alignment_s_pcd, au.uniform_pseudocounts) # degap the frequencies (ignore gap frequencies) single_freq_observed = au.degap(single_freq_observed, False) single_freq_sampled_pll = au.degap(single_freq_sampled_pll, False) single_freq_sampled_pcd = au.degap(single_freq_sampled_pcd, False) pairwise_freq_observed = au.degap(pairwise_freq_observed, False) pairwise_freq_sampled_pll = au.degap(pairwise_freq_sampled_pll, False) pairwise_freq_sampled_pcd = au.degap(pairwise_freq_sampled_pcd, False) #reshape frequencies L = alignment_o.shape[1] indices_upper_triangle = np.triu_indices(L, k=1) x_single = single_freq_observed.flatten().tolist() y_single_pll = single_freq_sampled_pll.flatten().tolist() y_single_pcd = single_freq_sampled_pcd.flatten().tolist() pair_freq_observed = pairwise_freq_observed[ indices_upper_triangle[0], indices_upper_triangle[1], :, :].flatten().tolist() pair_freq_sampled_pll = pairwise_freq_sampled_pll[ indices_upper_triangle[0], indices_upper_triangle[1], :, :].flatten().tolist() pair_freq_sampled_pcd = pairwise_freq_sampled_pcd[ indices_upper_triangle[0], indices_upper_triangle[1], :, :].flatten().tolist() cov_observed = [ pairwise_freq_observed[i, j, a, b] - (single_freq_observed[i, a] * single_freq_observed[j, b]) for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)] cov_sampled_pll = [pairwise_freq_sampled_pll[i, j, a, b] - (single_freq_sampled_pll[i, a] * single_freq_sampled_pll[j, b]) for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)] cov_sampled_pcd = [pairwise_freq_sampled_pcd[i, j, a, b] - (single_freq_sampled_pcd[i, a] * single_freq_sampled_pcd[j, b]) for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)] if log: x_single = np.log(x_single) y_single_pll = np.log(y_single_pll) y_single_pcd = np.log(y_single_pcd) pair_freq_observed = np.log(pair_freq_observed) pair_freq_sampled_pll = np.log(pair_freq_sampled_pll) pair_freq_sampled_pcd = np.log(pair_freq_sampled_pcd) #compute pearson correlation coefficient data_dict['pseudo-likelihood']['x'].append(np.corrcoef(x_single, y_single_pll)[0, 1]) data_dict['pseudo-likelihood']['y'].append('single site frequencies') data_dict['pseudo-likelihood']['x'].append(np.corrcoef(pair_freq_observed, pair_freq_sampled_pll)[0, 1]) data_dict['pseudo-likelihood']['y'].append('pairwise frequencies') data_dict['pseudo-likelihood']['x'].append(np.corrcoef(cov_observed, cov_sampled_pll)[0, 1]) data_dict['pseudo-likelihood']['y'].append('Covariances') data_dict['contrastive divergence']['x'].append(np.corrcoef(x_single, y_single_pcd)[0, 1]) data_dict['contrastive divergence']['y'].append('single site frequencies') data_dict['contrastive divergence']['x'].append(np.corrcoef(pair_freq_observed, pair_freq_sampled_pcd)[0, 1]) data_dict['contrastive divergence']['y'].append('pairwise frequencies') data_dict['contrastive divergence']['x'].append(np.corrcoef(cov_observed, cov_sampled_pcd)[0, 1]) data_dict['contrastive divergence']['y'].append('Covariances') #plot boxplot plot_boxplot_correlation_alignment_statistics_pll_vs_pcd(data_dict,plot_dir )
def main(): args = parse_args() braw_dir = args.braw_dir alignment_dir = args.alignment_dir plot_dir = args.plot_dir #debug braw_dir = "/home/vorberg//work/data/ccmgen/psicov/predictions_pcd/" alignment_dir = "/home/vorberg//work/data/ccmgen/psicov/alignments/" plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/scatter_apc_vs_ec/pcd/" pearson_r_list = [] proteins = [] for braw_file in glob.glob(braw_dir +"/*braw.gz"): protein_name = os.path.basename(braw_file).split('.')[0] proteins.append(protein_name) print(protein_name) #read braw file braw = raw.parse_msgpack(braw_file) meta_info = braw.meta neff = np.round(u.find_dict_key("neff", meta_info), decimals=3) lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info), decimals=3) L = braw.ncol # read alignment file alignment_file = alignment_dir +"/" + protein_name + ".aln" alignment = io.read_alignment(alignment_file) single_freq, pair_freq = au.calculate_frequencies(alignment, au.uniform_pseudocounts) #get the highly gapped positions that need to be excluded from analysis alignment_ungapped, gapped_positions = io.remove_gapped_positions(alignment, max_gap_percentage=50) non_gapped_positions = [i for i in range(L) if i not in gapped_positions] indices_i, indices_j = np.triu_indices(len(non_gapped_positions), k=1) #compute ec uij, scaling_factor = bu.compute_entropy_correction( single_freq, neff, lambda_w, braw.x_pair, entropy=True, squared=False, nr_states = 20) ec_term = scaling_factor * np.sqrt(np.sum(uij, axis=(3, 2))) ec_term_ungapped = ec_term[non_gapped_positions, :] ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions] #compute joint EC instead of geometric mean of per-column entropies # uij, scaling_factor = bu.compute_joint_entropy_correction(pair_freq, neff, lambda_w, braw.x_pair, nr_states = 20) # ec_term = scaling_factor * uij # ec_term_ungapped = ec_term[non_gapped_positions, :] # ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions] #compute contact matrix for ungapped positions cmat = bu.compute_l2norm_from_braw(braw.x_pair, apc=False, squared=False) #compute apc cmat_ungapped = cmat[non_gapped_positions, :] cmat_ungapped = cmat_ungapped[:, non_gapped_positions] mean = np.mean(cmat_ungapped, axis=0) apc_term_ungapped = mean[:, np.newaxis] * mean[np.newaxis, :] / np.mean(cmat_ungapped) #plot plot_file = plot_dir + "/" + protein_name + "_apc_vs_ec.html" plot_scatter( apc_term_ungapped[indices_i, indices_j], ec_term_ungapped[indices_i, indices_j], ["i: " + str(i) + "<br>j: " + str(j) for i,j in zip(indices_i, indices_j)], plot_file) #compute pearson correlation coefficient pearson_r_list.append(pearsonr(apc_term_ungapped[indices_i, indices_j], ec_term_ungapped[indices_i, indices_j])[0]) #plot boxplot with jitter plot_file = plot_dir + "/boxplot_pearsonr_apc_vs_ec.html" plot_boxplot_correlation(pearson_r_list, proteins, plot_file)
def main(): # =============================================================================== ### Parse arguments # =============================================================================== parser = argparse.ArgumentParser(description='plot statistics about dataset.') parser.add_argument("-d", "--dataset_files", type=str, help="path to directory with dataset description files") parser.add_argument("-a", "--alignments", type=str, help="path to directory with alignment files") parser.add_argument("-o", "--plot_out", type=str, help="path to directory where to put plot") args = parser.parse_args() plot_out = args.plot_out alignment_path = args.alignments dataset_files = args.dataset_files print ("--------------------------------------------------------") print ("plot_out: \t" + str(plot_out)) print ("path to alignemnt files: \t" + str(alignment_path)) print ("path to dataset files: \t" + str(dataset_files)) print ("--------------------------------------------------------") #plot_out = "/home/vorberg/work/plots/bayesian_framework/dataset_statistics/dataset_cath4.1/" #alignment_path = "/home/vorberg/work/data/benchmarkset_cathV4.1/psicov/" #dataset_files = "/home/vorberg/work/data/benchmarkset_cathV4.1/dataset/dataset_properties/" stats = { 'protein' : [], 'diversity' : [], 'N': [], 'L': [], 'percent_gaps': [] } if dataset_files is not None: dataset_folds = {} for file in glob.glob(dataset_files + "/*n5e01*"): id = os.path.basename(file).split("_")[2] dataset_folds[id] = pd.read_table(file, skipinitialspace=True) dataset_folds[id].columns = ['domain', 'resolution', 'CATH', 'L', 'N'] stats['dataset']= [] stats['cath_topology'] = [] cath_classes = { 1 : 'CATH class 1 (mainly alpha)', 2 : 'CATH class 2 (mainly beta)', 3 : 'CATH class 3 (alpha beta)' } for fold in dataset_folds.keys(): for index, row in dataset_folds[fold].iterrows(): protein = row['domain'] cath = row['CATH'] psicov_file = alignment_path + "/" + protein +".filt.psc" #if it does not exist, it has been filtered due to #combs ambiguity or alignment filter if os.path.exists(psicov_file): alignment = io.read_alignment(psicov_file) L = len(alignment[0]) N = len(alignment) percent_gaps = ali_ut.compute_gaps_per_position(alignment) percent_gaps_alignment = np.mean(percent_gaps) stats['protein'].append(protein) stats['diversity'].append(np.sqrt(N)/L) stats['N'].append(N) stats['L'].append(L) stats['percent_gaps'].append(percent_gaps_alignment) stats['dataset'].append(fold) stats['cath_topology'].append(cath_classes[int(cath.split(".")[0])]) stats_df = pd.DataFrame(stats) if dataset_files is None: psicov_files= glob.glob(alignment_path+"/*") for psicov_file in psicov_files: protein = os.path.basename(psicov_file).split(".")[0] print(protein) alignment = io.read_alignment(psicov_file) L = len(alignment[0]) N = len(alignment) percent_gaps = ali_ut.compute_gaps_per_position(alignment) percent_gaps_alignment = np.mean(percent_gaps) stats['protein'].append(protein) stats['diversity'].append(np.sqrt(N) / L) stats['N'].append(N) stats['L'].append(L) stats['percent_gaps'].append(percent_gaps_alignment) stats_df = pd.DataFrame(stats) ### Plot plot_boxplot_all_stats(stats_df, plot_out=plot_out+"/dataset_stats.html") #=============================================================================== ### Plot #=============================================================================== plot_boxplot_for_statistic( stats_df, 'diversity', 'Distribution of Diversity (sqrt(N)/L)', jitter_pos=2, plot_out=plot_out +"/diversity_dataset_boxplot.html" ) plot_boxplot_for_statistic( stats_df, 'diversity', '', jitter_pos=2, plot_out=plot_out +"/diversity_dataset_boxplot_notitle.html" ) plot_boxplot_for_statistic( stats_df, 'N', 'Distribution of MSA size (# sequences)', jitter_pos=2, plot_out=plot_out + "/msa_size_dataset_boxplot.html") plot_boxplot_for_statistic( stats_df, 'N', '', jitter_pos=2, plot_out=plot_out + "/msa_size_dataset_boxplot_notitle.html") plot_boxplot_for_statistic( stats_df, 'L', 'Distribution of protein lengths', jitter_pos=2, plot_out=plot_out + "/protein_length_dataset_boxplot.html") plot_boxplot_for_statistic( stats_df, 'L', '', jitter_pos=2, plot_out=plot_out + "/protein_length_dataset_boxplot_notitle.html") plot_boxplot_for_statistic( stats_df, 'percent_gaps', 'Distribution of gap percentage', jitter_pos=2, plot_out=plot_out +"/gap_percentage_boxplot.html") plot_boxplot_for_statistic( stats_df, 'percent_gaps', '', jitter_pos=2, plot_out=plot_out +"/gap_percentage_boxplot_notitle.html") plot_stacked_barchart_cath( stats_df, 'Proportion of CATH classes in all datasets', plot_out=plot_out + "/cath_topologies_stacked_relative.html" ) plot_stacked_barchart_cath( stats_df, '', plot_out=plot_out + "/cath_topologies_stacked_reative_notitle.html" )
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') group_append = parser.add_mutually_exclusive_group(required=True) group_append.add_argument('-m', '--mat_file', type=str, dest='mat_file', help='path to mat file') group_append.add_argument('-b', '--braw_file', type=str, dest='braw_file', help='path to braw file') parser.add_argument("-o", "--plot-out", dest="plot_out", type=str, help="directory for plot") parser.add_argument("--seqsep", type=int, default=6, help="sequence separation") parser.add_argument("--contact_threshold", type=int, default=8, help="contact definition; C_beta distance between residue pairs") parser.add_argument("--pdb_file", type=str, help="path to pdb file [optional] - plotting true contacs") parser.add_argument("--alignment_file", type=str, help="path to alignment file [optional] - plotting coverage") parser.add_argument("--apc", action="store_true", default=False, help="Apply average product correction") parser.add_argument("--entropy_correction", action="store_true", default=False, help="Apply entropy correction") args = parser.parse_args() if args.mat_file is None and args.braw_file is None: print("Either mat_file or braw_file need to be set.") plot_out = args.plot_out seqsep = args.seqsep contact_threshold = args.contact_threshold apc = args.apc entropy_correction = args.entropy_correction alignment_file = args.alignment_file pdb_file = args.pdb_file ##### debugging protein = "2hs1A" topology = "binary" topology = "star" alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln" alignment_file = None #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".aln" # alignment_format = "psicov" # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" + protein + ".filt.braw.gz" # braw_file = "/home/vorberg/" + protein + ".gx.gz" # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw_ec_correction/" + protein + ".braw.ec.gz" # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw/" + protein + ".filt.braw.gz" # mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.apc.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.apc.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.ec.20.log2.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.apc.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.ec.20.log2.mat" # pdb_file = "/home/vorberg/work/data/ccmgen/psicov/pdb/" + protein + ".pdb" # # pdb_file=None # seqsep = 4 # # seqsep = 1 # contact_threshold = 8 # plot_out = "/home/vorberg/work/plots/ccmgen/psicov/contact_maps/" apc=True apc = False entropy_correction = True entropy_correction = False ### Compute l2norm score from braw if args.braw_file is not None: braw_file = args.braw_file protein_name = '.'.join(os.path.basename(braw_file).split('.')[:-1]) braw = raw.parse_msgpack(braw_file) meta_info = braw.meta neff = np.round(u.find_dict_key("neff", meta_info), decimals=3) lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info), decimals=3) if entropy_correction: alignment = io.read_alignment(alignment_file) single_freq, pair_freq = au.calculate_frequencies(alignment, au.uniform_pseudocounts) mat = bu.compute_corrected_mat_entropy(braw.x_pair, single_freq, neff, lambda_w, entropy=True, squared=False, nr_states = 20) else: mat = bu.compute_l2norm_from_braw(braw, apc) ### Read score from mat if args.mat_file is not None: mat_file = args.mat_file mat = io.read_matfile(mat_file) if (apc): mat = bu.compute_apc_corrected_matrix(mat) meta_info = io.read_json_from_mat(mat_file) protein_name = os.path.basename(mat_file).split('.')[0] correction="" if apc: correction = "_apc" if entropy_correction: correction ="_ec" plot_file = plot_out + protein_name + "_seqsep" + str(seqsep) + "_contacthr" + str( contact_threshold) + correction + ".html" neff = np.round(u.find_dict_key("neff", meta_info), decimals=3) N = u.find_dict_key("nrow", meta_info) L = u.find_dict_key("ncol", meta_info) title = protein_name + "<br>L: " + str(L) + " N: " + str(N) + " Neff: " + str(neff) + " diversity: " + str( np.round(np.sqrt(N) / L, decimals=3)) plot_contact_map(mat, seqsep, contact_threshold, title, plot_file, alignment_file=alignment_file, pdb_file=pdb_file)
def main(): ### Parse arguments parser = argparse.ArgumentParser( description='Plotting empirical vs model alignment statistics.') parser.add_argument("observed_alignment", type=str, help="path to original aligment file") parser.add_argument("sampled_alignment", type=str, help="path to sampled alignment file") parser.add_argument("plot_dir", type=str, help="path to output directory for plots") args = parser.parse_args() observed_alignment_file = args.observed_alignment sampled_alignment_file = args.sampled_alignment plot_dir = args.plot_out max_gap_pos = 50 ######debugging protein = "1bkrA" observed_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".star.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_ccmgen_star/" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".binary.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_ccmgen_binary/" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".ind.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd/" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/" + protein + ".ind.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pll/" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12/" + protein + ".star.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_1e-3_cheating_12/" sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr/" + protein + ".binary.aln" plot_dir = "/home/vorberg/" # # sampled_alignment_file = "/home/vorberg/" + protein + ".binary.5.aln" # sampled_alignment_file = "/home/vorberg/" + protein + ".star.5.aln" # plot_dir = "/home/vorberg/" #read both alignments alignment_o = io.read_alignment(observed_alignment_file, max_gap_pos=100, max_gap_seq=100) L_original = alignment_o.shape[1] alignment_o, gapped_positions = io.remove_gapped_positions( alignment_o, max_gap_percentage=max_gap_pos) non_gapped_positions = [ i for i in range(L_original) if i not in gapped_positions ] alignment_s = io.read_alignment(sampled_alignment_file, max_gap_pos=100, max_gap_seq=100) alignment_s = np.ascontiguousarray(alignment_s[:, non_gapped_positions]) print(alignment_o.shape, alignment_s.shape) #alignment dimensions N_o = alignment_o.shape[0] N_s = alignment_s.shape[0] L = alignment_o.shape[1] div = np.round(np.sqrt(N_o) / L, decimals=3) neff_weights_o = np.round(au.compute_neff(alignment_o), decimals=3) neff_weights_s = np.round(au.compute_neff(alignment_s), decimals=3) neff_entropy_o = np.round(au.compute_neff_hhblits(alignment_o), decimals=3) neff_entropy_s = np.round(au.compute_neff_hhblits(alignment_s), decimals=3) #compute amino acid counts only once single_freq_observed, pairwise_freq_observed = au.calculate_frequencies( alignment_o, au.uniform_pseudocounts) single_freq_sampled, pairwise_freq_sampled = au.calculate_frequencies( alignment_s, au.uniform_pseudocounts) #degap the frequencies (ignore gap frequencies) single_freq_observed = au.degap(single_freq_observed, False) single_freq_sampled = au.degap(single_freq_sampled, False) pairwise_freq_observed = au.degap(pairwise_freq_observed, False) pairwise_freq_sampled = au.degap(pairwise_freq_sampled, False) #prepare plot properties protein = os.path.basename(observed_alignment_file).split(".")[0] method = os.path.basename(sampled_alignment_file).split(".")[1] title = "Observed and model alignment statistics for {0}".format(protein) title += "<br>original: N={0}, L={1}, div={2}, neff(weights)={3}, neff(entropy)={4}".format( N_o, L, div, neff_weights_o, neff_entropy_o) title += "<br>sampled: N={0}, L={1}, neff(weights)={2}, neff(entropy)={3}".format( N_s, L, neff_weights_s, neff_entropy_s) #title="" #plot in normal and in log space plot_out = plot_dir + "/" + protein + ".empirical_vs_model_alignment_stats_" + method + ".html" plot_empirical_vs_model_statistics(single_freq_observed, single_freq_sampled, pairwise_freq_observed, pairwise_freq_sampled, title=title, plot_out=plot_out, log=False, width=1200) plot_out = plot_dir + "/" + protein + ".empirical_vs_model_alignment_stats_" + method + "_log.html" plot_empirical_vs_model_statistics(single_freq_observed, single_freq_sampled, pairwise_freq_observed, pairwise_freq_sampled, title=title, plot_out=plot_out, log=True)
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting a contact map.') group_append = parser.add_mutually_exclusive_group(required=True) group_append.add_argument('-m', '--mat_file', type=str, dest='mat_file', help='path to mat file') group_append.add_argument('-b', '--braw_file', type=str, dest='braw_file', help='path to braw file') parser.add_argument("-o", "--plot-out", dest="plot_out", type=str, help="directory for plot") parser.add_argument("--seqsep", type=int, default=6, help="sequence separation") parser.add_argument( "--contact_threshold", type=int, default=8, help="contact definition; C_beta distance between residue pairs") parser.add_argument( "--pdb_file", type=str, help="path to pdb file [optional] - plotting true contacs") parser.add_argument( "--alignment_file", type=str, help="path to alignment file [optional] - plotting coverage") parser.add_argument("--apc", action="store_true", default=False, help="Apply average product correction") parser.add_argument("--entropy_correction", action="store_true", default=False, help="Apply entropy correction") args = parser.parse_args() if args.mat_file is None and args.braw_file is None: print("Either mat_file or braw_file need to be set.") plot_out = args.plot_out seqsep = args.seqsep contact_threshold = args.contact_threshold apc = args.apc entropy_correction = args.entropy_correction alignment_file = args.alignment_file pdb_file = args.pdb_file ##### debugging protein = "2hs1A" topology = "binary" topology = "star" alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln" alignment_file = None #alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".aln" # alignment_format = "psicov" # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/ccmpred-pll-centerv/braw/" + protein + ".filt.braw.gz" # braw_file = "/home/vorberg/" + protein + ".gx.gz" # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw_ec_correction/" + protein + ".braw.ec.gz" # braw_file = "/home/vorberg/work/data/benchmarkset_cathV4.1/contact_prediction/count_correction/braw/" + protein + ".filt.braw.gz" # mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pll/" + protein + ".frobenius.apc.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd/" + protein + ".frobenius.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.apc.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/predictions_pcd_cheating_12_pc100/" + protein + ".frobenius.ec.20.log2.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.apc.mat" mat_file = "/home/vorberg/work/data/ccmgen/psicov/recover_pcd_cheating_12_incmr_pc100/" + protein + "." + topology + ".frobenius.ec.20.log2.mat" # pdb_file = "/home/vorberg/work/data/ccmgen/psicov/pdb/" + protein + ".pdb" # # pdb_file=None # seqsep = 4 # # seqsep = 1 # contact_threshold = 8 # plot_out = "/home/vorberg/work/plots/ccmgen/psicov/contact_maps/" apc = True apc = False entropy_correction = True entropy_correction = False ### Compute l2norm score from braw if args.braw_file is not None: braw_file = args.braw_file protein_name = '.'.join(os.path.basename(braw_file).split('.')[:-1]) braw = raw.parse_msgpack(braw_file) meta_info = braw.meta neff = np.round(u.find_dict_key("neff", meta_info), decimals=3) lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info), decimals=3) if entropy_correction: alignment = io.read_alignment(alignment_file) single_freq, pair_freq = au.calculate_frequencies( alignment, au.uniform_pseudocounts) mat = bu.compute_corrected_mat_entropy(braw.x_pair, single_freq, neff, lambda_w, entropy=True, squared=False, nr_states=20) else: mat = bu.compute_l2norm_from_braw(braw, apc) ### Read score from mat if args.mat_file is not None: mat_file = args.mat_file mat = io.read_matfile(mat_file) if (apc): mat = bu.compute_apc_corrected_matrix(mat) meta_info = io.read_json_from_mat(mat_file) protein_name = os.path.basename(mat_file).split('.')[0] correction = "" if apc: correction = "_apc" if entropy_correction: correction = "_ec" plot_file = plot_out + protein_name + "_seqsep" + str( seqsep) + "_contacthr" + str(contact_threshold) + correction + ".html" neff = np.round(u.find_dict_key("neff", meta_info), decimals=3) N = u.find_dict_key("nrow", meta_info) L = u.find_dict_key("ncol", meta_info) title = protein_name + "<br>L: " + str(L) + " N: " + str( N) + " Neff: " + str(neff) + " diversity: " + str( np.round(np.sqrt(N) / L, decimals=3)) plot_contact_map(mat, seqsep, contact_threshold, title, plot_file, alignment_file=alignment_file, pdb_file=pdb_file)
def main(): ### Parse arguments parser = argparse.ArgumentParser(description='Plotting empirical vs model alignment statistics.') parser.add_argument("observed_alignment", type=str, help="path to original aligment file") parser.add_argument("sampled_alignment", type=str, help="path to sampled alignment file") parser.add_argument("plot_dir", type=str, help="path to output directory for plots") args = parser.parse_args() observed_alignment_file = args.observed_alignment sampled_alignment_file = args.sampled_alignment plot_dir = args.plot_out max_gap_pos = 50 ######debugging protein="1bkrA" observed_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/alignments/" + protein + ".aln" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".star.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_ccmgen_star/" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".binary.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_ccmgen_binary/" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd/" + protein + ".ind.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd/" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pll/" + protein + ".ind.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pll/" # # sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_lfactor1e-3_cheating_12/" + protein + ".star.aln" # plot_dir = "/home/vorberg/work/plots/ccmgen/psicov/sampled_pcd_1e-3_cheating_12/" sampled_alignment_file = "/home/vorberg/work/data/ccmgen/psicov/sampled_pcd_cheating_12_incmr/" + protein + ".binary.aln" plot_dir = "/home/vorberg/" # # sampled_alignment_file = "/home/vorberg/" + protein + ".binary.5.aln" # sampled_alignment_file = "/home/vorberg/" + protein + ".star.5.aln" # plot_dir = "/home/vorberg/" #read both alignments alignment_o = io.read_alignment(observed_alignment_file, max_gap_pos=100, max_gap_seq=100) L_original = alignment_o.shape[1] alignment_o, gapped_positions = io.remove_gapped_positions(alignment_o, max_gap_percentage=max_gap_pos) non_gapped_positions = [i for i in range(L_original) if i not in gapped_positions] alignment_s = io.read_alignment(sampled_alignment_file, max_gap_pos=100, max_gap_seq=100) alignment_s = np.ascontiguousarray(alignment_s[:, non_gapped_positions]) print(alignment_o.shape, alignment_s.shape) #alignment dimensions N_o = alignment_o.shape[0] N_s = alignment_s.shape[0] L = alignment_o.shape[1] div=np.round(np.sqrt(N_o)/L, decimals=3) neff_weights_o = np.round(au.compute_neff(alignment_o), decimals=3) neff_weights_s = np.round(au.compute_neff(alignment_s), decimals=3) neff_entropy_o = np.round(au.compute_neff_hhblits(alignment_o), decimals=3) neff_entropy_s = np.round(au.compute_neff_hhblits(alignment_s), decimals=3) #compute amino acid counts only once single_freq_observed, pairwise_freq_observed = au.calculate_frequencies(alignment_o, au.uniform_pseudocounts) single_freq_sampled, pairwise_freq_sampled = au.calculate_frequencies(alignment_s, au.uniform_pseudocounts) #degap the frequencies (ignore gap frequencies) single_freq_observed = au.degap(single_freq_observed, False) single_freq_sampled = au.degap(single_freq_sampled, False) pairwise_freq_observed = au.degap(pairwise_freq_observed, False) pairwise_freq_sampled = au.degap(pairwise_freq_sampled, False) #prepare plot properties protein = os.path.basename(observed_alignment_file).split(".")[0] method = os.path.basename(sampled_alignment_file).split(".")[1] title="Observed and model alignment statistics for {0}".format(protein) title+="<br>original: N={0}, L={1}, div={2}, neff(weights)={3}, neff(entropy)={4}".format(N_o,L,div,neff_weights_o, neff_entropy_o) title+="<br>sampled: N={0}, L={1}, neff(weights)={2}, neff(entropy)={3}".format(N_s,L,neff_weights_s, neff_entropy_s) #title="" #plot in normal and in log space plot_out = plot_dir + "/"+ protein + ".empirical_vs_model_alignment_stats_"+method+".html" plot_empirical_vs_model_statistics( single_freq_observed, single_freq_sampled, pairwise_freq_observed, pairwise_freq_sampled, title=title, plot_out=plot_out, log=False, width=1200) plot_out = plot_dir + "/"+ protein + ".empirical_vs_model_alignment_stats_"+method+"_log.html" plot_empirical_vs_model_statistics( single_freq_observed, single_freq_sampled, pairwise_freq_observed, pairwise_freq_sampled, title=title, plot_out=plot_out, log=True)
def main(): args = parse_args() braw_dir = args.braw_dir alignment_dir = args.alignment_dir plot_dir = args.plot_dir #debug braw_dir = "/home/vorberg//work/data/ccmgen/psicov/predictions_pcd/" alignment_dir = "/home/vorberg//work/data/ccmgen/psicov/alignments/" plot_dir = "/home/vorberg//work/plots/ccmgen/psicov/scatter_apc_vs_ec/pcd/" pearson_r_list = [] proteins = [] for braw_file in glob.glob(braw_dir + "/*braw.gz"): protein_name = os.path.basename(braw_file).split('.')[0] proteins.append(protein_name) print(protein_name) #read braw file braw = raw.parse_msgpack(braw_file) meta_info = braw.meta neff = np.round(u.find_dict_key("neff", meta_info), decimals=3) lambda_w = np.round(u.find_dict_key("lambda_pair", meta_info), decimals=3) L = braw.ncol # read alignment file alignment_file = alignment_dir + "/" + protein_name + ".aln" alignment = io.read_alignment(alignment_file) single_freq, pair_freq = au.calculate_frequencies( alignment, au.uniform_pseudocounts) #get the highly gapped positions that need to be excluded from analysis alignment_ungapped, gapped_positions = io.remove_gapped_positions( alignment, max_gap_percentage=50) non_gapped_positions = [ i for i in range(L) if i not in gapped_positions ] indices_i, indices_j = np.triu_indices(len(non_gapped_positions), k=1) #compute ec uij, scaling_factor = bu.compute_entropy_correction(single_freq, neff, lambda_w, braw.x_pair, entropy=True, squared=False, nr_states=20) ec_term = scaling_factor * np.sqrt(np.sum(uij, axis=(3, 2))) ec_term_ungapped = ec_term[non_gapped_positions, :] ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions] #compute joint EC instead of geometric mean of per-column entropies # uij, scaling_factor = bu.compute_joint_entropy_correction(pair_freq, neff, lambda_w, braw.x_pair, nr_states = 20) # ec_term = scaling_factor * uij # ec_term_ungapped = ec_term[non_gapped_positions, :] # ec_term_ungapped = ec_term_ungapped[:, non_gapped_positions] #compute contact matrix for ungapped positions cmat = bu.compute_l2norm_from_braw(braw.x_pair, apc=False, squared=False) #compute apc cmat_ungapped = cmat[non_gapped_positions, :] cmat_ungapped = cmat_ungapped[:, non_gapped_positions] mean = np.mean(cmat_ungapped, axis=0) apc_term_ungapped = mean[:, np.newaxis] * mean[ np.newaxis, :] / np.mean(cmat_ungapped) #plot plot_file = plot_dir + "/" + protein_name + "_apc_vs_ec.html" plot_scatter(apc_term_ungapped[indices_i, indices_j], ec_term_ungapped[indices_i, indices_j], [ "i: " + str(i) + "<br>j: " + str(j) for i, j in zip(indices_i, indices_j) ], plot_file) #compute pearson correlation coefficient pearson_r_list.append( pearsonr(apc_term_ungapped[indices_i, indices_j], ec_term_ungapped[indices_i, indices_j])[0]) #plot boxplot with jitter plot_file = plot_dir + "/boxplot_pearsonr_apc_vs_ec.html" plot_boxplot_correlation(pearson_r_list, proteins, plot_file)