def intialise_potentials(self): if self.init_raw_file is not None: try: raw_potentials = raw.parse_msgpack(self.init_raw_file) except: print("Unexpected error whil reading binary raw file {0}: {1}". format(self.init_raw_file, sys.exc_info()[0])) sys.exit(0) print("\nSuccessfully loaded model parameters from {0}.".format( self.init_raw_file)) self.x_single, self.x_pair = raw_potentials.x_single, raw_potentials.x_pair #in case positions with many gaps should be removed if self.gapped_positions is not None: indices = [ i for i in range(raw_potentials.ncol) if i not in self.gapped_positions ] self.x_single = self.x_single[indices, :] self.x_pair = self.x_pair[indices, :, :, :] self.x_pair = self.x_pair[:, indices, :, :] print( "Removed parameters for positions with >{0}% gaps.".format( self.max_gap_pos)) #save setting for meta data self.single_potential_init = self.init_raw_file self.pair_potential_init = self.init_raw_file else: # default initialisation of parameters: # initialise single potentials from regularization prior self.x_single = self.regularization.center_x_single # initialise pair potnetials at zero self.x_pair = np.zeros((self.L, self.L, 21, 21)) # save settting for meta data self.single_potential_init = self.reg_type self.pair_potential_init = "zero"
def main(): parser = argparse.ArgumentParser(description='Plotting a contact map.') group_append = parser.add_mutually_exclusive_group(required=True) group_append.add_argument('-m', '--mat-file', dest='mat_file', type=str, help='path to mat file') group_append.add_argument('-b', '--braw-file', dest='braw_file', type=str,help='path to binary raw coupling file') parser.add_argument('-o', '--plot-out', dest='plot_out', type=str, help='Output directory for plot') parser.add_argument('--seq-sep', dest='seqsep', type=int, default=6, help='Minimal sequence separation') parser.add_argument('--contact-threshold', dest='contact_threshold', type=int, default=8, help='Contact definition as maximal C_beta distance between residue pairs.') parser.add_argument('--pdb-file', dest='pdb_file', type=str, help='Optional PDB file (renumbered starting from 1) for distance matrix.') parser.add_argument('--alignment-file', dest='alignment_file', type=str, help='Optional alignment file for gap percentage and entropy subplot.') parser.add_argument("--aln-format", dest="aln_format", default="psicov", help="File format for MSAs [default: \"%(default)s\"]") parser.add_argument("--apc", action="store_true", default=False, help="Apply average product correction") parser.add_argument("--entropy-correction", dest='entropy_correction', action="store_true", default=False, help="Apply entropy correction") args = parser.parse_args() if args.mat_file is None and args.braw_file is None: print("Either mat_file or braw_file need to be set.") mat_file = args.mat_file braw_file = args.braw_file alignment_file = args.alignment_file aln_format = args.aln_format pdb_file = args.pdb_file plot_out = args.plot_out seqsep = args.seqsep contact_threshold = args.contact_threshold apc = args.apc entropy_correction = args.entropy_correction alignment=None if alignment_file is not None: alignment = read_msa(alignment_file, aln_format) #compute sequence weights weighting = SequenceWeights(False, 0.8) weights = weighting.weights_simple(alignment) #compute frequencies pseudocounts = PseudoCounts(alignment, weights) pseudocounts.calculate_frequencies( 'uniform_pseudocounts', 1, 1, remove_gaps=False ) if braw_file is not None: braw = raw.parse_msgpack(braw_file) meta_info = braw.meta #compute frobenius score from couplings mat = io.frobenius_score(braw.x_pair) if entropy_correction: if alignment is None: print("Alignment file is necessary to compute entropy correction!") else: scaling_factor_eta, mat = io.compute_local_correction( pseudocounts.freqs[0], braw.x_pair, meta_info['workflow'][0]['msafile']['neff'], meta_info['workflow'][0]['regularization']['lambda_pair'], mat, squared=False, entropy=True ) elif apc: mat = io.apc(mat) if mat_file is not None: mat, meta_info = io.read_matrix(mat_file) if entropy_correction: print("Binary Raw file is necessary to compute entropy correction!") elif apc: mat = io.apc(mat) plot_file = plot_out + "/contact_map_seqsep{0}_contacthr{1}.html".format(seqsep, contact_threshold) plot_contact_map(mat, seqsep, contact_threshold, plot_file, "", alignment=alignment, pdb_file=pdb_file)
def plot_contact_map(alignment_file, aln_format, braw_file, mat_file, pdb_file, plot_file, entropy_correction, apc, seqsep, contact_threshold): pseudocounts = None mat = None gaps_percentage_plot = None protein = None if entropy_correction and (alignment_file is None or braw_file is None): print("Entropy correction requires specification of alignment file and binary raw couplign file!") sys.exit(1) if alignment_file is not None: protein = os.path.basename(alignment_file).split(".")[0] alignment = io.read_msa(alignment_file, aln_format) # compute sequence weights weights = ccmpred.weighting.weights_simple(alignment, 0.8) # compute frequencies pseudocounts = PseudoCounts(alignment, weights) pseudocounts.calculate_frequencies( 'uniform_pseudocounts', 1, 1, remove_gaps=False ) gaps_percentage_plot = plot.plot_percentage_gaps_per_position(pseudocounts.counts[0], plot_file=None) if braw_file is not None: protein = os.path.basename(braw_file).split(".")[0] braw = raw.parse_msgpack(braw_file) meta_info = braw.meta # compute frobenius score from couplings mat = io_cm.frobenius_score(braw.x_pair) if entropy_correction: scaling_factor_eta, mat = io_cm.compute_local_correction( pseudocounts.freqs[0], braw.x_pair, meta_info['workflow'][0]['msafile']['neff'], meta_info['workflow'][0]['regularization']['lambda_pair'], mat, entropy=True ) elif apc: mat = io_cm.apc(mat) if mat_file is not None: protein = os.path.basename(mat_file).split(".")[0] mat, meta_info = io_cm.read_matrix(mat_file) if apc: mat = io_cm.apc(mat) L = len(mat) indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, seqsep) plot_matrix = pd.DataFrame() plot_matrix['residue_i'] = indices_upper_tri_i + 1 plot_matrix['residue_j'] = indices_upper_tri_j + 1 plot_matrix['confidence'] = mat[indices_upper_tri_i, indices_upper_tri_j] if pdb_file is not None: # compute distance map from pdb file observed_distances = io.distance_map(pdb_file, L) plot_matrix['distance'] = observed_distances[indices_upper_tri_i, indices_upper_tri_j] plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist() plot_title="Contact Map for protein {0}".format(protein) # Plot Contact Map plot.plot_contact_map_someScore_plotly(plot_matrix, plot_title, seqsep, gaps_percentage_plot, plot_file)
def collect_data(braw_dir, psicov_dir, pdb_dir, sequence_separation, cb_lower, cb_upper, nr_residue_pairs, diversity_threshold, Nij_threshold, l2normapc_threshold): braw_files = glob.glob(braw_dir + "/*braw.gz") # data coupling_data = pd.DataFrame() for braw_file in braw_files: # braw_file = braw_files[1] protein = os.path.basename(braw_file).split(".")[0] print(protein) alignment_file = psicov_dir + protein + '.aln' if not os.path.exists(alignment_file): print("Alignment File {0} does not exist.".format(alignment_file)) continue pdb_file = pdb_dir + protein + '.pdb' if not os.path.exists(pdb_file): print("PDB File {0} does not exist.".format(pdb_file)) continue AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8) diversity = np.sqrt(AF.N) / AF.L if diversity < diversity_threshold: print("Diversity = {0}. Skip this protein.".format(diversity)) continue braw = raw.parse_msgpack(braw_file) distance_map = pdb.distance_map(pdb_file, AF.L) #mask highly gapped positions gaps = 1 - (AF.Ni / AF.neff) highly_gapped_pos = np.where(np.array(gaps) > 0.5)[0] distance_map[highly_gapped_pos, :] = np.nan distance_map[:, highly_gapped_pos] = np.nan #get all residue pairs i<j residue_i, residue_j = np.triu_indices(AF.L, k=sequence_separation) # get residue pairs within Cb range dist_ij_pairs = distance_map[residue_i, residue_j] residue_i = residue_i[(dist_ij_pairs > cb_lower) & (dist_ij_pairs < cb_upper)] residue_j = residue_j[(dist_ij_pairs > cb_lower) & (dist_ij_pairs < cb_upper)] if len(residue_i) == 0: print("No residues left after applying distance constraints.") continue #apply Nij_treshold Nij = AF.Nij[residue_i, residue_j] residue_i = residue_i[(Nij > Nij_threshold)] residue_j = residue_j[(Nij > Nij_threshold)] if len(residue_i) == 0: print("No residues left after applying pairwise counts constraints.") continue # compute l2norm_apc score that has mean=0 l2norm_apc = bu.compute_l2norm_from_braw(braw.x_pair, apc=True) l2norm_apc_ij_pairs = l2norm_apc[residue_i, residue_j] residue_i = residue_i[(l2norm_apc_ij_pairs > l2normapc_threshold)] residue_j = residue_j[(l2norm_apc_ij_pairs > l2normapc_threshold)] if len(residue_i) == 0: print("No residues left after applying APC threshold constraints.") continue protein_coupling_df = pd.DataFrame( braw.x_pair[residue_i, residue_j, :20, :20].reshape(len(residue_i), 400), columns=io.AB) # ----------------------------------------------------------------------------------- # for reproducibility: set all values between -0.005 and 0.01 to zero # ind = (protein_coupling_df.loc[:, :] > -0.005) & (protein_coupling_df.loc[:, :] < 0.01) # protein_coupling_df[ind] = 0 # ----------------------------------------------------------------------------------- coupling_data = coupling_data.append(protein_coupling_df) print("Dataset size: " + str(len(coupling_data))) sys.stdout.flush() if len(coupling_data) > nr_residue_pairs: break print("final dataset size: " + str(len(coupling_data))) coupling_data.reset_index(inplace=True, drop=True) return coupling_data