def perturb_fragment(self, sample_index, random_fragment): """ TO DO Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein. --------- Params: - TO DO Returns: - TO DO """ # Create a new copy of the original protein to experiment on new_pose = Pose() new_pose.assign(self.prev_protein.pose) self.next_protein = Protein(pose=new_pose) # Print the Previous Protein debug('Previous Protein:', self.iter, 5) for pos in range(1, self.prev_protein.length+1): debug('\tPosition: {}\tAngle: {}'.format(pos, self.prev_protein.get_torsion(pos)), self.iter, 5) # For each residue in fragment, replace the corresponding torsion angles in copy of protein pos = sample_index debug('random fragment: {}\nPosition: {}'.format(random_fragment, pos), self.iter, 5) for phi, psi in random_fragment: self.next_protein.set_torsion(pos, phi, psi) pos += 1 # Print Protein after angle replacement debug('New Protein:', self.iter, 5) for pos in range(1, self.prev_protein.length+1): debug('\tPosition: {}\tAngle: {}'.format(pos, self.next_protein.get_torsion(pos)), self.iter, 5)
def perturb_fragment(self, protein, position): # you may want to add more arguments """ Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein. --------- Params: - protein = an input protein object that you want to copy and perturb - position = a position (1-indexed) in the input protein at which you would like to start the perturbation Returns: - perturbed fragment (a protein object) """ #check which fragments have already been sampled from the current position during the current step chosen_indices = self.sampled_fragments[position] fragments_to_sample_from = set(range(self.nfrags)) - chosen_indices #choose a candidate fragment at this position, then add that candidate to the list of previously chosen fragments (during this step) chosen_candidate = random.choice(list(fragments_to_sample_from)) self.sampled_fragments[position].add(chosen_candidate) #after candidate fragment is chosen, make perturbed fragment and return new_positions = self.candidate_frag_list[position][chosen_candidate] perturbed_fragment = Protein(pose=protein.pose) for i in range(len(new_positions)): perturbed_fragment.set_torsion((position + i), new_positions[i][0], new_positions[i][1]) return (perturbed_fragment)
def AddSeq(self,pseq): pt = Protein(pseq) if pt.isvalid(): #check validity for Ptn in self.dseqs.values(): #check repeat sequences if pt.seq == Ptn.seq: return False self.dseqs[len(self.dseqs)+1] = pt #1-based integers return True return False
def getBasicProteins(self, splittedLine): temp = [] for i in range(0, len(splittedLine), 2): p = Protein() acc = Helper.retrieveAccessionNumber(splittedLine[i]) p.accession = acc p.orthologGroup = self temp.append(p) return temp
def __init__(self, params): self.ligand = Protein() self.ligand.import_pdb(params.ligand_file_name) self.receptor = Protein() self.receptor.import_pdb(params.receptor_file_name) self.cg_atoms = [] if params.energy_type == "vdw": [self.index_ligand, self.index_receptor] = self.get_index(["CA", "CB"])
def __init__(self, sequence, steps=10000, temp_min=0.15, temp_max=1.0, temp_delta=0.05, save_interval=100): self.protein = Protein(sequence) self.steps = steps self.temp_min = temp_min self.temp_max = temp_max self.temp_delta = temp_delta self.temp = temp_max self.save_interval = save_interval self.best = None
def user_proteins(self): proteins = [] for i in range(len(self._protein_names)): name = self._protein_names[i].get() sequence = self._protein_sequences[i].get() proteins.append(Protein(name, sequence)) return proteins
def test_values(self): protein1 = Protein("SelfProt", 2, 0, 1, 1, 0, 0, 2, 0.15, 1, 0.076) #Make sure errors are raised when necessary # Test distance when protein 2 has negative numbers protein2 = Protein("IncorrectProt", -3, 0, 0, 1, 0, 3, 2, 0.24, 2, 0.016) self.assertRaises(ValueError, protein1.distance, protein2 ) # Test distance when protein 2 has booleans: protein2 = Protein("IncorrectProt", True, 0, 0, 1, 0, 3, 2, 0.24, 2, 0.016) self.assertRaises(ValueError, protein1.distance, protein2 ) # Test distance when protein 2 has strings: protein2 = Protein("IncorrectProt", "twenty", 0, 0, 1, 0, 3, 2, 0.24, 2, 0.016) self.assertRaises(ValueError, protein1.distance, protein2 )
def LoadFile(self, filename, filetype="text", sep=None): f = open(filename, "r") lines = f.readlines() if sep is not None: lines = lines.split(sep) if filetype == "fasta": title = None temp = [] for line in lines: if line.startswith(">"): if title is None: title = line else: title = line pt = Protein("".join(temp)) if pt.isvalid: #only adds valid seqs self.dseqs[len(self.dseqs)+1] = pt temp = [] else: temp.append(line.strip("\n")) if len(temp) != 0: pt = Protein("".join(temp)) if pt.isvalid: #only adds valid seqs self.dseqs[len(self.dseqs)+1] = pt self.history.enqueue(("LoadFile",filename,"fasta",sep)) elif filetype == "text": for line in lines: pt = Protein(line) if pt.isvalid(pt.seq): self.dseqs[len(self.dseqs)+1] = line else: return False return True
def readPDBFromStream(stream: Base.IOStream): from Protein import Protein from MoleculeTools import sanitize_mol r = Biomol.PDBMoleculeReader(stream) mol = Chem.BasicMolecule() r.read(mol) sanitize_mol(mol, makeHydrogenComplete=True) return Protein(mol)
def __init__(self, location, inputFile, outputDir=None, cns=False, reject=None, angleOnly=False, ppm=False, progressBar=None, writePgm=True): self.input = inputFile self.progressBar = progressBar print 'DANGLE (version 1.1)' print DANGLE_CITE # 1. read config file for location of reference information self.reference = Reference(os.path.dirname(location)) self.reference.outDir = outputDir or OUTDIR if not os.path.isdir(self.reference.outDir): os.makedirs(self.reference.outDir) self.reference.cns = cns self.reference.ppm = ppm self.reference.angleOnly = angleOnly if (reject is not None): self.reference.rejectThresh = reject # 2. read shifts of query protein (input) and calculate secondary shifts self.query = Protein(self.reference) self.query.readShiftsFromXml(inputFile) # 3. compare with DB print 'STEP1: Shift search' self.topMatches = self.compareWithShiftDB() # 4. make preditions from scorograms print 'STEP2: GLE generation' self.predictor = Predictor(self.query, self.topMatches, self.reference, writePgm) self.predictions = self.predictor.predictPhiPsiFromDatabaseMatches( progressBar=self.progressBar)
def step(self): """ TO DO Take a single MCMC step. Each step should do the following: 1. sample position in chain - Note: think about positions you can sample a k-mer fragment from. For example, you cannot sample from position 1 because there is no phi angle 2. sample fragment at that position and replace torsions in a *copied version* of the protein 3. measure energy after replacing fragment 4. accept or reject based on Metropolis criterion - if accept: incorporate proposed insertion and anneal temperature - if reject: sample new fragment (go to step 3) """ # Sample an eligible position within the original protein sample_index = random.randint(1, (int(self.prev_protein.length) - self.k)) # Candidate fragments with lowest rmsd values debug('\nn value: {}, sample index: {}'.format(self.N, sample_index), self.iter, 5) candidate_fragments = self.fragset.get_lowRMS_fragments(sample_index, self.N) debug('\ncandidate_fragments: {}\n'.format(candidate_fragments), self.iter, 5) # Run through all possible options of frag candidates before moving on fragment_indices = set() while len(fragment_indices) < len(candidate_fragments): # From list, choose a random fragment fragment_index = random.randint(0, (len(candidate_fragments)-1)) random_fragment = candidate_fragments[fragment_index] # Run through all possible options of frag candidates before moving on fragment_indices.add(fragment_index) # Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein. self.perturb_fragment(sample_index, random_fragment) # Test the energy of the changed protein copy self.new_energy = self.compute_energy(self.next_protein) # Metropolis test to see if we should stick with the changed version if self.metropolis_accept(): debug('Passed Metropolis!!\nProbability: {}'.format(self.prob_accept), self.iter, 5) # Anneal temp self.anneal_temp() # Accept the protein changes new_pose = Pose() new_pose.assign(self.next_protein.pose) self.prev_protein = Protein(pose=new_pose) # Update energy self.old_energy = self.new_energy # Update best pose and score if energy is better than previous if self.new_energy < self.best_score: self.best_score = self.new_energy self.best_pose = Pose() self.best_pose.assign(self.next_protein.pose) return debug('Failed Metropolis...\nProbability: {}'.format(self.prob_accept), self.iter, 5)
def perturb_fragment( self, pos: int, mer: str = "9mers", protein: Union[Protein, None] = None ) -> Tuple[Protein, int]: # you may want to add more arguments """ Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein. Store fragment candidate at certain position (call get_lowRMS just once.) :param protein: optional parameter, if none, use self.protein :param pos: position to change :param mer: mode of function, either "3mers" or "9mers" :return: new Protein with updated angles """ # set a new_pose (protein) if not protein: new_protein = Protein(pose=self.protein.pose) else: new_protein = Protein(pose=protein.pose) # sample candidate fragment random_index = random.randint(0, len(self.candidate_frag[mer][pos]) - 1) frag_chosen = self.candidate_frag[mer][pos][random_index] frag_index = self.fragment_set[mer].findFragIndex(pos, frag_chosen) # insert this fragment and return if mer == "9mers": frag_length = 9 else: frag_length = 3 for i in range(frag_length): new_protein.set_torsion(pos + i, frag_chosen[i][0], frag_chosen[i][1]) return new_protein, frag_index
def test_protein_move_shoul_can_be_reverted(self): p = Protein('HHHPPPHHH') p_copy = deepcopy(p) p.move() self.assertNotEqual(p, p_copy) p.undo_move() self.assertEquals(p, p_copy)
def make_proteins(): proteins.append(Protein("food_perception", "ST")) proteins.append(Protein("poison_perception", "AI")) proteins.append(Protein("red_tail+", "QR")) proteins.append(Protein("red_tail-", "RR")) proteins.append(Protein("green_tail+", "PR")) proteins.append(Protein("green_tail-", "LR"))
class Data: index_ligand = [] index_receptor = [] cg_atoms = [] def __init__(self, params): self.ligand = Protein() self.ligand.import_pdb(params.ligand_file_name) self.receptor = Protein() self.receptor.import_pdb(params.receptor_file_name) self.cg_atoms = [] if params.energy_type == "vdw": [self.index_ligand, self.index_receptor] = self.get_index(["CA", "CB"]) def get_index(self, atoms=["CA", "CB"]): # generate a dummy assembly and extract the indexes where atoms of interest are located assembly = A.Assembly(self.ligand, self.receptor) assembly.place_ligand(np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])) ligand_index = [] receptor_index = [] for aname in atoms: # append indexes of an element in atoms list for ligand [m, index] = assembly.atomselect_ligand("*", "*", aname, True) for i in index: ligand_index.append(i) # append indexes of an element in atoms list for receptor [m, index] = assembly.atomselect_receptor("*", "*", aname, True) for i in index: receptor_index.append(i) return [ligand_index, receptor_index]
def __init__(self, num, modelPath, partial): self.num = num self.basename = "Frag{:04n}".format(num) self.basepath = path.join(modelPath, self.basename) self.values = FragValues() self.protein = Protein() self.center = IndexedCoM() self.resCenters = [] self.stat = FragStatistics() self.atomCount = 0 self.residues = [] self.diffMat = [] self.sdf = [] self.partial = partial
class Structure(): def __init__ (self): # initialising the values self.monomer = "NA" # the monomeric unit self.pdb_file_name = "NA" self.index_CA_monomer = "NA" self.flexibility = "NA" self.init_coords = "NA" def read_pdb (self, pdb): self.pdb_file_name = pdb self.monomer = Protein() self.monomer.import_pdb(pdb) self.init_coords = self.monomer.get_xyz() def compute_PCA (self, topology,trajectory,align,ratio,mode, proj_file): self.flexibility = F.Flexibility_PCA() self.flexibility.compute_eigenvectors(topology,trajectory,align,ratio,mode, proj_file) def setCoords (self): self.init_coords = self.monomer.get_xyz()
def __init__(self, data_path, prot_len_file_name, with_overlap, with_redundant, with_gap, interpro_local_format): """ Preprocess class init Parameters ---------- data_path : str full data path prot_len_file_name : str file name containing protein length information with_overlap : bool output overlapping domain annotation (True), otherwise not overlapping domain annotation will be created (False) with_redundant : bool if with_overlap is False then create non overlapping (but possibly redundant) domains (True), otherwise create non overlapping and non redundant domain annotation (False) with_gap : bool add GAP domain for each protein subsequence >30 amino acids without domain hit (True), otherwise don't add GAP domain (False) interpro_local_format : bool preprocess output format produced by local interproscan run (True), otherwise preprocess Interpro downloaded protein2ipr format (False) Returns ------- None """ self.data_path = data_path self.prot_len_file_name = prot_len_file_name self.with_overlap = with_overlap self.with_redundant = with_redundant self.with_gap = with_gap self.last_protein = Protein(self.with_overlap, self.with_redundant, self.with_gap) self.proteins = [] self.interpro_local_format = interpro_local_format self.num_prot_with_no_interpro = 0
def __init__(self, parent=Protein(), name="", id=0, atoms=[], N=(0, 0, 0), C_alpha=(0, 0, 0), C_dash=(0, 0, 0), coordinates=[], SS=""): self.parent = parent self.name = name self.id = id self.atoms = atoms self.N = N self.C_alpha = C_alpha self.C_dash = C_dash self.coordinates = coordinates self.SS = SS
def parseFile(self, way): i = 0 list = [] self.setFile(open(way,"r")) protein = "" name = "" for line in self.getFile().readlines(): if (line[0] == '>'): if (protein != ""): list.insert(i, Protein(name, protein)) i += 1 protein = "" name = line name = name[0:-1] else: protein += line protein = protein[0:-1] return list
def test_distance(self): # Test for all dimensions: # Protein Constructor: protein1 = Protein("SelfProt", 2, 0, 1, 1, 0, 0, 2, 0.15, 1, 0.076) #Protein(name, C2H2, C2WH2,GATA3, CCHC, ZN2C6, zinc, prot_len, pos, num_chain, hys_cys) # Test distance when protein 2 is correct: protein2 = Protein("CorrectProt", 1, 0, 0, 1, 0, 3, 2, 0.24, 2, 0.016) self.assertAlmostEqual(protein1.distance(protein2), 6.15) # Test distance when protein 2 is 0: protein2 = Protein("CorrectProt", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) self.assertAlmostEqual(protein1.distance(protein2), 7.226)
def __init__(self,params): self.structure_hash = {} self.structure_list = [] volume_structure_hash = {} # this is used to get the biggest structure # GIORGIO_CODE create structure instances of the rigid monomers for nickName,pdb_file in params.monomer_file_name: # create instance of the structure class s = Structure() s.read_pdb (pdb_file) volume_structure_hash[len(s.monomer.get_xyz())] = [s, nickName] # create structure instance for the flexible monomers if params.assembly_style=="flexible": print ">> flexible docking requested for structures, launching PCA..." for nickName, traj_file in params.trajectory: try: # get the topology file: for nickName2, top_file in params.topology: if nickName2 == nickName: break # create the structure and compute the PCA s = Structure() s.compute_PCA(top_file, traj_file, params.align, params.ratio, params.mode, params.proj_file) s.read_pdb("protein.pdb") volume_structure_hash[len(s.monomer.get_xyz())] = [s, nickName] except ImportError, e: sys.exit(1) # TODO: work on the deform mode, but ask Matteo before if params.mode=="deform": self.structure_ligand=Protein() self.structure_ligand.import_pdb("protein.pdb") self.ligand.import_pdb("CA.pdb")
def FindInDB(self,dbID): if dbID in self.dseqs.keys(): print("Sequence size:",len(self.dseqs[dbID])) print("Sequence (First 1k bp)):", self.dseqs[dbID].seq[min(1000,len(self.dseqs[dbID].seq))]) if input("Update Sequence? [y/n]: ") == "y": new_seq = input("New sequence (key number maintained): ") pt = Protein(new_seq) if pt.isvalid: self.dseqs[dbID] = pt else: print("Invalid sequence") if len(self.dseqs[dbID].otherDBs.values()) > 0: print(self.dseqs[dbID].otherDBs) else: print("No external DB IDs") if input("Update external DB IDs? [y/n]: ") == "y": new_db = input("Database: ") new_id = input("Database ID: ") if len(new_db) > 2 and len(new_id) > 2: self.dseqs[dbID].addotherDBs(new_db,new_id) else: print("Invalid input") else: print("Invalid ID") return False
def relax(pdb, native, scorefxn=scorefxn_fa): """ Performs energy minimization using Rosetta FastRelax protocol, superimpose onto native structure, and calculate RMSD -------- Params - pdb (str): path to input structure in PDB format - native (str): path to native structure in PDB format - scorefxn (ScoreFunction): energy function to use in scoring. Either centroid ('score3') or full-atom ('fa_standard'), default full-atom Returns - Protein object representing relaxed structure - RMSD between input and native (float) - Score after minimization (float) """ pose = pose_from_pdb(pdb) score = score_pose(pose, scorefxn) print('initial score', score) to_fullatom = SwitchResidueTypeSetMover('fa_standard') to_fullatom.apply(pose) relax = FastRelax() #ClassicRelax() relax.set_scorefxn(scorefxn) relax.apply(pose) score = score_pose(pose, scorefxn) print('final score', score) pose.dump_pdb("%s_fast_relax.pdb" % (pdb[:-4])) native_pose = pose_from_pdb(native) relax.apply(native_pose) native_pose.dump_pdb("%s_fast_relax.pdb" % (native[:-4])) rmsd = superimpose_rmsd("%s_fast_relax.pdb" % (pdb[:-4]), "%s.pdb" % (native[:-4])) print('RMSD to native', rmsd) return Protein(pose=pose), rmsd, score
def storeSim(self, best_pdb: Protein, log: dict, sim_index: int) -> Tuple[str, str, int]: """ Store best pdb and log text file to log :param best_pdb: the structure to store as "best.pdb" :param log: log dict to store :param sim_index: int of simulation number :return path to sim folder, path to log folder, sim """ # dealing with paths if not self.logdir: cur_dir = os.getcwd() log_folder_name = self.protein_name + "_log" log_folder_path = os.path.join(cur_dir, log_folder_name) else: log_folder_path = self.logdir if not os.path.exists(log_folder_path): os.mkdir(log_folder_path) sim_folder_name = "sim_" + self.__toStr__(sim_index) sim_folder_path = os.path.join(log_folder_path, sim_folder_name) # avoid path exist error if not os.path.exists(sim_folder_path): os.mkdir(sim_folder_path) # store things # 1. initial pdb self.initial_protein.save_pdb( os.path.join(sim_folder_path, "initial.pdb")) # 2. target pdb target_protein = Protein(pose=self.target_pose) target_protein.save_pdb(os.path.join(sim_folder_path, "target.pdb")) # 3. best pdb best_pdb.save_pdb(os.path.join(sim_folder_path, "best.pdb")) # 6. log.txt self.savelog( log, os.path.join(sim_folder_path, sim_folder_name + "_log.txt")) return sim_folder_path, log_folder_path, sim_index
class MCMCSampler(object): def __init__(self, seq_protein, k, T_start, T_end, fragset, N, anneal_rate): """ TO DO: initialize necessary variables The score function is given to you (Rosetta centroid score function) """ self.scorefxn = create_score_function('score3') self.prev_protein = seq_protein self.best_pose = Pose() self.best_pose.assign(self.prev_protein.pose) self.fragset = fragset self.k = k self.N = N self.anneal_rate = anneal_rate self.best_score = self.compute_energy(seq_protein) self.old_energy = self.compute_energy(seq_protein) self.T = T_start self.T_end = T_end self.prob_accept = 0 self.iter = 0 # For graphing self.lst_energy = [] def compute_energy(self, protein): """ TO DO Compute energy of protein. Hint: look at utils.py -------- Params: - protein (Protein object): protein to score Return: - energy of conformation (float) """ return self.scorefxn(protein.pose) def perturb_fragment(self, sample_index, random_fragment): """ TO DO Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein. --------- Params: - TO DO Returns: - TO DO """ # Create a new copy of the original protein to experiment on new_pose = Pose() new_pose.assign(self.prev_protein.pose) self.next_protein = Protein(pose=new_pose) # Print the Previous Protein debug('Previous Protein:', self.iter, 5) for pos in range(1, self.prev_protein.length+1): debug('\tPosition: {}\tAngle: {}'.format(pos, self.prev_protein.get_torsion(pos)), self.iter, 5) # For each residue in fragment, replace the corresponding torsion angles in copy of protein pos = sample_index debug('random fragment: {}\nPosition: {}'.format(random_fragment, pos), self.iter, 5) for phi, psi in random_fragment: self.next_protein.set_torsion(pos, phi, psi) pos += 1 # Print Protein after angle replacement debug('New Protein:', self.iter, 5) for pos in range(1, self.prev_protein.length+1): debug('\tPosition: {}\tAngle: {}'.format(pos, self.next_protein.get_torsion(pos)), self.iter, 5) def metropolis_accept(self): # you may want to add more arguments """ TO DO Calculate probability of accepting or rejecting move based on Metropolis criterion. -------- Params: - TO DO Returns: - TO DO """ # Compute change in energy delta_e = self.new_energy - self.old_energy # Calculate a random number between zero and one rand_num = np.random.rand() # Passes if energy change is negative if delta_e <= 0: self.prob_accept = 1 return True else: # If energy is positive, calculate probability of accepting self.prob_accept = np.exp(-delta_e / self.T) if self.prob_accept > rand_num: return True else: return False def anneal_temp(self): """ TO DO Anneal temperature using exponential annealing schedule. Consider kT to be a single variable (i.e. ignore Boltzmann constant) -------- Params: - TO DO Returns: - TO DO """ self.T = self.anneal_rate * self.T def step(self): """ TO DO Take a single MCMC step. Each step should do the following: 1. sample position in chain - Note: think about positions you can sample a k-mer fragment from. For example, you cannot sample from position 1 because there is no phi angle 2. sample fragment at that position and replace torsions in a *copied version* of the protein 3. measure energy after replacing fragment 4. accept or reject based on Metropolis criterion - if accept: incorporate proposed insertion and anneal temperature - if reject: sample new fragment (go to step 3) """ # Sample an eligible position within the original protein sample_index = random.randint(1, (int(self.prev_protein.length) - self.k)) # Candidate fragments with lowest rmsd values debug('\nn value: {}, sample index: {}'.format(self.N, sample_index), self.iter, 5) candidate_fragments = self.fragset.get_lowRMS_fragments(sample_index, self.N) debug('\ncandidate_fragments: {}\n'.format(candidate_fragments), self.iter, 5) # Run through all possible options of frag candidates before moving on fragment_indices = set() while len(fragment_indices) < len(candidate_fragments): # From list, choose a random fragment fragment_index = random.randint(0, (len(candidate_fragments)-1)) random_fragment = candidate_fragments[fragment_index] # Run through all possible options of frag candidates before moving on fragment_indices.add(fragment_index) # Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein. self.perturb_fragment(sample_index, random_fragment) # Test the energy of the changed protein copy self.new_energy = self.compute_energy(self.next_protein) # Metropolis test to see if we should stick with the changed version if self.metropolis_accept(): debug('Passed Metropolis!!\nProbability: {}'.format(self.prob_accept), self.iter, 5) # Anneal temp self.anneal_temp() # Accept the protein changes new_pose = Pose() new_pose.assign(self.next_protein.pose) self.prev_protein = Protein(pose=new_pose) # Update energy self.old_energy = self.new_energy # Update best pose and score if energy is better than previous if self.new_energy < self.best_score: self.best_score = self.new_energy self.best_pose = Pose() self.best_pose.assign(self.next_protein.pose) return debug('Failed Metropolis...\nProbability: {}'.format(self.prob_accept), self.iter, 5) def simulate(self): """ TO DO Run full MCMC simulation from start_temp to end_temp. Be sure to save the best (lowest-energy) structure, so you can access it after. It is also a good idea to track certain variables during the simulation (temp, energy, and more). -------- Params: - TO DO Returns: - TO DO """ outfile('kmer_stats.txt', 'iter: \ttemp: \t\t\t\tenergy:\n') # Take as many steps as necessary until we reach the target temp while self.T >= self.T_end: self.step() outfile('kmer_stats.txt', '{} \t\t{} \t{}\n'.format(self.iter, self.T, self.old_energy)) self.lst_energy.append(self.new_energy) self.iter += 1
class Data: index_ligand=[] index_receptor=[] cg_atoms=[] def __init__(self,params): self.structure_hash = {} self.structure_list = [] volume_structure_hash = {} # this is used to get the biggest structure # GIORGIO_CODE create structure instances of the rigid monomers for nickName,pdb_file in params.monomer_file_name: # create instance of the structure class s = Structure() s.read_pdb (pdb_file) volume_structure_hash[len(s.monomer.get_xyz())] = [s, nickName] # create structure instance for the flexible monomers if params.assembly_style=="flexible": print ">> flexible docking requested for structures, launching PCA..." for nickName, traj_file in params.trajectory: try: # get the topology file: for nickName2, top_file in params.topology: if nickName2 == nickName: break # create the structure and compute the PCA s = Structure() s.compute_PCA(top_file, traj_file, params.align, params.ratio, params.mode, params.proj_file) s.read_pdb("protein.pdb") volume_structure_hash[len(s.monomer.get_xyz())] = [s, nickName] except ImportError, e: sys.exit(1) # TODO: work on the deform mode, but ask Matteo before if params.mode=="deform": self.structure_ligand=Protein() self.structure_ligand.import_pdb("protein.pdb") self.ligand.import_pdb("CA.pdb") # getting the biggest structure and putting at the beginning so that it is fixed sorted_volumes = volume_structure_hash.keys() sorted_volumes.sort() sorted_volumes.reverse() for i in sorted_volumes: # insert the elements in a list self.structure_list.append( volume_structure_hash[i][0] ) # insert the structure self.structure_hash[volume_structure_hash[i][1]] = self.structure_list.index(volume_structure_hash[i][0]) self.structure_list_and_name = [self.structure_list, self.structure_hash] print self.structure_list_and_name #LIGAND STRUCTURE #self.ligand = Protein() # if params.assembly_style=="flexible": # print ">> flexible docking requested for ligand, launching PCA..." # try: # self.flex_ligand=F.Flexibility_PCA() # self.flex_ligand.compute_eigenvectors(params.ligand_topology,params.ligand_trajectory,params.ligand_align,params.ligand_ratio,params.mode,params.ligand_proj_file) # self.ligand.import_pdb("protein.pdb") # importing the middle structure # except ImportError, e: # sys.exit(1) # # if params.mode=="deform": # self.structure_ligand=Protein() # self.structure_ligand.import_pdb("protein.pdb") # self.ligand.import_pdb("CA.pdb") #else: #load monomeric structure (the pdb file) #self.ligand.import_pdb(params.ligand_file_name) if params.energy_type=="vdw": self.CA_index_of_structures = self.get_index(["CA"]) #[self.index_ligand,self.index_receptor]=self.get_index(["CA","CB"]) # if the density map docking is on load the structure into data: if params.map_dock_OnOff: self.density_map_fileName = params.density_map
def read_pdb (self, pdb): self.pdb_file_name = pdb self.monomer = Protein() self.monomer.import_pdb(pdb) self.init_coords = self.monomer.get_xyz()
def parse_prot2in(self, file_in_name, batch_num_lines, batch_num_prot): """ Parse protein domain hits to create tabular formatted file relating each protein to its domains Parameters ---------- file_in_name : str input file name batch_num_lines : int number of lines to be parsed per batch batch_num_prot : int number of proteins to be processed per batch Returns ------- None """ file_out_name = self.create_file_out_name() total_out_prot = 0 if self.prot_len_file_name != "": prot_file = open( os.path.join(self.data_path, self.prot_len_file_name), 'r') else: prot_file = "" # check if output tabular file already exists, if yes then don't add header output_exists_already = False if os.path.isfile(os.path.join(self.data_path, file_out_name)): output_exists_already = True with gzip.open(os.path.join(self.data_path, file_in_name), 'rt') as file_in, open( os.path.join(self.data_path, file_out_name), 'a') as file_out: if not output_exists_already: # write the header of the output file file_out.write("uniprot_id\tinterpro_ids\tevidence_db_ids\n") line_count = 0 for i, batch in enumerate(batch_iterator(file_in, batch_num_lines)): for hit_line in batch: hit_line = hit_line.strip() hit_tabs = hit_line.split("\t") if self.interpro_local_format: assert len( hit_tabs ) >= 11, "AssertionError: " + hit_line + "has less than 11 tabs." else: assert len( hit_tabs ) == 6, "AssertionError: " + hit_line + " has more than 6 tabs." if self.last_protein.uniprot_id == "": # initialize protein list protein = Protein(self.with_overlap, self.with_redundant, self.with_gap, hit_line, prot_file, self.interpro_local_format) self.last_protein = protein self.proteins.append(protein) else: if Protein.get_prot_id( hit_line) == self.last_protein.uniprot_id: # update last created protein self.last_protein.add_domain(hit_line) else: # write to file complete proteins if len(self.proteins) == batch_num_prot: self.update_output(file_out) total_out_prot = total_out_prot + len( self.proteins) self.update_no_intepro() del self.proteins[:] # create new protein and append it to proteins protein = Protein(self.with_overlap, self.with_redundant, self.with_gap, hit_line, prot_file, self.interpro_local_format) self.last_protein = protein self.proteins.append(protein) line_count = line_count + 1 # save last proteins self.update_output(file_out) total_out_prot = total_out_prot + len(self.proteins) self.update_no_intepro() del self.proteins[:] if self.prot_len_file_name != "": prot_file.close() print("Successfully parsed {} lines.".format(line_count)) print("Successfully created {} proteins.".format(total_out_prot)) print("Number of proteins without any interpro annotation: {}.".format( self.num_prot_with_no_interpro))
def translate_with_fs(self, frameshifts=None): # frameshifts is a dict in {pos: Variant} form. NOT VariantSet! We are translating # with a particular FS combination and NOT calculating possible combinations here. if frameshifts is None: frameshifts = [] else: frameshifts = sorted(frameshifts) # should be already sorted, but... # the number of bases gained or lost by each frameshift. Positive: gain, negative: lost fs_shifts = [(fpos, fpos[0] - fpos[1] + len(fsvar)) for fpos, fsvar in frameshifts] def reposition(orig_pos): start, stop = orig_pos new_start, new_stop = start, stop for (fs_start, fs_stop), fs_shift in fs_shifts: if fs_start <= start < fs_stop or fs_start < stop <= fs_stop: warnings.warn('Watch out, variant inside frameshift! We\'re not ready to handle ' 'that yet. %s, (%d-%d)' % (self.id, fs_start, fs_stop)) if start >= fs_stop: # frameshift happened before variant, so variant shifts new_start += fs_shift new_stop += fs_shift return new_start, new_stop fs_positions = [] new_seq = Seq('', generic_nucleotide) original_seq = self.sequence[self.cds[0]:] next_start = 0 for (fs_start, fs_stop), fs_var in frameshifts: new_seq += original_seq[next_start:fs_start] fs_positions.append(len(new_seq)/3) # register first AA position that current FS affects new_seq += fs_var.sequence next_start = fs_stop else: new_seq += original_seq[next_start:] protein = Protein(new_seq.translate(), self) # now with the new sequence created it's time to translate non-FS variants. Since the frameshifts # moved their relative positions around, we have to use their updated locations. new_variantsets = {} for (start, stop), vset in {reposition(vpos): vset for vpos, vset in self.variantsets.iteritems()}.iteritems(): cstart = start - (start % 3) # codon start cstop = (stop + 2) / 3 * 3 # codon stop new_vset = VariantSet(vset.genomic_pos, set([])) # TODO: this may introduce superfluous AA-s, that is 'Q'->'QP' when a # ''->'P' would be enough. Need to look into it. -- 99% SOLVED. for v in vset: if v.variant_type not in ('FSI', 'FSD'): aa_seq = (new_seq[cstart:start] + v.sequence + new_seq[stop:cstop]).translate() translated_variant = Variant(v.genomic_pos, v.variant_type, aa_seq, 'AA', v.sample_id) # TODO: should we carry over metadata? I think we really should! # for now, let's just keep a simple reference to the original variant translated_variant.log_metadata('origin', v) new_vset.add_variant(translated_variant) new_vset.log_metadata('origin', vset) # TODO: maybe origin should be a first-class attribue not metadata? if len(new_vset) > 0: # frameshift VariantSets would create empty new_vsets, disregard them new_variantsets[(cstart/3, cstop/3)] = new_vset protein.variantsets = new_variantsets protein._trim_after_stop() # now let's see which frameshifts were actually kept. As induced stop codons may have terminated # the translated sequence, there's a chance that later frameshifts are irrelevant. # <= instead of < as the stop codon (Biopython '*') is trimmed away and if a FS induces that # as its first affected AA position it DID play a role in what the sequence has become # although '*' is not part of the protein sequence itself. fs_positions = filter(lambda x: x<=len(protein), fs_positions) used_frameshifts = zip(fs_positions, (fs for _, fs in frameshifts[:len(fs_positions)])) assert protein.get_metadata('frameshifts') == [], ("Someone has tweaked with the 'frameshift'" " field of protein metadata before. May have come from inherited transcript metadata." " Use a different field name in your custom functions.") protein.log_metadata('frameshifts', used_frameshifts) return protein
def rmsd_pdb(pdb_file1, pdb_file2): protein1 = Protein() protein1.read_pdb(pdb_file1) protein2 = Protein() protein2.read_pdb(pdb_file2) return Analysis.rmsd_proteins(protein1, protein2)
def Main(): #Initializes the protein class which contains all main functions for manipulation and storage of protein data P = Protein() # Gets user input for the name of the desired output file. Output graphic displays in the subfolder of completed trees name_of_run = nameOfRun() P.run_name = name_of_run # timer that determines how much pause is placed between calls to NCBI servers, reccomend 0.5 however 0 works when running NOT during peak hours time = timer() P.timer = time #Set input/output paths input_file_path = os.path.join('Input', 'ProteinInput') output_file_path = os.path.join('Output') #Clears previous CDS and genomic output from last run clearPreviousOutput(output_file_path) #Functions from the Protein class that use the input protein accession numbers to determine the corresponding protein ID, CDS, and genomic data P.Entrez_Protein_ID_Fetch(input_file_path) P.Entrez_Genome_Fetch() # Writes the fasta seq for the genome corresponding the protein. Establishing a parallel list. Ouput file path: PhyloRewrite/Output/Genome for i in range(len(P.gene)): data_type = 'gene' writeOutput(data_type, P.gene[i]) print 'Genomic Sequences written to Output File... ' P.Entrez_CDS_Fetch() #Writes the fasta seq for the CDS corresponding the protein. Establishing a parallel list. Ouput file path: PhyloRewrite/Output/CDS for i in range(len(P.retrieved_full_cds)): data_type = 'CDS' writeOutput(data_type, P.retrieved_full_cds[i]) print 'CDS Sequences written to Output File... ' #Checks as to whether the lists are in parallel, if they are not error will rise print '\n Checking the output for errors... \n' if len(P.gene) != len(P.retrieved_full_cds): repair_script() sys.exit() print 'No errors found, continuing... \n' #Function of the protein class that determines the intron phase and location of intron/exon boundry P.intronCalculator() #Contained within the lists is each print 'Intron Phases: ' + str(P.intron_phase) print 'Length of the Exons: ' + str(P.exon_lengths) #Uses clustal X linux executible to format a multiple sequencing alignment for the input protein sequences. Files found in execs/tmp P.multiple_sequencing_alignment() #Takes the multiple sequencing alignment output from clustal X and inputs into fasttree to generate an unrooted tree, then piped into ete2 to root. Files found in execs/tmp P.rootedTreeConstruction() #Builds the tree graphic P.renderingTreeImage()
def convertXmlToProtein(self, xml): """Turns raw XML from Uniprot into a proper Protein object. :param xml: An XML string to be parsed. :rtype: A Protein object. """ # XML to dictionary proteinObject = Protein() dictionary = xmltodict.parse(xml) root = dictionary["uniprot"] entry = root["entry"] for element, value in entry.items(): if element == "@accession": proteinObject.addAttribute("id", "uniprot", value) if element == "name": proteinObject.addAttribute("proteinShortName", "uniprot", value) if element == "protein": fullname = value["recommendedName"]["fullName"] proteinObject.addAttribute("proteinFullName", "uniprot", fullname) if element == "@created": year,month,day = value.split("-") proteinObject.addAttribute("creationDate", "uniprot", self.convertDateToNative(day,month,year) ) if element == "@modified": year,month,day = value.split("-") proteinObject.addAttribute("modifiedDate", "uniprot", self.convertDateToNative(day,month,year) ) if element == "comment": for comment in entry["comment"]: if "text" in comment: text = comment["text"]["#text"] if isinstance(comment["text"], OrderedDict) else comment["text"] proteinObject.addAttribute(comment["@type"], "uniprot",text) if element == "gene": genes = [] for gene in value["name"]: if "#text" in gene and isinstance(gene, OrderedDict): genes.append(gene["#text"]) proteinObject.addAttribute("geneName", "uniprot", genes) if element == "organism": if isinstance(value["name"], list): organisms = [] for organism in value["name"]: organisms.append(organism["#text"]) else: proteinObject.addAttribute("organism", "uniprot", value["name"]["#text"]) if element == "sequence": proteinObject.addAttribute("sequence", "uniprot",value["#text"].replace("\n","")) proteinObject.addAttribute("sequencelength", "uniprot",value["@length"].replace("\n","")) return proteinObject
class Sym: def __init__(self, sequence, steps=10000, temp_min=0.15, temp_max=1.0, temp_delta=0.05, save_interval=100): self.protein = Protein(sequence) self.steps = steps self.temp_min = temp_min self.temp_max = temp_max self.temp_delta = temp_delta self.temp = temp_max self.save_interval = save_interval self.best = None def accept_higher_energy(self): def pi(j): kb = 1.0 return exp(-j / (kb * self.temp)) return random() < (pi(self.protein.energy) / pi(self.protein.last_energy)) def run(self): try: os.makedirs('output/' + self.protein.sequence) except: # directory exists? pass self.temp = self.temp_max global_steps_done = 0 # clear file open('output/' + self.protein.sequence + '/trajectory.pdb', 'w').close() heat_stats_file = open('output/' + self.protein.sequence + '/heat.csv', 'w') contacts_stats_file = open( 'output/' + self.protein.sequence + '/contacts.csv', 'w') inertia_stats_file = open( 'output/' + self.protein.sequence + '/inertia.csv', 'w') while self.temp >= self.temp_min: E2 = 0.0 E = 0.0 inertia_sum = 0.0 best_for_temperature = None str_temp = str(self.temp) contacts_stats_file.write(str_temp) inertia_stats_file.write(str_temp) for s in xrange(self.steps): global_steps_done += 1 # new_protein = deepcopy(self.protein) self.protein.move() if self.protein.is_valid(): if self.protein.energy <= self.protein.last_energy: pass elif self.accept_higher_energy(): pass else: self.protein.undo_move() else: self.protein.undo_move() # saving best model if self.best is None: self.best = deepcopy(self.protein) elif self.protein.energy < self.best.energy: self.best = deepcopy(self.protein) # best for actual temperature if best_for_temperature is None: best_for_temperature = deepcopy(self.protein) elif self.protein.energy < best_for_temperature.energy: best_for_temperature = deepcopy(self.protein) # stats contacts_stats_file.write(';' + str(-self.protein.energy)) inertia_sum += self.protein.calculate_moment_of_inertia() E2 += self.protein.energy**2 E += self.protein.energy self.save_trajectory(global_steps_done) self.save_best_for_actual_temp(best_for_temperature) inertia_stats_file.write(';' + str(inertia_sum / self.steps) + '\n') E /= self.steps E2 /= self.steps heat_stats_file.write(str_temp + ';' + str((E2 - E**2) / self.temp**2) + '\n') contacts_stats_file.write('\n') self.temp -= self.temp_delta contacts_stats_file.close() inertia_stats_file.close() heat_stats_file.close() self.save_best() def save_best_for_actual_temp(self, best_for_temperature): temp = str(self.temp) if len(temp) == 3: temp += '0' with open( 'output/' + self.protein.sequence + '/best_for_' + temp + '.pdb', 'w') as f: f.write(best_for_temperature.to_pdb(self.temp)) def save_best(self): with open('output/' + self.protein.sequence + '/best.pdb', 'w') as f: f.write(self.best.to_pdb(0)) def save_trajectory(self, all_steps): if all_steps % self.save_interval == 0: with open('output/' + self.protein.sequence + '/trajectory.pdb', 'a') as f: f.write(self.protein.to_pdb(all_steps / self.save_interval))
class Preprocess: """ Class to preprocess file from interpro database, found at: https://www.ebi.ac.uk/interpro/beta/download/protein2ipr.dat.gz """ def __init__(self, data_path, prot_len_file_name, with_overlap, with_redundant, with_gap, interpro_local_format): """ Preprocess class init Parameters ---------- data_path : str full data path prot_len_file_name : str file name containing protein length information with_overlap : bool output overlapping domain annotation (True), otherwise not overlapping domain annotation will be created (False) with_redundant : bool if with_overlap is False then create non overlapping (but possibly redundant) domains (True), otherwise create non overlapping and non redundant domain annotation (False) with_gap : bool add GAP domain for each protein subsequence >30 amino acids without domain hit (True), otherwise don't add GAP domain (False) interpro_local_format : bool preprocess output format produced by local interproscan run (True), otherwise preprocess Interpro downloaded protein2ipr format (False) Returns ------- None """ self.data_path = data_path self.prot_len_file_name = prot_len_file_name self.with_overlap = with_overlap self.with_redundant = with_redundant self.with_gap = with_gap self.last_protein = Protein(self.with_overlap, self.with_redundant, self.with_gap) self.proteins = [] self.interpro_local_format = interpro_local_format self.num_prot_with_no_interpro = 0 def update_no_intepro(self): """ Update statistic count for proteins without interpro domain Parameters ---------- Returns ------- None """ # check how many interpro ids exist for domains of proteins for protein in self.proteins: if sum(protein.interpro_exist_all_domains) == 0: self.num_prot_with_no_interpro = self.num_prot_with_no_interpro + 1 def update_output(self, file_out): """ Update output tabular file Parameters ---------- file_out : str output file name Returns ------- None """ for protein in self.proteins: file_out.write(protein.to_tabs()) def create_file_out_name(self): """ Create output file name based on the type of domain annotation that was selected Parameters ---------- Returns ------- str created output file name """ file_out_name = "id_domains" if self.with_overlap: file_out_name = file_out_name + "_overlap" elif self.with_redundant is False: file_out_name = file_out_name + "_no_overlap" else: file_out_name = file_out_name + "_no_redundant" if self.with_gap: file_out_name = file_out_name + "_gap" else: file_out_name = file_out_name + "_no_gap" return file_out_name + ".tab" def parse_prot2in(self, file_in_name, batch_num_lines, batch_num_prot): """ Parse protein domain hits to create tabular formatted file relating each protein to its domains Parameters ---------- file_in_name : str input file name batch_num_lines : int number of lines to be parsed per batch batch_num_prot : int number of proteins to be processed per batch Returns ------- None """ file_out_name = self.create_file_out_name() total_out_prot = 0 if self.prot_len_file_name != "": prot_file = open( os.path.join(self.data_path, self.prot_len_file_name), 'r') else: prot_file = "" # check if output tabular file already exists, if yes then don't add header output_exists_already = False if os.path.isfile(os.path.join(self.data_path, file_out_name)): output_exists_already = True with gzip.open(os.path.join(self.data_path, file_in_name), 'rt') as file_in, open( os.path.join(self.data_path, file_out_name), 'a') as file_out: if not output_exists_already: # write the header of the output file file_out.write("uniprot_id\tinterpro_ids\tevidence_db_ids\n") line_count = 0 for i, batch in enumerate(batch_iterator(file_in, batch_num_lines)): for hit_line in batch: hit_line = hit_line.strip() hit_tabs = hit_line.split("\t") if self.interpro_local_format: assert len( hit_tabs ) >= 11, "AssertionError: " + hit_line + "has less than 11 tabs." else: assert len( hit_tabs ) == 6, "AssertionError: " + hit_line + " has more than 6 tabs." if self.last_protein.uniprot_id == "": # initialize protein list protein = Protein(self.with_overlap, self.with_redundant, self.with_gap, hit_line, prot_file, self.interpro_local_format) self.last_protein = protein self.proteins.append(protein) else: if Protein.get_prot_id( hit_line) == self.last_protein.uniprot_id: # update last created protein self.last_protein.add_domain(hit_line) else: # write to file complete proteins if len(self.proteins) == batch_num_prot: self.update_output(file_out) total_out_prot = total_out_prot + len( self.proteins) self.update_no_intepro() del self.proteins[:] # create new protein and append it to proteins protein = Protein(self.with_overlap, self.with_redundant, self.with_gap, hit_line, prot_file, self.interpro_local_format) self.last_protein = protein self.proteins.append(protein) line_count = line_count + 1 # save last proteins self.update_output(file_out) total_out_prot = total_out_prot + len(self.proteins) self.update_no_intepro() del self.proteins[:] if self.prot_len_file_name != "": prot_file.close() print("Successfully parsed {} lines.".format(line_count)) print("Successfully created {} proteins.".format(total_out_prot)) print("Number of proteins without any interpro annotation: {}.".format( self.num_prot_with_no_interpro)) def create_domains_corpus(self, file_in_name, file_out_name, batch_num_lines): """ Create domain corpus from protein domains tabular file Parameters ---------- file_in_name : str input file name file_out_name : str output file name batch_num_lines : int number of lines to be processed per batch Returns ------- None """ total_out_lines = 0 with open(os.path.join(self.data_path, file_in_name), 'r') as file_in, open( os.path.join(self.data_path, file_out_name), 'a') as file_out: for i, batch in enumerate(batch_iterator(file_in, batch_num_lines)): for line in batch: line_tabs = line.split("\t") assert len( line_tabs ) == 3, "AssertionError: line should have only three tabs." protein_domains = line_tabs[1] if protein_domains.strip() != "interpro_ids": file_out.write(protein_domains + "\n") total_out_lines = total_out_lines + 1 print("Successfully written {} proteins in domains representation.". format(total_out_lines)) def fasta2default_domains(self, fasta_name, data_id_format): """ Convert a fasta file containing proteins without any interproscan domain hit (mainly for prediction tasks) Parameters ---------- fasta_name : str input fasta name data_id_format : int data set contains id format in following types: protein ids (0), protein ids but remove ending ";" (1), protein ids can be extracted by splitting at "|" Returns ------- None """ file_out_name = "default_domains.tab" with open(os.path.join(self.data_path, fasta_name), "r") as fasta_file, open( os.path.join(self.data_path, file_out_name), "w") as file_out: file_out.write("uniprot_id\tinterpro_ids\tevidence_db_ids\n") for protein in SeqIO.parse(fasta_file, "fasta"): if data_id_format == 0: # DeepLoc domain_annot = protein.id + "_unk_dom" evid_annot = protein.id + "_unk_evid" elif data_id_format == 1: # for targetP remove ending ; domain_annot = protein.id.strip(";") + "_unk_dom" evid_annot = protein.id.strip(";") + "_unk_evid" elif data_id_format == 2: # Toxin domain_annot = protein.id.split("|")[1] + "_unk_dom" evid_annot = protein.id.split("|")[1] + "_unk_evid" file_out.write( "\t".join([protein.id, domain_annot, evid_annot]) + "\n")
''' Created on Apr 26, 2013 @author: cforker ''' from itertools import count from Protein import Protein from time import time def leadingZeroes(binstring,length): return '0'*(length-len(binstring[2:])) + binstring[2:] if __name__ == '__main__': t0 = time() N = 8 # binary representation of proteins. 1=H,0=P for prot in count(): if (prot == 2**N): break sprot = leadingZeroes(bin(prot),N) #print sprot ptest = Protein('01011111') ptest.setFolding([0,1,1,-1,-1,1]) ptest.foldingDimensions() ptest.buildGrid() ptest.countHBonds() ptest.printEverything() t1 = time() print "Execution Time",round((t1 - t0)*1000),"ms"
from Protein import Protein from Atom import Atom # Creating a protein molecule: prot = Protein('Trp-cage') seq = "" countaa = 0 with open("1l2y.coords") as protfile: for line in protfile: line = line.rstrip() split_line = line.split() aa = split_line[3] aanum = split_line[5] at = Atom(split_line[11], float(split_line[6]), float(split_line[7]), float(split_line[8])) # Create first atom prot.addatom(at, aa, aanum) # Add it to the molecule object if (int(aanum) > countaa): seq += aa + " " countaa += 1 prot.addsequence(seq) print(prot) # Print the molecule object details print("\nProtein Sequece:") prot.getsequence()
def run(self): exec "import %s as constraint" % (self.constraint) # create output directory for generated PDB self.OUTPUT_DIRECTORY = "result" if os.path.isdir(self.OUTPUT_DIRECTORY) != 1: os.mkdir(self.OUTPUT_DIRECTORY) clusters_file = open("%s/solutions.dat" % self.params.output_folder, "w") # use superclass method to filter acceptable solutions self.log = self.select_solutions(self.params) print ">> %s solutions filtered" % len(self.log) if len(self.log) == 0: return # generate a dummy multimer and extract the indexes of C alpha multimer = A.Assembly(self.data.ligand, self.data.receptor, self.data.cg_atoms) multimer.place_ligand(np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])) [m, index] = multimer.atomselect_ligand("*", "*", "CA", True) # load the monomeric structure positions s = Protein() s.import_pdb(self.params.ligand_file_name) coords = s.get_xyz() print ">> clustering best solutions..." P = self.log[:, 0 : len(self.log[0, :])] # points list V = self.log[:, -1] # values of best hits C = [] # centroids array P_C = np.zeros(len(P)) # points-to-cluster mapping C_V = [] # centroids values cnt = 0 # centroids counter # cluster accepted solutions while True: # check if new clustering loop is needed k = np.nonzero(P_C == 0)[0] if len(k) != 0: cnt = cnt + 1 P_C[k[0]] = cnt a = P[k[0]] C.append(a) else: break # create multimer pos = np.array(C[cnt - 1])[0:6].astype(float) multimer1 = A.Assembly(self.data.ligand, self.data.receptor, self.data.cg_atoms) multimer1.place_ligand(pos) # write multimer multimer1.write_PDB("%s/assembly%s.pdb" % (self.OUTPUT_DIRECTORY, cnt)) # clustering loop m1 = multimer1.get_ligand_xyz()[index] cnt2 = 1 for i in xrange(0, len(k), 1): self.data.ligand.set_xyz(coords) # multimer2 = A.Assembly(self.data.ligand) multimer2 = A.Assembly(self.data.ligand, self.data.receptor, self.data.cg_atoms) multimer2.place_ligand( np.array([P[k[i]][0], P[k[i]][1], P[k[i]][2], P[k[i]][3], P[k[i]][4], P[k[i]][5]]) ) m2 = multimer2.get_ligand_xyz()[index] rmsd = self.align(m1, m2) if rmsd < self.params.cluster_threshold: cnt2 += 1 P_C[k[i]] = cnt print ">>> clustered %s solutions on multimer %s" % (cnt2, cnt) # set centroid score with score of closes neighbor in set q = np.nonzero(P_C == cnt)[0] distance = 10000 targ = 0 for i in xrange(0, len(q), 1): d = np.sqrt(np.dot(C[cnt - 1] - P[q[i]], C[cnt - 1] - P[q[i]])) if d < distance: distance = d targ = q[i] C_V.append(V[targ]) # extract constraint values calculated for selected centroid measure = constraint.constraint_check(multimer1) ###generate output log (prepare data and formatting line, then dump in output file)### l = [] f = [] for item in C[cnt - 1][0 : len(C[cnt - 1]) - 1]: l.append(item) f.append("%8.3f ") # write constraint values f.append("| ") for item in measure: l.append(item) f.append("%8.3f ") # write fitness f.append("| %8.3f\n") l.append(C_V[cnt - 1]) formatting = "".join(f) clusters_file.write(formatting % tuple(l)) clusters_file.close() ####generate output log### ##write solution values # for item in C[cnt-1]: # clusters_file.write("%s "%item) ##write constraint values # for item in measure: # clusters_file.write("%s "%item) ##write fitness # clusters_file.write("%s\n"%C_V[cnt-1]) return
def parse_proteins(directory): proteins = list() #First we parse the structure. for file in os.listdir(directory): ZN_num = 0 length = 0 Hys_Cys = 0 ARG_LYS_HIS = 0 C2H2_occur = 0 C2WH2_occur = 0 GATA3_occur = 0 CCHC_occur = 0 ZN2C6_occur = 0 length_factor = 0 if file.endswith(".pdb") or file.endswith(".ent") or file.endswith( ".cif"): # Check the Zinc ions with open(os.path.join(directory, file), "r") as pdb: for line in pdb: if line.startswith("HETNAM"): if line.split(" ")[1] == "ZN": ZN_num += 1 # Biopython parser. protein = SeqIO.to_dict( SeqIO.parse((os.path.join(directory, file)), "pdb-seqres")) #Number of chains chain_num = len(protein) #Number of chains for key in (protein.keys()): #C2H2 Motif C2H2 = re.findall("[C].{2,4}[C].{9,13}[H].{3,5}[H]", str(protein[key].seq)) C2H2_occur = C2H2_occur + len(C2H2) #C2WH2 Motif C2WH2 = re.findall("[C].[W].{1,4}[C].{2,13}[H].{3,5}[H]", str(protein[key].seq)) C2WH2_occur = C2WH2_occur + len(C2WH2) #GATA3 Motif GATA3 = re.findall("[Y].[K].[H].{1,3}[R][P]", str(protein[key].seq)) GATA3_occur = GATA3_occur + len(GATA3) #CCHC Motif CCHC = re.findall("[C]..[C].{3,4}[H].{5,7}[C]", str(protein[key].seq)) CCHC_occur = CCHC_occur + len(CCHC) #ZN2C6 Motif ZN2C6 = re.findall( "[C]..[C]...[KR].[KR][C].{5,7}[C]..[C].{5,7}[C]", str(protein[key].seq)) ZN2C6_occur = ZN2C6_occur + len(ZN2C6) # Length of total protein length = length + (len(protein[key].seq)) #Number of Hystidines + Cysteine Hys = (str(protein[key].seq).count("H")) Cys = (str(protein[key].seq).count("C")) Hys_Cys = Hys_Cys + Hys + Cys #Number of positive residues in the protein ARG = (str(protein[key].seq).count("R")) LYS = (str(protein[key].seq).count("K")) ARG_LYS_HIS = ARG_LYS_HIS + Hys + ARG + LYS if 200 > length > 0: length_factor = 0 elif 400 >= length >= 200: length_factor = 1 elif 600 >= length > 400: length_factor = 2 else: length_factor = 3 prot = Protein(name=file[:-4], C2H2=C2H2_occur, C2WH2=C2WH2_occur, GATA3=GATA3_occur, CCHC=CCHC_occur, ZN2C6=ZN2C6_occur, zinc=ZN_num, prot_len=length_factor, pos=ARG_LYS_HIS / length, num_chain=chain_num, hys_cys=Hys_Cys / length) proteins.append(prot) return proteins
if __name__ == '__main__': response = welcomeStatement() if response == '1': #Function that runs the entire program Main() elif response == '2': #Use following code if you have the Protein, CDS, and genomic sequences #Code produces the Phylogenetic tree and intron mapping P = Protein() P.multiple_sequencing_alignment() P.intronCalculator() P.rootedTreeConstruction() P.renderingTreeImage() elif response == '3': repair_script() elif response == '4': duplicate_management() elif response == '5': #Enter experimental code here:
class Fragment: """Class describing fragments""" def __init__(self, num, modelPath, partial): self.num = num self.basename = "Frag{:04n}".format(num) self.basepath = path.join(modelPath, self.basename) self.values = FragValues() self.protein = Protein() self.center = IndexedCoM() self.resCenters = [] self.stat = FragStatistics() self.atomCount = 0 self.residues = [] self.diffMat = [] self.sdf = [] self.partial = partial def calcValues(self, viscosity=0.0, HarmMe=0.0, radius=0.0): self.values.calcValues(viscosity=viscosity, HarmMe=HarmMe, radius=radius) def addAtom(self, atomName, resNum, resName, point): weight = atomWeight(atomName) self.center.addPoint(resNum, weight, point=point) self.protein.addResidue(resNum, resName) self.atomCount += 1 if resNum not in self.residues: self.residues.append(resNum) if resNum > 1: if atomName.strip() == "N": self.protein.setNpos(resNum, point) if atomName.strip() == "H": self.protein.setHpos(resNum, point) def getWeight(self): return self.protein.getWeight() def getProtons(self): return self.protein.getProtons() def hasResidue(self, num): if num in self.residues: return True return False def getCenter(self): return self.center.getCenter() def getEta(self): return self.values.getEta() def getR(self, corr): return self.values.getR(corr) def getHM(self, corr): return self.values.getHM(corr) def getPDB(self): return self.basepath + '.pdb' def getDat(self): return self.basepath + '.dat' def doneParsing(self): # This function simply exists to free memory self.protein.done()