def main(): f = sys.argv[1] r = int(getarg(2, 0)) inname = getarg(3, "data/r%d_lin.p" % (r)) print("Test with radius=%d and file %s and model %s" % (r, f, inname)) model = pickle.load(open(inname, "rb")) w, h = Image.open(f).size X = extract_x([f], r) dim = (h - 2 * r, w - 2 * r) y = model.predict(X) y = np.reshape(y, dim) y = denormalize(y) if np.any(np.isnan(y)): print("result of", f, "has nan values after denormalizing") img = Image.fromarray(y.astype(np.uint8)) outfold = str.replace(str.replace(inname, ".p", ""), "data/", "out/") outf = path.join(outfold, path.basename(f)) mkdir_p(outfold) img.save(outf)
def run(cls, fragment_file, ligand_chain, ligand_sequence, pdbid): base_dir = os.path.dirname(fragment_file) ligand_sequence = SeqIO.read(ligand_sequence, "fasta") windowdf = pd.DataFrame(shared.create_windows(len(ligand_sequence))) pos_list = windowdf['position'].drop_duplicates().tolist() windowdf.to_csv(os.path.join(base_dir, "{0}_data.csv".format(pdbid)), index=False) print ligand_sequence ### Convert Rosetta format to PDB format pos_file_dict = dict() with open(fragment_file, "r") as ih: position = None src_pdbid = None rows = list() for line in ih: parts = line.split() if line[:8] == "position": position = parts[1] position_path = os.path.join(base_dir, position) position = int(position) keep_position = position in pos_list if keep_position: pos_file_dict[position] = list() if not os.path.isdir(position_path): os.mkdir(position_path) logging.debug("Rosetta to CA for %s", position_path) index = 1 # Parts is an empty list if line is blank elif not parts: if keep_position and rows: assert src_pdbid filepath = os.path.join(position_path, "frag_%03d.pdb" % index) pos_file_dict[position].append(filepath) if shared.missing(filepath): with open(filepath, "w") as oh: oh.writelines(rows) rows = list() index += 1 src_pdbid = None elif keep_position: pdbcode, pdbchain, resi, resn, ss, phi, psi, omega, x, y, z = parts new_pdbid = pdbcode + pdbchain if not rows: src_pdbid = new_pdbid rows.append(cls.header_fmt % src_pdbid) res_id = position fmt_list = list(cls.pdb_default) query_idx = res_id - 1 assert query_idx >= 0 try: query_resn = ligand_sequence[query_idx] except IndexError: print position, index, res_id raise real_res_id = res_id fmt_list[1] = real_res_id fmt_list[4] = shared.one_to_three[query_resn] fmt_list[5] = ligand_chain fmt_list[6] = real_res_id fmt_list[8] = x fmt_list[9] = y fmt_list[10] = z rows.append(cls.pdb_fmt % tuple(fmt_list)) res_id += 1 all_pos = sorted(pos_file_dict.keys(), key=int) last_pos = all_pos[-1] # Truncate last pos if necessary # 1, 7, 13, 19, 25, ... are starts if last_pos % 6 != 1: parser = PDB.PDBParser(QUIET=True) io = PDB.PDBIO() # Get computed position from database new_start = windowdf[windowdf['position'] == last_pos]['res_start'].tolist()[0] assert new_start % 6 == 1 last_pos_dir = os.path.dirname(pos_file_dict[last_pos][0]) new_dir = os.path.join( os.path.dirname(os.path.normpath(last_pos_dir)), "{0:.0f}".format(new_start)) logging.debug("Changing position %s to start at %s", last_pos, new_start) shared.mkdir_p(new_dir) # ADD NEW DIR TO DICT pos_file_dict[new_start] = list() residue_remove_slice = slice(new_start - last_pos) for fn in pos_file_dict.pop(last_pos): structure = parser.get_structure("fragment", fn) if len(structure.child_list) != 1: raise MakePdbError("More than one model in %s" % fn) model = structure.child_list[0] if len(model.child_list) != 1: raise MakePdbError("More than one chain in %s" % fn) chain = model[ligand_chain] for del_res in chain.get_list()[residue_remove_slice]: chain.detach_child(del_res.id) basename = os.path.basename(fn) outfile = os.path.join(new_dir, basename) io.set_structure(structure) io.save(outfile) pos_file_dict[new_start].append(outfile) shutil.rmtree(last_pos_dir)
def select_paths(self, complexname, receptor_chain, ligand_chain, nwindows, ct, dest=None): pdb_kwargs = dict(complexname=complexname, receptor_chain=receptor_chain, ligand_chain=ligand_chain, nwindows=nwindows) pdbid = "{complexname}{receptor_chain}{ligand_chain}".format( **pdb_kwargs) pdbwindowid = "{0}{nwindows}".format(pdbid, **pdb_kwargs) # Create top directory for pdbid # Equivalent to directory argument to constructor if dest is None: # Default dest is pdbid and window number dest = os.path.join(self.working_dir, pdbwindowid) else: # Place non-absolute dest relative to model db filedir if not os.path.isabs(dest): dest = os.path.join(self.working_dir, dest) # charmm balks at mixed case dest = dest.lower() shared.mkdir_p(dest) path_db = "path_{0}_all.db".format(pdbid) path_db = os.path.join(self.working_dir, path_db) windows = ["window%s" % x for x in range(nwindows)] center_q = """SELECT pathsid, nodescore, edgescores, clustersize, {windows} FROM clusters{nwindows} JOIN paths{nwindows} USING (pathsid) WHERE is_medoid=1 """.format(nwindows=nwindows, windows=", ".join(windows)) center_df = shared.db_to_pandas(center_q, path_db) occupancy_csv = "{0}_receptor_occupancy.csv".format(pdbwindowid) occupancy_file = os.path.join(self.working_dir, occupancy_csv) if shared.missing(occupancy_file): logging.warning("%s missing", occupancy_file) raise SelectPathsError("No occupancy score") # Load occupancy score occ_data = pd.read_csv(occupancy_file) # Combine occupancy score and other scores occ_data.rename(columns=dict(pathid="pathsid"), inplace=True) center_df = center_df.merge(occ_data, how="left") missing = center_df[center_df.isnull().any(axis=1)] if not missing.empty: print missing raise SelectPathsError("Null scores") for x, (scorename, ascending) in enumerate(neco_scores): multiplier = 1 if not ascending: multiplier = -1 if any(pd.isnull(center_df[scorename])): logging.error("%s %s", pdbid, scorename) raise SelectPathsError("Null values") # compute Z-scores center_df[scorename + "z"] = self.zscore(center_df[scorename] * multiplier) center_df["best_score"] = center_df.apply( lambda x: min(x[s + "z"] for s, __ in neco_scores), axis=1) # compute weighted score notb_weight = 1 - b_weight notb_scores = [(wght, scrnm) for wght, (scrnm, __) in zip(neco_weights, neco_scores)] def score_row(r): return b_weight * r['best_score'] + notb_weight * sum( wght * r[scrnm + "z"] for wght, scrnm in notb_scores) center_df['weighted_score'] = center_df.apply(score_row, axis=1) #print center_df.head() # take top n sorted = center_df.sort_values('weighted_score') top_n = sorted.head(ct) top_n[[ 'pathsid', 'nodescorez', 'edgescoresz', 'clustersizez', 'occupancyscorez', 'best_score', 'weighted_score' ]].to_csv(os.path.join(dest, "path_scores.csv"), index=False) paths = top_n.loc[:, ['pathsid'] + windows] model_db_file = "scores_{0}.db".format(pdbid) model_db_file = os.path.join(self.working_dir, model_db_file) self.combine_paths(paths=paths, model_db_file=model_db_file, dest=dest, **pdb_kwargs)
def merge_path(s): """ Create subdirectory and combined.pdb for each path """ # Create subdirectories for pathsid pathsid = s['pathsid'] subdir = os.path.join(top_dir, str(pathsid)) shared.mkdir_p(subdir) outfile = os.path.join(subdir, "%s.pdb" % struct_name) if not shared.missing(outfile): return outfile files = [s[w] for w in window_vars] structures = [get_structure(f) for f in files] chains = [struc[model_id][ligand_chain] for struc in structures] for s_start, c in zip(window_starts, chains): # Collect all residues (not modifying chain) r_list = [r for r in c] # Remove and re-number all residues for r in r_list: c.detach_child(r.id) cur_id = list(r.id) cur_id[1] += s_start - 1 r.id = tuple(cur_id) # Re-add residues to empty chain for r in r_list: c.add(r) starts = [c.child_list[0].id[1] for c in chains] ends = [c.child_list[-1].id[1] for c in chains] sb = PDB.StructureBuilder.StructureBuilder() sb.init_structure(struct_name) sb.init_model(model_id) sb.init_seg(' ') # Create empty ligand chain sb.init_chain(ligand_chain) new_struct = sb.get_structure() # Add receptor chains for ch in receptor_chain: new_struct[model_id].add(receptor_model[ch]) new_chain = new_struct[model_id][ligand_chain] for x in xrange(min(starts), max(ends) + 1): # Retrieve all residues with id 'x' residues = [c[x] for c in chains if x in c] # Running total of segment IDs n_res = len(residues) if n_res == 1: # Unpack single item res, = residues new_chain.add(res) elif n_res == 2: # Combined gets averaged position of two residues res1, res2 = residues new_res = res1.copy() for atom1 in res1: atomname = atom1.name atom2 = res2[atomname] new_atom = new_res[atomname] coord1 = atom1.coord coord2 = atom2.coord avg_coord = (coord1 + coord2) / 2.0 new_atom.set_coord(avg_coord) new_chain.add(new_res) else: raise SelectPathsError("%s residues at %s", n_res, x) io.set_structure(new_struct) io.save(outfile) return outfile
def cluster_models(self, modelcoord_dict, chain, groupid, cutoff=None, wd=None, cleanup=True): """ Cluster models. :param modelrow_dict: paths to cluster keyed by pathsid :type modelrow_dict: dict :param chain: chain to keep in model :param groupid: group identifier :type groupid: int :param wd: working directory :param cleanup: whether to remove merged files :type cleanup: bool :returns: list of dict """ if cutoff is None: cutoff = self.default_cutoff if wd is None: wd = self.default_wd os.chdir(wd) ligand_list = os.path.join(wd, "ligand_list_%s.txt" % groupid) cluster_err = os.path.join(wd, "clusters_%s_out.txt" % groupid) """ 1 2 3 4 5 6 7 8 12345678901234567890123456789012345678901234567890123456789012345678901234567890 ATOM 145 N VAL A 25 32.433 16.336 57.540 1.00 11.92 A1 N ATOM 146 CA VAL A 25 31.132 16.439 58.160 1.00 11.85 A1 C """ pdb_line = "ATOM {index:5d} CA UNK A{index:4d} {x:8.3f}{y:8.3f}{z:8.3f} 1.00 0.00 C \n" outdir = os.path.join(wd, "{0}merged".format(groupid)) shared.mkdir_p(outdir) file_list = list() lig_file_dict = dict() for pathsid, coords in modelcoord_dict.iteritems(): ligandfile = os.path.join(outdir, "path_{0}.pdb".format(pathsid)) outlines = list() for i, (x, y, z) in enumerate(coords): outlines.append(pdb_line.format(index=i + 1, x=x, y=y, z=z)) with open(ligandfile, "w") as oh: oh.writelines(outlines) lig_file_dict[ligandfile] = pathsid file_list.append(ligandfile) with open(ligand_list, "w") as oh: for fn in file_list: oh.write(fn + "\n") logging.debug("Clustering...") clst_cmd = [ self.clust_bin, "-L", ligand_list, "-c", str(cutoff), "-r", "0.1" ] proc = subprocess.Popen(clst_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate() ret = proc.returncode if ret: raise ClusterPdbError("Clustering exited %s" % ret) data_rows, err_lines = self.parse_cluster_out(out) with open(cluster_err, "w") as eh: eh.writelines(err_lines) for row in data_rows: row['pathsid'] = lig_file_dict[row['model']] if cleanup: shutil.rmtree(outdir, ignore_errors=True) shared.silent_remove(ligand_list) return data_rows
def run(cls, complexname, ligand_chain, ligand_sequence, psipred_path, porter_path, jpred_path, sspro_path, directory=None, nfrag=None, **kwargs): config = shared.load_config() make_fragments_pl = os.path.join( config['rosetta_path'], "tools/fragment_tools/make_fragments.pl") fragment_picker_exe = os.path.join( config['rosetta_path'], "main/source/bin/fragment_picker.linuxgccrelease") #complexname = complexname[:4] if directory is None: directory = os.path.join(script_dir, "quota{0}".format(complexname)) if nfrag is None: nfrag = cls.default_nfrag directory = os.path.abspath(directory) logging.info("DIRECTORY: %s", directory) # Check, prepare, run Rosetta pdbid = "{0}{1}".format(complexname, ligand_chain) output_dir = os.path.join(directory, "output_files") fragment_name = "{0}.{1}.9mers".format(pdbid, nfrag) fragment_file = os.path.join(output_dir, fragment_name) score_name = "{0}.fsc.{1}.9mers".format(pdbid, nfrag) score_file = os.path.join(output_dir, score_name) input_dir = os.path.join(directory, "input_files") path_id = os.path.join(input_dir, pdbid) fastain = path_id + ".fasta" if shared.missing(fragment_file) or shared.missing(score_file): flag_kwargs = dict(pdbid=pdbid, nfrag=nfrag, rosetta_path=config['rosetta_path']) template_dir = os.path.join(script_dir, "rosetta_templates") # Create Rosetta tree shared.mkdir_p(output_dir) shared.mkdir_p(input_dir) # Check if ss files exist for method in cls.ss_methods: method_key = "{0}_path".format(method) f = locals()[method_key] if shared.missing(f): raise RunRosettaError("File %s not found" % f) else: flag_kwargs[method_key] = f native_line = "" protocol_type = "" flag_kwargs['native_line'] = native_line flag_kwargs['protocol_type'] = protocol_type # Copy fasta file try: shutil.copy(ligand_sequence, fastain) except shutil.Error: # same file error pass with shared.CHDIR(input_dir): checkpoint_file = "{0}.checkpoint".format(pdbid) if shared.missing(checkpoint_file): chk_file = "{0}.chk".format(pdbid) if shared.missing(chk_file): # Run blast cmd = cls.blastcmdfmt.format(id=path_id, **config) cmd = cmd.split() subprocess.check_call(cmd) # Convert to Rosetta checkpoint format #subprocess.check_call([cls.convert_blast, pdbid]) subprocess.check_call([ cls.convert_blast, make_fragments_pl, fastain, chk_file ]) # Copy quota sizes shutil.copy(os.path.join(template_dir, "quota.def"), input_dir) # Copy score weights weights_name = "quota-protocol.wghts" shutil.copy(os.path.join(template_dir, weights_name), input_dir) # Create homolog file homolog_file = os.path.join(input_dir, "{0}.homolog_vall".format(pdbid)) with open(homolog_file, "w") as oh: oh.write("{0}\n".format(pdbid)) # Create flag file with open( os.path.join(template_dir, "quota-protocol.flags.template")) as ih: flags_template = ih.read() with open(os.path.join(directory, "quota-protocol.flags"), "w") as oh: oh.write(flags_template.format(**flag_kwargs)) # Run rosetta with shared.CHDIR(directory): # XXX ask Steve why I need to do this now #cmd = "source /usr/local/bio/Modules/default/init/bash; module load rosetta; fragment_picker.linuxgccrelease @quota-protocol.flags" #cmd = "module load rosetta; fragment_picker.linuxgccrelease @quota-protocol.flags" #bash_cmd = '/bin/bash -c "{0}"'.format(cmd) cmd = [fragment_picker_exe, "@quota-protocol.flags"] with open("{0}.out".format(pdbid), "w") as oh: #proc = subprocess.Popen(bash_cmd, #shell=True, #stdout=oh, #stderr=oh) proc = subprocess.Popen(cmd, stdout=oh, stderr=oh) returncode = proc.wait() if returncode: raise RunRosettaError("Rosetta exited %s" % returncode) # Check if it's actually done if shared.missing(fragment_file) or shared.missing(score_file): raise RunRosettaError("Rosetta did not finish but exited 0")