def create_modeldist_tables(self, pdbid, windowid, windowindex_list): """ Create database and tables """ db_name = self.modeldist_sql_data['db_name_fmt'].format( pdbid=pdbid, windowid=windowid) if not shared.missing(db_name): return logging.debug("Creating new DB for %s", windowid) allowed_id = windowid - 1 allowed_sql = self.modeldist_sql(cur_window=windowid, prev_window=allowed_id, mode="allowed", **self.modeldist_sql_data) allowed_schema = allowed_sql.pop('schema') window_dict = {windowindex_list[allowed_id]: allowed_sql} with shared.new_conn(db_name) as conn: cursor = conn.cursor() cursor.execute(allowed_schema) for prev_window in range(windowid - 1): disallowed_sql = self.modeldist_sql(cur_window=windowid, prev_window=prev_window, mode="disallowed", **self.modeldist_sql_data) cursor.execute(disallowed_sql.pop('schema')) window_dict[windowindex_list[prev_window]] = disallowed_sql return dict(db_name=db_name, window_dict=window_dict)
def main(directory, pdbid, r_ch, l_ch, input): wd = os.path.abspath(directory) subdir = os.path.join(wd, pdbid) complexid = "{0}{1}{2}".format(pdbid, r_ch, l_ch) complexdb = os.path.join(wd, "scores_{0}.db".format(complexid)) if not shared.missing(complexdb): return # Initialize window data window_data = pd.read_csv(input) windows = list() fragments = list() # Windows are in 2nd-level subdirectories for window_dir in glob.iglob(os.path.join(subdir, "*")): # Skip 2nd-level files if not os.path.isdir(window_dir): continue subdir, windowindex = os.path.split(window_dir) windowindex = windowindex.lower().replace(pdbid.lower(), "") try: windowindex = int(windowindex) except Exception: raise CreateDatabaseError("Expected window directory format $PDBID$WINDOWINDEX (e.g. 1bfg1)") window_row = dict(windowindex=windowindex, window_wd=window_dir) windows.append(window_row) # Fragments are in 3rd-level subdirectories for fragment_dir in glob.iglob(os.path.join(window_dir, "*")): # Skip 3rd-level files if not os.path.isdir(fragment_dir): continue window_dir, fragmentindex = os.path.split(fragment_dir) fragment_row = dict(windowindex=windowindex, fragmentindex=fragmentindex) fragments.append(fragment_row) window_df = pd.merge(window_data, pd.DataFrame(windows), on="windowindex") # Create fragment database with shared.new_conn(complexdb) as windowconn: cursor = windowconn.cursor() # Insert windows into database cursor.execute(window_schema) w_insert = shared.create_insert_statement("window", window_df.columns) cursor.executemany(w_insert, window_df.to_dict("records")) # Insert fragments into database cursor.execute(fragment_schema) insert = shared.create_insert_statement("fragment", ["windowindex", "fragmentindex"]) cursor.executemany(insert, fragments)
def run(cls, fragment_file, ligand_chain, ligand_sequence, pdbid): base_dir = os.path.dirname(fragment_file) ligand_sequence = SeqIO.read(ligand_sequence, "fasta") windowdf = pd.DataFrame(shared.create_windows(len(ligand_sequence))) pos_list = windowdf['position'].drop_duplicates().tolist() windowdf.to_csv(os.path.join(base_dir, "{0}_data.csv".format(pdbid)), index=False) print ligand_sequence ### Convert Rosetta format to PDB format pos_file_dict = dict() with open(fragment_file, "r") as ih: position = None src_pdbid = None rows = list() for line in ih: parts = line.split() if line[:8] == "position": position = parts[1] position_path = os.path.join(base_dir, position) position = int(position) keep_position = position in pos_list if keep_position: pos_file_dict[position] = list() if not os.path.isdir(position_path): os.mkdir(position_path) logging.debug("Rosetta to CA for %s", position_path) index = 1 # Parts is an empty list if line is blank elif not parts: if keep_position and rows: assert src_pdbid filepath = os.path.join(position_path, "frag_%03d.pdb" % index) pos_file_dict[position].append(filepath) if shared.missing(filepath): with open(filepath, "w") as oh: oh.writelines(rows) rows = list() index += 1 src_pdbid = None elif keep_position: pdbcode, pdbchain, resi, resn, ss, phi, psi, omega, x, y, z = parts new_pdbid = pdbcode + pdbchain if not rows: src_pdbid = new_pdbid rows.append(cls.header_fmt % src_pdbid) res_id = position fmt_list = list(cls.pdb_default) query_idx = res_id - 1 assert query_idx >= 0 try: query_resn = ligand_sequence[query_idx] except IndexError: print position, index, res_id raise real_res_id = res_id fmt_list[1] = real_res_id fmt_list[4] = shared.one_to_three[query_resn] fmt_list[5] = ligand_chain fmt_list[6] = real_res_id fmt_list[8] = x fmt_list[9] = y fmt_list[10] = z rows.append(cls.pdb_fmt % tuple(fmt_list)) res_id += 1 all_pos = sorted(pos_file_dict.keys(), key=int) last_pos = all_pos[-1] # Truncate last pos if necessary # 1, 7, 13, 19, 25, ... are starts if last_pos % 6 != 1: parser = PDB.PDBParser(QUIET=True) io = PDB.PDBIO() # Get computed position from database new_start = windowdf[windowdf['position'] == last_pos]['res_start'].tolist()[0] assert new_start % 6 == 1 last_pos_dir = os.path.dirname(pos_file_dict[last_pos][0]) new_dir = os.path.join( os.path.dirname(os.path.normpath(last_pos_dir)), "{0:.0f}".format(new_start)) logging.debug("Changing position %s to start at %s", last_pos, new_start) shared.mkdir_p(new_dir) # ADD NEW DIR TO DICT pos_file_dict[new_start] = list() residue_remove_slice = slice(new_start - last_pos) for fn in pos_file_dict.pop(last_pos): structure = parser.get_structure("fragment", fn) if len(structure.child_list) != 1: raise MakePdbError("More than one model in %s" % fn) model = structure.child_list[0] if len(model.child_list) != 1: raise MakePdbError("More than one chain in %s" % fn) chain = model[ligand_chain] for del_res in chain.get_list()[residue_remove_slice]: chain.detach_child(del_res.id) basename = os.path.basename(fn) outfile = os.path.join(new_dir, basename) io.set_structure(structure) io.save(outfile) pos_file_dict[new_start].append(outfile) shutil.rmtree(last_pos_dir)
def select_paths(self, complexname, receptor_chain, ligand_chain, nwindows, ct, dest=None): pdb_kwargs = dict(complexname=complexname, receptor_chain=receptor_chain, ligand_chain=ligand_chain, nwindows=nwindows) pdbid = "{complexname}{receptor_chain}{ligand_chain}".format( **pdb_kwargs) pdbwindowid = "{0}{nwindows}".format(pdbid, **pdb_kwargs) # Create top directory for pdbid # Equivalent to directory argument to constructor if dest is None: # Default dest is pdbid and window number dest = os.path.join(self.working_dir, pdbwindowid) else: # Place non-absolute dest relative to model db filedir if not os.path.isabs(dest): dest = os.path.join(self.working_dir, dest) # charmm balks at mixed case dest = dest.lower() shared.mkdir_p(dest) path_db = "path_{0}_all.db".format(pdbid) path_db = os.path.join(self.working_dir, path_db) windows = ["window%s" % x for x in range(nwindows)] center_q = """SELECT pathsid, nodescore, edgescores, clustersize, {windows} FROM clusters{nwindows} JOIN paths{nwindows} USING (pathsid) WHERE is_medoid=1 """.format(nwindows=nwindows, windows=", ".join(windows)) center_df = shared.db_to_pandas(center_q, path_db) occupancy_csv = "{0}_receptor_occupancy.csv".format(pdbwindowid) occupancy_file = os.path.join(self.working_dir, occupancy_csv) if shared.missing(occupancy_file): logging.warning("%s missing", occupancy_file) raise SelectPathsError("No occupancy score") # Load occupancy score occ_data = pd.read_csv(occupancy_file) # Combine occupancy score and other scores occ_data.rename(columns=dict(pathid="pathsid"), inplace=True) center_df = center_df.merge(occ_data, how="left") missing = center_df[center_df.isnull().any(axis=1)] if not missing.empty: print missing raise SelectPathsError("Null scores") for x, (scorename, ascending) in enumerate(neco_scores): multiplier = 1 if not ascending: multiplier = -1 if any(pd.isnull(center_df[scorename])): logging.error("%s %s", pdbid, scorename) raise SelectPathsError("Null values") # compute Z-scores center_df[scorename + "z"] = self.zscore(center_df[scorename] * multiplier) center_df["best_score"] = center_df.apply( lambda x: min(x[s + "z"] for s, __ in neco_scores), axis=1) # compute weighted score notb_weight = 1 - b_weight notb_scores = [(wght, scrnm) for wght, (scrnm, __) in zip(neco_weights, neco_scores)] def score_row(r): return b_weight * r['best_score'] + notb_weight * sum( wght * r[scrnm + "z"] for wght, scrnm in notb_scores) center_df['weighted_score'] = center_df.apply(score_row, axis=1) #print center_df.head() # take top n sorted = center_df.sort_values('weighted_score') top_n = sorted.head(ct) top_n[[ 'pathsid', 'nodescorez', 'edgescoresz', 'clustersizez', 'occupancyscorez', 'best_score', 'weighted_score' ]].to_csv(os.path.join(dest, "path_scores.csv"), index=False) paths = top_n.loc[:, ['pathsid'] + windows] model_db_file = "scores_{0}.db".format(pdbid) model_db_file = os.path.join(self.working_dir, model_db_file) self.combine_paths(paths=paths, model_db_file=model_db_file, dest=dest, **pdb_kwargs)
def merge_path(s): """ Create subdirectory and combined.pdb for each path """ # Create subdirectories for pathsid pathsid = s['pathsid'] subdir = os.path.join(top_dir, str(pathsid)) shared.mkdir_p(subdir) outfile = os.path.join(subdir, "%s.pdb" % struct_name) if not shared.missing(outfile): return outfile files = [s[w] for w in window_vars] structures = [get_structure(f) for f in files] chains = [struc[model_id][ligand_chain] for struc in structures] for s_start, c in zip(window_starts, chains): # Collect all residues (not modifying chain) r_list = [r for r in c] # Remove and re-number all residues for r in r_list: c.detach_child(r.id) cur_id = list(r.id) cur_id[1] += s_start - 1 r.id = tuple(cur_id) # Re-add residues to empty chain for r in r_list: c.add(r) starts = [c.child_list[0].id[1] for c in chains] ends = [c.child_list[-1].id[1] for c in chains] sb = PDB.StructureBuilder.StructureBuilder() sb.init_structure(struct_name) sb.init_model(model_id) sb.init_seg(' ') # Create empty ligand chain sb.init_chain(ligand_chain) new_struct = sb.get_structure() # Add receptor chains for ch in receptor_chain: new_struct[model_id].add(receptor_model[ch]) new_chain = new_struct[model_id][ligand_chain] for x in xrange(min(starts), max(ends) + 1): # Retrieve all residues with id 'x' residues = [c[x] for c in chains if x in c] # Running total of segment IDs n_res = len(residues) if n_res == 1: # Unpack single item res, = residues new_chain.add(res) elif n_res == 2: # Combined gets averaged position of two residues res1, res2 = residues new_res = res1.copy() for atom1 in res1: atomname = atom1.name atom2 = res2[atomname] new_atom = new_res[atomname] coord1 = atom1.coord coord2 = atom2.coord avg_coord = (coord1 + coord2) / 2.0 new_atom.set_coord(avg_coord) new_chain.add(new_res) else: raise SelectPathsError("%s residues at %s", n_res, x) io.set_structure(new_struct) io.save(outfile) return outfile
def __init__(self, complexid, nwindows, directory=None, limit=None): config = shared.load_config() self.clust_bin = os.path.join(config['lzerd_path'], "LB3Dclust") if directory is None: directory = script_dir path_db_file = os.path.join(directory, "path_{0}_all.db".format(complexid)) if shared.missing(path_db_file): raise ClusterPdbError("DB file %s not found" % path_db_file) model_db_file = os.path.join(directory, "scores_{0}.db".format(complexid)) if shared.missing(model_db_file): raise ClusterPdbError("DB file %s not found" % model_db_file) logging.debug("\n%s", model_db_file) sql_dict = self.make_sql(complexid=complexid, nwindows=nwindows, limit=limit) pconn = sqlite3.connect(path_db_file, isolation_level="EXCLUSIVE") pcurs = pconn.cursor() # Check done try: cluster_result = pcurs.execute(sql_dict['cluster_count']) except sqlite3.OperationalError: cluster_count = 0 else: cluster_count = cluster_result.next()[0] n = pcurs.execute(sql_dict['path_count']).next()[0] done = (n and (cluster_count == n)) if not done: if cluster_count: logging.debug("n paths: %s", n) logging.debug("n clusters: %s", cluster_count) sys.exit(1) path_q = sql_dict['path_select'] row_gen = pcurs.execute(path_q) # Convert result tuples to dict of list modelid_dict = {int(row[0]): row[1:] for row in row_gen} # Start heuristic clustering cluster_gen = self.partial_cluster(modelid_dict=modelid_dict, complexid=complexid, nwindows=nwindows, model_db_file=model_db_file) # Create cluster table for stmt in sql_dict['cluster_schemas']: pcurs.execute(stmt) # Insert cluster rows insert = sql_dict['cluster_insert'] # Write to disk in batches for path_chunk in itertools.izip_longest(*[iter(cluster_gen)] * 10000, fillvalue=None): pcurs.executemany(insert, (row for row in path_chunk if row is not None)) pconn.commit() else: logging.debug("Clustering done.") pconn.close()
def run(cls, complexname, ligand_chain, ligand_sequence, psipred_path, porter_path, jpred_path, sspro_path, directory=None, nfrag=None, **kwargs): config = shared.load_config() make_fragments_pl = os.path.join( config['rosetta_path'], "tools/fragment_tools/make_fragments.pl") fragment_picker_exe = os.path.join( config['rosetta_path'], "main/source/bin/fragment_picker.linuxgccrelease") #complexname = complexname[:4] if directory is None: directory = os.path.join(script_dir, "quota{0}".format(complexname)) if nfrag is None: nfrag = cls.default_nfrag directory = os.path.abspath(directory) logging.info("DIRECTORY: %s", directory) # Check, prepare, run Rosetta pdbid = "{0}{1}".format(complexname, ligand_chain) output_dir = os.path.join(directory, "output_files") fragment_name = "{0}.{1}.9mers".format(pdbid, nfrag) fragment_file = os.path.join(output_dir, fragment_name) score_name = "{0}.fsc.{1}.9mers".format(pdbid, nfrag) score_file = os.path.join(output_dir, score_name) input_dir = os.path.join(directory, "input_files") path_id = os.path.join(input_dir, pdbid) fastain = path_id + ".fasta" if shared.missing(fragment_file) or shared.missing(score_file): flag_kwargs = dict(pdbid=pdbid, nfrag=nfrag, rosetta_path=config['rosetta_path']) template_dir = os.path.join(script_dir, "rosetta_templates") # Create Rosetta tree shared.mkdir_p(output_dir) shared.mkdir_p(input_dir) # Check if ss files exist for method in cls.ss_methods: method_key = "{0}_path".format(method) f = locals()[method_key] if shared.missing(f): raise RunRosettaError("File %s not found" % f) else: flag_kwargs[method_key] = f native_line = "" protocol_type = "" flag_kwargs['native_line'] = native_line flag_kwargs['protocol_type'] = protocol_type # Copy fasta file try: shutil.copy(ligand_sequence, fastain) except shutil.Error: # same file error pass with shared.CHDIR(input_dir): checkpoint_file = "{0}.checkpoint".format(pdbid) if shared.missing(checkpoint_file): chk_file = "{0}.chk".format(pdbid) if shared.missing(chk_file): # Run blast cmd = cls.blastcmdfmt.format(id=path_id, **config) cmd = cmd.split() subprocess.check_call(cmd) # Convert to Rosetta checkpoint format #subprocess.check_call([cls.convert_blast, pdbid]) subprocess.check_call([ cls.convert_blast, make_fragments_pl, fastain, chk_file ]) # Copy quota sizes shutil.copy(os.path.join(template_dir, "quota.def"), input_dir) # Copy score weights weights_name = "quota-protocol.wghts" shutil.copy(os.path.join(template_dir, weights_name), input_dir) # Create homolog file homolog_file = os.path.join(input_dir, "{0}.homolog_vall".format(pdbid)) with open(homolog_file, "w") as oh: oh.write("{0}\n".format(pdbid)) # Create flag file with open( os.path.join(template_dir, "quota-protocol.flags.template")) as ih: flags_template = ih.read() with open(os.path.join(directory, "quota-protocol.flags"), "w") as oh: oh.write(flags_template.format(**flag_kwargs)) # Run rosetta with shared.CHDIR(directory): # XXX ask Steve why I need to do this now #cmd = "source /usr/local/bio/Modules/default/init/bash; module load rosetta; fragment_picker.linuxgccrelease @quota-protocol.flags" #cmd = "module load rosetta; fragment_picker.linuxgccrelease @quota-protocol.flags" #bash_cmd = '/bin/bash -c "{0}"'.format(cmd) cmd = [fragment_picker_exe, "@quota-protocol.flags"] with open("{0}.out".format(pdbid), "w") as oh: #proc = subprocess.Popen(bash_cmd, #shell=True, #stdout=oh, #stderr=oh) proc = subprocess.Popen(cmd, stdout=oh, stderr=oh) returncode = proc.wait() if returncode: raise RunRosettaError("Rosetta exited %s" % returncode) # Check if it's actually done if shared.missing(fragment_file) or shared.missing(score_file): raise RunRosettaError("Rosetta did not finish but exited 0")
def count_receptor_contacts(cls, paths, complexname, receptor_chain, ligand_chain, nwindows, dbf, model_db_file, query_dict): """ Count number of paths contacting each receptor """ wd = os.path.dirname(model_db_file) pdb_kwargs = dict(complexname=complexname, receptor_chain=receptor_chain, ligand_chain=ligand_chain, nwindows=nwindows) pdbwindowid = "{complexname}{receptor_chain}{ligand_chain}{nwindows}".format( **pdb_kwargs) outfile = os.path.join(wd, "{0}_path_contacts.pdb".format(pdbwindowid)) path_score_file = os.path.join( wd, "{0}_receptor_occupancy.csv".format(pdbwindowid)) if not shared.missing(outfile) and not shared.missing(path_score_file): logging.debug("%s exists", outfile) return cutoff = 5.0 residue_fmt = "{chain}_{resname}{resid[1]}" def make_key(residue): __, __, chainid, residueid = residue.get_full_id() return residue_fmt.format(chain=chainid, resname=residue.get_resname(), resid=residueid) # Drop window1 (modelid) column orig_window_vars = [ x for x in paths.columns.values.tolist() if x.startswith("window") ] for window_var in orig_window_vars: paths = paths.drop(window_var, axis=1) # Get model filepaths for paths filepaths = cls.get_paths(paths[['pathsid']], dbf=dbf, model_db_file=model_db_file, query_dict=query_dict) window_vars = [ x for x in filepaths.columns.values.tolist() if x.startswith("window") ] get_files = lambda row: [row[w] for w in window_vars] parser = PDB.PDBParser(QUIET=True) # Remove hydrogens get_structure = lambda x: parser.get_structure( os.path.splitext(os.path.basename(x))[0], shared.strip_h(x)) modelid = 0 receptor_contacts = collections.defaultdict(set) for x, row in filepaths.iterrows(): pathsid = row['pathsid'] path_files = get_files(row) for fn in path_files: structure = get_structure(fn) atoms = [ atom for chain in structure[modelid] for residue in chain for atom in residue ] if not atoms: raise PlotPathsError("No atoms in %s" % fn) ns = PDB.NeighborSearch(atoms) search = ns.search_all(radius=cutoff, level="R") for res1, res2 in search: __, __, c1, r1 = res1.get_full_id() __, __, c2, r2 = res2.get_full_id() # Skip if chains are both ligand or both receptor if (c1 == ligand_chain) == (c2 == ligand_chain): continue if c1 in receptor_chain: key = make_key(res1) elif c2 in receptor_chain: key = make_key(res2) else: raise PlotPathsError("Neither %s nor %s is receptor" % (c1, c2)) receptor_contacts[key].add(pathsid) # Convert from defaultdict to normal dict receptor_contacts = dict(receptor_contacts) # Count paths contacting each receptor residue emptyset = set() # Chains have been combined r_ch = receptor_chain[0] # Deliberately using last structure from loop for residue in structure[modelid][r_ch]: key = make_key(residue) mypaths = receptor_contacts.get(key, emptyset) count = len(mypaths) for atom in residue: atom.set_bfactor(count) # Write out structure with b-factor #structure[modelid].detach_child(ligand_chain) #io = PDB.PDBIO() #io.set_structure(structure) #io.save(outfile) # Count receptor contacts for each path path_score_dict = collections.defaultdict(int) for contacts in receptor_contacts.itervalues(): n_contacts = len(contacts) for pathid in contacts: path_score_dict[pathid] += n_contacts path_score_df = pd.DataFrame(path_score_dict.items(), columns=["pathid", "occupancyscore"]) path_score_df.to_csv(path_score_file, index=False)