예제 #1
0
 def create_modeldist_tables(self, pdbid, windowid, windowindex_list):
     """
     Create database and tables
     """
     db_name = self.modeldist_sql_data['db_name_fmt'].format(
         pdbid=pdbid, windowid=windowid)
     if not shared.missing(db_name):
         return
     logging.debug("Creating new DB for %s", windowid)
     allowed_id = windowid - 1
     allowed_sql = self.modeldist_sql(cur_window=windowid,
                                      prev_window=allowed_id,
                                      mode="allowed",
                                      **self.modeldist_sql_data)
     allowed_schema = allowed_sql.pop('schema')
     window_dict = {windowindex_list[allowed_id]: allowed_sql}
     with shared.new_conn(db_name) as conn:
         cursor = conn.cursor()
         cursor.execute(allowed_schema)
         for prev_window in range(windowid - 1):
             disallowed_sql = self.modeldist_sql(cur_window=windowid,
                                                 prev_window=prev_window,
                                                 mode="disallowed",
                                                 **self.modeldist_sql_data)
             cursor.execute(disallowed_sql.pop('schema'))
             window_dict[windowindex_list[prev_window]] = disallowed_sql
     return dict(db_name=db_name, window_dict=window_dict)
예제 #2
0
def main(directory, pdbid, r_ch, l_ch, input):

    wd = os.path.abspath(directory)
    subdir = os.path.join(wd, pdbid)
    complexid = "{0}{1}{2}".format(pdbid, r_ch, l_ch)
    complexdb = os.path.join(wd, "scores_{0}.db".format(complexid))

    if not shared.missing(complexdb):
        return

    # Initialize window data
    window_data = pd.read_csv(input)
    windows = list()
    fragments = list()
    # Windows are in 2nd-level subdirectories
    for window_dir in glob.iglob(os.path.join(subdir, "*")):
        # Skip 2nd-level files
        if not os.path.isdir(window_dir):
            continue
        subdir, windowindex = os.path.split(window_dir)
        windowindex = windowindex.lower().replace(pdbid.lower(), "")
        try:
            windowindex = int(windowindex)
        except Exception:
            raise CreateDatabaseError("Expected window directory format $PDBID$WINDOWINDEX (e.g. 1bfg1)")
        window_row = dict(windowindex=windowindex, window_wd=window_dir)
        windows.append(window_row)
        # Fragments are in 3rd-level subdirectories
        for fragment_dir in glob.iglob(os.path.join(window_dir, "*")):
            # Skip 3rd-level files
            if not os.path.isdir(fragment_dir):
                continue
            window_dir, fragmentindex = os.path.split(fragment_dir)
            fragment_row = dict(windowindex=windowindex, fragmentindex=fragmentindex)
            fragments.append(fragment_row)

    window_df = pd.merge(window_data, pd.DataFrame(windows), on="windowindex")

    # Create fragment database
    with shared.new_conn(complexdb) as windowconn:
        cursor = windowconn.cursor()
        # Insert windows into database
        cursor.execute(window_schema)
        w_insert = shared.create_insert_statement("window", window_df.columns)
        cursor.executemany(w_insert, window_df.to_dict("records"))
        # Insert fragments into database
        cursor.execute(fragment_schema)
        insert = shared.create_insert_statement("fragment", ["windowindex", "fragmentindex"])
        cursor.executemany(insert, fragments)
예제 #3
0
    def run(cls, fragment_file, ligand_chain, ligand_sequence, pdbid):
        base_dir = os.path.dirname(fragment_file)

        ligand_sequence = SeqIO.read(ligand_sequence, "fasta")

        windowdf = pd.DataFrame(shared.create_windows(len(ligand_sequence)))
        pos_list = windowdf['position'].drop_duplicates().tolist()
        windowdf.to_csv(os.path.join(base_dir, "{0}_data.csv".format(pdbid)),
                        index=False)

        print ligand_sequence

        ### Convert Rosetta format to PDB format
        pos_file_dict = dict()
        with open(fragment_file, "r") as ih:
            position = None
            src_pdbid = None
            rows = list()
            for line in ih:
                parts = line.split()
                if line[:8] == "position":
                    position = parts[1]
                    position_path = os.path.join(base_dir, position)
                    position = int(position)
                    keep_position = position in pos_list
                    if keep_position:
                        pos_file_dict[position] = list()
                        if not os.path.isdir(position_path):
                            os.mkdir(position_path)
                        logging.debug("Rosetta to CA for %s", position_path)
                        index = 1
                # Parts is an empty list if line is blank
                elif not parts:
                    if keep_position and rows:
                        assert src_pdbid
                        filepath = os.path.join(position_path,
                                                "frag_%03d.pdb" % index)
                        pos_file_dict[position].append(filepath)
                        if shared.missing(filepath):
                            with open(filepath, "w") as oh:
                                oh.writelines(rows)
                        rows = list()
                        index += 1
                    src_pdbid = None
                elif keep_position:
                    pdbcode, pdbchain, resi, resn, ss, phi, psi, omega, x, y, z = parts
                    new_pdbid = pdbcode + pdbchain
                    if not rows:
                        src_pdbid = new_pdbid
                        rows.append(cls.header_fmt % src_pdbid)
                        res_id = position
                    fmt_list = list(cls.pdb_default)
                    query_idx = res_id - 1
                    assert query_idx >= 0
                    try:
                        query_resn = ligand_sequence[query_idx]
                    except IndexError:
                        print position, index, res_id
                        raise
                    real_res_id = res_id
                    fmt_list[1] = real_res_id
                    fmt_list[4] = shared.one_to_three[query_resn]
                    fmt_list[5] = ligand_chain
                    fmt_list[6] = real_res_id
                    fmt_list[8] = x
                    fmt_list[9] = y
                    fmt_list[10] = z
                    rows.append(cls.pdb_fmt % tuple(fmt_list))
                    res_id += 1

        all_pos = sorted(pos_file_dict.keys(), key=int)
        last_pos = all_pos[-1]

        # Truncate last pos if necessary
        # 1, 7, 13, 19, 25, ... are starts
        if last_pos % 6 != 1:
            parser = PDB.PDBParser(QUIET=True)
            io = PDB.PDBIO()
            # Get computed position from database
            new_start = windowdf[windowdf['position'] ==
                                 last_pos]['res_start'].tolist()[0]
            assert new_start % 6 == 1
            last_pos_dir = os.path.dirname(pos_file_dict[last_pos][0])
            new_dir = os.path.join(
                os.path.dirname(os.path.normpath(last_pos_dir)),
                "{0:.0f}".format(new_start))
            logging.debug("Changing position %s to start at %s", last_pos,
                          new_start)
            shared.mkdir_p(new_dir)
            # ADD NEW DIR TO DICT
            pos_file_dict[new_start] = list()
            residue_remove_slice = slice(new_start - last_pos)
            for fn in pos_file_dict.pop(last_pos):
                structure = parser.get_structure("fragment", fn)
                if len(structure.child_list) != 1:
                    raise MakePdbError("More than one model in %s" % fn)
                model = structure.child_list[0]
                if len(model.child_list) != 1:
                    raise MakePdbError("More than one chain in %s" % fn)
                chain = model[ligand_chain]
                for del_res in chain.get_list()[residue_remove_slice]:
                    chain.detach_child(del_res.id)
                basename = os.path.basename(fn)
                outfile = os.path.join(new_dir, basename)
                io.set_structure(structure)
                io.save(outfile)
                pos_file_dict[new_start].append(outfile)
            shutil.rmtree(last_pos_dir)
예제 #4
0
    def select_paths(self,
                     complexname,
                     receptor_chain,
                     ligand_chain,
                     nwindows,
                     ct,
                     dest=None):

        pdb_kwargs = dict(complexname=complexname,
                          receptor_chain=receptor_chain,
                          ligand_chain=ligand_chain,
                          nwindows=nwindows)
        pdbid = "{complexname}{receptor_chain}{ligand_chain}".format(
            **pdb_kwargs)
        pdbwindowid = "{0}{nwindows}".format(pdbid, **pdb_kwargs)

        # Create top directory for pdbid
        # Equivalent to directory argument to constructor
        if dest is None:
            # Default dest is pdbid and window number
            dest = os.path.join(self.working_dir, pdbwindowid)
        else:
            # Place non-absolute dest relative to model db filedir
            if not os.path.isabs(dest):
                dest = os.path.join(self.working_dir, dest)

        # charmm balks at mixed case
        dest = dest.lower()
        shared.mkdir_p(dest)

        path_db = "path_{0}_all.db".format(pdbid)
        path_db = os.path.join(self.working_dir, path_db)
        windows = ["window%s" % x for x in range(nwindows)]
        center_q = """SELECT
        pathsid, nodescore, edgescores, clustersize,
        {windows}
        FROM clusters{nwindows}
        JOIN paths{nwindows} USING (pathsid)
        WHERE is_medoid=1
        """.format(nwindows=nwindows, windows=", ".join(windows))

        center_df = shared.db_to_pandas(center_q, path_db)

        occupancy_csv = "{0}_receptor_occupancy.csv".format(pdbwindowid)
        occupancy_file = os.path.join(self.working_dir, occupancy_csv)
        if shared.missing(occupancy_file):
            logging.warning("%s missing", occupancy_file)
            raise SelectPathsError("No occupancy score")
        # Load occupancy score
        occ_data = pd.read_csv(occupancy_file)
        # Combine occupancy score and other scores
        occ_data.rename(columns=dict(pathid="pathsid"), inplace=True)
        center_df = center_df.merge(occ_data, how="left")
        missing = center_df[center_df.isnull().any(axis=1)]
        if not missing.empty:
            print missing
            raise SelectPathsError("Null scores")

        for x, (scorename, ascending) in enumerate(neco_scores):
            multiplier = 1
            if not ascending:
                multiplier = -1

            if any(pd.isnull(center_df[scorename])):
                logging.error("%s %s", pdbid, scorename)
                raise SelectPathsError("Null values")
            # compute Z-scores
            center_df[scorename + "z"] = self.zscore(center_df[scorename] *
                                                     multiplier)
        center_df["best_score"] = center_df.apply(
            lambda x: min(x[s + "z"] for s, __ in neco_scores), axis=1)

        # compute weighted score
        notb_weight = 1 - b_weight
        notb_scores = [(wght, scrnm)
                       for wght, (scrnm, __) in zip(neco_weights, neco_scores)]

        def score_row(r):
            return b_weight * r['best_score'] + notb_weight * sum(
                wght * r[scrnm + "z"] for wght, scrnm in notb_scores)

        center_df['weighted_score'] = center_df.apply(score_row, axis=1)
        #print center_df.head()
        # take top n
        sorted = center_df.sort_values('weighted_score')
        top_n = sorted.head(ct)
        top_n[[
            'pathsid', 'nodescorez', 'edgescoresz', 'clustersizez',
            'occupancyscorez', 'best_score', 'weighted_score'
        ]].to_csv(os.path.join(dest, "path_scores.csv"), index=False)
        paths = top_n.loc[:, ['pathsid'] + windows]

        model_db_file = "scores_{0}.db".format(pdbid)
        model_db_file = os.path.join(self.working_dir, model_db_file)

        self.combine_paths(paths=paths,
                           model_db_file=model_db_file,
                           dest=dest,
                           **pdb_kwargs)
예제 #5
0
        def merge_path(s):
            """
            Create subdirectory and combined.pdb for each path
            """
            # Create subdirectories for pathsid
            pathsid = s['pathsid']
            subdir = os.path.join(top_dir, str(pathsid))
            shared.mkdir_p(subdir)
            outfile = os.path.join(subdir, "%s.pdb" % struct_name)
            if not shared.missing(outfile):
                return outfile
            files = [s[w] for w in window_vars]
            structures = [get_structure(f) for f in files]
            chains = [struc[model_id][ligand_chain] for struc in structures]

            for s_start, c in zip(window_starts, chains):
                # Collect all residues (not modifying chain)
                r_list = [r for r in c]
                # Remove and re-number all residues
                for r in r_list:
                    c.detach_child(r.id)
                    cur_id = list(r.id)
                    cur_id[1] += s_start - 1
                    r.id = tuple(cur_id)
                # Re-add residues to empty chain
                for r in r_list:
                    c.add(r)

            starts = [c.child_list[0].id[1] for c in chains]
            ends = [c.child_list[-1].id[1] for c in chains]

            sb = PDB.StructureBuilder.StructureBuilder()
            sb.init_structure(struct_name)
            sb.init_model(model_id)
            sb.init_seg('    ')
            # Create empty ligand chain
            sb.init_chain(ligand_chain)
            new_struct = sb.get_structure()
            # Add receptor chains
            for ch in receptor_chain:
                new_struct[model_id].add(receptor_model[ch])

            new_chain = new_struct[model_id][ligand_chain]

            for x in xrange(min(starts), max(ends) + 1):
                # Retrieve all residues with id 'x'
                residues = [c[x] for c in chains if x in c]
                # Running total of segment IDs
                n_res = len(residues)
                if n_res == 1:
                    # Unpack single item
                    res, = residues
                    new_chain.add(res)
                elif n_res == 2:
                    # Combined gets averaged position of two residues
                    res1, res2 = residues
                    new_res = res1.copy()
                    for atom1 in res1:
                        atomname = atom1.name
                        atom2 = res2[atomname]
                        new_atom = new_res[atomname]
                        coord1 = atom1.coord
                        coord2 = atom2.coord
                        avg_coord = (coord1 + coord2) / 2.0
                        new_atom.set_coord(avg_coord)
                    new_chain.add(new_res)
                else:
                    raise SelectPathsError("%s residues at %s", n_res, x)

            io.set_structure(new_struct)
            io.save(outfile)
            return outfile
예제 #6
0
    def __init__(self, complexid, nwindows, directory=None, limit=None):

        config = shared.load_config()
        self.clust_bin = os.path.join(config['lzerd_path'], "LB3Dclust")

        if directory is None:
            directory = script_dir

        path_db_file = os.path.join(directory,
                                    "path_{0}_all.db".format(complexid))
        if shared.missing(path_db_file):
            raise ClusterPdbError("DB file %s not found" % path_db_file)
        model_db_file = os.path.join(directory,
                                     "scores_{0}.db".format(complexid))
        if shared.missing(model_db_file):
            raise ClusterPdbError("DB file %s not found" % model_db_file)
        logging.debug("\n%s", model_db_file)

        sql_dict = self.make_sql(complexid=complexid,
                                 nwindows=nwindows,
                                 limit=limit)

        pconn = sqlite3.connect(path_db_file, isolation_level="EXCLUSIVE")
        pcurs = pconn.cursor()
        # Check done
        try:
            cluster_result = pcurs.execute(sql_dict['cluster_count'])
        except sqlite3.OperationalError:
            cluster_count = 0
        else:
            cluster_count = cluster_result.next()[0]
        n = pcurs.execute(sql_dict['path_count']).next()[0]
        done = (n and (cluster_count == n))
        if not done:
            if cluster_count:
                logging.debug("n paths: %s", n)
                logging.debug("n clusters: %s", cluster_count)
                sys.exit(1)
            path_q = sql_dict['path_select']
            row_gen = pcurs.execute(path_q)
            # Convert result tuples to dict of list
            modelid_dict = {int(row[0]): row[1:] for row in row_gen}
            # Start heuristic clustering
            cluster_gen = self.partial_cluster(modelid_dict=modelid_dict,
                                               complexid=complexid,
                                               nwindows=nwindows,
                                               model_db_file=model_db_file)
            # Create cluster table
            for stmt in sql_dict['cluster_schemas']:
                pcurs.execute(stmt)
            # Insert cluster rows
            insert = sql_dict['cluster_insert']
            # Write to disk in batches
            for path_chunk in itertools.izip_longest(*[iter(cluster_gen)] *
                                                     10000,
                                                     fillvalue=None):
                pcurs.executemany(insert,
                                  (row
                                   for row in path_chunk if row is not None))
            pconn.commit()
        else:
            logging.debug("Clustering done.")
        pconn.close()
예제 #7
0
    def run(cls,
            complexname,
            ligand_chain,
            ligand_sequence,
            psipred_path,
            porter_path,
            jpred_path,
            sspro_path,
            directory=None,
            nfrag=None,
            **kwargs):

        config = shared.load_config()
        make_fragments_pl = os.path.join(
            config['rosetta_path'], "tools/fragment_tools/make_fragments.pl")
        fragment_picker_exe = os.path.join(
            config['rosetta_path'],
            "main/source/bin/fragment_picker.linuxgccrelease")

        #complexname = complexname[:4]

        if directory is None:
            directory = os.path.join(script_dir,
                                     "quota{0}".format(complexname))
        if nfrag is None:
            nfrag = cls.default_nfrag

        directory = os.path.abspath(directory)
        logging.info("DIRECTORY: %s", directory)

        # Check, prepare, run Rosetta
        pdbid = "{0}{1}".format(complexname, ligand_chain)
        output_dir = os.path.join(directory, "output_files")
        fragment_name = "{0}.{1}.9mers".format(pdbid, nfrag)
        fragment_file = os.path.join(output_dir, fragment_name)
        score_name = "{0}.fsc.{1}.9mers".format(pdbid, nfrag)
        score_file = os.path.join(output_dir, score_name)
        input_dir = os.path.join(directory, "input_files")
        path_id = os.path.join(input_dir, pdbid)
        fastain = path_id + ".fasta"
        if shared.missing(fragment_file) or shared.missing(score_file):
            flag_kwargs = dict(pdbid=pdbid,
                               nfrag=nfrag,
                               rosetta_path=config['rosetta_path'])
            template_dir = os.path.join(script_dir, "rosetta_templates")
            # Create Rosetta tree
            shared.mkdir_p(output_dir)
            shared.mkdir_p(input_dir)
            # Check if ss files exist
            for method in cls.ss_methods:
                method_key = "{0}_path".format(method)
                f = locals()[method_key]
                if shared.missing(f):
                    raise RunRosettaError("File %s not found" % f)
                else:
                    flag_kwargs[method_key] = f
            native_line = ""
            protocol_type = ""
            flag_kwargs['native_line'] = native_line
            flag_kwargs['protocol_type'] = protocol_type
            # Copy fasta file
            try:
                shutil.copy(ligand_sequence, fastain)
            except shutil.Error:
                # same file error
                pass
            with shared.CHDIR(input_dir):
                checkpoint_file = "{0}.checkpoint".format(pdbid)
                if shared.missing(checkpoint_file):
                    chk_file = "{0}.chk".format(pdbid)
                    if shared.missing(chk_file):
                        # Run blast
                        cmd = cls.blastcmdfmt.format(id=path_id, **config)
                        cmd = cmd.split()
                        subprocess.check_call(cmd)
                    # Convert to Rosetta checkpoint format
                    #subprocess.check_call([cls.convert_blast, pdbid])
                    subprocess.check_call([
                        cls.convert_blast, make_fragments_pl, fastain, chk_file
                    ])
            # Copy quota sizes
            shutil.copy(os.path.join(template_dir, "quota.def"), input_dir)
            # Copy score weights
            weights_name = "quota-protocol.wghts"
            shutil.copy(os.path.join(template_dir, weights_name), input_dir)
            # Create homolog file
            homolog_file = os.path.join(input_dir,
                                        "{0}.homolog_vall".format(pdbid))
            with open(homolog_file, "w") as oh:
                oh.write("{0}\n".format(pdbid))
            # Create flag file
            with open(
                    os.path.join(template_dir,
                                 "quota-protocol.flags.template")) as ih:
                flags_template = ih.read()
            with open(os.path.join(directory, "quota-protocol.flags"),
                      "w") as oh:
                oh.write(flags_template.format(**flag_kwargs))

            # Run rosetta
            with shared.CHDIR(directory):
                # XXX ask Steve why I need to do this now
                #cmd = "source /usr/local/bio/Modules/default/init/bash; module load rosetta; fragment_picker.linuxgccrelease @quota-protocol.flags"
                #cmd = "module load rosetta; fragment_picker.linuxgccrelease @quota-protocol.flags"
                #bash_cmd = '/bin/bash -c "{0}"'.format(cmd)
                cmd = [fragment_picker_exe, "@quota-protocol.flags"]
                with open("{0}.out".format(pdbid), "w") as oh:
                    #proc = subprocess.Popen(bash_cmd,
                    #shell=True,
                    #stdout=oh,
                    #stderr=oh)
                    proc = subprocess.Popen(cmd, stdout=oh, stderr=oh)
                    returncode = proc.wait()
                    if returncode:
                        raise RunRosettaError("Rosetta exited %s" % returncode)

            # Check if it's actually done
            if shared.missing(fragment_file) or shared.missing(score_file):
                raise RunRosettaError("Rosetta did not finish but exited 0")
예제 #8
0
    def count_receptor_contacts(cls, paths, complexname, receptor_chain,
                                ligand_chain, nwindows, dbf, model_db_file,
                                query_dict):
        """
        Count number of paths contacting each receptor
        """

        wd = os.path.dirname(model_db_file)
        pdb_kwargs = dict(complexname=complexname,
                          receptor_chain=receptor_chain,
                          ligand_chain=ligand_chain,
                          nwindows=nwindows)
        pdbwindowid = "{complexname}{receptor_chain}{ligand_chain}{nwindows}".format(
            **pdb_kwargs)
        outfile = os.path.join(wd, "{0}_path_contacts.pdb".format(pdbwindowid))
        path_score_file = os.path.join(
            wd, "{0}_receptor_occupancy.csv".format(pdbwindowid))
        if not shared.missing(outfile) and not shared.missing(path_score_file):
            logging.debug("%s exists", outfile)
            return

        cutoff = 5.0

        residue_fmt = "{chain}_{resname}{resid[1]}"

        def make_key(residue):
            __, __, chainid, residueid = residue.get_full_id()
            return residue_fmt.format(chain=chainid,
                                      resname=residue.get_resname(),
                                      resid=residueid)

        # Drop window1 (modelid) column
        orig_window_vars = [
            x for x in paths.columns.values.tolist() if x.startswith("window")
        ]
        for window_var in orig_window_vars:
            paths = paths.drop(window_var, axis=1)
        # Get model filepaths for paths
        filepaths = cls.get_paths(paths[['pathsid']],
                                  dbf=dbf,
                                  model_db_file=model_db_file,
                                  query_dict=query_dict)
        window_vars = [
            x for x in filepaths.columns.values.tolist()
            if x.startswith("window")
        ]
        get_files = lambda row: [row[w] for w in window_vars]

        parser = PDB.PDBParser(QUIET=True)
        # Remove hydrogens
        get_structure = lambda x: parser.get_structure(
            os.path.splitext(os.path.basename(x))[0], shared.strip_h(x))
        modelid = 0

        receptor_contacts = collections.defaultdict(set)
        for x, row in filepaths.iterrows():
            pathsid = row['pathsid']
            path_files = get_files(row)
            for fn in path_files:
                structure = get_structure(fn)
                atoms = [
                    atom for chain in structure[modelid] for residue in chain
                    for atom in residue
                ]
                if not atoms:
                    raise PlotPathsError("No atoms in %s" % fn)
                ns = PDB.NeighborSearch(atoms)
                search = ns.search_all(radius=cutoff, level="R")
                for res1, res2 in search:
                    __, __, c1, r1 = res1.get_full_id()
                    __, __, c2, r2 = res2.get_full_id()
                    # Skip if chains are both ligand or both receptor
                    if (c1 == ligand_chain) == (c2 == ligand_chain):
                        continue
                    if c1 in receptor_chain:
                        key = make_key(res1)
                    elif c2 in receptor_chain:
                        key = make_key(res2)
                    else:
                        raise PlotPathsError("Neither %s nor %s is receptor" %
                                             (c1, c2))
                    receptor_contacts[key].add(pathsid)
        # Convert from defaultdict to normal dict
        receptor_contacts = dict(receptor_contacts)

        # Count paths contacting each receptor residue
        emptyset = set()
        # Chains have been combined
        r_ch = receptor_chain[0]
        # Deliberately using last structure from loop
        for residue in structure[modelid][r_ch]:
            key = make_key(residue)
            mypaths = receptor_contacts.get(key, emptyset)
            count = len(mypaths)
            for atom in residue:
                atom.set_bfactor(count)

        # Write out structure with b-factor
        #structure[modelid].detach_child(ligand_chain)
        #io = PDB.PDBIO()
        #io.set_structure(structure)
        #io.save(outfile)

        # Count receptor contacts for each path
        path_score_dict = collections.defaultdict(int)
        for contacts in receptor_contacts.itervalues():
            n_contacts = len(contacts)
            for pathid in contacts:
                path_score_dict[pathid] += n_contacts
        path_score_df = pd.DataFrame(path_score_dict.items(),
                                     columns=["pathid", "occupancyscore"])
        path_score_df.to_csv(path_score_file, index=False)