def prepare_sql(self, complexdb):

        sql_kwargs = dict(
            model_tablename="allmodel",
            model_choose_tablename="model",
            model_choose_columns=["modelid", "di", "coordinates"],
        )

        model_choose_schema = """
        CREATE TABLE IF NOT EXISTS {model_choose_tablename}
        (
        {model_choose_columns[0]} INTEGER PRIMARY KEY NOT NULL,
        {model_choose_columns[1]} REAL NOT NULL,
        {model_choose_columns[2]} TEXT NOT NULL,
        FOREIGN KEY({model_choose_columns[0]}) REFERENCES {model_tablename}({model_choose_columns[0]})
        )""".format(**sql_kwargs)

        model_choose_insert = shared.create_insert_statement(
            sql_kwargs['model_choose_tablename'],
            sql_kwargs['model_choose_columns'])

        n_model_q_fmt = """SELECT count(*) AS ct
        FROM {model_choose_tablename}
        JOIN {model_tablename} USING({model_choose_columns[0]})
        JOIN fragment f USING(fragmentindex, windowindex)
        WHERE windowindex={{windowindex}}
        """.format(**sql_kwargs)

        window_data_q_fmt = """SELECT {model_choose_columns[0]}, modelindex, fragmentindex,
        windowindex, m.dfire as dfire, itscore
        FROM {model_tablename} m JOIN fragment f USING(fragmentindex, windowindex)
        WHERE windowindex={{windowindex}} AND m.dfire IS NOT NULL
        """.format(**sql_kwargs)

        n_paths_q = "SELECT count(*) FROM {model_tablename}".format(
            **sql_kwargs)

        with shared.write_conn(complexdb) as conn:
            curs = conn.cursor()
            curs.execute(model_choose_schema)

        return dict(
            model_insert=model_choose_insert,
            window_data_q_fmt=window_data_q_fmt,
            n_model_q_fmt=n_model_q_fmt,
            n_paths=n_paths_q,
        )
示例#2
0
    def count_paths(cls, complexid, nwindows, db_dir):
        out_db_file = cls.out_db_fmt.format(db_dir=db_dir, complexid=complexid)
        out_tablename = cls.out_tablename_fmt.format(nwindows=nwindows)
        cluster_tablename = cls.cluster_tablename_fmt.format(nwindows=nwindows)

        path_count = 0
        cluster_count = 0
        if os.path.isfile(out_db_file):
            # Need write ability to resolve journal
            with shared.write_conn(out_db_file) as out_conn:
                out_conn.cursor().execute("SELECT 1")
            with shared.ro_conn(out_db_file) as out_conn:
                try:
                    path_count = list(out_conn.cursor().execute(
                        "SELECT count(*) FROM %s" % out_tablename))[0][0]
                except apsw.SQLError:
                    pass
                try:
                    cluster_count = list(out_conn.cursor().execute(
                        "SELECT count(*) FROM %s" % cluster_tablename))[0][0]
                except apsw.SQLError:
                    pass
        return dict(path_count=path_count, cluster_count=cluster_count)
    def load(self, complexdb, complexname, receptor_chain, ligand_chain,
             **kwargs):
        """
        Load model scores.
        """

        fragmentselect = """
        SELECT windowindex, fragmentindex, window_wd
        FROM fragment
        JOIN window USING(windowindex)"""

        model_ct_select = "SELECT COUNT(*) FROM allmodel WHERE fragmentindex=:fragmentindex AND windowindex=:windowindex"
        modelcolumns = [
            "modelindex", "dfire", "itscore", "fragmentindex", "windowindex"
        ]
        modelinsert = shared.create_insert_statement("allmodel", modelcolumns)

        pdbid = "{0}{1}{2}".format(complexname, receptor_chain, ligand_chain)
        logging.debug(pdbid)

        receptor_chain = receptor_chain.lower()[:1]
        ligand_chain = ligand_chain.lower()

        with shared.write_conn(complexdb) as conn:
            curs = conn.cursor()
            curs.execute(model_schema)
            ## INSERT MODELS
            fragment_rows = shared.conn_to_pandas(fragmentselect, conn)
            fragment_rows = fragment_rows.to_dict("records")
            for frag_row in fragment_rows:
                windowindex = frag_row['windowindex']
                fragmentindex = frag_row['fragmentindex']
                frag_wd = os.path.join(frag_row['window_wd'],
                                       str(fragmentindex))

                itscorefile = os.path.join(frag_wd, shared.itscore_file)
                model_itscores = shared.read_itscore(itscorefile, kind="model")

                nmodels = len(model_itscores)
                nmodelrows = curs.execute(
                    model_ct_select,
                    dict(fragmentindex=fragmentindex,
                         windowindex=windowindex)).fetchone()[0]
                if nmodelrows < nmodels:
                    # Drop partial table
                    if nmodelrows:
                        logging.debug("Dropping partial model table")
                        curs.execute("DELETE FROM model")

                    nitscore = len(model_itscores)

                    dfirefile = os.path.join(frag_wd, shared.goap_file)
                    model_dfires = shared.read_dfire_from_goap(dfirefile,
                                                               kind="model")

                    modelrows = pd.merge(model_itscores,
                                         model_dfires,
                                         on="modelindex",
                                         how="left")
                    ndi = len(modelrows)

                    if len(set((nitscore, ndi))) != 1:
                        logging.error("ITScores: %s", nitscore)
                        logging.error("IT lj dfire: %s", ndi)
                        raise LoadModelScoresError("Score number mismatch")

                    modelrows['fragmentindex'] = fragmentindex
                    modelrows['windowindex'] = windowindex
                    curs.executemany(modelinsert, modelrows.to_dict("records"))
    def load_modeldist(self, complexdb, pdbid, window_rows, backbone_atoms, cb,
                       stored_atoms, n_atoms, ca_index, atom_overlap,
                       res_overlap, bb_threshold, max_clash, sticky_max_all,
                       sticky_max_any, min_cosine, min_ifourdist,
                       max_ifourdist, min_isixdist, max_isixdist,
                       min_itwelvedist, **kwargs):

        model_q = """
        SELECT windowindex, modelid, coordinates
        FROM model
        JOIN allmodel USING(modelid)
        JOIN fragment USING(windowindex, fragmentindex)
        ORDER BY windowindex, modelindex
        """
        models = shared.db_to_pandas(model_q, complexdb).to_dict("records")

        if not models:
            raise LoadModelScoresError("No models")

        modeldistcoords = {
            row['modelid']: json.loads(row['coordinates'])
            for row in models
        }

        # Store consecutive pairs of window indices
        window_indices = [row['windowindex'] for row in window_rows]

        def window_dist(w1, w2):
            return abs(window_indices.index(w2) - window_indices.index(w1))

        w_models = dict()
        for w in window_indices:
            w_models[w] = [
                row for row in models if int(row['windowindex']) == w
            ]

        for key, val in w_models.iteritems():
            print key, len(val)

        # Step forward by 4 residues and forward to CA atom
        mp_idx = n_atoms * (res_overlap + 1) + ca_index
        # Step backwards by 4 residues and forward to CA atom
        i_idx = -n_atoms * (res_overlap + 1) + ca_index
        # Step forward by 3 residues and forward to CA atom
        i4_idx = n_atoms * res_overlap + ca_index

        def get_mp_dist(separation):
            if separation < 1:
                raise LoadModelScoresError("Window separation %s", separation)
            elif separation == 1:
                min_dist = min_isixdist
            else:
                min_dist = min_itwelvedist
            max_dist = separation * max_isixdist
            return min_dist, max_dist

        # Pre-create np arrays of midpoint coordinates
        mp_coord_dict = dict()
        for x, window in enumerate(window_indices):
            window_length = window_rows[x]['length']
            if window_length == 4:
                # CA of 4th residue
                window_mp_idx = -4
            elif window_length > 4:
                window_mp_idx = mp_idx
            else:
                raise LoadModelScoresError(
                    "Window {windowindex} has invalid length {length}".format(
                        **window_rows[x]))
            mp_coord_dict[window] = np.array([
                modeldistcoords[id['modelid']][window_mp_idx]
                for id in w_models[window]
            ])

        # Calculate modeldists
        def calculate_modeldist(w1, w2):
            logging.debug("Starting %s %s", w1, w2)
            separation = window_dist(w1, w2)
            neighbors = separation == 1
            mp_min, mp_max = get_mp_dist(separation)

            # Calculate all midpoint distances simultaneously
            w1_array = mp_coord_dict[w1]
            w2_array = mp_coord_dict[w2]
            pairwise_mp_dist = spatial.distance.cdist(w1_array, w2_array)
            allowed = (pairwise_mp_dist > mp_min) & (pairwise_mp_dist < mp_max)
            it = np.nditer(allowed, flags=['multi_index'])
            while not it.finished:
                # Convert from array indices to local lists
                mp_index = it.multi_index
                mp_dist = pairwise_mp_dist[mp_index]
                i, j = mp_index
                w1model = w_models[w1][i]
                w2model = w_models[w2][j]

                id1 = w1model['modelid']
                id2 = w2model['modelid']

                row = dict(modela=id1, modelb=id2, mpdist=mp_dist)
                # If mp_dist constraint is met, check overlap etc.
                if it[0]:
                    skip = False
                    w1_coords = modeldistcoords[id1]
                    w2_coords = modeldistcoords[id2]

                    if neighbors:
                        # CHECK i to i+4 CA DISTANCE
                        i_ca = w1_coords[i_idx]
                        i4_ca = w2_coords[i4_idx]
                        # spatial can handle lists
                        ifourdist = spatial.distance.euclidean(i_ca, i4_ca)

                        if ifourdist < min_ifourdist or ifourdist > max_ifourdist:
                            skip = True
                        else:
                            row['ifourdist'] = ifourdist

                    # CHECK FOR BACKBONE CLASH
                    if not skip and mp_dist <= max_isixdist:
                        w2_nooverlap = w2_coords
                        if neighbors:
                            w2_nooverlap = w2_nooverlap[atom_overlap:]
                        pairwise_overlap = spatial.distance.cdist(
                            w1_coords, w2_nooverlap)
                        clash = pairwise_overlap <= bb_threshold
                        # Number of atoms in clash
                        nclash = np.count_nonzero(clash)

                        if nclash > max_clash:
                            skip = True
                    else:
                        nclash = 0

                    if not skip and neighbors:
                        # PRECOMPUTE EDGESCORE
                        w1_sticky = w1_coords[-atom_overlap:]
                        w2_sticky = w2_coords[:atom_overlap]
                        distances = np.diag(
                            spatial.distance.cdist(w1_sticky, w2_sticky))
                        if any(distances > sticky_max_any):
                            skip = True
                        elif all(distances > sticky_max_all):
                            skip = True
                        else:
                            row['edgescore'] = np.mean(np.square(distances))

                    if not skip and neighbors:
                        # CALCULATE COSINE

                        # N of third to last residue
                        w1_start = np.array(w1_coords[-12])
                        # C of last residue
                        w1_end = np.array(w1_coords[-2])
                        w1_vec = w1_end - w1_start
                        w1_mag = np.linalg.norm(w1_vec)

                        # N of first residue
                        w2_start = np.array(w2_coords[1])
                        # C of third residue
                        w2_end = np.array(w2_coords[11])
                        w2_vec = w2_end - w2_start
                        w2_mag = np.linalg.norm(w2_vec)

                        # $cos(\theta) = \frac{ a \cdot b }{ \| a \| \| b \| }$
                        vector_cosine = np.dot(w1_vec,
                                               w2_vec) / w1_mag / w2_mag

                        if vector_cosine < min_cosine:
                            skip = True
                        else:
                            row['cosine'] = vector_cosine

                    # NEIGHBORS: insert ALLOWED
                    if neighbors and not skip:
                        yield row
                # If mp_dist was not met or clash, row is disallowed
                # NON-NEIGHBORS: insert DISALLOWED
                if not neighbors and (not it[0] or skip):
                    yield row
                it.iternext()
            logging.debug("Finished %s %s", w1, w2)

        for windowid in range(1, len(window_indices)):
            window_index = window_indices[windowid]
            window_sql = self.create_modeldist_tables(
                pdbid=pdbid,
                windowid=windowid,
                windowindex_list=window_indices)
            if window_sql is None: continue
            with shared.write_conn(window_sql['db_name']) as conn:
                cursor = conn.cursor()
                for prev_window_index, sql in window_sql[
                        'window_dict'].iteritems():
                    row_gen = calculate_modeldist(prev_window_index,
                                                  window_index)
                    try:
                        first = next(row_gen)
                    except StopIteration:
                        raise LoadModelScoresError(
                            "No allowed pairs for %s %s", window_index,
                            prev_window_index)
                    else:
                        cursor.execute(sql['insert'], first)
                        cursor.executemany(sql['insert'], row_gen)
                    cursor.execute(sql['index'])
    def choose(self, complexdb, complexname, receptor_chain, ligand_chain,
               **kwargs):
        """
        Choose models and load coordinates.
        """
        models_per_window = 4500

        parser = PDB.PDBParser(QUIET=True)

        scores = [
            dict(columns="itscore", ascending=True),
            dict(columns="dfire", ascending=True)
        ]

        sql_dict = self.prepare_sql(complexdb)
        n_model_q_fmt = sql_dict['n_model_q_fmt']
        window_data_q_fmt = sql_dict['window_data_q_fmt']
        model_insert = sql_dict['model_insert']

        window_q = "SELECT windowindex, window_wd, res_end - res_start + 1 AS length FROM window ORDER BY res_start"
        window_df = shared.db_to_pandas(window_q, complexdb)
        window_rows = window_df.to_dict("records")

        for x, window_row in enumerate(window_rows):
            window_wd = window_row['window_wd']
            length = window_row['length']
            n_chosen_q = n_model_q_fmt.format(**window_row)
            with shared.ro_conn(complexdb) as conn:
                n_chosen = list(conn.cursor().execute(n_chosen_q))[0][0]
            if n_chosen == models_per_window:
                continue
            get_coords = functools.partial(self.get_coords,
                                           parser=parser,
                                           chain=ligand_chain,
                                           length=length)
            window_data_q = window_data_q_fmt.format(**window_row)
            windowrows = shared.db_to_pandas(window_data_q, complexdb)
            if windowrows.empty:
                raise LoadModelScoresError(
                    "No rows for windowindex {windowindex}".format(
                        **window_row))
            # Scale scores and create di column
            windowrows = self.scale_scores(windowrows, scores)
            # Sort by di
            windowrows = windowrows.sort_values("di")
            # Get top N rows by di
            windowrows = windowrows.head(models_per_window)
            # Add path
            path_fmt = os.path.join(window_wd, "{fragmentindex:.0f}", "decoys",
                                    "model{modelindex:.0f}.pdb")
            windowrows['path'] = windowrows.apply(
                lambda x: path_fmt.format(**x), 1)
            # Prepare for insertion
            insert_rows = windowrows.apply(get_coords, 1)
            with shared.write_conn(complexdb) as conn:
                conn.cursor().executemany(model_insert,
                                          insert_rows.to_dict("records"))

        pdbid = "{0}{1}{2}".format(complexname, receptor_chain, ligand_chain)
        self.load_modeldist(complexdb=complexdb,
                            pdbid=pdbid,
                            window_rows=window_rows,
                            **self.params)
示例#6
0
    def __init__(self, complexid, nwindows, batchsize=None, directory=None):
        """CONSTRUCTOR"""

        if directory is None:
            db_dir = script_dir
        else:
            db_dir = directory
        self.in_db_fmt = os.path.join(
            db_dir, "%s_modeldist{cur_window}.db" % complexid)
        self.score_db_file = os.path.join(
            db_dir, "scores_{complexid}.db".format(complexid=complexid))
        self.out_db_file = self.out_db_fmt.format(db_dir=db_dir,
                                                  complexid=complexid)
        logging.debug(self.out_db_file)

        if batchsize is None:
            batchsize = self.default_batchsize

        make_path_times = list()
        cluster_times = list()
        clustersize_times = list()
        for n in range(2, nwindows + 1):
            ct = self.count_paths(complexid=complexid,
                                  nwindows=n,
                                  db_dir=db_dir)
            n_paths = ct['path_count']
            n_clusters = ct['cluster_count']
            start = time.time()
            if n_paths:
                logging.debug("%s %s has %s rows", complexid, n, n_paths)
            else:
                self.make_paths(complexid=complexid,
                                nwindows=n,
                                batchsize=batchsize)
            pathtime = time.time()
            make_path_times.append(pathtime - start)
            if not n_clusters:
                ClusterPdb(complexid=complexid, nwindows=n, directory=db_dir)
            clustertime = time.time()
            cluster_times.append(clustertime - pathtime)
            # Update cluster sizes
            cluster_tablename = self.cluster_tablename_fmt.format(nwindows=n)
            columns = ["pathsid", "cid", "is_medoid"]
            q = "SELECT {columns} FROM {cluster_tablename}".format(
                columns=", ".join(columns),
                cluster_tablename=cluster_tablename)
            # NB shared.db_to_pandas raises factually incorrect error
            with shared.ro_conn(self.out_db_file) as out_conn:
                rows = list(out_conn.cursor().execute(q))
                cluster_rows = pd.DataFrame(rows, columns=columns)
            cluster_sizes = cluster_rows.groupby('cid').size()
            cluster_sizes = cluster_sizes.to_frame("clustersize")
            cluster_rows = cluster_rows.merge(cluster_sizes,
                                              left_on="cid",
                                              right_index=True)
            center_rows = cluster_rows.loc[cluster_rows.loc[:,
                                                            'is_medoid'] == 1]
            cluster_update = shared.create_update_statement(
                tablename=cluster_tablename,
                columns=self.cluster_update_columns,
                where=self.cluster_id_columns)
            with shared.write_conn(self.out_db_file) as conn:
                conn.cursor().executemany(cluster_update,
                                          center_rows.to_dict("records"))
            clustersizetime = time.time()
            clustersize_times.append(clustersizetime - clustertime)
            logging.info("Ending n=%s after %s", n, clustersizetime - start)

        logging.info(make_path_times)
        logging.info(cluster_times)
        logging.info(clustersize_times)
        make_path_total = sum(make_path_times)
        cluster_total = sum(cluster_times)
        clustersize_total = sum(clustersize_times)
        grand_total = sum([make_path_total, cluster_total, clustersize_total])
        logging.info("Make path: %s", make_path_total / grand_total)
        logging.info("Cluster: %s", cluster_total / grand_total)
        logging.info("Clustersize: %s", clustersize_total / grand_total)