def prepare_sql(self, complexdb): sql_kwargs = dict( model_tablename="allmodel", model_choose_tablename="model", model_choose_columns=["modelid", "di", "coordinates"], ) model_choose_schema = """ CREATE TABLE IF NOT EXISTS {model_choose_tablename} ( {model_choose_columns[0]} INTEGER PRIMARY KEY NOT NULL, {model_choose_columns[1]} REAL NOT NULL, {model_choose_columns[2]} TEXT NOT NULL, FOREIGN KEY({model_choose_columns[0]}) REFERENCES {model_tablename}({model_choose_columns[0]}) )""".format(**sql_kwargs) model_choose_insert = shared.create_insert_statement( sql_kwargs['model_choose_tablename'], sql_kwargs['model_choose_columns']) n_model_q_fmt = """SELECT count(*) AS ct FROM {model_choose_tablename} JOIN {model_tablename} USING({model_choose_columns[0]}) JOIN fragment f USING(fragmentindex, windowindex) WHERE windowindex={{windowindex}} """.format(**sql_kwargs) window_data_q_fmt = """SELECT {model_choose_columns[0]}, modelindex, fragmentindex, windowindex, m.dfire as dfire, itscore FROM {model_tablename} m JOIN fragment f USING(fragmentindex, windowindex) WHERE windowindex={{windowindex}} AND m.dfire IS NOT NULL """.format(**sql_kwargs) n_paths_q = "SELECT count(*) FROM {model_tablename}".format( **sql_kwargs) with shared.write_conn(complexdb) as conn: curs = conn.cursor() curs.execute(model_choose_schema) return dict( model_insert=model_choose_insert, window_data_q_fmt=window_data_q_fmt, n_model_q_fmt=n_model_q_fmt, n_paths=n_paths_q, )
def count_paths(cls, complexid, nwindows, db_dir): out_db_file = cls.out_db_fmt.format(db_dir=db_dir, complexid=complexid) out_tablename = cls.out_tablename_fmt.format(nwindows=nwindows) cluster_tablename = cls.cluster_tablename_fmt.format(nwindows=nwindows) path_count = 0 cluster_count = 0 if os.path.isfile(out_db_file): # Need write ability to resolve journal with shared.write_conn(out_db_file) as out_conn: out_conn.cursor().execute("SELECT 1") with shared.ro_conn(out_db_file) as out_conn: try: path_count = list(out_conn.cursor().execute( "SELECT count(*) FROM %s" % out_tablename))[0][0] except apsw.SQLError: pass try: cluster_count = list(out_conn.cursor().execute( "SELECT count(*) FROM %s" % cluster_tablename))[0][0] except apsw.SQLError: pass return dict(path_count=path_count, cluster_count=cluster_count)
def load(self, complexdb, complexname, receptor_chain, ligand_chain, **kwargs): """ Load model scores. """ fragmentselect = """ SELECT windowindex, fragmentindex, window_wd FROM fragment JOIN window USING(windowindex)""" model_ct_select = "SELECT COUNT(*) FROM allmodel WHERE fragmentindex=:fragmentindex AND windowindex=:windowindex" modelcolumns = [ "modelindex", "dfire", "itscore", "fragmentindex", "windowindex" ] modelinsert = shared.create_insert_statement("allmodel", modelcolumns) pdbid = "{0}{1}{2}".format(complexname, receptor_chain, ligand_chain) logging.debug(pdbid) receptor_chain = receptor_chain.lower()[:1] ligand_chain = ligand_chain.lower() with shared.write_conn(complexdb) as conn: curs = conn.cursor() curs.execute(model_schema) ## INSERT MODELS fragment_rows = shared.conn_to_pandas(fragmentselect, conn) fragment_rows = fragment_rows.to_dict("records") for frag_row in fragment_rows: windowindex = frag_row['windowindex'] fragmentindex = frag_row['fragmentindex'] frag_wd = os.path.join(frag_row['window_wd'], str(fragmentindex)) itscorefile = os.path.join(frag_wd, shared.itscore_file) model_itscores = shared.read_itscore(itscorefile, kind="model") nmodels = len(model_itscores) nmodelrows = curs.execute( model_ct_select, dict(fragmentindex=fragmentindex, windowindex=windowindex)).fetchone()[0] if nmodelrows < nmodels: # Drop partial table if nmodelrows: logging.debug("Dropping partial model table") curs.execute("DELETE FROM model") nitscore = len(model_itscores) dfirefile = os.path.join(frag_wd, shared.goap_file) model_dfires = shared.read_dfire_from_goap(dfirefile, kind="model") modelrows = pd.merge(model_itscores, model_dfires, on="modelindex", how="left") ndi = len(modelrows) if len(set((nitscore, ndi))) != 1: logging.error("ITScores: %s", nitscore) logging.error("IT lj dfire: %s", ndi) raise LoadModelScoresError("Score number mismatch") modelrows['fragmentindex'] = fragmentindex modelrows['windowindex'] = windowindex curs.executemany(modelinsert, modelrows.to_dict("records"))
def load_modeldist(self, complexdb, pdbid, window_rows, backbone_atoms, cb, stored_atoms, n_atoms, ca_index, atom_overlap, res_overlap, bb_threshold, max_clash, sticky_max_all, sticky_max_any, min_cosine, min_ifourdist, max_ifourdist, min_isixdist, max_isixdist, min_itwelvedist, **kwargs): model_q = """ SELECT windowindex, modelid, coordinates FROM model JOIN allmodel USING(modelid) JOIN fragment USING(windowindex, fragmentindex) ORDER BY windowindex, modelindex """ models = shared.db_to_pandas(model_q, complexdb).to_dict("records") if not models: raise LoadModelScoresError("No models") modeldistcoords = { row['modelid']: json.loads(row['coordinates']) for row in models } # Store consecutive pairs of window indices window_indices = [row['windowindex'] for row in window_rows] def window_dist(w1, w2): return abs(window_indices.index(w2) - window_indices.index(w1)) w_models = dict() for w in window_indices: w_models[w] = [ row for row in models if int(row['windowindex']) == w ] for key, val in w_models.iteritems(): print key, len(val) # Step forward by 4 residues and forward to CA atom mp_idx = n_atoms * (res_overlap + 1) + ca_index # Step backwards by 4 residues and forward to CA atom i_idx = -n_atoms * (res_overlap + 1) + ca_index # Step forward by 3 residues and forward to CA atom i4_idx = n_atoms * res_overlap + ca_index def get_mp_dist(separation): if separation < 1: raise LoadModelScoresError("Window separation %s", separation) elif separation == 1: min_dist = min_isixdist else: min_dist = min_itwelvedist max_dist = separation * max_isixdist return min_dist, max_dist # Pre-create np arrays of midpoint coordinates mp_coord_dict = dict() for x, window in enumerate(window_indices): window_length = window_rows[x]['length'] if window_length == 4: # CA of 4th residue window_mp_idx = -4 elif window_length > 4: window_mp_idx = mp_idx else: raise LoadModelScoresError( "Window {windowindex} has invalid length {length}".format( **window_rows[x])) mp_coord_dict[window] = np.array([ modeldistcoords[id['modelid']][window_mp_idx] for id in w_models[window] ]) # Calculate modeldists def calculate_modeldist(w1, w2): logging.debug("Starting %s %s", w1, w2) separation = window_dist(w1, w2) neighbors = separation == 1 mp_min, mp_max = get_mp_dist(separation) # Calculate all midpoint distances simultaneously w1_array = mp_coord_dict[w1] w2_array = mp_coord_dict[w2] pairwise_mp_dist = spatial.distance.cdist(w1_array, w2_array) allowed = (pairwise_mp_dist > mp_min) & (pairwise_mp_dist < mp_max) it = np.nditer(allowed, flags=['multi_index']) while not it.finished: # Convert from array indices to local lists mp_index = it.multi_index mp_dist = pairwise_mp_dist[mp_index] i, j = mp_index w1model = w_models[w1][i] w2model = w_models[w2][j] id1 = w1model['modelid'] id2 = w2model['modelid'] row = dict(modela=id1, modelb=id2, mpdist=mp_dist) # If mp_dist constraint is met, check overlap etc. if it[0]: skip = False w1_coords = modeldistcoords[id1] w2_coords = modeldistcoords[id2] if neighbors: # CHECK i to i+4 CA DISTANCE i_ca = w1_coords[i_idx] i4_ca = w2_coords[i4_idx] # spatial can handle lists ifourdist = spatial.distance.euclidean(i_ca, i4_ca) if ifourdist < min_ifourdist or ifourdist > max_ifourdist: skip = True else: row['ifourdist'] = ifourdist # CHECK FOR BACKBONE CLASH if not skip and mp_dist <= max_isixdist: w2_nooverlap = w2_coords if neighbors: w2_nooverlap = w2_nooverlap[atom_overlap:] pairwise_overlap = spatial.distance.cdist( w1_coords, w2_nooverlap) clash = pairwise_overlap <= bb_threshold # Number of atoms in clash nclash = np.count_nonzero(clash) if nclash > max_clash: skip = True else: nclash = 0 if not skip and neighbors: # PRECOMPUTE EDGESCORE w1_sticky = w1_coords[-atom_overlap:] w2_sticky = w2_coords[:atom_overlap] distances = np.diag( spatial.distance.cdist(w1_sticky, w2_sticky)) if any(distances > sticky_max_any): skip = True elif all(distances > sticky_max_all): skip = True else: row['edgescore'] = np.mean(np.square(distances)) if not skip and neighbors: # CALCULATE COSINE # N of third to last residue w1_start = np.array(w1_coords[-12]) # C of last residue w1_end = np.array(w1_coords[-2]) w1_vec = w1_end - w1_start w1_mag = np.linalg.norm(w1_vec) # N of first residue w2_start = np.array(w2_coords[1]) # C of third residue w2_end = np.array(w2_coords[11]) w2_vec = w2_end - w2_start w2_mag = np.linalg.norm(w2_vec) # $cos(\theta) = \frac{ a \cdot b }{ \| a \| \| b \| }$ vector_cosine = np.dot(w1_vec, w2_vec) / w1_mag / w2_mag if vector_cosine < min_cosine: skip = True else: row['cosine'] = vector_cosine # NEIGHBORS: insert ALLOWED if neighbors and not skip: yield row # If mp_dist was not met or clash, row is disallowed # NON-NEIGHBORS: insert DISALLOWED if not neighbors and (not it[0] or skip): yield row it.iternext() logging.debug("Finished %s %s", w1, w2) for windowid in range(1, len(window_indices)): window_index = window_indices[windowid] window_sql = self.create_modeldist_tables( pdbid=pdbid, windowid=windowid, windowindex_list=window_indices) if window_sql is None: continue with shared.write_conn(window_sql['db_name']) as conn: cursor = conn.cursor() for prev_window_index, sql in window_sql[ 'window_dict'].iteritems(): row_gen = calculate_modeldist(prev_window_index, window_index) try: first = next(row_gen) except StopIteration: raise LoadModelScoresError( "No allowed pairs for %s %s", window_index, prev_window_index) else: cursor.execute(sql['insert'], first) cursor.executemany(sql['insert'], row_gen) cursor.execute(sql['index'])
def choose(self, complexdb, complexname, receptor_chain, ligand_chain, **kwargs): """ Choose models and load coordinates. """ models_per_window = 4500 parser = PDB.PDBParser(QUIET=True) scores = [ dict(columns="itscore", ascending=True), dict(columns="dfire", ascending=True) ] sql_dict = self.prepare_sql(complexdb) n_model_q_fmt = sql_dict['n_model_q_fmt'] window_data_q_fmt = sql_dict['window_data_q_fmt'] model_insert = sql_dict['model_insert'] window_q = "SELECT windowindex, window_wd, res_end - res_start + 1 AS length FROM window ORDER BY res_start" window_df = shared.db_to_pandas(window_q, complexdb) window_rows = window_df.to_dict("records") for x, window_row in enumerate(window_rows): window_wd = window_row['window_wd'] length = window_row['length'] n_chosen_q = n_model_q_fmt.format(**window_row) with shared.ro_conn(complexdb) as conn: n_chosen = list(conn.cursor().execute(n_chosen_q))[0][0] if n_chosen == models_per_window: continue get_coords = functools.partial(self.get_coords, parser=parser, chain=ligand_chain, length=length) window_data_q = window_data_q_fmt.format(**window_row) windowrows = shared.db_to_pandas(window_data_q, complexdb) if windowrows.empty: raise LoadModelScoresError( "No rows for windowindex {windowindex}".format( **window_row)) # Scale scores and create di column windowrows = self.scale_scores(windowrows, scores) # Sort by di windowrows = windowrows.sort_values("di") # Get top N rows by di windowrows = windowrows.head(models_per_window) # Add path path_fmt = os.path.join(window_wd, "{fragmentindex:.0f}", "decoys", "model{modelindex:.0f}.pdb") windowrows['path'] = windowrows.apply( lambda x: path_fmt.format(**x), 1) # Prepare for insertion insert_rows = windowrows.apply(get_coords, 1) with shared.write_conn(complexdb) as conn: conn.cursor().executemany(model_insert, insert_rows.to_dict("records")) pdbid = "{0}{1}{2}".format(complexname, receptor_chain, ligand_chain) self.load_modeldist(complexdb=complexdb, pdbid=pdbid, window_rows=window_rows, **self.params)
def __init__(self, complexid, nwindows, batchsize=None, directory=None): """CONSTRUCTOR""" if directory is None: db_dir = script_dir else: db_dir = directory self.in_db_fmt = os.path.join( db_dir, "%s_modeldist{cur_window}.db" % complexid) self.score_db_file = os.path.join( db_dir, "scores_{complexid}.db".format(complexid=complexid)) self.out_db_file = self.out_db_fmt.format(db_dir=db_dir, complexid=complexid) logging.debug(self.out_db_file) if batchsize is None: batchsize = self.default_batchsize make_path_times = list() cluster_times = list() clustersize_times = list() for n in range(2, nwindows + 1): ct = self.count_paths(complexid=complexid, nwindows=n, db_dir=db_dir) n_paths = ct['path_count'] n_clusters = ct['cluster_count'] start = time.time() if n_paths: logging.debug("%s %s has %s rows", complexid, n, n_paths) else: self.make_paths(complexid=complexid, nwindows=n, batchsize=batchsize) pathtime = time.time() make_path_times.append(pathtime - start) if not n_clusters: ClusterPdb(complexid=complexid, nwindows=n, directory=db_dir) clustertime = time.time() cluster_times.append(clustertime - pathtime) # Update cluster sizes cluster_tablename = self.cluster_tablename_fmt.format(nwindows=n) columns = ["pathsid", "cid", "is_medoid"] q = "SELECT {columns} FROM {cluster_tablename}".format( columns=", ".join(columns), cluster_tablename=cluster_tablename) # NB shared.db_to_pandas raises factually incorrect error with shared.ro_conn(self.out_db_file) as out_conn: rows = list(out_conn.cursor().execute(q)) cluster_rows = pd.DataFrame(rows, columns=columns) cluster_sizes = cluster_rows.groupby('cid').size() cluster_sizes = cluster_sizes.to_frame("clustersize") cluster_rows = cluster_rows.merge(cluster_sizes, left_on="cid", right_index=True) center_rows = cluster_rows.loc[cluster_rows.loc[:, 'is_medoid'] == 1] cluster_update = shared.create_update_statement( tablename=cluster_tablename, columns=self.cluster_update_columns, where=self.cluster_id_columns) with shared.write_conn(self.out_db_file) as conn: conn.cursor().executemany(cluster_update, center_rows.to_dict("records")) clustersizetime = time.time() clustersize_times.append(clustersizetime - clustertime) logging.info("Ending n=%s after %s", n, clustersizetime - start) logging.info(make_path_times) logging.info(cluster_times) logging.info(clustersize_times) make_path_total = sum(make_path_times) cluster_total = sum(cluster_times) clustersize_total = sum(clustersize_times) grand_total = sum([make_path_total, cluster_total, clustersize_total]) logging.info("Make path: %s", make_path_total / grand_total) logging.info("Cluster: %s", cluster_total / grand_total) logging.info("Clustersize: %s", clustersize_total / grand_total)