Exemplo n.º 1
0
 def _run(
     self,
     predicates: List["ReactionComponentPredicate"],
     cursor: psycopg2.extensions.cursor,
     limit: Optional[int] = None,
 ) -> List[Result]:
     """Runs the query for a set of predicates."""
     if not predicates:
         return []
     self._setup(predicates, cursor)
     predicate_components = []
     args = []
     for predicate in predicates:
         components = [
             sql.SQL("""
             SELECT DISTINCT dataset_id, reaction_id, serialized
             FROM reactions """)
         ]
         components.extend(self._get_tables())
         components.append(sql.SQL("""
             WHERE """))
         predicate_sql, predicate_args = predicate.get()
         components.append(predicate_sql)
         args.extend(predicate_args)
         predicate_components.append(sql.Composed(components))
     components = [sql.Composed(predicate_components).join(" INTERSECT ")]
     if limit:
         components.append(sql.SQL(" LIMIT %s"))
         args.append(limit)
     query = sql.Composed(components).join("")
     logger.info("Running SQL command:%s",
                 cursor.mogrify(query, args).decode())
     cursor.execute(query, args)
     return fetch_results(cursor)
Exemplo n.º 2
0
    def run(self,
            cursor: psycopg2.extensions.cursor,
            limit: Optional[int] = None) -> List[Result]:
        """Runs the query.

        Args:
            cursor: psycopg2 cursor.
            limit: Integer maximum number of matches. If None (the default), no
                limit is set.

        Returns:
            List of Result instances.
        """
        components = [
            sql.SQL("""
            SELECT DISTINCT dataset_id, reaction_id, serialized
            FROM reactions
            WHERE doi = ANY (%s)""")
        ]
        args = [self._dois]
        if limit:
            components.append(sql.SQL(" LIMIT %s"))
            args.append(limit)
        query = sql.Composed(components).join("")
        logger.info("Running SQL command:%s",
                    cursor.mogrify(query, args).decode())
        cursor.execute(query, args)
        return fetch_results(cursor)
Exemplo n.º 3
0
def generate_insert_queries(
    curs: psycopg2.extensions.cursor,
    insert_table: str,
    df: pd.DataFrame,
    *,
    chunksize: int = 10000,
) -> Coroutine:
    """Generator that helps insert_pandas_into. Assumes totally valid
    arguments, and colnames must match the schema of the insert table.

    Args:
        curs (psycopg2.extensions.cursor): Connection used to insert to table
        insert_table (str): Target table in database
        df (pd.DataFrame): Pandas dataframe that will be inserted
        chunksize (int, optional): How many rows to write per insert.
            Defaults to 10000.

    Returns:
        None
    """
    # TODO: assert cursor here
    if not isinstance(insert_table, str):
        raise TypeError("insert_table must be a str")
    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be a pandas DataFrame")
    if not isinstance(chunksize, int):
        raise TypeError("chunksize must be an int")

    ncol = len(df.columns)
    colnames = df.columns.tolist()
    sanitized_colnames = [f'"{col}"' for col in colnames]

    insert_template = "\n".join(
        [
            f"INSERT INTO {insert_table} ",
            "(",
            # indent the first colname
            "  " + ",\n  ".join(sanitized_colnames),
            ")",
            "VALUES\n",
            "{}",
        ]
    )
    all_values = df.values.tolist()

    formatting = ", ".join(["%s"] * ncol)  # e.g. '%s, %s, %s'
    for i in range(0, len(all_values) + chunksize, chunksize):
        subset_values = all_values[i : i + chunksize]
        if subset_values:
            # as of 2018 Dec 7, you can only use mogrify with a cursor object
            query_values = "  " + ",\n  ".join(
                curs.mogrify(f"({formatting})", row).decode() for row in subset_values
            )
            # cleanup values
            query_values = query_values.replace("'NaT'::timestamp", "NULL")
            query_values = query_values.replace("'NaN'::float", "NULL")
            query_values = query_values.replace("'None'", "NULL")
            query = insert_template.format(query_values)
            yield query
Exemplo n.º 4
0
    def _setup(self, predicates: List["ReactionComponentPredicate"],
               cursor: psycopg2.extensions.cursor) -> None:
        """Prepares the database for a query.

        Args:
            cursor: psycopg.cursor instance.
            predicates: Predicates included in this query.
        """
        command = sql.SQL("SET rdkit.do_chiral_sss=%s")
        args = [self._do_chiral_sss]
        logger.info("Running SQL command: %s",
                    cursor.mogrify(command, args).decode())
        cursor.execute(command, args)
        command = sql.SQL("SET rdkit.tanimoto_threshold=%s")
        tanimoto_threshold = self._tanimoto_threshold
        for predicate in predicates:
            if predicate.mode == ReactionComponentPredicate.MatchMode.EXACT:
                tanimoto_threshold = 1.0
        args = [tanimoto_threshold]
        logger.info("Running SQL command: %s",
                    cursor.mogrify(command, args).decode())
        cursor.execute(command, args)
Exemplo n.º 5
0
    def run(self,
            cursor: psycopg2.extensions.cursor,
            limit: Optional[int] = None) -> List[Result]:
        """Runs the query.

        Args:
            cursor: psycopg.cursor instance.
            limit: Not used; present for compatibility.

        Returns:
            List of Result instances.
        """
        del limit  # Unused.
        query = sql.SQL("""
            SELECT DISTINCT dataset_id, reaction_id, serialized
            FROM reactions
            WHERE reaction_id = ANY (%s)""")
        args = [self._reaction_ids]
        logger.info("Running SQL command:%s",
                    cursor.mogrify(query, args).decode())
        cursor.execute(query, args)
        return fetch_results(cursor)
Exemplo n.º 6
0
    def run(self,
            cursor: psycopg2.extensions.cursor,
            limit: Optional[int] = None) -> List[Result]:
        """Runs the query.

        Args:
            cursor: psycopg.cursor instance.
            limit: Maximum number of matches. If None, no limit is set.

        Returns:
            List of Result instances.
        """
        del limit  # Unused.
        query = sql.SQL("""
            SELECT DISTINCT dataset_id, reaction_id, serialized
            FROM reactions
            TABLESAMPLE SYSTEM_ROWS (%s)""")
        args = [self._num_rows]
        logger.info("Running SQL command:%s",
                    cursor.mogrify(query, args).decode())
        cursor.execute(query, args)
        return fetch_results(cursor)