示例#1
0
def insert_events(data: pandas.DataFrame,
                  mapping: dict,
                  db_cursor: DictCursor,
                  batch_size: int = 1000) -> pandas.DataFrame:
    """
    Inserts sampling events, reeturning the given dataframe with updated
    `pid`'s from the database.
    """
    base_query, field_mapping = get_base_query(mapping['event'])

    total = len(data.values)
    start = 0
    end = min(total, batch_size)

    pids = []
    while start < total:
        logging.info("   * inserting %s to %s", start, end)
        values = format_values(data, field_mapping, start, end)

        query = f"{base_query} VALUES {', '.join(values)} RETURNING pid;"

        try:
            db_cursor.execute(query)
            pids += [r['pid'] for r in db_cursor.fetchall()]
        except psycopg2.Error as err:
            logging.error(err)
            logging.error("No data were imported.")
            sys.exit(1)

        start = end
        end = min(total, end + batch_size)

    # assign pids to data for future joins
    return data.assign(pid=pids)
示例#2
0
def insert_common(data: pandas.DataFrame,
                  mapping: dict,
                  db_cursor: DictCursor,
                  batch_size: int = 1000):
    """
    Inserts `data` into the database based on the given `mapping`.
    """
    base_query, field_mapping = get_base_query(mapping)

    total = len(data.values)
    start = 0
    end = min(total, batch_size)

    while start < total:
        logging.info("   * inserting %s to %s", start, end)
        values = format_values(data, field_mapping, start, end)

        query = f"{base_query} VALUES {', '.join(values)};"

        try:
            db_cursor.execute(query)
        except psycopg2.Error as err:
            logging.error(err)
            logging.error("No data were imported.")
            sys.exit(1)

        start = end
        end = min(total, end + batch_size)
def list_datasets_in_bioatlas(cursor: DictCursor) -> list:
    """
    Returns a list of all datasets available in the database where
    `in_bioatlas`is `true`.
    """
    query = "SELECT pid, dataset_id FROM public.dataset WHERE in_bioatlas;"

    cursor.execute(query)
    return [dict(row) for row in cursor.fetchall()]
def create_input_fasta(cursor: DictCursor, filename: str = 'blast_input'):
    """
    Uses the given database `cursor` to extract sequences to be placed in
    `filename`.
    """
    logging.info("Creating input fasta file: %s.fasta", filename)

    with open(f'{filename}.fasta', 'w') as fasta:
        cursor.execute("SELECT * FROM api.app_asvs_for_blastdb;")
        for asv_id, taxonomy, sequence in cursor.fetchall():
            fasta.write('>%s-%s\n%s\n' % (asv_id, taxonomy, sequence))
示例#5
0
def create_tables(cur: DictCursor, conn: DictConnection):
    """
    Function that creates all tables from `CREATE_TABLE_QUERIES`

    Parameters
    ----------
    cur
    conn
    """
    for query in CREATE_TABLE_QUERIES:
        cur.execute(query)
        conn.commit()
示例#6
0
def load_staging_tables(cur: DictCursor, conn: DictConnection):
    """
    Function that loads all data from the source S3 bucket into the staging
    Redshift tables by using the `COPY_TABLE_QUERIES` variable

    Parameters
    ----------
    cur
    conn
    """
    for query in COPY_TABLE_QUERIES:
        cur.execute(query)
        conn.commit()
示例#7
0
def insert_tables(cur: DictCursor, conn: DictConnection):
    """
    Function that inserts data from the staging Redshift tables into the
    Redshift analytical tables by using the `INSERT_TABLE_QUERIES` variable

    Parameters
    ----------
    cur
    conn
    """
    for query in INSERT_TABLE_QUERIES:
        cur.execute(query)
        conn.commit()
示例#8
0
def drop_tables(cur: DictCursor, conn: DictConnection):
    """
    Function that drops all tables from `DROP_TABLES` by using the
    `DROP_TABLE_FORMAT` string format

    Parameters
    ----------
    cur
    conn
    """
    for table in DROP_TABLES:
        cur.execute(DROP_TABLE_FORMAT.format(table=table))
        conn.commit()
示例#9
0
def insert_dataset(data: pandas.DataFrame, mapping: dict,
                   db_cursor: DictCursor) -> int:
    """
    Inserts a single dataset into the database, and returns the database `pid`.
    """
    base_query, field_mapping = get_base_query(mapping['dataset'])

    if len(data.values) != 1:
        logging.error("There must be exactly one dataset to insert")
        sys.exit(1)

    values = format_values(data, field_mapping, 0, 1)
    query = f"{base_query} VALUES {', '.join(values)} RETURNING pid;"

    try:
        db_cursor.execute(query)
    except psycopg2.Error as err:
        logging.error(err)
        logging.error("No data were imported.")
        sys.exit(1)

    return db_cursor.fetchall()[0]['pid']
示例#10
0
def insert_asvs(data: pandas.DataFrame,
                mapping: dict,
                db_cursor: DictCursor,
                batch_size: int = 1000) -> (pandas.DataFrame, int):
    """
    Inserts asv's into the database, returning the database `pid`'s. Unlike the
    other categories asv conflicts returns the id of the previously registered
    entry.
    """
    base_query, field_mapping = get_base_query(mapping['asv'])

    total = len(data.values)
    start = 0
    end = min(total, batch_size)

    # get max asv_id before insert (this helps us figure out which asv's were
    # already in the database).
    db_cursor.execute("SELECT MAX(pid) FROM asv;")
    old_max_pid = db_cursor.fetchone()[0]

    pids = []
    while start < total:
        logging.info("   * inserting %s to %s", start, end)
        values = format_values(data, field_mapping, start, end)

        query = f"{base_query} VALUES {', '.join(values)} " + \
                "ON CONFLICT (asv_sequence) DO UPDATE SET pid = asv.pid " + \
                "RETURNING pid;"

        try:
            db_cursor.execute(query)
            pids += [r['pid'] for r in db_cursor.fetchall()]
        except psycopg2.Error as err:
            logging.error(err)
            logging.error("No data were imported.")
            sys.exit(1)

        start = end
        end = min(total, end + batch_size)

    # assign pids to data for future joins
    return data.assign(pid=pids), old_max_pid or 0
示例#11
0
 def next(self):
     res = _cursor.next(self)
     self.log(res)
     return res
示例#12
0
 def fetchall(self):
     res = _cursor.fetchall(self)
     for row in res:
         self.log(row)
     return res
示例#13
0
 def fetchone(self):
     res = _cursor.fetchone(self)
     self.log(res)
     return res
示例#14
0
    def __execute(self,
                  cursor: DictCursor,
                  query_args: tuple,
                  double_percentage_sign_marker: str,
                  print_warnings: bool) -> None:
        """Execute statement, set up cursor to results."""

        # MC_REWRITE_TO_PYTHON: 'query_args' should be decoded from 'bytes' at this point

        if len(query_args) == 0:
            raise McDatabaseResultException('No query or its parameters.')
        if len(query_args[0]) == 0:
            raise McDatabaseResultException('Query is empty or undefined.')

        try:

            if len(query_args) == 1:
                # If only a query without any parameters (tuple or dictionary) are passed, psycopg2 is happy to operate
                # on a single literal '%' because it doesn't even try to do its own interpolation. However, with some
                # parameters present (e.g. a dictionary) psycopg2 then tries to do the interpolation and expects literal
                # '%' to be duplicated ('%%'). To unify the behavior, we always pass a parameter (even if it's empty)
                # to execute().
                query_args = (query_args[0], {},)

            query = query_args[0]

            # Duplicate '%' everywhere except for psycopg2 parameter placeholders ('%s' and '%(...)s')
            query = re.sub(r'%(?!(s|\(.*?\)s?))', '%%', query)

            # Replace percentage signs coming from quote()d strings with double percentage signs
            query = query.replace(double_percentage_sign_marker, '%%')

            query_args_list = list(query_args)
            query_args_list[0] = query
            query_args = tuple(query_args_list)

            log.debug("Running query: %s" % str(query_args))

            t = time.time()

            cursor.execute(*query_args)

            query_time = time.time() - t
            if (query_time >= 1):
                query_text = textwrap.shorten(str(query_args[0]), width=80)
                query_params = textwrap.shorten(str(query_args[1:]), width=80)
                log.info("Slow query (%d seconds): %s, %s" % (query_time, query_text, query_params))

        except psycopg2.Warning as ex:
            if print_warnings:
                log.warning('Warning while running query: %s' % str(ex))
            else:
                log.debug('Warning while running query: %s' % str(ex))

        except psycopg2.ProgrammingError as ex:
            raise McDatabaseResultException(
                'Invalid query: %(exception)s; query: %(query)s' % {
                    'exception': str(ex),
                    'query': str(query_args),
                })

        except psycopg2.Error as ex:

            try:
                mogrified_query = cursor.mogrify(*query_args)
            except Exception as ex:
                # Can't mogrify
                raise McDatabaseResultException(
                    'Query failed: %(exception)s; query: %(query)s' % {
                        'exception': str(ex),
                        'query': str(query_args),
                    })
            else:
                raise McDatabaseResultException(
                    'Query failed: %(exception)s; query: %(query)s; mogrified query: %(mogrified_query)s' % {
                        'exception': str(ex),
                        'query': str(query_args),
                        'mogrified_query': str(mogrified_query),
                    })

        except Exception as ex:
            raise McDatabaseResultException(
                'Invalid query (DBD::Pg -> psycopg2 query conversion?): %(exception)s; query: %(query)s' % {
                    'exception': str(ex),
                    'query': str(query_args),
                })

        self.__cursor = cursor  # Cursor now holds results
示例#15
0
    def __execute(self,
                  cursor: DictCursor,
                  query_args: tuple,
                  double_percentage_sign_marker: str,
                  print_warnings: bool) -> None:
        """Execute statement, set up cursor to results."""

        # MC_REWRITE_TO_PYTHON: 'query_args' should be decoded from 'bytes' at this point

        if len(query_args) == 0:
            raise McDatabaseResultException('No query or its parameters.')
        if len(query_args[0]) == 0:
            raise McDatabaseResultException('Query is empty or undefined.')

        try:

            if len(query_args) == 1:
                # If only a query without any parameters (tuple or dictionary) are passed, psycopg2 is happy to operate
                # on a single literal '%' because it doesn't even try to do its own interpolation. However, with some
                # parameters present (e.g. a dictionary) psycopg2 then tries to do the interpolation and expects literal
                # '%' to be duplicated ('%%'). To unify the behavior, we always pass a parameter (even if it's empty)
                # to execute().
                query_args = (query_args[0], {},)

            query = query_args[0]

            # Duplicate '%' everywhere except for psycopg2 parameter placeholders ('%s' and '%(...)s')
            query = re.sub(r'%(?!(s|\(.*?\)s?))', '%%', query)

            # Replace percentage signs coming from quote()d strings with double percentage signs
            query = query.replace(double_percentage_sign_marker, '%%')

            query_args_list = list(query_args)
            query_args_list[0] = query
            query_args = tuple(query_args_list)

            log.debug("Running query: %s" % str(query_args))

            t = time.time()

            cursor.execute(*query_args)

            query_time = time.time() - t
            if query_time >= 1:
                query_text = textwrap.shorten(str(query_args[0]), width=80)
                query_params = textwrap.shorten(str(query_args[1:]), width=80)
                log.info("Slow query (%d seconds): %s, %s" % (query_time, query_text, query_params))

        except psycopg2.Warning as ex:
            if print_warnings:
                log.warning('Warning while running query: %s' % str(ex))
            else:
                log.debug('Warning while running query: %s' % str(ex))

        except psycopg2.ProgrammingError as ex:
            message = (query_args[0][0:1024], query_args[1:])
            raise McDatabaseResultException(
                'Invalid query: %(exception)s; query: %(query)s' % {
                    'exception': str(ex),
                    'query': str(message),
                })

        except psycopg2.Error as ex:

            ex_class = McDatabaseResultException
            if isinstance(ex, TransactionRollbackError):
                ex_class = McTransactionRollbackError
                if 'tuple to be locked was already moved to another partition due to concurrent update' in str(ex):
                    ex_class = McTupleAlreadyMovedError

            try:
                mogrified_query = cursor.mogrify(*query_args)
            except Exception as ex:
                # Can't mogrify
                raise ex_class(
                    'Query failed: %(exception)s; query: %(query)s' % {
                        'exception': str(ex),
                        'query': str(query_args),
                    })
            else:
                raise ex_class(
                    'Query failed: %(exception)s; query: %(query)s; mogrified query: %(mogrified_query)s' % {
                        'exception': str(ex),
                        'query': str(query_args),
                        'mogrified_query': str(mogrified_query),
                    })

        except Exception as ex:
            message = (query_args[0][0:1024], query_args[1:])
            raise McDatabaseResultException(
                'Invalid query (DBD::Pg -> psycopg2 query conversion?): %(exception)s; query: %(query)s' % {
                    'exception': str(ex),
                    'query': str(message),
                })

        self.__cursor = cursor  # Cursor now holds results
示例#16
0
    def __execute(self, cursor: DictCursor, query_args: tuple,
                  double_percentage_sign_marker: str,
                  print_warnings: bool) -> None:
        """Execute statement, set up cursor to results."""

        # MC_REWRITE_TO_PYTHON: 'query_args' should be decoded from 'bytes' at this point

        if len(query_args) == 0:
            raise McDatabaseResultException('No query or its parameters.')
        if len(query_args[0]) == 0:
            raise McDatabaseResultException('Query is empty or undefined.')

        try:

            if len(query_args) == 1:
                # If only a query without any parameters (tuple or dictionary) are passed, psycopg2 is happy to operate
                # on a single literal '%' because it doesn't even try to do its own interpolation. However, with some
                # parameters present (e.g. a dictionary) psycopg2 then tries to do the interpolation and expects literal
                # '%' to be duplicated ('%%'). To unify the behavior, we always pass a parameter (even if it's empty)
                # to execute().
                query_args = (
                    query_args[0],
                    {},
                )

            query = query_args[0]

            # Duplicate '%' everywhere except for psycopg2 parameter placeholders ('%s' and '%(...)s')
            query = re.sub('%(?!(s|\(.*?\)s?))', '%%', query)

            # Replace percentage signs coming from quote()d strings with double percentage signs
            query = query.replace(double_percentage_sign_marker, '%%')

            query_args_list = list(query_args)
            query_args_list[0] = query
            query_args = tuple(query_args_list)

            log.debug("Running query: %s" % str(query_args))

            cursor.execute(*query_args)

        except psycopg2.Warning as ex:
            if print_warnings:
                log.warning('Warning while running query: %s' % str(ex))
            else:
                log.debug('Warning while running query: %s' % str(ex))

        except psycopg2.ProgrammingError as ex:
            raise McDatabaseResultException(
                'Invalid query: %(exception)s; query: %(query)s' % {
                    'exception': str(ex),
                    'query': str(query_args),
                })

        except psycopg2.Error as ex:

            try:
                mogrified_query = cursor.mogrify(*query_args)
            except Exception as ex:
                # Can't mogrify
                raise McDatabaseResultException(
                    'Query failed: %(exception)s; query: %(query)s' % {
                        'exception': str(ex),
                        'query': str(query_args),
                    })
            else:
                raise McDatabaseResultException(
                    'Query failed: %(exception)s; query: %(query)s; mogrified query: %(mogrified_query)s'
                    % {
                        'exception': str(ex),
                        'query': str(query_args),
                        'mogrified_query': str(mogrified_query),
                    })

        except Exception as ex:
            raise McDatabaseResultException(
                'Invalid query (DBD::Pg -> psycopg2 query conversion?): %(exception)s; query: %(query)s'
                % {
                    'exception': str(ex),
                    'query': str(query_args),
                })

        self.__cursor = cursor  # Cursor now holds results
示例#17
0
 def fetchmany(self, size=None):
     res = _cursor.fetchmany(self, size=None)
     return res