예제 #1
0
def insert_annotation_data(self, chunk: List[int], mat_metadata: dict):
    """Insert annotation data into database

    Args:
        chunk (List[int]): chunk of annotation ids
        mat_metadata (dict): materialized metadata
    Returns:
        bool: True if data was inserted
    """
    aligned_volume = mat_metadata["aligned_volume"]
    analysis_version = mat_metadata["analysis_version"]
    annotation_table_name = mat_metadata["annotation_table_name"]
    datastack = mat_metadata["datastack"]

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    # build table models
    AnnotationModel = create_annotation_model(mat_metadata,
                                              with_crud_columns=False)
    SegmentationModel = create_segmentation_model(mat_metadata)
    analysis_table = get_analysis_table(aligned_volume, datastack,
                                        annotation_table_name,
                                        analysis_version)

    query_columns = []
    for col in AnnotationModel.__table__.columns:
        query_columns.append(col)
    for col in SegmentationModel.__table__.columns:
        if not col.name == "id":
            query_columns.append(col)

    chunked_id_query = query_id_range(AnnotationModel.id, chunk[0], chunk[1])

    anno_ids = (session.query(
        AnnotationModel.id).filter(chunked_id_query).filter(
            AnnotationModel.valid == True))
    query = (session.query(*query_columns).join(SegmentationModel).filter(
        SegmentationModel.id == AnnotationModel.id).filter(
            SegmentationModel.id.in_(anno_ids)))
    data = query.all()
    mat_df = pd.DataFrame(data)
    mat_df = mat_df.to_dict(orient="records")
    SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
    analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack,
                                               analysis_version)
    analysis_session, analysis_engine = create_session(analysis_sql_uri)

    try:
        analysis_engine.execute(analysis_table.insert(),
                                [data for data in mat_df])
    except Exception as e:
        celery_logger.error(e)
        analysis_session.rollback()
    finally:
        analysis_session.close()
        analysis_engine.dispose()
        session.close()
        engine.dispose()
    return True
예제 #2
0
def add_indices(self, mat_metadata: dict):
    """Find missing indices for a given table contained
    in the mat_metadata dict. Spawns a chain of celery
    tasks that run synchronously that add an index per task.

    Args:
        mat_metadata (dict): datastack info for the aligned_volume derived from the infoservice

    Returns:
        chain: chain of celery tasks
    """
    add_indices = mat_metadata.get("add_indices", False)
    if add_indices:
        analysis_version = mat_metadata.get("analysis_version")
        datastack = mat_metadata["datastack"]
        analysis_database = mat_metadata["analysis_database"]
        SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
        analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack,
                                                   analysis_version)

        analysis_session, analysis_engine = create_session(analysis_sql_uri)

        annotation_table_name = mat_metadata.get("annotation_table_name")
        schema = mat_metadata.get("schema")

        table_metadata = None
        if mat_metadata.get("reference_table"):
            table_metadata = {
                "reference_table": mat_metadata.get("reference_table")
            }

        model = make_flat_model(
            table_name=annotation_table_name,
            schema_type=schema,
            segmentation_source=None,
            table_metadata=table_metadata,
        )

        commands = index_cache.add_indices_sql_commands(
            annotation_table_name, model, analysis_engine)
        analysis_session.close()
        analysis_engine.dispose()

        add_index_tasks = chain(
            [add_index.si(analysis_database, command) for command in commands])

        return self.replace(add_index_tasks)
    return "Indices already exist"
예제 #3
0
 def get(self, aligned_volume_name, version, tablename):
     check_aligned_volume(aligned_volume_name)
     SQL_URI_CONFIG = current_app.config["SQLALCHEMY_DATABASE_URI"]
     sql_base_uri = SQL_URI_CONFIG.rpartition("/")[0]
     sql_uri = make_url(f"{sql_base_uri}/{aligned_volume_name}")
     session, engine = create_session(sql_uri)
     metadata = MetaData()
     try:
         annotation_table = Table(tablename,
                                  metadata,
                                  autoload=True,
                                  autoload_with=engine)
     except NoSuchTableError as e:
         logging.error(f"No table exists {e}")
         return abort(404)
     response = session.query(annotation_table).limit(10).all()
     annotations = [r._asdict() for r in response]
     if annotations:
         return annotations, 200
     else:
         return abort(404)
예제 #4
0
def drop_indices(self, mat_metadata: dict):
    """Drop all indices of a given table.

    Args:
        mat_metadata (dict): datastack info for the aligned_volume derived from the infoservice

    Returns:
        str: string if indices were dropped or not.
    """
    add_indices = mat_metadata.get("add_indices", False)
    if add_indices:
        analysis_version = mat_metadata.get("analysis_version", None)
        datastack = mat_metadata["datastack"]
        temp_mat_table_name = mat_metadata["temp_mat_table_name"]
        SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
        analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack,
                                                   analysis_version)

        analysis_session, analysis_engine = create_session(analysis_sql_uri)
        index_cache.drop_table_indices(temp_mat_table_name, analysis_engine)
        analysis_session.close()
        analysis_engine.dispose()
        return "Indices DROPPED"
    return "No indices dropped"
예제 #5
0
def remove_expired_databases(delete_threshold: int = 5) -> str:
    """
    Remove expired database from time this method is called.
    """
    aligned_volume_databases = get_aligned_volumes_databases()
    datastacks = get_config_param("DATASTACKS")
    current_time = datetime.utcnow()
    remove_db_cron_info = []

    for datastack in datastacks:
        datastack_info = get_datastack_info(datastack)
        aligned_volume = datastack_info["aligned_volume"]["name"]
        if aligned_volume in aligned_volume_databases:
            SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
            sql_base_uri = SQL_URI_CONFIG.rpartition("/")[0]
            sql_uri = make_url(f"{sql_base_uri}/{aligned_volume}")
            session, engine = create_session(sql_uri)
            session.expire_on_commit = False
            # get number of expired dbs that are ready for deletion
            try:
                expired_results = (session.query(AnalysisVersion).filter(
                    AnalysisVersion.expires_on <= current_time).all())
                expired_versions = [
                    str(expired_db) for expired_db in expired_results
                ]

            except Exception as sql_error:
                celery_logger.error(f"Error: {sql_error}")
                continue

            # get databases that exist currently, filter by materializied dbs
            result = engine.execute(
                "SELECT datname FROM pg_database;").fetchall()
            database_list = list(itertools.chain.from_iterable(result))
            databases = [
                database for database in database_list
                if database.startswith(datastack)
            ]

            # get databases to delete that are currently present
            databases_to_delete = [
                database for database in databases
                if database in expired_versions
            ]

            dropped_dbs_info = {
                "aligned_volume":
                aligned_volume,
                "materialized_databases":
                (databases, f"count={len(databases)}"),
                "expired_databases": (
                    expired_versions,
                    f"count={len(expired_versions)}",
                ),
                "delete_threshold":
                delete_threshold,
            }
            dropped_dbs = []

            if len(databases) > delete_threshold:
                with engine.connect() as conn:
                    conn.execution_options(isolation_level="AUTOCOMMIT")
                    for database in databases_to_delete:
                        try:
                            sql = (
                                "SELECT 1 FROM pg_database WHERE datname='%s'"
                                % database)
                            result_proxy = conn.execute(sql)
                            result = result_proxy.scalar()
                            if result:
                                drop_connections = f"""
                                SELECT 
                                    pg_terminate_backend(pid) 
                                FROM 
                                    pg_stat_activity
                                WHERE 
                                    datname = '{database}'
                                AND pid <> pg_backend_pid()
                                """

                                conn.execute(drop_connections)
                                celery_logger.info(
                                    f"Dropped connections to: {database}")
                                sql = "DROP DATABASE %s" % database
                                result_proxy = conn.execute(sql)
                                celery_logger.info(
                                    f"Database: {database} removed")

                                # strip version from database string
                                database_version = database.rsplit("__mat")[-1]

                                expired_database = (
                                    session.query(AnalysisVersion).filter(
                                        AnalysisVersion.version ==
                                        database_version).one())
                                expired_database.valid = False
                                session.commit()
                                celery_logger.info(
                                    f"Database '{expired_database}' dropped")
                                dropped_dbs.append(expired_database)
                                dropped_dbs_info[
                                    "dropped_databases"] = dropped_dbs
                        except Exception as e:
                            celery_logger.error(
                                f"ERROR: {e}: {database} does not exist")
            remove_db_cron_info.append(dropped_dbs_info)
            session.close()
    return remove_db_cron_info
예제 #6
0
def merge_tables(self, mat_metadata: dict):
    """Merge all the annotation and segmentation rows into a new table that are
    flagged as valid. Drop the original split tables after inserting all the rows
    into the new table.

    Args:
        mat_metadata (dict): datastack info for the aligned_volume from the infoservice
        analysis_version (int): materialized version number

    Raises:
        e: error during table merging operation

    Returns:
        str: number of rows copied
    """
    analysis_version = mat_metadata["analysis_version"]
    annotation_table_name = mat_metadata["annotation_table_name"]
    segmentation_table_name = mat_metadata["segmentation_table_name"]
    temp_table_name = mat_metadata["temp_mat_table_name"]
    schema = mat_metadata["schema"]
    datastack = mat_metadata["datastack"]

    # create dynamic sql_uri
    SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
    analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack,
                                               analysis_version)

    # get schema and match column order for sql query
    anno_schema = get_schema(schema)
    flat_schema = create_flattened_schema(anno_schema)

    ordered_model_columns = create_table_dict(
        table_name=annotation_table_name,
        Schema=flat_schema,
        segmentation_source=None,
        table_metadata=None,
        with_crud_columns=False,
    )

    AnnotationModel = create_annotation_model(mat_metadata,
                                              with_crud_columns=False)
    SegmentationModel = create_segmentation_model(mat_metadata)

    query_columns = {}
    crud_columns = ["created", "deleted", "superceded_id"]
    for col in AnnotationModel.__table__.columns:
        if col.name not in crud_columns:
            query_columns[col.name] = col
    for col in SegmentationModel.__table__.columns:
        if not col.name == "id":
            query_columns[col.name] = col

    sorted_columns = OrderedDict([(key, query_columns[key])
                                  for key in ordered_model_columns
                                  if key in query_columns.keys()])
    sorted_columns_list = list(sorted_columns.values())
    columns = [f'"{col.table}".{col.name}' for col in sorted_columns_list]

    mat_session, mat_engine = create_session(analysis_sql_uri)

    query = f"""
        SELECT 
            {', '.join(columns)}
        FROM 
            {AnnotationModel.__table__.name}
        JOIN 
            "{SegmentationModel.__table__.name}"
            ON {AnnotationModel.id} = "{SegmentationModel.__table__.name}".id
        WHERE
            {AnnotationModel.id} = "{SegmentationModel.__table__.name}".id
        AND {AnnotationModel.valid} = true

    """

    try:
        mat_db_connection = mat_engine.connect()
        with mat_db_connection.begin():
            insert_query = mat_db_connection.execute(
                f"CREATE TABLE {temp_table_name} AS ({query});")
            row_count = insert_query.rowcount
            drop_query = mat_db_connection.execute(
                f'DROP TABLE {annotation_table_name}, "{segmentation_table_name}" CASCADE;'
            )
            alter_query = mat_db_connection.execute(
                f"ALTER TABLE {temp_table_name} RENAME TO {annotation_table_name};"
            )
        mat_session.close()
        mat_engine.dispose()

        return f"Number of rows copied: {row_count}"
    except Exception as e:
        celery_logger.error(e)
        raise (e)
예제 #7
0
def create_materialized_metadata(
    self,
    datastack_info: dict,
    analysis_version: int,
    materialization_time_stamp: datetime.datetime.utcnow,
):
    """Creates a metadata table in a materialized database. Reads row counts
    from annotation tables copied to the materialized database. Inserts row count
    and table info into the metadata table.

    Args:
        aligned_volume (str):  aligned volume name
        mat_sql_uri (str): target database sql url to use

    Raises:
       database_error:  sqlalchemy connection error

    Returns:
        bool: True if Metadata table were created and table info was inserted.
    """
    aligned_volume = datastack_info["aligned_volume"]["name"]
    datastack = datastack_info["datastack"]
    SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
    sql_base_uri = SQL_URI_CONFIG.rpartition("/")[0]
    sql_uri = make_url(f"{sql_base_uri}/{aligned_volume}")
    analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack,
                                               analysis_version)

    session, engine = create_session(sql_uri)
    analysis_session, analysis_engine = create_session(analysis_sql_uri)

    try:
        mat_table = MaterializedMetadata()
        mat_table.__table__.create(bind=analysis_engine)  # pylint: disable=maybe-no-member
    except Exception as e:
        celery_logger.error(f"Materialized Metadata table creation failed {e}")

    mat_client = dynamic_annotation_cache.get_db(
        f"{datastack}__mat{analysis_version}")

    tables = session.query(AnnoMetadata).all()
    try:
        for table in tables:
            # only create table if marked as valid in the metadata table
            if table.valid:
                table_name = table.table_name
                schema_type = (session.query(AnnoMetadata.schema_type).filter(
                    AnnoMetadata.table_name == table_name).one())

                valid_row_count = mat_client._get_table_row_count(
                    table_name, filter_valid=True)
                celery_logger.info(f"Row count {valid_row_count}")
                if valid_row_count == 0:
                    continue

                mat_metadata = MaterializedMetadata(
                    schema=schema_type[0],
                    table_name=table_name,
                    row_count=valid_row_count,
                    materialized_timestamp=materialization_time_stamp,
                )
                analysis_session.add(mat_metadata)
                analysis_session.commit()
    except Exception as database_error:
        analysis_session.rollback()
        session.rollback()
        celery_logger.error(database_error)
    finally:
        session.close()
        engine.dispose()
        mat_client.cached_session.close()
        analysis_session.close()
        analysis_engine.dispose()
    return True
예제 #8
0
def create_new_version(
    datastack_info: dict,
    materialization_time_stamp: datetime.datetime.utcnow,
    days_to_expire: int = None,
):
    """Create new versioned database row in the analysis_version table.
    Sets the expiration date for the database.

    Args:
        datastack_info (dict): datastack info from infoservice
        materialization_time_stamp (datetime.datetime.utcnow): UTC timestamp of root_id lookup
        days_to_expire (int, optional): Number of days until db is flagged to be expired. Defaults to 5.

    Returns:
        [int]: version number of materialized database
    """
    aligned_volume = datastack_info["aligned_volume"]["name"]
    datastack = datastack_info.get("datastack")

    table_objects = [
        AnalysisVersion.__tablename__,
        AnalysisTable.__tablename__,
    ]
    SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
    sql_base_uri = SQL_URI_CONFIG.rpartition("/")[0]
    sql_uri = make_url(f"{sql_base_uri}/{aligned_volume}")

    session, engine = create_session(sql_uri)

    # create analysis metadata table if not exists
    for table in table_objects:
        if not engine.dialect.has_table(engine, table):
            Base.metadata.tables[table].create(bind=engine)

    top_version = session.query(func.max(AnalysisVersion.version)).scalar()

    if top_version is None:
        new_version_number = 1
    else:
        new_version_number = top_version + 1
    if days_to_expire > 0:
        expiration_date = materialization_time_stamp + datetime.timedelta(
            days=days_to_expire)
    else:
        expiration_date = None

    analysisversion = AnalysisVersion(
        datastack=datastack,
        time_stamp=materialization_time_stamp,
        version=new_version_number,
        valid=False,
        expires_on=expiration_date,
    )
    try:
        session.add(analysisversion)
        session.commit()
    except Exception as e:
        session.rollback()
        celery_logger.error(e)
    finally:
        session.close()
        engine.dispose()
    return new_version_number
 def setup_method(self, database_uri):
     self.session, self.engine = create_session(database_uri)