def insert_annotation_data(self, chunk: List[int], mat_metadata: dict): """Insert annotation data into database Args: chunk (List[int]): chunk of annotation ids mat_metadata (dict): materialized metadata Returns: bool: True if data was inserted """ aligned_volume = mat_metadata["aligned_volume"] analysis_version = mat_metadata["analysis_version"] annotation_table_name = mat_metadata["annotation_table_name"] datastack = mat_metadata["datastack"] session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) # build table models AnnotationModel = create_annotation_model(mat_metadata, with_crud_columns=False) SegmentationModel = create_segmentation_model(mat_metadata) analysis_table = get_analysis_table(aligned_volume, datastack, annotation_table_name, analysis_version) query_columns = [] for col in AnnotationModel.__table__.columns: query_columns.append(col) for col in SegmentationModel.__table__.columns: if not col.name == "id": query_columns.append(col) chunked_id_query = query_id_range(AnnotationModel.id, chunk[0], chunk[1]) anno_ids = (session.query( AnnotationModel.id).filter(chunked_id_query).filter( AnnotationModel.valid == True)) query = (session.query(*query_columns).join(SegmentationModel).filter( SegmentationModel.id == AnnotationModel.id).filter( SegmentationModel.id.in_(anno_ids))) data = query.all() mat_df = pd.DataFrame(data) mat_df = mat_df.to_dict(orient="records") SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack, analysis_version) analysis_session, analysis_engine = create_session(analysis_sql_uri) try: analysis_engine.execute(analysis_table.insert(), [data for data in mat_df]) except Exception as e: celery_logger.error(e) analysis_session.rollback() finally: analysis_session.close() analysis_engine.dispose() session.close() engine.dispose() return True
def add_indices(self, mat_metadata: dict): """Find missing indices for a given table contained in the mat_metadata dict. Spawns a chain of celery tasks that run synchronously that add an index per task. Args: mat_metadata (dict): datastack info for the aligned_volume derived from the infoservice Returns: chain: chain of celery tasks """ add_indices = mat_metadata.get("add_indices", False) if add_indices: analysis_version = mat_metadata.get("analysis_version") datastack = mat_metadata["datastack"] analysis_database = mat_metadata["analysis_database"] SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack, analysis_version) analysis_session, analysis_engine = create_session(analysis_sql_uri) annotation_table_name = mat_metadata.get("annotation_table_name") schema = mat_metadata.get("schema") table_metadata = None if mat_metadata.get("reference_table"): table_metadata = { "reference_table": mat_metadata.get("reference_table") } model = make_flat_model( table_name=annotation_table_name, schema_type=schema, segmentation_source=None, table_metadata=table_metadata, ) commands = index_cache.add_indices_sql_commands( annotation_table_name, model, analysis_engine) analysis_session.close() analysis_engine.dispose() add_index_tasks = chain( [add_index.si(analysis_database, command) for command in commands]) return self.replace(add_index_tasks) return "Indices already exist"
def get(self, aligned_volume_name, version, tablename): check_aligned_volume(aligned_volume_name) SQL_URI_CONFIG = current_app.config["SQLALCHEMY_DATABASE_URI"] sql_base_uri = SQL_URI_CONFIG.rpartition("/")[0] sql_uri = make_url(f"{sql_base_uri}/{aligned_volume_name}") session, engine = create_session(sql_uri) metadata = MetaData() try: annotation_table = Table(tablename, metadata, autoload=True, autoload_with=engine) except NoSuchTableError as e: logging.error(f"No table exists {e}") return abort(404) response = session.query(annotation_table).limit(10).all() annotations = [r._asdict() for r in response] if annotations: return annotations, 200 else: return abort(404)
def drop_indices(self, mat_metadata: dict): """Drop all indices of a given table. Args: mat_metadata (dict): datastack info for the aligned_volume derived from the infoservice Returns: str: string if indices were dropped or not. """ add_indices = mat_metadata.get("add_indices", False) if add_indices: analysis_version = mat_metadata.get("analysis_version", None) datastack = mat_metadata["datastack"] temp_mat_table_name = mat_metadata["temp_mat_table_name"] SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack, analysis_version) analysis_session, analysis_engine = create_session(analysis_sql_uri) index_cache.drop_table_indices(temp_mat_table_name, analysis_engine) analysis_session.close() analysis_engine.dispose() return "Indices DROPPED" return "No indices dropped"
def remove_expired_databases(delete_threshold: int = 5) -> str: """ Remove expired database from time this method is called. """ aligned_volume_databases = get_aligned_volumes_databases() datastacks = get_config_param("DATASTACKS") current_time = datetime.utcnow() remove_db_cron_info = [] for datastack in datastacks: datastack_info = get_datastack_info(datastack) aligned_volume = datastack_info["aligned_volume"]["name"] if aligned_volume in aligned_volume_databases: SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") sql_base_uri = SQL_URI_CONFIG.rpartition("/")[0] sql_uri = make_url(f"{sql_base_uri}/{aligned_volume}") session, engine = create_session(sql_uri) session.expire_on_commit = False # get number of expired dbs that are ready for deletion try: expired_results = (session.query(AnalysisVersion).filter( AnalysisVersion.expires_on <= current_time).all()) expired_versions = [ str(expired_db) for expired_db in expired_results ] except Exception as sql_error: celery_logger.error(f"Error: {sql_error}") continue # get databases that exist currently, filter by materializied dbs result = engine.execute( "SELECT datname FROM pg_database;").fetchall() database_list = list(itertools.chain.from_iterable(result)) databases = [ database for database in database_list if database.startswith(datastack) ] # get databases to delete that are currently present databases_to_delete = [ database for database in databases if database in expired_versions ] dropped_dbs_info = { "aligned_volume": aligned_volume, "materialized_databases": (databases, f"count={len(databases)}"), "expired_databases": ( expired_versions, f"count={len(expired_versions)}", ), "delete_threshold": delete_threshold, } dropped_dbs = [] if len(databases) > delete_threshold: with engine.connect() as conn: conn.execution_options(isolation_level="AUTOCOMMIT") for database in databases_to_delete: try: sql = ( "SELECT 1 FROM pg_database WHERE datname='%s'" % database) result_proxy = conn.execute(sql) result = result_proxy.scalar() if result: drop_connections = f""" SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '{database}' AND pid <> pg_backend_pid() """ conn.execute(drop_connections) celery_logger.info( f"Dropped connections to: {database}") sql = "DROP DATABASE %s" % database result_proxy = conn.execute(sql) celery_logger.info( f"Database: {database} removed") # strip version from database string database_version = database.rsplit("__mat")[-1] expired_database = ( session.query(AnalysisVersion).filter( AnalysisVersion.version == database_version).one()) expired_database.valid = False session.commit() celery_logger.info( f"Database '{expired_database}' dropped") dropped_dbs.append(expired_database) dropped_dbs_info[ "dropped_databases"] = dropped_dbs except Exception as e: celery_logger.error( f"ERROR: {e}: {database} does not exist") remove_db_cron_info.append(dropped_dbs_info) session.close() return remove_db_cron_info
def merge_tables(self, mat_metadata: dict): """Merge all the annotation and segmentation rows into a new table that are flagged as valid. Drop the original split tables after inserting all the rows into the new table. Args: mat_metadata (dict): datastack info for the aligned_volume from the infoservice analysis_version (int): materialized version number Raises: e: error during table merging operation Returns: str: number of rows copied """ analysis_version = mat_metadata["analysis_version"] annotation_table_name = mat_metadata["annotation_table_name"] segmentation_table_name = mat_metadata["segmentation_table_name"] temp_table_name = mat_metadata["temp_mat_table_name"] schema = mat_metadata["schema"] datastack = mat_metadata["datastack"] # create dynamic sql_uri SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack, analysis_version) # get schema and match column order for sql query anno_schema = get_schema(schema) flat_schema = create_flattened_schema(anno_schema) ordered_model_columns = create_table_dict( table_name=annotation_table_name, Schema=flat_schema, segmentation_source=None, table_metadata=None, with_crud_columns=False, ) AnnotationModel = create_annotation_model(mat_metadata, with_crud_columns=False) SegmentationModel = create_segmentation_model(mat_metadata) query_columns = {} crud_columns = ["created", "deleted", "superceded_id"] for col in AnnotationModel.__table__.columns: if col.name not in crud_columns: query_columns[col.name] = col for col in SegmentationModel.__table__.columns: if not col.name == "id": query_columns[col.name] = col sorted_columns = OrderedDict([(key, query_columns[key]) for key in ordered_model_columns if key in query_columns.keys()]) sorted_columns_list = list(sorted_columns.values()) columns = [f'"{col.table}".{col.name}' for col in sorted_columns_list] mat_session, mat_engine = create_session(analysis_sql_uri) query = f""" SELECT {', '.join(columns)} FROM {AnnotationModel.__table__.name} JOIN "{SegmentationModel.__table__.name}" ON {AnnotationModel.id} = "{SegmentationModel.__table__.name}".id WHERE {AnnotationModel.id} = "{SegmentationModel.__table__.name}".id AND {AnnotationModel.valid} = true """ try: mat_db_connection = mat_engine.connect() with mat_db_connection.begin(): insert_query = mat_db_connection.execute( f"CREATE TABLE {temp_table_name} AS ({query});") row_count = insert_query.rowcount drop_query = mat_db_connection.execute( f'DROP TABLE {annotation_table_name}, "{segmentation_table_name}" CASCADE;' ) alter_query = mat_db_connection.execute( f"ALTER TABLE {temp_table_name} RENAME TO {annotation_table_name};" ) mat_session.close() mat_engine.dispose() return f"Number of rows copied: {row_count}" except Exception as e: celery_logger.error(e) raise (e)
def create_materialized_metadata( self, datastack_info: dict, analysis_version: int, materialization_time_stamp: datetime.datetime.utcnow, ): """Creates a metadata table in a materialized database. Reads row counts from annotation tables copied to the materialized database. Inserts row count and table info into the metadata table. Args: aligned_volume (str): aligned volume name mat_sql_uri (str): target database sql url to use Raises: database_error: sqlalchemy connection error Returns: bool: True if Metadata table were created and table info was inserted. """ aligned_volume = datastack_info["aligned_volume"]["name"] datastack = datastack_info["datastack"] SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") sql_base_uri = SQL_URI_CONFIG.rpartition("/")[0] sql_uri = make_url(f"{sql_base_uri}/{aligned_volume}") analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack, analysis_version) session, engine = create_session(sql_uri) analysis_session, analysis_engine = create_session(analysis_sql_uri) try: mat_table = MaterializedMetadata() mat_table.__table__.create(bind=analysis_engine) # pylint: disable=maybe-no-member except Exception as e: celery_logger.error(f"Materialized Metadata table creation failed {e}") mat_client = dynamic_annotation_cache.get_db( f"{datastack}__mat{analysis_version}") tables = session.query(AnnoMetadata).all() try: for table in tables: # only create table if marked as valid in the metadata table if table.valid: table_name = table.table_name schema_type = (session.query(AnnoMetadata.schema_type).filter( AnnoMetadata.table_name == table_name).one()) valid_row_count = mat_client._get_table_row_count( table_name, filter_valid=True) celery_logger.info(f"Row count {valid_row_count}") if valid_row_count == 0: continue mat_metadata = MaterializedMetadata( schema=schema_type[0], table_name=table_name, row_count=valid_row_count, materialized_timestamp=materialization_time_stamp, ) analysis_session.add(mat_metadata) analysis_session.commit() except Exception as database_error: analysis_session.rollback() session.rollback() celery_logger.error(database_error) finally: session.close() engine.dispose() mat_client.cached_session.close() analysis_session.close() analysis_engine.dispose() return True
def create_new_version( datastack_info: dict, materialization_time_stamp: datetime.datetime.utcnow, days_to_expire: int = None, ): """Create new versioned database row in the analysis_version table. Sets the expiration date for the database. Args: datastack_info (dict): datastack info from infoservice materialization_time_stamp (datetime.datetime.utcnow): UTC timestamp of root_id lookup days_to_expire (int, optional): Number of days until db is flagged to be expired. Defaults to 5. Returns: [int]: version number of materialized database """ aligned_volume = datastack_info["aligned_volume"]["name"] datastack = datastack_info.get("datastack") table_objects = [ AnalysisVersion.__tablename__, AnalysisTable.__tablename__, ] SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") sql_base_uri = SQL_URI_CONFIG.rpartition("/")[0] sql_uri = make_url(f"{sql_base_uri}/{aligned_volume}") session, engine = create_session(sql_uri) # create analysis metadata table if not exists for table in table_objects: if not engine.dialect.has_table(engine, table): Base.metadata.tables[table].create(bind=engine) top_version = session.query(func.max(AnalysisVersion.version)).scalar() if top_version is None: new_version_number = 1 else: new_version_number = top_version + 1 if days_to_expire > 0: expiration_date = materialization_time_stamp + datetime.timedelta( days=days_to_expire) else: expiration_date = None analysisversion = AnalysisVersion( datastack=datastack, time_stamp=materialization_time_stamp, version=new_version_number, valid=False, expires_on=expiration_date, ) try: session.add(analysisversion) session.commit() except Exception as e: session.rollback() celery_logger.error(e) finally: session.close() engine.dispose() return new_version_number
def setup_method(self, database_uri): self.session, self.engine = create_session(database_uri)