def upload_data(self, data: List, bulk_upload_info: dict): aligned_volume = bulk_upload_info["aligned_volume"] model_data = { "annotation_table_name": bulk_upload_info["annotation_table_name"], "schema": bulk_upload_info["schema"], "pcg_table_name": bulk_upload_info["pcg_table_name"], } AnnotationModel = create_annotation_model(model_data) SegmentationModel = create_segmentation_model(model_data) session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) try: with engine.begin() as connection: connection.execute(AnnotationModel.__table__.insert(), data[0]) connection.execute(SegmentationModel.__table__.insert(), data[1]) except Exception as e: celery_logger.error(f"ERROR: {e}") raise self.retry(exc=Exception, countdown=3) finally: session.close() engine.dispose() return True
def generate_chunked_model_ids( mat_metadata: dict, use_segmentation_model=False ) -> List[List]: """Creates list of chunks with start:end index for chunking queries for materialization. Parameters ---------- mat_metadata : dict Materialization metadata Returns ------- List[List] list of list containing start and end indices """ celery_logger.info("Chunking supervoxel ids") if use_segmentation_model: AnnotationModel = create_segmentation_model(mat_metadata) else: AnnotationModel = create_annotation_model(mat_metadata) chunk_size = mat_metadata.get("chunk_size") if not chunk_size: ROW_CHUNK_SIZE = get_config_param("MATERIALIZATION_ROW_CHUNK_SIZE") chunk_size = ROW_CHUNK_SIZE chunked_ids = chunk_ids(mat_metadata, AnnotationModel.id, chunk_size) return [chunk for chunk in chunked_ids]
def insert_annotation_data(self, chunk: List[int], mat_metadata: dict): """Insert annotation data into database Args: chunk (List[int]): chunk of annotation ids mat_metadata (dict): materialized metadata Returns: bool: True if data was inserted """ aligned_volume = mat_metadata["aligned_volume"] analysis_version = mat_metadata["analysis_version"] annotation_table_name = mat_metadata["annotation_table_name"] datastack = mat_metadata["datastack"] session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) # build table models AnnotationModel = create_annotation_model(mat_metadata, with_crud_columns=False) SegmentationModel = create_segmentation_model(mat_metadata) analysis_table = get_analysis_table(aligned_volume, datastack, annotation_table_name, analysis_version) query_columns = [] for col in AnnotationModel.__table__.columns: query_columns.append(col) for col in SegmentationModel.__table__.columns: if not col.name == "id": query_columns.append(col) chunked_id_query = query_id_range(AnnotationModel.id, chunk[0], chunk[1]) anno_ids = (session.query( AnnotationModel.id).filter(chunked_id_query).filter( AnnotationModel.valid == True)) query = (session.query(*query_columns).join(SegmentationModel).filter( SegmentationModel.id == AnnotationModel.id).filter( SegmentationModel.id.in_(anno_ids))) data = query.all() mat_df = pd.DataFrame(data) mat_df = mat_df.to_dict(orient="records") SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack, analysis_version) analysis_session, analysis_engine = create_session(analysis_sql_uri) try: analysis_engine.execute(analysis_table.insert(), [data for data in mat_df]) except Exception as e: celery_logger.error(e) analysis_session.rollback() finally: analysis_session.close() analysis_engine.dispose() session.close() engine.dispose() return True
def insert_segmentation_data(materialization_data: dict, mat_metadata: dict) -> dict: """Insert supervoxel and root id data into segmentation table. Args: materialization_data (dict): supervoxel and/or root id data mat_metadata (dict): materialization metadata Returns: dict: returns description of number of rows inserted """ if not materialization_data: return {"status": "empty"} SegmentationModel = create_segmentation_model(mat_metadata) aligned_volume = mat_metadata.get("aligned_volume") session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) try: with engine.begin() as connection: connection.execute(SegmentationModel.__table__.insert(), materialization_data) except SQLAlchemyError as e: session.rollback() celery_logger.error(e) finally: session.close() return {"Segmentation data inserted": len(materialization_data)}
def get_new_roots(self, supervoxel_chunk: list, mat_metadata: dict): """Get new roots from supervoxels ids of expired roots Args: supervoxel_chunk (list): [description] mat_metadata (dict): [description] Returns: dict: dicts of new root_ids """ pcg_table_name = mat_metadata.get("pcg_table_name") materialization_time_stamp = mat_metadata["materialization_time_stamp"] try: formatted_mat_ts = datetime.datetime.strptime( materialization_time_stamp, "%Y-%m-%dT%H:%M:%S.%f") except: formatted_mat_ts = datetime.datetime.strptime( materialization_time_stamp, "%Y-%m-%d %H:%M:%S.%f") root_ids_df = pd.DataFrame(supervoxel_chunk, dtype=object) supervoxel_col_name = list( root_ids_df.loc[:, root_ids_df.columns.str.endswith("supervoxel_id")]) root_id_col_name = list( root_ids_df.loc[:, root_ids_df.columns.str.endswith("root_id")]) supervoxel_df = root_ids_df.loc[:, supervoxel_col_name[0]] supervoxel_data = supervoxel_df.to_list() root_id_array = lookup_new_root_ids(pcg_table_name, supervoxel_data, formatted_mat_ts) del supervoxel_data root_ids_df.loc[supervoxel_df.index, root_id_col_name[0]] = root_id_array root_ids_df.drop(columns=[supervoxel_col_name[0]]) SegmentationModel = create_segmentation_model(mat_metadata) aligned_volume = mat_metadata.get("aligned_volume") session = sqlalchemy_cache.get(aligned_volume) data = root_ids_df.to_dict(orient="records") try: session.bulk_update_mappings(SegmentationModel, data) session.commit() except Exception as e: session.rollback() celery_logger.error(f"ERROR: {e}") raise self.retry(exc=e, countdown=3) finally: session.close() return f"Number of rows updated: {len(data)}"
def get_sql_supervoxel_ids(chunks: List[int], mat_metadata: dict) -> List[int]: """Iterates over columns with 'supervoxel_id' present in the name and returns supervoxel ids between start and stop ids. Parameters ---------- chunks: dict name of database to target mat_metadata : dict Materialization metadata Returns ------- List[int] list of supervoxel ids between 'start_id' and 'end_id' """ SegmentationModel = create_segmentation_model(mat_metadata) aligned_volume = mat_metadata.get("aligned_volume") session = sqlalchemy_cache.get(aligned_volume) columns = [ model_column.name for model_column in SegmentationModel.__table__.columns ] supervoxel_id_columns = [ model_column for model_column in columns if "supervoxel_id" in model_column ] mapped_columns = [ getattr(SegmentationModel, supervoxel_id_column) for supervoxel_id_column in supervoxel_id_columns ] try: filter_query = session.query(SegmentationModel.id, *mapped_columns) if len(chunks) > 1: query = filter_query.filter( or_(SegmentationModel.id).between(int(chunks[0]), int(chunks[1]))) elif len(chunks) == 1: query = filter_query.filter(SegmentationModel.id == chunks[0]) data = query.all() df = pd.DataFrame(data) return df.to_dict(orient="list") except Exception as e: celery_logger.error(e) session.rollback() finally: session.close()
def get_supervoxel_ids(root_id_chunk: list, mat_metadata: dict): """Get supervoxel ids associated with expired root ids Args: root_id_chunk (list): [description] mat_metadata (dict): [description] Returns: dict: supervoxels of a group of expired root ids None: no supervoxel ids exist for the expired root id """ aligned_volume = mat_metadata.get("aligned_volume") SegmentationModel = create_segmentation_model(mat_metadata) session = sqlalchemy_cache.get(aligned_volume) columns = [column.name for column in SegmentationModel.__table__.columns] root_id_columns = [column for column in columns if "root_id" in column] expired_root_id_data = {} try: for root_id_column in root_id_columns: prefix = root_id_column.rsplit("_", 2)[0] supervoxel_name = f"{prefix}_supervoxel_id" supervoxels = [ data for data in session.query( SegmentationModel.id, getattr(SegmentationModel, root_id_column), getattr(SegmentationModel, supervoxel_name), ).filter( or_(getattr(SegmentationModel, root_id_column)).in_( root_id_chunk)) ] if supervoxels: expired_root_id_data[root_id_column] = pd.DataFrame( supervoxels).to_dict(orient="records") except Exception as e: raise e finally: session.close() if expired_root_id_data: return expired_root_id_data else: return None
def create_missing_segmentation_table(self, mat_metadata: dict) -> dict: """Create missing segmentation tables associated with an annotation table if it does not already exist. Parameters ---------- mat_metadata : dict Materialization metadata Returns: dict: Materialization metadata """ segmentation_table_name = mat_metadata.get("segmentation_table_name") aligned_volume = mat_metadata.get("aligned_volume") SegmentationModel = create_segmentation_model(mat_metadata) session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) if (not session.query(SegmentationMetadata).filter( SegmentationMetadata.table_name == segmentation_table_name).scalar()): SegmentationModel.__table__.create(bind=engine, checkfirst=True) creation_time = datetime.datetime.utcnow() metadata_dict = { "annotation_table": mat_metadata.get("annotation_table_name"), "schema_type": mat_metadata.get("schema"), "table_name": segmentation_table_name, "valid": True, "created": creation_time, "pcg_table_name": mat_metadata.get("pcg_table_name"), } seg_metadata = SegmentationMetadata(**metadata_dict) try: session.add(seg_metadata) session.commit() except Exception as e: celery_logger.error(f"SQL ERROR: {e}") session.rollback() else: session.close() return mat_metadata
def update_segmentation_data(materialization_data: dict, mat_metadata: dict) -> dict: if not materialization_data: return {"status": "empty"} SegmentationModel = create_segmentation_model(mat_metadata) aligned_volume = mat_metadata.get("aligned_volume") session = sqlalchemy_cache.get(aligned_volume) try: session.bulk_update_mappings(SegmentationModel, materialization_data) session.commit() except Exception as e: session.rollback() celery_logger.error(f"ERROR: {e}") raise (e) finally: session.close() return f"Number of rows updated: {len(materialization_data)}"
def get_ids_with_missing_roots(mat_metadata: dict) -> List: """Get a chunk generator of the primary key ids for rows that contain at least one missing root id. Finds the min and max primary key id values globally across the table where a missing root id is present in a column. Args: mat_metadata (dict): materialization metadata Returns: List: generator of chunked primary key ids. """ SegmentationModel = create_segmentation_model(mat_metadata) aligned_volume = mat_metadata.get("aligned_volume") session = sqlalchemy_cache.get(aligned_volume) columns = [ seg_column.name for seg_column in SegmentationModel.__table__.columns ] root_id_columns = [ root_column for root_column in columns if "root_id" in root_column ] query_columns = [ getattr(SegmentationModel, root_id_column).is_(None) for root_id_column in root_id_columns ] max_id = (session.query(func.max(SegmentationModel.id)).filter( or_(*query_columns)).scalar()) min_id = (session.query(func.min(SegmentationModel.id)).filter( or_(*query_columns)).scalar()) if min_id and max_id: if min_id < max_id: id_range = range(min_id, max_id + 1) return create_chunks(id_range, 500) elif min_id == max_id: return [min_id] else: celery_logger.info( f"No missing root_ids found in '{SegmentationModel.__table__.name}'" ) return None
def create_tables(self, bulk_upload_params: dict): table_name = bulk_upload_params["annotation_table_name"] aligned_volume = bulk_upload_params["aligned_volume"] pcg_table_name = bulk_upload_params["pcg_table_name"] last_updated = bulk_upload_params["last_updated"] seg_table_name = bulk_upload_params["seg_table_name"] upload_creation_time = bulk_upload_params["upload_creation_time"] session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) if (not session.query(AnnoMetadata).filter( AnnoMetadata.table_name == table_name).scalar()): AnnotationModel = create_annotation_model(bulk_upload_params) AnnotationModel.__table__.create(bind=engine, checkfirst=True) anno_metadata_dict = { "table_name": table_name, "schema_type": bulk_upload_params.get("schema"), "valid": True, "created": upload_creation_time, "user_id": bulk_upload_params.get("user_id", "*****@*****.**"), "description": bulk_upload_params["description"], "reference_table": bulk_upload_params.get("reference_table"), "flat_segmentation_source": bulk_upload_params.get("flat_segmentation_source"), } anno_metadata = AnnoMetadata(**anno_metadata_dict) session.add(anno_metadata) if (not session.query(SegmentationMetadata).filter( SegmentationMetadata.table_name == table_name).scalar()): SegmentationModel = create_segmentation_model(bulk_upload_params) SegmentationModel.__table__.create(bind=engine, checkfirst=True) seg_metadata_dict = { "annotation_table": table_name, "schema_type": bulk_upload_params.get("schema"), "table_name": seg_table_name, "valid": True, "created": upload_creation_time, "pcg_table_name": pcg_table_name, "last_updated": last_updated, } seg_metadata = SegmentationMetadata(**seg_metadata_dict) try: session.flush() session.add(seg_metadata) session.commit() except Exception as e: celery_logger.error(f"SQL ERROR: {e}") session.rollback() raise e finally: drop_seg_indexes = index_cache.drop_table_indices( SegmentationModel.__table__.name, engine) # wait for indexes to drop time.sleep(10) drop_anno_indexes = index_cache.drop_table_indices( AnnotationModel.__table__.name, engine) celery_logger.info( f"Table {AnnotationModel.__table__.name} indices have been dropped {drop_anno_indexes}." ) celery_logger.info( f"Table {SegmentationModel.__table__.name} indices have been dropped {drop_seg_indexes}." ) session.close() return f"Tables {table_name}, {seg_table_name} created."
def merge_tables(self, mat_metadata: dict): """Merge all the annotation and segmentation rows into a new table that are flagged as valid. Drop the original split tables after inserting all the rows into the new table. Args: mat_metadata (dict): datastack info for the aligned_volume from the infoservice analysis_version (int): materialized version number Raises: e: error during table merging operation Returns: str: number of rows copied """ analysis_version = mat_metadata["analysis_version"] annotation_table_name = mat_metadata["annotation_table_name"] segmentation_table_name = mat_metadata["segmentation_table_name"] temp_table_name = mat_metadata["temp_mat_table_name"] schema = mat_metadata["schema"] datastack = mat_metadata["datastack"] # create dynamic sql_uri SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack, analysis_version) # get schema and match column order for sql query anno_schema = get_schema(schema) flat_schema = create_flattened_schema(anno_schema) ordered_model_columns = create_table_dict( table_name=annotation_table_name, Schema=flat_schema, segmentation_source=None, table_metadata=None, with_crud_columns=False, ) AnnotationModel = create_annotation_model(mat_metadata, with_crud_columns=False) SegmentationModel = create_segmentation_model(mat_metadata) query_columns = {} crud_columns = ["created", "deleted", "superceded_id"] for col in AnnotationModel.__table__.columns: if col.name not in crud_columns: query_columns[col.name] = col for col in SegmentationModel.__table__.columns: if not col.name == "id": query_columns[col.name] = col sorted_columns = OrderedDict([(key, query_columns[key]) for key in ordered_model_columns if key in query_columns.keys()]) sorted_columns_list = list(sorted_columns.values()) columns = [f'"{col.table}".{col.name}' for col in sorted_columns_list] mat_session, mat_engine = create_session(analysis_sql_uri) query = f""" SELECT {', '.join(columns)} FROM {AnnotationModel.__table__.name} JOIN "{SegmentationModel.__table__.name}" ON {AnnotationModel.id} = "{SegmentationModel.__table__.name}".id WHERE {AnnotationModel.id} = "{SegmentationModel.__table__.name}".id AND {AnnotationModel.valid} = true """ try: mat_db_connection = mat_engine.connect() with mat_db_connection.begin(): insert_query = mat_db_connection.execute( f"CREATE TABLE {temp_table_name} AS ({query});") row_count = insert_query.rowcount drop_query = mat_db_connection.execute( f'DROP TABLE {annotation_table_name}, "{segmentation_table_name}" CASCADE;' ) alter_query = mat_db_connection.execute( f"ALTER TABLE {temp_table_name} RENAME TO {annotation_table_name};" ) mat_session.close() mat_engine.dispose() return f"Number of rows copied: {row_count}" except Exception as e: celery_logger.error(e) raise (e)
def get_new_root_ids(materialization_data: dict, mat_metadata: dict) -> dict: """Get root ids Args: materialization_data (dict): supervoxel data for root_id lookup mat_metadata (dict): Materialization metadata Returns: dict: root_ids to be inserted into db """ pcg_table_name = mat_metadata.get("pcg_table_name") aligned_volume = mat_metadata.get("aligned_volume") try: materialization_time_stamp = datetime.datetime.strptime( mat_metadata.get("materialization_time_stamp"), "%Y-%m-%d %H:%M:%S.%f") except: materialization_time_stamp = datetime.datetime.strptime( mat_metadata.get("materialization_time_stamp"), "%Y-%m-%dT%H:%M:%S.%f") supervoxel_df = pd.DataFrame(materialization_data, dtype=object) drop_col_names = list( supervoxel_df.loc[:, supervoxel_df.columns.str.endswith("position")]) supervoxel_df = supervoxel_df.drop(drop_col_names, 1) AnnotationModel = create_annotation_model(mat_metadata) SegmentationModel = create_segmentation_model(mat_metadata) __, seg_model_cols, __ = get_query_columns_by_suffix( AnnotationModel, SegmentationModel, "root_id") anno_ids = supervoxel_df["id"].to_list() # get current root ids from database session = sqlalchemy_cache.get(aligned_volume) try: current_root_ids = [ data for data in session.query(*seg_model_cols).filter( or_(SegmentationModel.id.in_(anno_ids))) ] except SQLAlchemyError as e: session.rollback() current_root_ids = [] celery_logger.error(e) finally: session.close() supervoxel_col_names = list( supervoxel_df.loc[:, supervoxel_df.columns.str.endswith("supervoxel_id")]) if current_root_ids: # merge root_id df with supervoxel df df = pd.DataFrame(current_root_ids, dtype=object) root_ids_df = pd.merge(supervoxel_df, df) else: # create empty dataframe with root_id columns root_id_columns = [ col_name.replace("supervoxel_id", "root_id") for col_name in supervoxel_col_names if "supervoxel_id" in col_name ] df = pd.DataFrame(columns=root_id_columns, dtype=object).fillna(value=np.nan) root_ids_df = pd.concat((supervoxel_df, df), axis=1) cols = [x for x in root_ids_df.columns if "root_id" in x] cg = chunkedgraph_cache.init_pcg(pcg_table_name) # filter missing root_ids and lookup root_ids if missing mask = np.logical_and.reduce([root_ids_df[col].isna() for col in cols]) missing_root_rows = root_ids_df.loc[mask] if not missing_root_rows.empty: supervoxel_data = missing_root_rows.loc[:, supervoxel_col_names] for col_name in supervoxel_data: if "supervoxel_id" in col_name: root_id_name = col_name.replace("supervoxel_id", "root_id") data = missing_root_rows.loc[:, col_name] root_id_array = get_root_ids(cg, data, materialization_time_stamp) root_ids_df.loc[data.index, root_id_name] = root_id_array return root_ids_df.to_dict(orient="records")
def get_annotations_with_missing_supervoxel_ids(mat_metadata: dict, chunk: List[int]) -> dict: """Get list of valid annotation and their ids to lookup existing supervoxel ids. If there are missing supervoxels they will be set as None for cloudvolume lookup. Parameters ---------- mat_metadata : dict Materialization metadata chunk : list chunked range to for sql id query Returns ------- dict dict of annotation and segmentation data """ aligned_volume = mat_metadata.get("aligned_volume") SegmentationModel = create_segmentation_model(mat_metadata) AnnotationModel = create_annotation_model(mat_metadata) session = sqlalchemy_cache.get(aligned_volume) anno_model_cols, __, supervoxel_columns = get_query_columns_by_suffix( AnnotationModel, SegmentationModel, "supervoxel_id") query = session.query(*anno_model_cols) chunked_id_query = query_id_range(AnnotationModel.id, chunk[0], chunk[1]) annotation_data = [ data for data in query.filter(chunked_id_query).order_by( AnnotationModel.id).filter(AnnotationModel.valid == True).join( SegmentationModel, isouter=True).filter( SegmentationModel.id == None) ] annotation_dataframe = pd.DataFrame(annotation_data, dtype=object) if not annotation_dataframe.empty: wkb_data = annotation_dataframe.loc[:, annotation_dataframe.columns.str. endswith("position")] annotation_dict = {} for column, wkb_points in wkb_data.items(): annotation_dict[column] = [ get_geom_from_wkb(wkb_point) for wkb_point in wkb_points ] for key, value in annotation_dict.items(): annotation_dataframe.loc[:, key] = value segmentation_dataframe = pd.DataFrame(columns=supervoxel_columns, dtype=object) segmentation_dataframe = segmentation_dataframe.fillna(value=np.nan) mat_df = pd.concat((segmentation_dataframe, annotation_dataframe), axis=1) materialization_data = mat_df.to_dict(orient="list") else: materialization_data = None session.close() return materialization_data