def upload_data(self, data: List, bulk_upload_info: dict):

    aligned_volume = bulk_upload_info["aligned_volume"]

    model_data = {
        "annotation_table_name": bulk_upload_info["annotation_table_name"],
        "schema": bulk_upload_info["schema"],
        "pcg_table_name": bulk_upload_info["pcg_table_name"],
    }

    AnnotationModel = create_annotation_model(model_data)
    SegmentationModel = create_segmentation_model(model_data)

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    try:
        with engine.begin() as connection:
            connection.execute(AnnotationModel.__table__.insert(), data[0])
            connection.execute(SegmentationModel.__table__.insert(), data[1])
    except Exception as e:
        celery_logger.error(f"ERROR: {e}")
        raise self.retry(exc=Exception, countdown=3)
    finally:
        session.close()
        engine.dispose()
    return True
示例#2
0
def generate_chunked_model_ids(
    mat_metadata: dict, use_segmentation_model=False
) -> List[List]:
    """Creates list of chunks with start:end index for chunking queries for materialization.

    Parameters
    ----------
    mat_metadata : dict
        Materialization metadata

    Returns
    -------
    List[List]
        list of list containing start and end indices
    """
    celery_logger.info("Chunking supervoxel ids")
    if use_segmentation_model:
        AnnotationModel = create_segmentation_model(mat_metadata)
    else:
        AnnotationModel = create_annotation_model(mat_metadata)
    chunk_size = mat_metadata.get("chunk_size")

    if not chunk_size:
        ROW_CHUNK_SIZE = get_config_param("MATERIALIZATION_ROW_CHUNK_SIZE")
        chunk_size = ROW_CHUNK_SIZE

    chunked_ids = chunk_ids(mat_metadata, AnnotationModel.id, chunk_size)

    return [chunk for chunk in chunked_ids]
示例#3
0
def insert_annotation_data(self, chunk: List[int], mat_metadata: dict):
    """Insert annotation data into database

    Args:
        chunk (List[int]): chunk of annotation ids
        mat_metadata (dict): materialized metadata
    Returns:
        bool: True if data was inserted
    """
    aligned_volume = mat_metadata["aligned_volume"]
    analysis_version = mat_metadata["analysis_version"]
    annotation_table_name = mat_metadata["annotation_table_name"]
    datastack = mat_metadata["datastack"]

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    # build table models
    AnnotationModel = create_annotation_model(mat_metadata,
                                              with_crud_columns=False)
    SegmentationModel = create_segmentation_model(mat_metadata)
    analysis_table = get_analysis_table(aligned_volume, datastack,
                                        annotation_table_name,
                                        analysis_version)

    query_columns = []
    for col in AnnotationModel.__table__.columns:
        query_columns.append(col)
    for col in SegmentationModel.__table__.columns:
        if not col.name == "id":
            query_columns.append(col)

    chunked_id_query = query_id_range(AnnotationModel.id, chunk[0], chunk[1])

    anno_ids = (session.query(
        AnnotationModel.id).filter(chunked_id_query).filter(
            AnnotationModel.valid == True))
    query = (session.query(*query_columns).join(SegmentationModel).filter(
        SegmentationModel.id == AnnotationModel.id).filter(
            SegmentationModel.id.in_(anno_ids)))
    data = query.all()
    mat_df = pd.DataFrame(data)
    mat_df = mat_df.to_dict(orient="records")
    SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
    analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack,
                                               analysis_version)
    analysis_session, analysis_engine = create_session(analysis_sql_uri)

    try:
        analysis_engine.execute(analysis_table.insert(),
                                [data for data in mat_df])
    except Exception as e:
        celery_logger.error(e)
        analysis_session.rollback()
    finally:
        analysis_session.close()
        analysis_engine.dispose()
        session.close()
        engine.dispose()
    return True
def insert_segmentation_data(materialization_data: dict,
                             mat_metadata: dict) -> dict:
    """Insert supervoxel and root id data into segmentation table.

    Args:
        materialization_data (dict): supervoxel and/or root id data
        mat_metadata (dict): materialization metadata

    Returns:
        dict: returns description of number of rows inserted
    """
    if not materialization_data:
        return {"status": "empty"}

    SegmentationModel = create_segmentation_model(mat_metadata)
    aligned_volume = mat_metadata.get("aligned_volume")

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    try:
        with engine.begin() as connection:
            connection.execute(SegmentationModel.__table__.insert(),
                               materialization_data)
    except SQLAlchemyError as e:
        session.rollback()
        celery_logger.error(e)
    finally:
        session.close()
    return {"Segmentation data inserted": len(materialization_data)}
def get_new_roots(self, supervoxel_chunk: list, mat_metadata: dict):
    """Get new roots from supervoxels ids of expired roots

    Args:
        supervoxel_chunk (list): [description]
        mat_metadata (dict): [description]

    Returns:
        dict: dicts of new root_ids
    """
    pcg_table_name = mat_metadata.get("pcg_table_name")

    materialization_time_stamp = mat_metadata["materialization_time_stamp"]
    try:
        formatted_mat_ts = datetime.datetime.strptime(
            materialization_time_stamp, "%Y-%m-%dT%H:%M:%S.%f")
    except:
        formatted_mat_ts = datetime.datetime.strptime(
            materialization_time_stamp, "%Y-%m-%d %H:%M:%S.%f")
    root_ids_df = pd.DataFrame(supervoxel_chunk, dtype=object)

    supervoxel_col_name = list(
        root_ids_df.loc[:,
                        root_ids_df.columns.str.endswith("supervoxel_id")])
    root_id_col_name = list(
        root_ids_df.loc[:, root_ids_df.columns.str.endswith("root_id")])
    supervoxel_df = root_ids_df.loc[:, supervoxel_col_name[0]]
    supervoxel_data = supervoxel_df.to_list()

    root_id_array = lookup_new_root_ids(pcg_table_name, supervoxel_data,
                                        formatted_mat_ts)

    del supervoxel_data

    root_ids_df.loc[supervoxel_df.index, root_id_col_name[0]] = root_id_array
    root_ids_df.drop(columns=[supervoxel_col_name[0]])

    SegmentationModel = create_segmentation_model(mat_metadata)
    aligned_volume = mat_metadata.get("aligned_volume")
    session = sqlalchemy_cache.get(aligned_volume)
    data = root_ids_df.to_dict(orient="records")
    try:
        session.bulk_update_mappings(SegmentationModel, data)
        session.commit()
    except Exception as e:
        session.rollback()
        celery_logger.error(f"ERROR: {e}")
        raise self.retry(exc=e, countdown=3)
    finally:
        session.close()
    return f"Number of rows updated: {len(data)}"
def get_sql_supervoxel_ids(chunks: List[int], mat_metadata: dict) -> List[int]:
    """Iterates over columns with 'supervoxel_id' present in the name and
    returns supervoxel ids between start and stop ids.

    Parameters
    ----------
    chunks: dict
        name of database to target
    mat_metadata : dict
        Materialization metadata

    Returns
    -------
    List[int]
        list of supervoxel ids between 'start_id' and 'end_id'
    """
    SegmentationModel = create_segmentation_model(mat_metadata)
    aligned_volume = mat_metadata.get("aligned_volume")
    session = sqlalchemy_cache.get(aligned_volume)

    columns = [
        model_column.name
        for model_column in SegmentationModel.__table__.columns
    ]
    supervoxel_id_columns = [
        model_column for model_column in columns
        if "supervoxel_id" in model_column
    ]
    mapped_columns = [
        getattr(SegmentationModel, supervoxel_id_column)
        for supervoxel_id_column in supervoxel_id_columns
    ]
    try:
        filter_query = session.query(SegmentationModel.id, *mapped_columns)
        if len(chunks) > 1:
            query = filter_query.filter(
                or_(SegmentationModel.id).between(int(chunks[0]),
                                                  int(chunks[1])))
        elif len(chunks) == 1:
            query = filter_query.filter(SegmentationModel.id == chunks[0])

        data = query.all()
        df = pd.DataFrame(data)
        return df.to_dict(orient="list")
    except Exception as e:
        celery_logger.error(e)
        session.rollback()
    finally:
        session.close()
def get_supervoxel_ids(root_id_chunk: list, mat_metadata: dict):
    """Get supervoxel ids associated with expired root ids

    Args:
        root_id_chunk (list): [description]
        mat_metadata (dict): [description]

    Returns:
        dict: supervoxels of a group of expired root ids
        None: no supervoxel ids exist for the expired root id
    """
    aligned_volume = mat_metadata.get("aligned_volume")
    SegmentationModel = create_segmentation_model(mat_metadata)

    session = sqlalchemy_cache.get(aligned_volume)
    columns = [column.name for column in SegmentationModel.__table__.columns]
    root_id_columns = [column for column in columns if "root_id" in column]
    expired_root_id_data = {}
    try:
        for root_id_column in root_id_columns:
            prefix = root_id_column.rsplit("_", 2)[0]
            supervoxel_name = f"{prefix}_supervoxel_id"

            supervoxels = [
                data for data in session.query(
                    SegmentationModel.id,
                    getattr(SegmentationModel, root_id_column),
                    getattr(SegmentationModel, supervoxel_name),
                ).filter(
                    or_(getattr(SegmentationModel, root_id_column)).in_(
                        root_id_chunk))
            ]
            if supervoxels:
                expired_root_id_data[root_id_column] = pd.DataFrame(
                    supervoxels).to_dict(orient="records")

    except Exception as e:
        raise e
    finally:
        session.close()
    if expired_root_id_data:
        return expired_root_id_data
    else:
        return None
def create_missing_segmentation_table(self, mat_metadata: dict) -> dict:
    """Create missing segmentation tables associated with an annotation table if it
    does not already exist.

    Parameters
    ----------
    mat_metadata : dict
        Materialization metadata

    Returns:
        dict: Materialization metadata
    """
    segmentation_table_name = mat_metadata.get("segmentation_table_name")
    aligned_volume = mat_metadata.get("aligned_volume")

    SegmentationModel = create_segmentation_model(mat_metadata)

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    if (not session.query(SegmentationMetadata).filter(
            SegmentationMetadata.table_name ==
            segmentation_table_name).scalar()):
        SegmentationModel.__table__.create(bind=engine, checkfirst=True)
        creation_time = datetime.datetime.utcnow()
        metadata_dict = {
            "annotation_table": mat_metadata.get("annotation_table_name"),
            "schema_type": mat_metadata.get("schema"),
            "table_name": segmentation_table_name,
            "valid": True,
            "created": creation_time,
            "pcg_table_name": mat_metadata.get("pcg_table_name"),
        }

        seg_metadata = SegmentationMetadata(**metadata_dict)
        try:
            session.add(seg_metadata)
            session.commit()
        except Exception as e:
            celery_logger.error(f"SQL ERROR: {e}")
            session.rollback()
    else:
        session.close()
    return mat_metadata
def update_segmentation_data(materialization_data: dict,
                             mat_metadata: dict) -> dict:
    if not materialization_data:
        return {"status": "empty"}

    SegmentationModel = create_segmentation_model(mat_metadata)
    aligned_volume = mat_metadata.get("aligned_volume")

    session = sqlalchemy_cache.get(aligned_volume)

    try:
        session.bulk_update_mappings(SegmentationModel, materialization_data)
        session.commit()
    except Exception as e:
        session.rollback()
        celery_logger.error(f"ERROR: {e}")
        raise (e)
    finally:
        session.close()
    return f"Number of rows updated: {len(materialization_data)}"
def get_ids_with_missing_roots(mat_metadata: dict) -> List:
    """Get a chunk generator of the primary key ids for rows that contain
    at least one missing root id. Finds the min and max primary key id values
    globally across the table where a missing root id is present in a column.

    Args:
        mat_metadata (dict): materialization metadata

    Returns:
        List: generator of chunked primary key ids.
    """
    SegmentationModel = create_segmentation_model(mat_metadata)
    aligned_volume = mat_metadata.get("aligned_volume")
    session = sqlalchemy_cache.get(aligned_volume)

    columns = [
        seg_column.name for seg_column in SegmentationModel.__table__.columns
    ]
    root_id_columns = [
        root_column for root_column in columns if "root_id" in root_column
    ]
    query_columns = [
        getattr(SegmentationModel, root_id_column).is_(None)
        for root_id_column in root_id_columns
    ]
    max_id = (session.query(func.max(SegmentationModel.id)).filter(
        or_(*query_columns)).scalar())
    min_id = (session.query(func.min(SegmentationModel.id)).filter(
        or_(*query_columns)).scalar())
    if min_id and max_id:
        if min_id < max_id:
            id_range = range(min_id, max_id + 1)
            return create_chunks(id_range, 500)
        elif min_id == max_id:
            return [min_id]
    else:
        celery_logger.info(
            f"No missing root_ids found in '{SegmentationModel.__table__.name}'"
        )
        return None
def create_tables(self, bulk_upload_params: dict):
    table_name = bulk_upload_params["annotation_table_name"]
    aligned_volume = bulk_upload_params["aligned_volume"]
    pcg_table_name = bulk_upload_params["pcg_table_name"]
    last_updated = bulk_upload_params["last_updated"]
    seg_table_name = bulk_upload_params["seg_table_name"]
    upload_creation_time = bulk_upload_params["upload_creation_time"]
    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    if (not session.query(AnnoMetadata).filter(
            AnnoMetadata.table_name == table_name).scalar()):
        AnnotationModel = create_annotation_model(bulk_upload_params)
        AnnotationModel.__table__.create(bind=engine, checkfirst=True)
        anno_metadata_dict = {
            "table_name":
            table_name,
            "schema_type":
            bulk_upload_params.get("schema"),
            "valid":
            True,
            "created":
            upload_creation_time,
            "user_id":
            bulk_upload_params.get("user_id", "*****@*****.**"),
            "description":
            bulk_upload_params["description"],
            "reference_table":
            bulk_upload_params.get("reference_table"),
            "flat_segmentation_source":
            bulk_upload_params.get("flat_segmentation_source"),
        }
        anno_metadata = AnnoMetadata(**anno_metadata_dict)
        session.add(anno_metadata)

    if (not session.query(SegmentationMetadata).filter(
            SegmentationMetadata.table_name == table_name).scalar()):
        SegmentationModel = create_segmentation_model(bulk_upload_params)
        SegmentationModel.__table__.create(bind=engine, checkfirst=True)
        seg_metadata_dict = {
            "annotation_table": table_name,
            "schema_type": bulk_upload_params.get("schema"),
            "table_name": seg_table_name,
            "valid": True,
            "created": upload_creation_time,
            "pcg_table_name": pcg_table_name,
            "last_updated": last_updated,
        }

        seg_metadata = SegmentationMetadata(**seg_metadata_dict)

    try:
        session.flush()
        session.add(seg_metadata)
        session.commit()
    except Exception as e:
        celery_logger.error(f"SQL ERROR: {e}")
        session.rollback()
        raise e
    finally:
        drop_seg_indexes = index_cache.drop_table_indices(
            SegmentationModel.__table__.name, engine)
        # wait for indexes to drop
        time.sleep(10)
        drop_anno_indexes = index_cache.drop_table_indices(
            AnnotationModel.__table__.name, engine)
        celery_logger.info(
            f"Table {AnnotationModel.__table__.name} indices have been dropped {drop_anno_indexes}."
        )
        celery_logger.info(
            f"Table {SegmentationModel.__table__.name} indices have been dropped {drop_seg_indexes}."
        )

        session.close()

    return f"Tables {table_name}, {seg_table_name} created."
示例#12
0
def merge_tables(self, mat_metadata: dict):
    """Merge all the annotation and segmentation rows into a new table that are
    flagged as valid. Drop the original split tables after inserting all the rows
    into the new table.

    Args:
        mat_metadata (dict): datastack info for the aligned_volume from the infoservice
        analysis_version (int): materialized version number

    Raises:
        e: error during table merging operation

    Returns:
        str: number of rows copied
    """
    analysis_version = mat_metadata["analysis_version"]
    annotation_table_name = mat_metadata["annotation_table_name"]
    segmentation_table_name = mat_metadata["segmentation_table_name"]
    temp_table_name = mat_metadata["temp_mat_table_name"]
    schema = mat_metadata["schema"]
    datastack = mat_metadata["datastack"]

    # create dynamic sql_uri
    SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
    analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack,
                                               analysis_version)

    # get schema and match column order for sql query
    anno_schema = get_schema(schema)
    flat_schema = create_flattened_schema(anno_schema)

    ordered_model_columns = create_table_dict(
        table_name=annotation_table_name,
        Schema=flat_schema,
        segmentation_source=None,
        table_metadata=None,
        with_crud_columns=False,
    )

    AnnotationModel = create_annotation_model(mat_metadata,
                                              with_crud_columns=False)
    SegmentationModel = create_segmentation_model(mat_metadata)

    query_columns = {}
    crud_columns = ["created", "deleted", "superceded_id"]
    for col in AnnotationModel.__table__.columns:
        if col.name not in crud_columns:
            query_columns[col.name] = col
    for col in SegmentationModel.__table__.columns:
        if not col.name == "id":
            query_columns[col.name] = col

    sorted_columns = OrderedDict([(key, query_columns[key])
                                  for key in ordered_model_columns
                                  if key in query_columns.keys()])
    sorted_columns_list = list(sorted_columns.values())
    columns = [f'"{col.table}".{col.name}' for col in sorted_columns_list]

    mat_session, mat_engine = create_session(analysis_sql_uri)

    query = f"""
        SELECT 
            {', '.join(columns)}
        FROM 
            {AnnotationModel.__table__.name}
        JOIN 
            "{SegmentationModel.__table__.name}"
            ON {AnnotationModel.id} = "{SegmentationModel.__table__.name}".id
        WHERE
            {AnnotationModel.id} = "{SegmentationModel.__table__.name}".id
        AND {AnnotationModel.valid} = true

    """

    try:
        mat_db_connection = mat_engine.connect()
        with mat_db_connection.begin():
            insert_query = mat_db_connection.execute(
                f"CREATE TABLE {temp_table_name} AS ({query});")
            row_count = insert_query.rowcount
            drop_query = mat_db_connection.execute(
                f'DROP TABLE {annotation_table_name}, "{segmentation_table_name}" CASCADE;'
            )
            alter_query = mat_db_connection.execute(
                f"ALTER TABLE {temp_table_name} RENAME TO {annotation_table_name};"
            )
        mat_session.close()
        mat_engine.dispose()

        return f"Number of rows copied: {row_count}"
    except Exception as e:
        celery_logger.error(e)
        raise (e)
def get_new_root_ids(materialization_data: dict, mat_metadata: dict) -> dict:
    """Get root ids

    Args:
        materialization_data (dict): supervoxel data for root_id lookup
        mat_metadata (dict): Materialization metadata

    Returns:
        dict: root_ids to be inserted into db
    """
    pcg_table_name = mat_metadata.get("pcg_table_name")
    aligned_volume = mat_metadata.get("aligned_volume")
    try:
        materialization_time_stamp = datetime.datetime.strptime(
            mat_metadata.get("materialization_time_stamp"),
            "%Y-%m-%d %H:%M:%S.%f")
    except:
        materialization_time_stamp = datetime.datetime.strptime(
            mat_metadata.get("materialization_time_stamp"),
            "%Y-%m-%dT%H:%M:%S.%f")
    supervoxel_df = pd.DataFrame(materialization_data, dtype=object)
    drop_col_names = list(
        supervoxel_df.loc[:, supervoxel_df.columns.str.endswith("position")])
    supervoxel_df = supervoxel_df.drop(drop_col_names, 1)

    AnnotationModel = create_annotation_model(mat_metadata)
    SegmentationModel = create_segmentation_model(mat_metadata)

    __, seg_model_cols, __ = get_query_columns_by_suffix(
        AnnotationModel, SegmentationModel, "root_id")
    anno_ids = supervoxel_df["id"].to_list()

    # get current root ids from database
    session = sqlalchemy_cache.get(aligned_volume)

    try:
        current_root_ids = [
            data for data in session.query(*seg_model_cols).filter(
                or_(SegmentationModel.id.in_(anno_ids)))
        ]
    except SQLAlchemyError as e:
        session.rollback()
        current_root_ids = []
        celery_logger.error(e)
    finally:
        session.close()

    supervoxel_col_names = list(
        supervoxel_df.loc[:,
                          supervoxel_df.columns.str.endswith("supervoxel_id")])

    if current_root_ids:
        # merge root_id df with supervoxel df
        df = pd.DataFrame(current_root_ids, dtype=object)
        root_ids_df = pd.merge(supervoxel_df, df)

    else:
        # create empty dataframe with root_id columns
        root_id_columns = [
            col_name.replace("supervoxel_id", "root_id")
            for col_name in supervoxel_col_names if "supervoxel_id" in col_name
        ]
        df = pd.DataFrame(columns=root_id_columns,
                          dtype=object).fillna(value=np.nan)
        root_ids_df = pd.concat((supervoxel_df, df), axis=1)

    cols = [x for x in root_ids_df.columns if "root_id" in x]

    cg = chunkedgraph_cache.init_pcg(pcg_table_name)

    # filter missing root_ids and lookup root_ids if missing
    mask = np.logical_and.reduce([root_ids_df[col].isna() for col in cols])
    missing_root_rows = root_ids_df.loc[mask]
    if not missing_root_rows.empty:
        supervoxel_data = missing_root_rows.loc[:, supervoxel_col_names]
        for col_name in supervoxel_data:
            if "supervoxel_id" in col_name:
                root_id_name = col_name.replace("supervoxel_id", "root_id")
                data = missing_root_rows.loc[:, col_name]
                root_id_array = get_root_ids(cg, data,
                                             materialization_time_stamp)
                root_ids_df.loc[data.index, root_id_name] = root_id_array

    return root_ids_df.to_dict(orient="records")
def get_annotations_with_missing_supervoxel_ids(mat_metadata: dict,
                                                chunk: List[int]) -> dict:
    """Get list of valid annotation and their ids to lookup existing supervoxel ids. If there
    are missing supervoxels they will be set as None for cloudvolume lookup.

    Parameters
    ----------
    mat_metadata : dict
        Materialization metadata
    chunk : list
        chunked range to for sql id query

    Returns
    -------
    dict
        dict of annotation and segmentation data
    """

    aligned_volume = mat_metadata.get("aligned_volume")
    SegmentationModel = create_segmentation_model(mat_metadata)
    AnnotationModel = create_annotation_model(mat_metadata)

    session = sqlalchemy_cache.get(aligned_volume)

    anno_model_cols, __, supervoxel_columns = get_query_columns_by_suffix(
        AnnotationModel, SegmentationModel, "supervoxel_id")

    query = session.query(*anno_model_cols)

    chunked_id_query = query_id_range(AnnotationModel.id, chunk[0], chunk[1])
    annotation_data = [
        data for data in query.filter(chunked_id_query).order_by(
            AnnotationModel.id).filter(AnnotationModel.valid == True).join(
                SegmentationModel, isouter=True).filter(
                    SegmentationModel.id == None)
    ]

    annotation_dataframe = pd.DataFrame(annotation_data, dtype=object)
    if not annotation_dataframe.empty:
        wkb_data = annotation_dataframe.loc[:,
                                            annotation_dataframe.columns.str.
                                            endswith("position")]

        annotation_dict = {}
        for column, wkb_points in wkb_data.items():
            annotation_dict[column] = [
                get_geom_from_wkb(wkb_point) for wkb_point in wkb_points
            ]

        for key, value in annotation_dict.items():
            annotation_dataframe.loc[:, key] = value

        segmentation_dataframe = pd.DataFrame(columns=supervoxel_columns,
                                              dtype=object)
        segmentation_dataframe = segmentation_dataframe.fillna(value=np.nan)
        mat_df = pd.concat((segmentation_dataframe, annotation_dataframe),
                           axis=1)
        materialization_data = mat_df.to_dict(orient="list")
    else:
        materialization_data = None

    session.close()

    return materialization_data