def refresh_product(index: Index, product: DatasetType): engine: Engine = alchemy_engine(index) insert_count = _populate_missing_dataset_extents(engine, product) # If we inserted data... if insert_count: # And it's a non-spatial product... if get_dataset_extent_alchemy_expression( product.metadata_type) is None: # And it has WRS path/rows... if "sat_path" in product.metadata_type.dataset_fields: # We can synthesize the polygons! _LOG.debug( "spatial_synthesizing.start", product_name=product.name, ) shapes = _get_path_row_shapes() rows = [ row for row in index.datasets.search_returning( ("id", "sat_path", "sat_row"), product=product.name) if row.sat_path.lower is not None ] if rows: engine.execute( DATASET_SPATIAL.update().where( DATASET_SPATIAL.c.id == bindparam("dataset_id")). values(footprint=bindparam("footprint")), [ dict( dataset_id=id_, footprint=from_shape( shapely.ops.unary_union([ shapes[(int(sat_path.lower), row)] for row in range( int(sat_row.lower), int(sat_row.upper) + 1, ) ]), srid=4326, extended=True, ), ) for id_, sat_path, sat_row in rows ], ) _LOG.debug( "spatial_synthesizing.done", product_name=product.name, ) return insert_count
def _populate_missing_dataset_extents( engine: Engine, product: DatasetType, force_update_all=False ): columns = {c.name: c for c in _select_dataset_extent_columns(product)} if force_update_all: query = ( DATASET_SPATIAL.update() .values(**columns) .where(DATASET_SPATIAL.c.id == columns["id"]) .where( DATASET.c.dataset_type_ref == bindparam("product_ref", product.id, type_=SmallInteger) ) .where(DATASET.c.archived == None) ) else: query = ( postgres.insert(DATASET_SPATIAL) .from_select( columns.keys(), select(columns.values()) .where( DATASET.c.dataset_type_ref == bindparam("product_ref", product.id, type_=SmallInteger) ) .where(DATASET.c.archived == None) .order_by(columns["center_time"]), ) .on_conflict_do_nothing(index_elements=["id"]) ) # print(as_sql(query)) _LOG.debug( "spatial_insert_query.start", product_name=product.name, force_update_all=force_update_all, ) changed = engine.execute(query).rowcount _LOG.debug( "spatial_insert_query.end", product_name=product.name, change_count=changed ) return changed
def refresh_spatial_extents( index: Index, product: DatasetType, clean_up_deleted=False, assume_after_date: datetime = None, ): """ Update the spatial extents to match any changes upstream in ODC. :param assume_after_date: Only scan datasets that have changed after the given (db server) time. If None, all datasets will be regenerated. :param clean_up_deleted: Scan for any manually deleted rows too. Slow. """ engine: Engine = alchemy_engine(index) log = _LOG.bind(product_name=product.name, after_date=assume_after_date) # First, remove any archived datasets from our spatial table. datasets_to_delete = (select([DATASET.c.id]).where( DATASET.c.archived.isnot(None)).where( DATASET.c.dataset_type_ref == product.id)) if assume_after_date is not None: # Note that we use "dataset_changed_expression" to scan the datasets, # rather than "where archived > date", because the latter has no index! # (.... and we're using dataset_changed_expression's index everywhere else, # so it's probably still in memory and super fast!) datasets_to_delete = datasets_to_delete.where( dataset_changed_expression() > assume_after_date) log.info("spatial_archival", ) changed = engine.execute(DATASET_SPATIAL.delete().where( DATASET_SPATIAL.c.id.in_(datasets_to_delete))).rowcount log.info( "spatial_archival.end", change_count=changed, ) # Forcing? Check every other dataset for removal, so we catch manually-deleted rows from the table. if clean_up_deleted: log.warning("spatial_deletion_full_scan", ) changed += engine.execute( DATASET_SPATIAL.delete().where( DATASET_SPATIAL.c.dataset_type_ref == product.id, ) # Where it doesn't exist in the ODC dataset table. .where(~DATASET_SPATIAL.c.id.in_( select([DATASET.c.id]).where( DATASET.c.dataset_type_ref == product.id, )))).rowcount log.info( "spatial_deletion_scan.end", change_count=changed, ) # We'll update first, then insert new records. # -> We do it in this order so that inserted records aren't immediately updated. # (Note: why don't we do this in one upsert? Because we get our sqlalchemy expressions # through ODC's APIs and can't choose alternative table aliases to make sub-queries. # Maybe you can figure out a workaround, though?) column_values = { c.name: c for c in _select_dataset_extent_columns(product) } only_where = [ DATASET.c.dataset_type_ref == bindparam("product_ref", product.id, type_=SmallInteger), DATASET.c.archived.is_(None), ] if assume_after_date is not None: only_where.append(dataset_changed_expression() > assume_after_date) else: log.warning("spatial_update.recreating_everything") # Update any changed datasets log.info( "spatial_update", product_name=product.name, after_date=assume_after_date, ) changed += engine.execute( DATASET_SPATIAL.update().values(**column_values).where( DATASET_SPATIAL.c.id == column_values["id"]).where( and_(*only_where))).rowcount log.info("spatial_update.end", product_name=product.name, change_count=changed) # ... and insert new ones. log.info( "spatial_insert", product_name=product.name, after_date=assume_after_date, ) changed += engine.execute( postgres.insert(DATASET_SPATIAL).from_select( column_values.keys(), select(column_values.values()).where(and_(*only_where)).order_by( column_values["center_time"]), ).on_conflict_do_nothing(index_elements=["id"])).rowcount log.info("spatial_insert.end", product_name=product.name, change_count=changed) # If we changed data... if changed: # And it's a non-spatial product... if get_dataset_extent_alchemy_expression( product.metadata_type) is None: # And it has WRS path/rows... if "sat_path" in product.metadata_type.dataset_fields: # We can synthesize the polygons! log.info("spatial_synthesizing", ) shapes = _get_path_row_shapes() rows = [ row for row in index.datasets.search_returning( ("id", "sat_path", "sat_row"), product=product.name) if row.sat_path.lower is not None ] if rows: engine.execute( DATASET_SPATIAL.update().where( DATASET_SPATIAL.c.id == bindparam("dataset_id")). values(footprint=bindparam("footprint")), [ dict( dataset_id=id_, footprint=from_shape( shapely.ops.unary_union([ shapes[(int(sat_path.lower), row)] for row in range( int(sat_row.lower), int(sat_row.upper) + 1, ) ]), srid=4326, extended=True, ), ) for id_, sat_path, sat_row in rows ], ) log.info("spatial_synthesizing.end", ) return changed