def _join_inverse_relation(self, relation_name: str, attributes: list, arguments: dict): parent = self.relation_parents[relation_name] parent_info = self._get_relation_info(parent) relation_name_snake = to_snake(relation_name).split('_') assert relation_name_snake[0] == 'inv' relation_attr_name = '_'.join(relation_name_snake[1:-2]) dst_catalog_name = relation_name_snake[-2] dst_collection_name = relation_name_snake[-1] dst_model_name = self.model.get_table_name(dst_catalog_name, dst_collection_name) dst_info = self._collect_relation_info(relation_name, f'{dst_model_name}') json_attrs = self._json_build_attrs(attributes, dst_info['alias']) json_attrs = f"{json_attrs}, '_catalog', '{dst_catalog_name}', '_collection', '{dst_collection_name}'" alias = f"_inv_{relation_attr_name}_{dst_info['catalog_name']}_{dst_info['collection_name']}" relation_name = get_relation_name( self.model, dst_info['catalog_name'], dst_info['collection_name'], relation_attr_name ) self._add_relation_joins(parent_info, dst_info, relation_name, arguments, is_inverse=True) self.select_expressions.append(f"json_build_object({json_attrs}) {alias}")
def query_reference_entities(catalog, collection, reference_name, src_id): assert _Base _session = get_session() gob_model = GOBModel() rel_catalog_name = 'rel' rel_collection_name = get_relation_name(gob_model, catalog, collection, reference_name) rel_table, rel_model = get_table_and_model(rel_catalog_name, rel_collection_name) dst_catalog_name, dst_collection_name = gob_model.get_collection( catalog, collection)['references'][reference_name]['ref'].split(':') # Destination table and model dst_table, dst_model = get_table_and_model(dst_catalog_name, dst_collection_name) query = _session.query(dst_table) \ .join(rel_table, dst_table._id == rel_table.dst_id) \ .filter(rel_table.src_id == src_id) # Exclude all records with date_deleted all_entities = filter_deleted(query, dst_table) # The default result is where expiration date is in the future or empty all_entities = filter_active(all_entities, dst_table) entity_convert = _get_convert_for_model(dst_catalog_name, dst_collection_name, dst_model) return all_entities, entity_convert
def _join_relation(self, relation_name: str, attributes: list, arguments: dict): parent = self.relation_parents[relation_name] parent_info = self._get_relation_info(parent) relation_attr_name = to_snake(self.relation_aliases[relation_name]) dst_catalog_name, dst_collection_name = self.model.get_catalog_collection_names_from_ref( parent_info['collection']['attributes'][relation_attr_name]['ref'] ) dst_info = self._collect_relation_info(relation_name, f'{dst_catalog_name}_{dst_collection_name}') alias = f"_{to_snake(relation_name)}" json_attrs = self._json_build_attrs(attributes, dst_info['alias']) json_attrs = f"{json_attrs}, '_catalog', '{dst_catalog_name}', '_collection', '{dst_collection_name}'" relation_name = get_relation_name( self.model, parent_info['catalog_name'], parent_info['collection_name'], relation_attr_name ) self._add_relation_joins(parent_info, dst_info, relation_name, arguments, self._is_srcvalue_requested(attributes), relation_attr_name, self._is_many(parent_info['collection']['attributes'][relation_attr_name]['type'])) self.select_expressions.append(f"json_build_object({json_attrs}) {alias}")
def clear_test_dbs(): """ Clear the GOB test databases :return: """ model = GOBModel() # Test data is contained in the test_catalog and relation catalog test_catalog = "test_catalogue" rel_catalog = "rel" # Collect names of all test tables and entities tables = [] test_entities = [] rel_entities = [] for collection_name in model.get_collections(test_catalog): collection = model.get_collection(test_catalog, collection_name) tables.append(model.get_table_name(test_catalog, collection_name)) test_entities.append(collection_name) refs = { **collection['references'], **collection['very_many_references'] } for ref in refs: ref_name = get_relation_name(model, test_catalog, collection_name, ref) tables.append(model.get_table_name(rel_catalog, ref_name)) rel_entities.append(ref_name) # Nicely format the SQL statement indent = ",\n" + ' ' * 17 table_length = max([len(table) for table in tables]) # Provide for SQL statements truncate_tables = ";\n".join([f"TRUNCATE TABLE {table:{table_length}} CASCADE" for table in tables]) test_entity_list = indent.join([f"'{e}'" for e in test_entities]) rel_entity_list = indent.join([f"'{e}'" for e in rel_entities]) # Construct SQL statement statement = f""" -- Truncate test tables {truncate_tables}; -- Delete test entity events DELETE FROM events WHERE catalogue = '{test_catalog}' AND entity IN ({test_entity_list}); -- Delete test relation events DELETE FROM events WHERE catalogue = '{rel_catalog}' AND entity IN ({rel_entity_list}); -- Commit all changes COMMIT; """ exec_statement(statement)
def _add_relations(query, catalog_name, collection_name): gob_model = GOBModel() collection = gob_model.get_collection(catalog_name, collection_name) has_states = collection.get('has_states', False) src_table, _ = get_table_and_model(catalog_name, collection_name) for reference in collection['references']: relation_name = get_relation_name(gob_model, catalog_name, collection_name, reference) if not relation_name: continue rel_table, _ = get_table_and_model('rel', relation_name) select_attrs = [ getattr(rel_table, 'src_id'), getattr(rel_table, 'src_volgnummer'), ] if has_states else [ getattr(rel_table, 'src_id'), ] subselect = session \ .query( *select_attrs, func.json_agg( func.json_build_object( FIELD.SOURCE_VALUE, getattr(rel_table, FIELD.SOURCE_VALUE), FIELD.REFERENCE_ID, getattr(rel_table, 'dst_id') ) ).label('source_values') ).filter( and_( getattr(rel_table, FIELD.DATE_DELETED).is_(None), or_( getattr(rel_table, FIELD.EXPIRATION_DATE).is_(None), getattr(rel_table, FIELD.EXPIRATION_DATE) > func.now() ) ) ).group_by( *select_attrs ).subquery() join_clause = [ getattr(src_table, FIELD.ID) == getattr(subselect.c, 'src_id'), getattr(src_table, FIELD.SEQNR) == getattr(subselect.c, 'src_volgnummer') ] if has_states else [ getattr(src_table, FIELD.ID) == getattr(subselect.c, 'src_id'), ] query = query.join(subselect, and_(*join_clause), isouter=True) \ .add_columns( getattr(subselect.c, 'source_values').label(f"ref:{reference}") ) return query
def prepare_relate(msg): """ The starting point for the relate process. A relate job will be split into individual relate jobs on attribute level. If there's only a catalog in the message, all collections of that catalog will be related. When a job which has been split is received the relation name will be added and the job will be forwarded to the next step of the relate process where the relations are being made. :param msg: a message from the broker containing the catalog and collections (optional) :return: the result message of the relate preparation step """ header = msg.get('header', {}) catalog_name = header.get('catalogue') collection_name = header.get('collection') attribute_name = header.get('attribute') application = "GOBRelate" msg["header"] = { **msg.get("header", {}), "version": "0.1", "source": "GOB", "application": application, "entity": collection_name } timestamp = datetime.datetime.utcnow().isoformat() msg["header"].update({ "timestamp": timestamp, }) logger.configure(msg, "RELATE") if not catalog_name or not collection_name or not attribute_name: # A job will be splitted when catalog, collection or attribute are not provided logger.info("Splitting relate job") _split_job(msg) msg['header']['is_split'] = True return publish_result(msg, []) else: # If the job has all attributes, add the relation name and forward to the next step in the relate process logger.info(f"** Relate {catalog_name} {collection_name} {attribute_name}") relation_name = get_relation_name(GOBModel(), catalog_name, collection_name, attribute_name) msg["header"].update({ "catalogue": "rel", "collection": relation_name, "entity": relation_name, "original_catalogue": catalog_name, "original_collection": collection_name, "original_attribute": attribute_name, }) return msg
def get(self, catalog_name, collection_name, attribute): """Returns definition of materialized view for given relation. :param collection_name: :param catalog_name: :param attribute: :return: """ relation_name = model_relations.get_relation_name(self.model, catalog_name, collection_name, attribute) return self.get_by_relation_name(relation_name)
def _get_relation_model(self): relation_owner = (self.src_object if self.src_side == 'src' else self.dst_model) # Get the source catalogue and collection from the source object owner_table_name = getattr(relation_owner, '__tablename__') owner_catalog_name, owner_collection_name = _get_catalog_collection_name_from_table_name(owner_table_name) relation_name = get_relation_name(gobmodel, owner_catalog_name, owner_collection_name, self.attribute_name) relation_table_name = f"rel_{relation_name}" return models[relation_table_name]
def process_relate(msg: dict): """ This function starts the actual relate process. The message is checked for completeness and the Relater builds the new or updated relations and returns the result the be compared as if it was the result of an import job. :param msg: a message from the broker containing the catalog and collections (optional) :return: the result message of the relate process """ logger.configure(msg, "RELATE SRC") _check_message(msg) header = msg.get('header') logger.info("Relate table started") full_update = header.get('mode', "update") == "full" if full_update: logger.info("Full relate requested") updater = Relater(header[CATALOG_KEY], header[COLLECTION_KEY], header[ATTRIBUTE_KEY]) filename, confirms = updater.update(full_update) logger.info("Relate table completed") relation_name = get_relation_name(GOBModel(), header[CATALOG_KEY], header[COLLECTION_KEY], header[ATTRIBUTE_KEY]) result_msg = { "header": { **msg["header"], "catalogue": "rel", "collection": relation_name, "entity": relation_name, "source": "GOB", "application": "GOB", "version": RELATE_VERSION, "timestamp": msg.get("timestamp", datetime.datetime.utcnow().isoformat()), }, "summary": logger.get_summary(), "contents_ref": filename, "confirms": confirms, } return result_msg
def _dump_relations(catalog_name, collection_name, config): """Dumps relations for catalog_name, collection_name """ config['schema'] = catalog_name _, model = get_table_and_model(catalog_name, collection_name) for relation in [k for k in model['references'].keys()]: relation_name = get_relation_name(GOBModel(), catalog_name, collection_name, relation) if not relation_name or relation_name in SKIP_RELATIONS: # relation_name is None when relation does not exist (yet) yield f"Skipping {catalog_name} {collection_name} {relation}\n" continue yield f"Export {catalog_name} {collection_name} {relation}\n" rel_dumper = DbDumper('rel', relation_name, config) yield from rel_dumper.dump_to_db(full_dump=config.get('force_full', False)) rel_dumper.disconnect()
def test_relation_name(self, mock_name_compressor): model = mock.MagicMock() src = { "catalog": { 'abbreviation': 'cat' }, "catalog_name": "catalog", "collection": { 'abbreviation': 'col', 'attributes': { 'reference': { 'ref': 'src:dst' } } }, "collection_name": "collection" } dst = { "catalog": { 'abbreviation': 'dst_cat' }, "catalog_name": "catalog", "collection": { 'abbreviation': 'dst_col' }, "collection_name": "collection" } # Assert that NameCompressor is used mock_name_compressor.compress_name.side_effect = lambda s: s name = _get_relation_name(src, dst, "reference") expect = 'cat_col_dst_cat_dst_col_reference' self.assertEqual(name, expect) mock_name_compressor.compress_name.assert_called_with(expect) model.get_catalog.return_value = src['catalog'] model.get_collection.return_value = src['collection'] name = get_relation_name(model, "catalog", "collection", "reference") expect = 'cat_col_cat_col_reference' self.assertEqual(name, expect)
def test_relation_name(self): model = mock.MagicMock() src = { "catalog": { 'abbreviation': 'cat' }, "catalog_name": "catalog", "collection": { 'abbreviation': 'col', 'attributes': { 'reference': { 'ref': 'src:dst' } } }, "collection_name": "collection" } dst = { "catalog": { 'abbreviation': 'dst_cat' }, "catalog_name": "catalog", "collection": { 'abbreviation': 'dst_col' }, "collection_name": "collection" } name = _get_relation_name(src, dst, "reference") expect = 'cat_col_dst_cat_dst_col_reference' self.assertEqual(name, expect) model.get_catalog.return_value = src['catalog'] model.get_collection.return_value = src['collection'] name = get_relation_name(model, "catalog", "collection", "reference") expect = 'cat_col_cat_col_reference' self.assertEqual(name, expect)
def check_very_many_relations(src_catalog_name, src_collection_name, src_field_name): """ Check very many relations for any dangling relations Dangling can be because a relation exist without any bronwaarde or the bronwaarde cannot be matched with any referenced entity. This can be checked in the relation table instead of the json attribute itself. :param src_catalog_name: :param src_collection_name: :param src_field_name: :return: None """ # Get the source catalog, collection and field for the given names model = GOBModel() src_table_name = model.get_table_name(src_catalog_name, src_collection_name) src_has_states = model.has_states(src_catalog_name, src_collection_name) relation_table_name = "rel_" + get_relation_name( model, src_catalog_name, src_collection_name, src_field_name) select = ["src._id as id", "rel.bronwaarde as bronwaarde"] group_by = ["src._id", "rel.bronwaarde"] if src_has_states: state_select = [ "src.volgnummer", "src.begin_geldigheid", "src.eind_geldigheid" ] select.extend(state_select) select = ",\n ".join(select) group_by = ",\n ".join(group_by) join_on = ['src._id = rel.src_id'] if src_has_states: join_on.extend(['src._volgnummer = rel.src_volgnummer']) join_on = ",\n ".join(join_on) name = f"{src_collection_name} {src_field_name}" bronwaarden = f""" SELECT {select} FROM {src_table_name} src LEFT OUTER JOIN {relation_table_name} rel ON {join_on} WHERE src._date_deleted IS NULL AND rel.bronwaarde IS NULL GROUP BY {group_by} """ _query_missing(bronwaarden, QA_CHECK.Sourcevalue_exists, name) dangling = f""" SELECT {select} FROM {src_table_name} src LEFT OUTER JOIN {relation_table_name} rel ON {join_on} WHERE src._date_deleted IS NULL AND rel.bronwaarde IS NOT NULL AND rel.dst_id IS NULL GROUP BY {group_by} """ _query_missing(dangling, QA_CHECK.Reference_exists, name)
def _get_relation_check_query(query_type, src_catalog_name, src_collection_name, src_field_name, filter_applications: list): assert query_type in [ "dangling", "missing" ], "Relation check query expects type to be dangling or missing" model = GOBModel() src_collection = model.get_collection(src_catalog_name, src_collection_name) src_table_name = model.get_table_name(src_catalog_name, src_collection_name) src_field = src_collection['all_fields'].get(src_field_name) src_has_states = model.has_states(src_catalog_name, src_collection_name) is_many = src_field['type'] == "GOB.ManyReference" relation_table_name = "rel_" + get_relation_name( model, src_catalog_name, src_collection_name, src_field_name) main_select = [f"src.{FIELD.ID} as id", f"src.{FIELD.EXPIRATION_DATE}"] main_select.extend([ f"rel.{FIELD.SOURCE_VALUE}" ] if query_type == "dangling" else [ f"src.{src_field_name}->>'{FIELD.SOURCE_VALUE}' as {FIELD.SOURCE_VALUE}" ]) select = [ FIELD.ID, FIELD.EXPIRATION_DATE, FIELD.DATE_DELETED, f"jsonb_array_elements({src_field_name}) as {src_field_name}" ] if src_has_states: state_select = [FIELD.SEQNR, FIELD.START_VALIDITY, FIELD.END_VALIDITY] select.extend(state_select) main_select.extend([f"src.{field}" for field in state_select]) select = ",\n ".join(select) main_select = ",\n ".join(main_select) join_on = ['src._id = rel.src_id'] if src_has_states: join_on.extend(['src.volgnummer = rel.src_volgnummer']) join_on = " AND ".join(join_on) src = f""" ( SELECT {select} FROM {src_table_name} ) AS src """ if is_many and query_type == "missing" else f"{src_table_name} src" where = [f"src.{FIELD.DATE_DELETED} IS NULL"] # For missing relations check is bronwaarde is empty where.extend([f"{src_field_name}->>'bronwaarde' IS NULL"] if query_type == "missing" else []) # For dangling relations check if destination is empty where.extend(["rel.dst_id IS NULL", f"rel.{FIELD.DATE_DELETED} IS NULL"] if query_type == "dangling" else []) if filter_applications: ors = [ f"src.{FIELD.APPLICATION} = '{application}'" for application in filter_applications ] where.append(f"({' OR '.join(ors)})") where = " AND ".join(where) query = f""" SELECT {main_select} FROM {src}""" query += f""" JOIN {relation_table_name} rel ON {join_on} """ if query_type == "dangling" else "" query += f""" WHERE {where} """ return query
def create_utility_view(self): """Creates view with utility columns for relating without relation table View contains all columns from the main table, plus the RELATION_id, RELATION_VOLGNUMMER, RELATION_ref and RELATION_bronwaarde columns for each RELATION (for example ligt_in_buurt_id, ligt_in_buurt_ref and ligt_in_buurt_bronwaarde) :return: """ yield "Creating view\n" main_alias = self.model['abbreviation'].lower() src_has_states = self.model.get('has_states', False) # Collect all necessary joins and select statements joins = [] selects = [f'{main_alias}.*'] for relation in self.model['references'].keys(): # Add a join and selects for each relation relation_name = get_relation_name(GOBModel(), self.catalog_name, self.collection_name, relation) if not relation_name: # Undefined relation continue if not self._table_exists(relation_name): yield f"Excluding relation {relation_name} from view because table does not exist\n" continue relation_table = f'{self.catalog_name}.{relation_name}' # Determine if ManyReference and if destination has states src_field = self.model['all_fields'].get(relation) dst_catalog_name, dst_collection_name = GOBModel().split_ref(src_field['ref']) dst_has_states = GOBModel().has_states(dst_catalog_name, dst_collection_name) is_many = src_field['type'] == fully_qualified_type_name(GOB.ManyReference) on = f'{relation}.src_id = {main_alias}.{FIELD.ID}' + ( f' and {relation}.src_volgnummer = {main_alias}.{FIELD.SEQNR}' if src_has_states else '' ) if is_many: # For a ManyReference, we need to aggregate the values in an array join = f""" left join ( -- Aggregates id, volgnummer and ref for {relation} per src object. bronwaarde is already in the src table select rel.src_id, {'rel.src_volgnummer,' if src_has_states else ''} array_agg(rel.dst_id) dst_id, {'array_agg(rel.dst_volgnummer) dst_volgnummer,' if dst_has_states else ''} array_agg({self._ref('rel', dst_has_states)}) "ref" from {relation_table} rel group by rel.src_id{', rel.src_volgnummer' if src_has_states else ''} ) {relation} on {on} """ selects.append(f'{relation}.ref {relation}_ref') else: # For a single Reference we expect one row from the relation table join = f"left join {relation_table} {relation} on {on}" selects.append(f'{self._ref(relation, dst_has_states)} {relation}_ref') joins.append(join) selects += [ f'{relation}.dst_id {relation}_id', ] if dst_has_states: selects += [f'{relation}.dst_volgnummer {relation}_volgnummer'] # Build query based on collected joins and selects NEWLINE = '\n' query = f""" select {f',{NEWLINE} '.join(selects)} from {self.catalog_name}.{self.collection_name} {main_alias} {f'{NEWLINE}'.join(joins)} """ # Create the view viewname = f'{self.catalog_name}.v_{self.collection_name}' self._execute(f"drop view if exists {viewname}") self._execute(f"create view {viewname} as {query}") yield f"Utility view {viewname} created\n"