def hash(self): return hash_string_min( (self.collection.hash, self.with_filters_recursive( lambda condition: hash_string_min( (condition['children'], condition['type'])), lambda filter_func: filter_func['filter_function'].hash)))
def _get_combined_entity_type_selections_sql(properties, is_source): sqls = [] # Get the properties needed for the source or target per entity-type selection for ets_id, ets_properties in properties.items(): joins, matching_fields = [], [] # Then for get all properties from this entity-type selection required for a single matching function for ets_index, (property_label, ets_matching_func_props) in enumerate( ets_properties.items()): matching_method = ets_matching_func_props['matching_method'] ets_method_properties = ets_matching_func_props['properties'] MatchingSql._matching_methods_sql(ets_id, matching_method, ets_method_properties, is_source, joins, matching_fields, ets_index) sqls.append( sql.SQL( cleandoc(""" SELECT {collection} AS collection, target.uri, {matching_fields} FROM (SELECT DISTINCT uri FROM {res}) AS target {joins} """)).format( collection=sql.Literal(int(ets_id)), matching_fields=sql.SQL(',\n ').join( matching_fields), res=sql.Identifier(hash_string_min(ets_id)), joins=get_sql_empty(sql.SQL('\n').join(joins)), )) return sql.SQL('\nUNION ALL\n').join(sqls)
def __init__(self, data, entity_type_selection=None, collection=None, transformers=None): self._data = data if isinstance(data, list) else [data] self._collection = entity_type_selection.collection if entity_type_selection else collection if not self._collection: raise Exception('Property field should have either an entity-type selection or a collection') self._transformers = transformers if transformers else [] self._alias = entity_type_selection.alias if entity_type_selection \ else hash_string_min(self._collection.table_name) self._extend = True self._hash = None self._prop_path = None
def _get_field_transformers(self, normalized=False): transformers_info = get_transformers() field_transformers = self._transformers.copy() for transformer in field_transformers: if transformer['name'] in transformers_info: transformer['sql_template'] = transformers_info[ transformer['name']]['sql_template'] if transformer['name'] == 'stopwords': transformer['parameters']['key'] = hash_string_min( (transformer['parameters']['dictionary'], transformer['parameters']['additional'])) else: raise NameError('Transformer %s is not defined' % transformer['name']) if not self._field_type_info['type']: field_transformers.insert( 0, { 'sql_template': transformers_info['lowercase']['sql_template'], 'parameters': {} }) if self._field_type_info['type'] == 'number': field_transformers.append({ 'sql_template': transformers_info['to_numeric_immutable']['sql_template'], 'parameters': {} }) elif self._field_type_info['type'] == 'date': field_transformers.append({ 'sql_template': transformers_info['to_date_immutable']['sql_template'], 'parameters': { 'format': self._field_type_info['parameters']['format'] } }) if normalized: field_transformers.append({ 'sql_template': self._norm_template, 'parameters': self._norm_properties }) return field_transformers
def prepare_sql(self): prepare_sqls = [ sql.SQL( 'SELECT init_dictionary({key}, {dictionary}, {additional});'). format( key=sql.Literal( hash_string_min( (transformer['parameters']['dictionary'], transformer['parameters']['additional']))), dictionary=sql.Literal( transformer['parameters']['dictionary']), additional=sql.SQL('ARRAY[{}]::text[]').format( sql.SQL(', ').join( sql.Literal(additional) for additional in transformer['parameters']['additional'])), ) for transformer in self._transformers if transformer['name'] == 'stopwords' ] if prepare_sqls: return sql.SQL('\n').join(prepare_sqls) return None
def _intermediate_property_path(self): if not self._prop_path: self._prop_path = [] path = self._collection.table_name prev_collection = self._collection data = [(self._data[i], self._data[i + 1], self._data[i + 3] if i + 3 < len(self._data) else None) for i in range(0, len(self._data) - 2, 2)] for (prop, collection_id, next_collection_id) in data: collection = prev_collection.get_collection_by_id(collection_id) next_collection = collection.get_collection_by_id(next_collection_id) if next_collection_id else None path += f'[{collection.table_name}_{prop}_{next_collection.table_name}]' if next_collection \ else f'[{collection.table_name}_{prop}]' self._prop_path.append({ 'from_collection': prev_collection, 'to_collection': collection, 'alias': hash_string_min(path), 'property': column_name_hash(prop) }) prev_collection = collection return self._prop_path
def hash(self): if not self._hash: self._hash = \ hash_string_min((self.resource_label, self.prop_label, self._extend, self.is_list, self._transformers)) return self._hash
def hash(self): if self.prop_normalized: return hash_string_min( (self.prop_original.hash, self.prop_normalized.hash)) return self.prop_original.hash
def alias(self): return hash_string_min(self.table_name)
def config_hash(self): return hash_string_min(self._data)
def target_intermediates_hash(self): return hash_string_min( (self.config_hash, sorted([prop.hash for prop in self.target_intermediates_props])))
def sources_hash(self): return hash_string_min( (self.config_hash, sorted([prop.hash for prop in self.sources_props])))
def hash(self): return hash_string_min( (self.property_field.hash, self.function_name, self.parameters))
def alias(self): return hash_string_min(self.id)
def get_extended_property_alias(resource, prop): return hash_string_min(resource + '.' + prop) + '_extended'
def _matching_methods_sql(ets_id, matching_method, properties, is_source, joins, matching_fields, ets_index): target = 'target' + str(ets_index) field_name_org = matching_method.field_name field_name_norm = field_name_org + '_norm' props_org = [prop.prop_original for prop in properties] props_norm = [ prop.prop_normalized for prop in properties if prop.prop_normalized ] # In case of list matching, combine all values into a field if matching_method.is_list_match: field_norm = sql.SQL('') if props_norm: field_norm = sql.SQL( cleandoc(''' , ARRAY( SELECT {field_name_norm} FROM unnest({fields_org}, {fields_norm}) AS x ({field_name_org}, {field_name_norm}) WHERE {field_name_org} IS NOT NULL GROUP BY {field_name_org}, {field_name_norm} ) AS {field_name_norm} ''')).format( field_name_org=sql.Identifier(field_name_org), field_name_norm=sql.Identifier(field_name_norm), fields_org=sql.SQL(' || ').join([ sql.SQL('array_agg({})').format( sql.Identifier(prop.hash)) for prop in props_org ]), fields_norm=sql.SQL(' || ').join([ sql.SQL('array_agg({})').format( sql.Identifier(prop.hash)) for prop in props_norm ]), ) joins.append( sql.SQL( cleandoc(''' LEFT JOIN ( SELECT uri, ARRAY( SELECT {field_name_org} FROM unnest({fields_org}) AS {field_name_org} WHERE {field_name_org} IS NOT NULL GROUP BY {field_name_org} ) AS {field_name_org} {field_norm} FROM {res} GROUP BY uri ) AS {target} ON target.uri = {target}.uri ''')).format(fields_org=sql.SQL(' || ').join([ sql.SQL('array_agg({})').format( sql.Identifier(prop.hash)) for prop in props_org ]), field_name_org=sql.Identifier(field_name_org), field_norm=field_norm, res=sql.Identifier(hash_string_min(ets_id)), target=sql.Identifier(target))) # Otherwise combine all values into a new field to use as a join else: if len(props_org) == 1: field_template = '{field_org} AS {field_name_org}' if props_norm: field_template += ', {field_norm} AS {field_name_norm}' fields_sql = sql.SQL(field_template).format( field_org=sql.Identifier(props_org[0].hash), field_name_org=sql.Identifier(field_name_org), field_norm=sql.Identifier(props_norm[0].hash) if props_norm else sql.SQL(''), field_name_norm=sql.Identifier(field_name_norm) if props_norm else sql.SQL(''), ) lateral_sql = sql.SQL('') else: field_template = '{field_name_org}' if not props_norm else '{field_name_org}, {field_name_norm}' fields_sql = sql.SQL(field_template).format( field_name_org=sql.Identifier(field_name_org), field_name_norm=sql.Identifier(field_name_norm) if props_norm else sql.SQL(''), ) join_template = ', LATERAL unnest(ARRAY[{fields_org}]) AS {field_name_org}' if not props_norm else \ ', LATERAL unnest(ARRAY[{fields_org}], ARRAY[{fields_norm}]) AS x ({field_name_org}, {field_name_norm})' lateral_sql = sql.SQL(join_template).format( fields_org=sql.SQL(', ').join( [sql.Identifier(prop.hash) for prop in props_org]), field_name_org=sql.Identifier(field_name_org), fields_norm=sql.SQL(', ').join([ sql.Identifier(prop.hash) for prop in props_norm ]) if props_norm else sql.SQL(''), field_name_norm=sql.Identifier(field_name_norm) if props_norm else sql.SQL(''), ) joins.append( sql.SQL( cleandoc(''' LEFT JOIN ( SELECT DISTINCT uri, {fields} FROM {res}{lateral} ) AS {target} ON target.uri = {target}.uri AND {target}.{field_name_org} IS NOT NULL ''')).format( fields=fields_sql, res=sql.Identifier(hash_string_min(ets_id)), lateral=lateral_sql, target=sql.Identifier(target), field_name_org=sql.Identifier(field_name_org), )) # Now that we have determined the target fields, add them to the list of matching fields matching_fields.append( sql.SQL('{target}.{field} AS {field}').format( target=sql.Identifier(target), field=sql.Identifier(field_name_org))) if props_norm: matching_fields.append( sql.SQL('{target}.{field} AS {field}').format( target=sql.Identifier(target), field=sql.Identifier(field_name_norm))) # Add properties to do the intermediate dataset matching if matching_method.is_intermediate: for intermediate_ets, intermediate_ets_props in matching_method.intermediates.items( ): intermediate_res = hash_string_min(intermediate_ets) intermediate_target = 'intermediate' + str(ets_index) intermediate_fields = intermediate_ets_props[ 'source' if is_source else 'target'] intermediate_match_sqls = [ sql.SQL( '{target}.{field_name} = {intermediate_target}.{intermediate_field}' ).format(target=sql.Identifier(target), field_name=sql.Identifier(field_name_org), intermediate_target=sql.Identifier( intermediate_target), intermediate_field=sql.Identifier( intermediate_field.prop_original.hash)) for intermediate_field in intermediate_fields ] joins.append( sql.SQL( cleandoc(''' LEFT JOIN {intermediate_res} AS {intermediate_target} ON {match_sqls} ''')).format( intermediate_res=sql.Identifier(intermediate_res), intermediate_target=sql.Identifier( intermediate_target), match_sqls=sql.SQL(' OR ').join( intermediate_match_sqls))) matching_fields.append( sql.SQL('{join_name}.uri AS {field_name}').format( join_name=sql.Identifier(intermediate_target), field_name=sql.Identifier(field_name_org + '_intermediate')))
def extended_prop_label(self): return hash_string_min(self.resource_label + '.' + self.prop_label) + '_extended'
def hash(self): return hash_string_min((self.graphql_endpoint, self.dataset_id, self.collection_id))