def hash(self):
     return hash_string_min(
         (self.collection.hash,
          self.with_filters_recursive(
              lambda condition: hash_string_min(
                  (condition['children'], condition['type'])),
              lambda filter_func: filter_func['filter_function'].hash)))
Пример #2
0
    def _get_combined_entity_type_selections_sql(properties, is_source):
        sqls = []

        # Get the properties needed for the source or target per entity-type selection
        for ets_id, ets_properties in properties.items():
            joins, matching_fields = [], []

            # Then for get all properties from this entity-type selection required for a single matching function
            for ets_index, (property_label,
                            ets_matching_func_props) in enumerate(
                                ets_properties.items()):
                matching_method = ets_matching_func_props['matching_method']
                ets_method_properties = ets_matching_func_props['properties']

                MatchingSql._matching_methods_sql(ets_id, matching_method,
                                                  ets_method_properties,
                                                  is_source, joins,
                                                  matching_fields, ets_index)

            sqls.append(
                sql.SQL(
                    cleandoc(""" 
                    SELECT {collection} AS collection, target.uri, 
                           {matching_fields}
                    FROM (SELECT DISTINCT uri FROM {res}) AS target {joins}
               """)).format(
                        collection=sql.Literal(int(ets_id)),
                        matching_fields=sql.SQL(',\n       ').join(
                            matching_fields),
                        res=sql.Identifier(hash_string_min(ets_id)),
                        joins=get_sql_empty(sql.SQL('\n').join(joins)),
                    ))

        return sql.SQL('\nUNION ALL\n').join(sqls)
Пример #3
0
    def __init__(self, data, entity_type_selection=None, collection=None, transformers=None):
        self._data = data if isinstance(data, list) else [data]

        self._collection = entity_type_selection.collection if entity_type_selection else collection
        if not self._collection:
            raise Exception('Property field should have either an entity-type selection or a collection')

        self._transformers = transformers if transformers else []
        self._alias = entity_type_selection.alias if entity_type_selection \
            else hash_string_min(self._collection.table_name)

        self._extend = True
        self._hash = None
        self._prop_path = None
    def _get_field_transformers(self, normalized=False):
        transformers_info = get_transformers()

        field_transformers = self._transformers.copy()
        for transformer in field_transformers:
            if transformer['name'] in transformers_info:
                transformer['sql_template'] = transformers_info[
                    transformer['name']]['sql_template']

                if transformer['name'] == 'stopwords':
                    transformer['parameters']['key'] = hash_string_min(
                        (transformer['parameters']['dictionary'],
                         transformer['parameters']['additional']))
            else:
                raise NameError('Transformer %s is not defined' %
                                transformer['name'])

        if not self._field_type_info['type']:
            field_transformers.insert(
                0, {
                    'sql_template':
                    transformers_info['lowercase']['sql_template'],
                    'parameters': {}
                })

        if self._field_type_info['type'] == 'number':
            field_transformers.append({
                'sql_template':
                transformers_info['to_numeric_immutable']['sql_template'],
                'parameters': {}
            })
        elif self._field_type_info['type'] == 'date':
            field_transformers.append({
                'sql_template':
                transformers_info['to_date_immutable']['sql_template'],
                'parameters': {
                    'format': self._field_type_info['parameters']['format']
                }
            })

        if normalized:
            field_transformers.append({
                'sql_template': self._norm_template,
                'parameters': self._norm_properties
            })

        return field_transformers
    def prepare_sql(self):
        prepare_sqls = [
            sql.SQL(
                'SELECT init_dictionary({key}, {dictionary}, {additional});').
            format(
                key=sql.Literal(
                    hash_string_min(
                        (transformer['parameters']['dictionary'],
                         transformer['parameters']['additional']))),
                dictionary=sql.Literal(
                    transformer['parameters']['dictionary']),
                additional=sql.SQL('ARRAY[{}]::text[]').format(
                    sql.SQL(', ').join(
                        sql.Literal(additional) for additional in
                        transformer['parameters']['additional'])),
            ) for transformer in self._transformers
            if transformer['name'] == 'stopwords'
        ]

        if prepare_sqls:
            return sql.SQL('\n').join(prepare_sqls)

        return None
Пример #6
0
    def _intermediate_property_path(self):
        if not self._prop_path:
            self._prop_path = []
            path = self._collection.table_name

            prev_collection = self._collection
            data = [(self._data[i], self._data[i + 1], self._data[i + 3] if i + 3 < len(self._data) else None)
                    for i in range(0, len(self._data) - 2, 2)]
            for (prop, collection_id, next_collection_id) in data:
                collection = prev_collection.get_collection_by_id(collection_id)
                next_collection = collection.get_collection_by_id(next_collection_id) if next_collection_id else None
                path += f'[{collection.table_name}_{prop}_{next_collection.table_name}]' if next_collection \
                    else f'[{collection.table_name}_{prop}]'

                self._prop_path.append({
                    'from_collection': prev_collection,
                    'to_collection': collection,
                    'alias': hash_string_min(path),
                    'property': column_name_hash(prop)
                })

                prev_collection = collection

        return self._prop_path
Пример #7
0
    def hash(self):
        if not self._hash:
            self._hash = \
                hash_string_min((self.resource_label, self.prop_label, self._extend, self.is_list, self._transformers))

        return self._hash
    def hash(self):
        if self.prop_normalized:
            return hash_string_min(
                (self.prop_original.hash, self.prop_normalized.hash))

        return self.prop_original.hash
Пример #9
0
 def alias(self):
     return hash_string_min(self.table_name)
Пример #10
0
 def config_hash(self):
     return hash_string_min(self._data)
Пример #11
0
 def target_intermediates_hash(self):
     return hash_string_min(
         (self.config_hash,
          sorted([prop.hash for prop in self.target_intermediates_props])))
Пример #12
0
 def sources_hash(self):
     return hash_string_min(
         (self.config_hash,
          sorted([prop.hash for prop in self.sources_props])))
Пример #13
0
 def hash(self):
     return hash_string_min(
         (self.property_field.hash, self.function_name, self.parameters))
 def alias(self):
     return hash_string_min(self.id)
Пример #15
0
 def get_extended_property_alias(resource, prop):
     return hash_string_min(resource + '.' + prop) + '_extended'
Пример #16
0
    def _matching_methods_sql(ets_id, matching_method, properties, is_source,
                              joins, matching_fields, ets_index):
        target = 'target' + str(ets_index)

        field_name_org = matching_method.field_name
        field_name_norm = field_name_org + '_norm'

        props_org = [prop.prop_original for prop in properties]
        props_norm = [
            prop.prop_normalized for prop in properties if prop.prop_normalized
        ]

        # In case of list matching, combine all values into a field
        if matching_method.is_list_match:
            field_norm = sql.SQL('')
            if props_norm:
                field_norm = sql.SQL(
                    cleandoc('''
                     , ARRAY(
                        SELECT {field_name_norm}
                        FROM unnest({fields_org}, {fields_norm}) AS x ({field_name_org}, {field_name_norm})
                        WHERE {field_name_org} IS NOT NULL
                        GROUP BY {field_name_org}, {field_name_norm}
                    ) AS {field_name_norm}
                ''')).format(
                        field_name_org=sql.Identifier(field_name_org),
                        field_name_norm=sql.Identifier(field_name_norm),
                        fields_org=sql.SQL(' || ').join([
                            sql.SQL('array_agg({})').format(
                                sql.Identifier(prop.hash))
                            for prop in props_org
                        ]),
                        fields_norm=sql.SQL(' || ').join([
                            sql.SQL('array_agg({})').format(
                                sql.Identifier(prop.hash))
                            for prop in props_norm
                        ]),
                    )

            joins.append(
                sql.SQL(
                    cleandoc('''
                LEFT JOIN (
                    SELECT uri, ARRAY(
                        SELECT {field_name_org}
                        FROM unnest({fields_org}) AS {field_name_org}
                        WHERE {field_name_org} IS NOT NULL
                        GROUP BY {field_name_org}
                    ) AS {field_name_org} {field_norm}
                    FROM {res}
                    GROUP BY uri
                ) AS {target}
                ON target.uri = {target}.uri
            ''')).format(fields_org=sql.SQL(' || ').join([
                        sql.SQL('array_agg({})').format(
                            sql.Identifier(prop.hash)) for prop in props_org
                    ]),
                         field_name_org=sql.Identifier(field_name_org),
                         field_norm=field_norm,
                         res=sql.Identifier(hash_string_min(ets_id)),
                         target=sql.Identifier(target)))
        # Otherwise combine all values into a new field to use as a join
        else:
            if len(props_org) == 1:
                field_template = '{field_org} AS {field_name_org}'
                if props_norm:
                    field_template += ', {field_norm} AS {field_name_norm}'

                fields_sql = sql.SQL(field_template).format(
                    field_org=sql.Identifier(props_org[0].hash),
                    field_name_org=sql.Identifier(field_name_org),
                    field_norm=sql.Identifier(props_norm[0].hash)
                    if props_norm else sql.SQL(''),
                    field_name_norm=sql.Identifier(field_name_norm)
                    if props_norm else sql.SQL(''),
                )

                lateral_sql = sql.SQL('')
            else:
                field_template = '{field_name_org}' if not props_norm else '{field_name_org}, {field_name_norm}'

                fields_sql = sql.SQL(field_template).format(
                    field_name_org=sql.Identifier(field_name_org),
                    field_name_norm=sql.Identifier(field_name_norm)
                    if props_norm else sql.SQL(''),
                )

                join_template = ', LATERAL unnest(ARRAY[{fields_org}]) AS {field_name_org}' if not props_norm else \
                    ', LATERAL unnest(ARRAY[{fields_org}], ARRAY[{fields_norm}]) AS x ({field_name_org}, {field_name_norm})'

                lateral_sql = sql.SQL(join_template).format(
                    fields_org=sql.SQL(', ').join(
                        [sql.Identifier(prop.hash) for prop in props_org]),
                    field_name_org=sql.Identifier(field_name_org),
                    fields_norm=sql.SQL(', ').join([
                        sql.Identifier(prop.hash) for prop in props_norm
                    ]) if props_norm else sql.SQL(''),
                    field_name_norm=sql.Identifier(field_name_norm)
                    if props_norm else sql.SQL(''),
                )

            joins.append(
                sql.SQL(
                    cleandoc('''
                LEFT JOIN (
                    SELECT DISTINCT uri, {fields}
                    FROM {res}{lateral}
                ) AS {target}
                ON target.uri = {target}.uri AND {target}.{field_name_org} IS NOT NULL
            ''')).format(
                        fields=fields_sql,
                        res=sql.Identifier(hash_string_min(ets_id)),
                        lateral=lateral_sql,
                        target=sql.Identifier(target),
                        field_name_org=sql.Identifier(field_name_org),
                    ))

        # Now that we have determined the target fields, add them to the list of matching fields
        matching_fields.append(
            sql.SQL('{target}.{field} AS {field}').format(
                target=sql.Identifier(target),
                field=sql.Identifier(field_name_org)))
        if props_norm:
            matching_fields.append(
                sql.SQL('{target}.{field} AS {field}').format(
                    target=sql.Identifier(target),
                    field=sql.Identifier(field_name_norm)))

        # Add properties to do the intermediate dataset matching
        if matching_method.is_intermediate:
            for intermediate_ets, intermediate_ets_props in matching_method.intermediates.items(
            ):
                intermediate_res = hash_string_min(intermediate_ets)
                intermediate_target = 'intermediate' + str(ets_index)
                intermediate_fields = intermediate_ets_props[
                    'source' if is_source else 'target']

                intermediate_match_sqls = [
                    sql.SQL(
                        '{target}.{field_name} = {intermediate_target}.{intermediate_field}'
                    ).format(target=sql.Identifier(target),
                             field_name=sql.Identifier(field_name_org),
                             intermediate_target=sql.Identifier(
                                 intermediate_target),
                             intermediate_field=sql.Identifier(
                                 intermediate_field.prop_original.hash))
                    for intermediate_field in intermediate_fields
                ]

                joins.append(
                    sql.SQL(
                        cleandoc('''
                        LEFT JOIN {intermediate_res} AS {intermediate_target}
                        ON {match_sqls}
                    ''')).format(
                            intermediate_res=sql.Identifier(intermediate_res),
                            intermediate_target=sql.Identifier(
                                intermediate_target),
                            match_sqls=sql.SQL(' OR ').join(
                                intermediate_match_sqls)))

                matching_fields.append(
                    sql.SQL('{join_name}.uri AS {field_name}').format(
                        join_name=sql.Identifier(intermediate_target),
                        field_name=sql.Identifier(field_name_org +
                                                  '_intermediate')))
Пример #17
0
 def extended_prop_label(self):
     return hash_string_min(self.resource_label + '.' + self.prop_label) + '_extended'
Пример #18
0
 def hash(self):
     return hash_string_min((self.graphql_endpoint, self.dataset_id, self.collection_id))