def get_linkset_cte_sql(self, with_view_filters=False, apply_paging=True, apply_sorting=True, include_linkset_uris=True): use_filters = bool(with_view_filters and self._view.filters_per_collection) filter_laterals_sql = get_sql_empty(self._filter_laterals_sql, flag=use_filters) where_sql = self._links_filter.sql( additional_filter=self. _additional_filter_sql if use_filters else None) sort_sql = sql.SQL('ORDER BY sort_order ASC') if apply_sorting and self._sort_desc is not None: sort_sql = sql.SQL('ORDER BY similarity {}, sort_order ASC') \ .format(sql.SQL('DESC') if self._sort_desc else sql.SQL('ASC')) limit_offset_sql = get_sql_empty(sql.SQL( get_pagination_sql(self._limit, self._offset)), flag=apply_paging) include_linkset_uris_sql = get_sql_empty(sql.SQL( cleandoc(''' , linkset_uris AS ( SELECT DISTINCT nodes.uri FROM linkset, LATERAL (VALUES (linkset.source_uri), (linkset.target_uri)) AS nodes(uri) ) ''')), flag=include_linkset_uris) return sql.SQL( cleandoc(''' WITH linkset AS ( SELECT source_uri, target_uri, link_order, source_collections, target_collections, source_intermediates, target_intermediates, cluster_id, cluster_hash_id, valid, similarity, motivation FROM {schema}.{view_name} AS linkset {filter_laterals_sql} {where_sql} {sort_sql} {limit_offset_sql} ) {include_linkset_uris_sql} ''')).format( schema=sql.Identifier(self._schema), view_name=sql.Identifier(self._table_name), filter_laterals_sql=filter_laterals_sql, where_sql=where_sql, sort_sql=sort_sql, limit_offset_sql=limit_offset_sql, include_linkset_uris_sql=include_linkset_uris_sql, )
def get_entity_type_selection_sample_total(self, id, sql_only=False): entity_type_selection = self.get_entity_type_selection_by_id(id) if not entity_type_selection or not entity_type_selection.collection.is_downloaded: return {'total': 0} filter_properties = entity_type_selection.filter_properties if any(not prop.is_downloaded for prop in filter_properties): return {'total': 0} joins = Joins() joins.set_joins_for_props(filter_properties) where_sql = entity_type_selection.filters_sql if where_sql: where_sql = sql.SQL('WHERE {}').format(where_sql) query_sql = sql.SQL( cleandoc(''' SELECT count({resource}.uri) AS total FROM timbuctoo.{table_name} AS {resource} {joins} {condition} ''')).format(resource=sql.Identifier(entity_type_selection.alias), table_name=sql.Identifier( entity_type_selection.collection.table_name), joins=joins.sql, condition=get_sql_empty(where_sql)) if sql_only: return query_sql return fetch_one(query_sql, dict=True)
def _get_combined_entity_type_selections_sql(properties, is_source): sqls = [] # Get the properties needed for the source or target per entity-type selection for ets_id, ets_properties in properties.items(): joins, matching_fields = [], [] # Then for get all properties from this entity-type selection required for a single matching function for ets_index, (property_label, ets_matching_func_props) in enumerate( ets_properties.items()): matching_method = ets_matching_func_props['matching_method'] ets_method_properties = ets_matching_func_props['properties'] MatchingSql._matching_methods_sql(ets_id, matching_method, ets_method_properties, is_source, joins, matching_fields, ets_index) sqls.append( sql.SQL( cleandoc(""" SELECT {collection} AS collection, target.uri, {matching_fields} FROM (SELECT DISTINCT uri FROM {res}) AS target {joins} """)).format( collection=sql.Literal(int(ets_id)), matching_fields=sql.SQL(',\n ').join( matching_fields), res=sql.Identifier(hash_string_min(ets_id)), joins=get_sql_empty(sql.SQL('\n').join(joins)), )) return sql.SQL('\nUNION ALL\n').join(sqls)
def generate_match_linkset_finish_sql(self): sim_fields_sqls = MatchingMethod.get_similarity_fields_sqls( self._linkset.matching_methods) sim_matching_methods_conditions_sqls = [ match_method.similarity_threshold_sql for match_method in self._linkset.matching_methods if match_method.similarity_threshold_sql ] sim_grouping_conditions_sqls = [ sql.SQL('{similarity} >= {threshold}').format( similarity=similarity, threshold=sql.Literal(threshold)) for (threshold, similarity ) in self._linkset.similarity_logic_ops_sql_per_threshold ] sim_condition_sql = get_sql_empty( sql.Composed([ sql.SQL('WHERE '), sql.SQL(' AND ').join(sim_matching_methods_conditions_sqls + sim_grouping_conditions_sqls) ]), flag=sim_matching_methods_conditions_sqls or sim_grouping_conditions_sqls) return sql.SQL( cleandoc(""" DROP TABLE IF EXISTS linksets.{linkset} CASCADE; CREATE TABLE linksets.{linkset} AS SELECT linkset.*, similarity FROM linkset {sim_fields_sql} CROSS JOIN LATERAL coalesce({sim_logic_ops_sql}, 1) AS similarity {sim_condition_sql}; ALTER TABLE linksets.{linkset} ADD PRIMARY KEY (source_uri, target_uri), ADD COLUMN cluster_id integer, ADD COLUMN cluster_hash_id char(15), ADD COLUMN valid link_validity DEFAULT 'unchecked' NOT NULL, ADD COLUMN motivation text; ALTER TABLE linksets.{linkset} ADD COLUMN sort_order serial; CREATE INDEX ON linksets.{linkset} USING hash (source_uri); CREATE INDEX ON linksets.{linkset} USING hash (target_uri); CREATE INDEX ON linksets.{linkset} USING hash (valid); CREATE INDEX ON linksets.{linkset} USING btree (cluster_id); CREATE INDEX ON linksets.{linkset} USING btree (similarity); CREATE INDEX ON linksets.{linkset} USING btree (sort_order); ANALYZE linksets.{linkset}; """) + '\n').format( linkset=sql.Identifier(self._job.table_name(self._linkset.id)), sim_fields_sql=sql.SQL('\n').join(sim_fields_sqls), sim_logic_ops_sql=self._linkset.similarity_logic_ops_sql, sim_condition_sql=sim_condition_sql)
def generate_lens_sql(self): def spec_select_sql(id, type): default_columns = sql.SQL('source_uri, target_uri, link_order, source_collections, target_collections, ' 'source_intermediates, target_intermediates, similarities, valid') if type == 'linkset': return sql.SQL('SELECT {default_columns}, ARRAY[{id}] AS linksets, ARRAY[]::integer[] AS lenses ' 'FROM linksets.{table}').format( default_columns=default_columns, id=sql.Literal(id), table=sql.Identifier(self._job.table_name(id)) ) return sql.SQL('SELECT {default_columns}, linksets, ARRAY[{id}] AS lenses ' 'FROM lenses.{table}').format( default_columns=default_columns, id=sql.Literal(id), table=sql.Identifier(self._job.table_name(id)) ) lens_sql = self._lens.with_lenses_recursive( lambda elem: self._lens_sql(elem['type'], elem['only_left'], sql.SQL('(\n{sql}\n)').format(sql=elem['left']), sql.SQL('(\n{sql}\n)').format(sql=elem['right'])), lambda spec: spec_select_sql(spec['id'], spec['type']) ) sim_fields_sqls = MatchingMethod.get_similarity_fields_sqls(self._lens.matching_methods) sim_conditions_sqls = [sql.SQL('{similarity} >= {threshold}') .format(similarity=similarity, threshold=sql.Literal(threshold)) for (threshold, similarity) in self._lens.similarity_logic_ops_sql_per_threshold] sim_condition_sql = get_sql_empty(sql.Composed([sql.SQL('WHERE '), sql.SQL(' AND ').join(sim_conditions_sqls)]), flag=sim_conditions_sqls) return sql.SQL(cleandoc( """ DROP TABLE IF EXISTS lenses.{lens} CASCADE; CREATE TABLE lenses.{lens} AS SELECT lens.*, similarity FROM ( {lens_sql} ) AS lens {sim_fields_sql} CROSS JOIN LATERAL coalesce({sim_logic_ops_sql}, 1) AS similarity {sim_condition_sql}; """ ) + '\n').format( lens=sql.Identifier(self._job.table_name(self._lens.id)), lens_sql=lens_sql, sim_fields_sql=sql.SQL('\n').join(sim_fields_sqls), sim_logic_ops_sql=self._lens.similarity_logic_ops_sql, sim_condition_sql=sim_condition_sql )
def get_links_generator_sql(self, with_view_properties='none', with_view_filters=False): is_single_value = with_view_properties == 'single' use_properties = bool(with_view_properties != 'none' and self._view.properties_per_collection) selection_sql = get_sql_empty( self._selection_props_sql(is_single_value), flag=use_properties, prefix=sql.SQL(', \n'), add_new_line=False) props_joins_sql = get_sql_empty(self._properties_join_sql( sql.SQL('IN (linkset.source_uri, linkset.target_uri)'), is_single_value), flag=use_properties) group_by_sql = get_sql_empty(sql.SQL( 'GROUP BY source_uri, target_uri, link_order, source_collections, target_collections, ' 'source_intermediates, target_intermediates, cluster_id, cluster_hash_id, valid, similarity, motivation' ), flag=use_properties, add_new_line=False) return sql.SQL( cleandoc(''' {linkset_cte} SELECT source_uri, target_uri, link_order, source_collections, target_collections, source_intermediates, target_intermediates, cluster_id, cluster_hash_id, valid, similarity, motivation {selection_sql} FROM linkset {props_joins_sql} {group_by_sql} ''')).format(linkset_cte=self.get_linkset_cte_sql( with_view_filters=with_view_filters), selection_sql=selection_sql, props_joins_sql=props_joins_sql, group_by_sql=group_by_sql)
def get_clusters_generator_sql(self, with_view_properties='none', with_view_filters=False, include_nodes=False): is_single_value = with_view_properties == 'single' use_properties = bool(with_view_properties != 'none' and self._view.properties_per_collection) selection_sql = get_sql_empty(self._cluster_selection_props_sql, flag=use_properties, prefix=sql.SQL(', \n'), add_new_line=False) if include_nodes: selection_sql = sql.Composed( [selection_sql, sql.SQL(', all_nodes AS nodes')]) props_joins_sql = get_sql_empty(self._properties_join_sql( sql.SQL('IN (nodes_limited)'), single_value=is_single_value, include_unnest=True), flag=use_properties) sort_sql = sql.SQL('ORDER BY cluster_id') if self._cluster_sort_type is not None: if self._cluster_sort_type == 'size_asc' or self._cluster_sort_type == 'size_desc': sort_sql = sql.SQL('ORDER BY size {}, cluster_id {}') \ .format(sql.SQL('ASC') if self._cluster_sort_type == 'size_asc' else sql.SQL('DESC'), sql.SQL('DESC') if self._cluster_sort_type == 'size_asc' else sql.SQL('ASC')) else: sort_sql = sql.SQL('ORDER BY total_links {}, cluster_id {}') \ .format(sql.SQL('ASC') if self._cluster_sort_type == 'count_asc' else sql.SQL('DESC'), sql.SQL('DESC') if self._cluster_sort_type == 'count_asc' else sql.SQL('ASC')) return sql.SQL( cleandoc(''' {linkset_cte} SELECT cluster_id, cluster_hash_id, size, links {selection_sql} FROM ( SELECT cluster_id, cluster_hash_id, array_agg(DISTINCT nodes) AS all_nodes, count(DISTINCT nodes) AS size, jsonb_object_agg(valid, valid_count) AS links, sum(valid_count) AS total_links FROM ( SELECT cluster_id, cluster_hash_id, array_agg(nodes.uri) AS all_nodes, valid, count(valid) / 2 AS valid_count FROM linkset, LATERAL (VALUES (linkset.source_uri), (linkset.target_uri)) AS nodes(uri) GROUP BY cluster_id, cluster_hash_id, valid ) AS x, unnest(all_nodes) AS nodes GROUP BY cluster_id, cluster_hash_id {having_sql} {sort_sql} {limit_offset} ) AS clusters LEFT JOIN unnest(all_nodes[0:50]) AS nodes_limited ON true {props_joins_sql} GROUP BY cluster_id, cluster_hash_id, all_nodes, size, links, total_links {sort_sql} ''')).format( linkset_cte=self.get_linkset_cte_sql( with_view_filters=with_view_filters, apply_paging=False), selection_sql=selection_sql, having_sql=self._clusters_filter.sql(), limit_offset=sql.SQL( get_pagination_sql(self._limit, self._offset)), props_joins_sql=props_joins_sql, sort_sql=sort_sql, )
def generate_entity_type_selection_sql(self): entity_type_selections_sql = [] for entity_type_selection in self._linkset.all_entity_type_selections: random = '\nORDER BY RANDOM()' if entity_type_selection.random else '' limit = sql.SQL(') AS x%s\nLIMIT %i' % (random, entity_type_selection.limit)) \ if entity_type_selection.limit > -1 else sql.SQL('') prepare_sqls = [] matching_fields_sqls = [ sql.SQL('{}.uri').format( sql.Identifier(entity_type_selection.alias)) ] matching_methods_props = entity_type_selection.get_fields( self._linkset) for matching_method_prop in matching_methods_props: if matching_method_prop.prepare_sql: prepare_sqls.append(matching_method_prop.prepare_sql) for property_field in \ {mm_prop.prop_original for mm_prop in matching_methods_props}.union( {mm_prop.prop_normalized for mm_prop in matching_methods_props if mm_prop.prop_normalized}): matching_fields_sqls.append( sql.SQL('{matching_field} AS {name}').format( matching_field=property_field.sql, name=sql.Identifier(property_field.hash))) joins = Joins() joins.set_joins_for_props( entity_type_selection.properties_for_matching(self._linkset)) where_sql = entity_type_selection.filters_sql if where_sql: where_sql = sql.SQL('WHERE {}').format(where_sql) ets_sql = sql.SQL( cleandoc( """ DROP MATERIALIZED VIEW IF EXISTS {view_name} CASCADE; CREATE MATERIALIZED VIEW {view_name} AS {pre}SELECT DISTINCT {matching_fields} FROM timbuctoo.{table_name} AS {view_name}{joins}{wheres}{limit}; ANALYZE {view_name}; """) + '\n').format( pre=sql.SQL('SELECT * FROM (') if entity_type_selection.limit > -1 else sql.SQL(''), view_name=sql.Identifier(entity_type_selection.alias), matching_fields=sql.SQL(',\n ').join( matching_fields_sqls), table_name=sql.Identifier( entity_type_selection.collection.table_name), joins=get_sql_empty(joins.sql), wheres=get_sql_empty(where_sql), limit=get_sql_empty(limit), ) if prepare_sqls: ets_sql = sql.Composed( [sql.SQL('\n').join(prepare_sqls), sql.SQL('\n'), ets_sql]) entity_type_selections_sql.append(ets_sql) return sql.Composed(entity_type_selections_sql)