def update_authority_kind(searcher: SolrCollection) -> None: organization_types = { organization['sys_uri']: organization['kind'] for organization in searcher.select_all_documents( 'sys_type:organization', ['sys_uri', 'kind'], id_field='sys_id') if 'kind' in organization and 'sys_uri' in organization } objects_with_authority = searcher.select_all_documents( 'authority:[* TO *]', ['sys_id, authority'], id_field='sys_id') logging.info('Found {0} objects with a relation with an authority'.format( len(objects_with_authority))) updates = [{ 'sys_id': donl_object['sys_id'], 'authority_kind': { 'set': [ organization_types[authority] for authority in donl_object['authority'] if authority in organization_types ] } } for donl_object in objects_with_authority] searcher.index_documents(updates, commit=False) logging.info('results') logging.info(' indexed: %s', len(updates))
def main() -> None: utils.setup_logger(__file__) logging.info('aggregate_signals.py started') signal_collection = SolrCollection(os.getenv('SOLR_COLLECTION_SIGNALS')) signal_aggregated_collection = SolrCollection( os.getenv('SOLR_COLLECTION_SIGNALS_AGGREGATED')) signals = signal_collection.select_all_documents() aggregated_signals = signal_aggregated_collection.select_all_documents() signal_aggregated_collection.index_documents( get_aggregations(aggregate_fields(signals, ['query', 'handler']), aggregated_signals, 'query')) signal_aggregated_collection.index_documents( get_aggregations( aggregate_fields( reduce_date_field_to_hours(signals, 'search_timestamp'), ['search_timestamp', 'handler']), aggregated_signals, 'search_timestamp')) signal_aggregated_collection.index_documents( get_aggregations( aggregate_fields(preprocess_filters(signals), ['filters', 'handler']), aggregated_signals, 'filters')) signal_collection.delete_documents('*:*') logging.info('aggregate_signals.py finished')
def update_relations(searcher: SolrCollection) -> None: has_relations = utils.load_resource('has_relations') updates = {} for relation_source, mapping in has_relations.items(): logging.info('relations for %s', relation_source) sources = searcher.select_all_documents( fq='sys_type:{0}'.format(relation_source), id_field='sys_id') rels = searcher.select_all_documents( fl=list(set(list(mapping.values()) + ['sys_uri', 'sys_type'])), fq='sys_type:{0}'.format(' OR sys_type:'.join(mapping.keys())), id_field='sys_id') logging.info(' subjects: %s', len(sources)) logging.info(' relations: %s', len(rels)) [source.update({'related_to': []}) for source in sources] for source in sources: related_to = set() for mapping_target, mapping_source in mapping.items(): if mapping_target in source['related_to']: continue for relation in rels: if mapping_target != relation['sys_type']: continue try: if isinstance(relation[mapping_source], list): if source['sys_uri'] in relation[mapping_source]: related_to.add(mapping_target) else: if source['sys_uri'] == relation[mapping_source]: related_to.add(mapping_target) except KeyError: continue if source['sys_id'] not in updates: updates[source['sys_id']] = set() for related_to_type in related_to: updates[source['sys_id']].add(related_to_type) logging.info('indexing relations') updates = [{ 'sys_id': sys_id, 'related_to': { 'set': list(update) } } for sys_id, update in updates.items()] searcher.index_documents(updates, commit=False) logging.info('results') logging.info(' indexed: %s', len(updates))
def get_theme_suggestions(search_core: SolrCollection, in_context: str) -> list: """ Get theme suggestions within a given context and use the number of occurrences of a theme within the context as weight :param search_core: The search core to get theme suggestions from :param in_context: The context :return: The list of theme suggestions """ context_entities = search_core.select_all_documents( 'sys_type:"{0}"'.format(in_context), ['theme'], id_field='sys_id' ) counts = {} for context_entity in context_entities: if 'theme' not in context_entity: continue for theme in context_entity['theme']: if theme not in counts: counts[theme] = 0 counts[theme] += 1 synonyms_uri_nl = search_core.select_managed_synonyms('uri_nl') return [{ 'theme': synonyms_uri_nl[theme] if theme in synonyms_uri_nl else theme, 'weight': count, 'payload': theme, 'type': 'theme', 'language': ['nl', 'en'], 'in_context_of': in_context } for theme, count in counts.items()]
def main(): utils.setup_logger(__file__) logging.info('update_relations_with_object_property.py -- starting') search_collection = SolrCollection(os.getenv('SOLR_COLLECTION_SEARCH')) property_to_relation = utils.load_resource('property_to_relation') updates = [] for object_type, mapping in property_to_relation.items(): object_uri_to_source_mapping = get_object_uri_to_source_mapping( search_collection, object_type, mapping['source']) for relation in mapping['relations']: relation_objects = search_collection.select_all_documents( 'sys_type:{0} AND {1}:[* TO *]'.format(relation['type'], relation['match']), ['sys_uri', relation['match']], id_field='sys_id') updates += get_relation_updates(relation, mapping, object_uri_to_source_mapping, relation_objects) search_collection.index_documents(updates) logging.info('update_relations_with_object_property.py -- finished')
def get_object_uri_to_source_mapping(search_collection: SolrCollection, object_type: str, source_field: str) -> dict: objects = search_collection.select_all_documents( 'sys_type:{0} AND {1}:[* TO *]'.format(object_type, source_field), ['sys_uri', source_field], id_field='sys_id') return { single_object['sys_uri']: single_object[source_field] for single_object in objects }
def get_doc_suggestions(search_core: SolrCollection, doc_type: str, mappings: dict, relation_counts: dict, communities: dict, fq: str = None) -> list: """ Get suggestions of a given doc_type from the search core :param SolrCollection search_core: The search core to get suggestions from :param str doc_type: The document type to get suggestions for :param dict mappings: The mapping from the search core to the suggester core :param dict relation_counts: A dictionary of the relation facet field :param dict communities: A dictionary with community URIs as keys and community names as value :param str fq: An optional string used for filtering documents for which suggestions are returned :return: The list of doc suggestions """ dict_mapper = DictMapper(mappings) suggestions = [] filter_all_docs = 'sys_type:"{0}"'.format(doc_type) if fq is not None: filter_all_docs += ' AND ' + fq entities = search_core.select_all_documents( filter_all_docs, list(mappings.keys()) + ['sys_uri'], id_field='sys_id' ) for entity in entities: sys_uri = entity['sys_uri'] if sys_uri not in relation_counts: continue entity = dict_mapper.apply_map(entity) if 'relation_community' in entity: names = [] for community_uri in entity['relation_community']: if community_uri in communities: names.append(communities[community_uri]) entity['relation_community'] = names entity['weight'] = relation_counts[sys_uri] entity['language'] = ['nl', 'en'] entity['in_context_of'] = 'self' suggestions.append(entity) return suggestions
def update_popularity(searcher: SolrCollection) -> None: logging.info('Updating popularity') relation_counts = searcher.get_facet_counts('relation') donl_objects = searcher.select_all_documents(fl=['sys_uri', 'popularity'], id_field='sys_id') updates = [ { 'sys_id': donl_object['sys_id'], 'popularity': { 'set': relation_counts[donl_object['sys_uri']] } } for donl_object in donl_objects if donl_object['sys_uri'] in relation_counts and not ('popularity' in donl_object and donl_object['popularity'] == relation_counts[donl_object['sys_uri']]) ] searcher.index_documents(updates, commit=False) logging.info('results') logging.info(' indexed: %s', len(updates))
def main() -> None: utils.setup_logger(__file__) logging.info('generate_suggestions.py -- starting') suggest = SolrCollection(os.getenv('SOLR_COLLECTION_SUGGESTER')) search = SolrCollection(os.getenv('SOLR_COLLECTION_SEARCH')) logging.info('clearing suggestions') suggest.delete_documents('*:*', commit=False) relation_counts = search.get_facet_counts('relation') community_uri_to_name = search.select_all_documents( fq='sys_type:community', fl=['sys_uri', 'sys_name'], id_field='sys_id' ) community_uri_to_name = {community['sys_uri']: community['sys_name'] for community in community_uri_to_name} suggestion_types = utils.load_resource('suggestions') doc_suggestions = {doc_type: get_doc_suggestions( search, doc_type, config['mapping'], relation_counts, community_uri_to_name) for doc_type, config in suggestion_types.items()} logging.info('adding title suggestions:') for doc_type, doc_type_suggestions in doc_suggestions.items(): suggest.index_documents(doc_type_suggestions, commit=False) logging.info(' titles: %s of type %s', len(doc_type_suggestions), doc_type) user_defined_synonym_suggestions = {doc_type: get_doc_suggestions( search, doc_type, config['user_defined_synonyms'], relation_counts, community_uri_to_name, 'user_defined_synonyms:[* TO *]') for doc_type, config in suggestion_types.items() if 'user_defined_synonyms' in config} logging.info('adding user defined synonym suggestions:') for doc_type, doc_type_suggestions in user_defined_synonym_suggestions.items(): suggest.index_documents(doc_type_suggestions, commit=False) logging.info(' user defined synonyms: %s of type %s', len(doc_type_suggestions), doc_type) context_suggestions = { doc_type: { relation: get_suggestions(search, doc_type, relation, suggestion_types[relation]['mapping'], community_uri_to_name) } for doc_type, config in suggestion_types.items() for relation in config['relations']} logging.info('adding context suggestions:') for doc_type, relations in context_suggestions.items(): for relation, suggestions in relations.items(): suggest.index_documents(suggestions, commit=False) logging.info(' titles: %s of type %s in context of %s', len(suggestions), relation, doc_type) logging.info('adding theme suggestions:') theme_suggestions = get_theme_suggestions(search, 'dataset') suggest.index_documents(theme_suggestions, commit=False) logging.info(' themes: %s in context of %s', len(theme_suggestions), 'dataset') logging.info('committing changes to index') suggest.index_documents([], commit=True) logging.info('building Solr suggester') suggest.build_suggestions('build_suggest') logging.info('generate_suggestions.py -- finished')
def get_suggestions(search_core: SolrCollection, in_context: str, doc_type: str, mappings: dict, communities: dict) -> list: """ Get suggestions of a given doc_type in a given context from the search core :param search_core: The search core to get suggestions from :param in_context: The context :param doc_type: The doc type :param mappings: The mappings of the given context :param dict communities: A dictionary with community URIs as keys and community names as value :return: The list of suggestions """ # For context suggestions search for payload mapping and set it to # sys_uri => payload delete_mappings = [key for key, value in mappings.items() if 'payload' in value] for mapping in delete_mappings: del mappings[mapping] mappings['sys_uri'] = ['payload'] dict_mapper = DictMapper(mappings) doc_entities = search_core.select_all_documents( 'sys_type:"{0}" AND sys_uri:[* TO *]'.format(doc_type), id_field='sys_id' ) context_entities = search_core.select_all_documents( 'sys_type:"{0}" AND relation:[* TO *]'.format(in_context), ['relation'], id_field='sys_id' ) counts = {} for doc_entity in doc_entities: for context_entity in context_entities: if doc_entity['sys_uri'] in context_entity['relation']: counts[doc_entity['sys_uri']] = \ counts[doc_entity['sys_uri']] + 1 \ if doc_entity['sys_uri'] in counts else 1 suggestions = [] for doc_entity in doc_entities: entity = dict_mapper.apply_map(doc_entity) if 'relation_community' in entity: names = [] for community_uri in entity['relation_community']: if community_uri in communities: names.append(communities[community_uri]) entity['relation_community'] = names if doc_entity['sys_uri'] not in counts: continue entity.update({ 'weight': counts[doc_entity['sys_uri']] if doc_entity['sys_uri'] in counts else 0, 'in_context_of': in_context, 'language': ['nl', 'en'], 'type': [suggestion_type + '_filter' for suggestion_type in entity['type']] if 'type' in entity else ['filter'] }) suggestions.append(entity) return suggestions
def update_reverse_relations(searcher: SolrCollection) -> None: """ Ensures that the relations in the following example are mirrored: [{ "sys_type": "A", "identifier": "Foo", "relation_B": "Bar" }, { "sys_type": "B", "identifier": "Bar" }] Afterwards the relations are set as: [{ "sys_type": "A", "identifier": "Foo", "relation_B": "Bar" }, { "sys_type": "B", "identifier": "Bar", "relation_A": "Foo" }] This ensures that all relations are traversable regardless which object is used as a reference point. :param SolrCollection searcher: The searcher to find and update objects with """ relations = utils.load_resource('relations') for source_object, source_data in relations.items(): for relation, mapping in source_data.items(): logging.info('updating reverse relations from %s to %s', source_object, relation) field_entities = searcher.select_all_documents( 'sys_type:{0}'.format(source_object), ['sys_id', mapping['match'], mapping['to']], id_field='sys_id') field_entities = { entity[mapping['match']]: entity for entity in field_entities } relation_entities = searcher.select_all_documents( 'sys_type:{0}'.format(relation), [mapping['match'], mapping['from']], id_field='sys_id') entities_to_relation_entities = {} for relation_entity in relation_entities: if mapping['from'] not in relation_entity: continue for uri in relation_entity[mapping['from']]: if uri in field_entities.keys(): if uri not in entities_to_relation_entities: entities_to_relation_entities[uri] = [] entities_to_relation_entities[uri].append( relation_entity[mapping['match']]) logging.info( ' found %s objects of type %s with relations to' ' objects of type %s', len(entities_to_relation_entities.keys()), relation, source_object) deletes = [{ 'sys_id': field_entity['sys_id'], mapping['to']: { 'remove': field_entity[mapping['to']] } } for field_entity in field_entities if mapping['to'] in field_entity and field_entity[mapping['match']] not in entities_to_relation_entities.keys()] updates = [{ 'sys_id': field_entities[uri]['sys_id'], mapping['to']: { 'set': entities_to_relation_entities[uri] } } for uri in entities_to_relation_entities] searcher.index_documents(deletes, commit=False) searcher.index_documents(updates, commit=False) logging.info('results') logging.info(' deleted: %s', len(deletes)) logging.info(' updated: %s', len(updates))