예제 #1
0
def update_authority_kind(searcher: SolrCollection) -> None:
    organization_types = {
        organization['sys_uri']: organization['kind']
        for organization in searcher.select_all_documents(
            'sys_type:organization', ['sys_uri', 'kind'], id_field='sys_id')
        if 'kind' in organization and 'sys_uri' in organization
    }

    objects_with_authority = searcher.select_all_documents(
        'authority:[* TO *]', ['sys_id, authority'], id_field='sys_id')

    logging.info('Found {0} objects with a relation with an authority'.format(
        len(objects_with_authority)))

    updates = [{
        'sys_id': donl_object['sys_id'],
        'authority_kind': {
            'set': [
                organization_types[authority]
                for authority in donl_object['authority']
                if authority in organization_types
            ]
        }
    } for donl_object in objects_with_authority]

    searcher.index_documents(updates, commit=False)

    logging.info('results')
    logging.info(' indexed:         %s', len(updates))
def main() -> None:
    utils.setup_logger(__file__)

    logging.info('aggregate_signals.py started')

    signal_collection = SolrCollection(os.getenv('SOLR_COLLECTION_SIGNALS'))
    signal_aggregated_collection = SolrCollection(
        os.getenv('SOLR_COLLECTION_SIGNALS_AGGREGATED'))

    signals = signal_collection.select_all_documents()
    aggregated_signals = signal_aggregated_collection.select_all_documents()

    signal_aggregated_collection.index_documents(
        get_aggregations(aggregate_fields(signals, ['query', 'handler']),
                         aggregated_signals, 'query'))

    signal_aggregated_collection.index_documents(
        get_aggregations(
            aggregate_fields(
                reduce_date_field_to_hours(signals, 'search_timestamp'),
                ['search_timestamp', 'handler']), aggregated_signals,
            'search_timestamp'))

    signal_aggregated_collection.index_documents(
        get_aggregations(
            aggregate_fields(preprocess_filters(signals),
                             ['filters', 'handler']), aggregated_signals,
            'filters'))

    signal_collection.delete_documents('*:*')

    logging.info('aggregate_signals.py finished')
예제 #3
0
def update_relations(searcher: SolrCollection) -> None:
    has_relations = utils.load_resource('has_relations')
    updates = {}
    for relation_source, mapping in has_relations.items():
        logging.info('relations for %s', relation_source)

        sources = searcher.select_all_documents(
            fq='sys_type:{0}'.format(relation_source), id_field='sys_id')
        rels = searcher.select_all_documents(
            fl=list(set(list(mapping.values()) + ['sys_uri', 'sys_type'])),
            fq='sys_type:{0}'.format(' OR sys_type:'.join(mapping.keys())),
            id_field='sys_id')

        logging.info(' subjects:        %s', len(sources))
        logging.info(' relations:       %s', len(rels))

        [source.update({'related_to': []}) for source in sources]

        for source in sources:
            related_to = set()
            for mapping_target, mapping_source in mapping.items():
                if mapping_target in source['related_to']:
                    continue

                for relation in rels:
                    if mapping_target != relation['sys_type']:
                        continue

                    try:
                        if isinstance(relation[mapping_source], list):
                            if source['sys_uri'] in relation[mapping_source]:
                                related_to.add(mapping_target)
                        else:
                            if source['sys_uri'] == relation[mapping_source]:
                                related_to.add(mapping_target)
                    except KeyError:
                        continue

            if source['sys_id'] not in updates:
                updates[source['sys_id']] = set()

            for related_to_type in related_to:
                updates[source['sys_id']].add(related_to_type)

    logging.info('indexing relations')

    updates = [{
        'sys_id': sys_id,
        'related_to': {
            'set': list(update)
        }
    } for sys_id, update in updates.items()]

    searcher.index_documents(updates, commit=False)

    logging.info('results')
    logging.info(' indexed:         %s', len(updates))
def get_theme_suggestions(search_core: SolrCollection, in_context: str) -> list:
    """
    Get theme suggestions within a given context and use the number of
    occurrences of a theme within the context as weight

    :param search_core: The search core to get theme suggestions from
    :param in_context: The context
    :return: The list of theme suggestions
    """
    context_entities = search_core.select_all_documents(
        'sys_type:"{0}"'.format(in_context), ['theme'], id_field='sys_id'
    )

    counts = {}

    for context_entity in context_entities:
        if 'theme' not in context_entity:
            continue

        for theme in context_entity['theme']:
            if theme not in counts:
                counts[theme] = 0

            counts[theme] += 1

    synonyms_uri_nl = search_core.select_managed_synonyms('uri_nl')

    return [{
        'theme': synonyms_uri_nl[theme] if theme in synonyms_uri_nl else theme,
        'weight': count,
        'payload': theme,
        'type': 'theme',
        'language': ['nl', 'en'],
        'in_context_of': in_context
    } for theme, count in counts.items()]
def main():
    utils.setup_logger(__file__)

    logging.info('update_relations_with_object_property.py -- starting')

    search_collection = SolrCollection(os.getenv('SOLR_COLLECTION_SEARCH'))
    property_to_relation = utils.load_resource('property_to_relation')

    updates = []
    for object_type, mapping in property_to_relation.items():
        object_uri_to_source_mapping = get_object_uri_to_source_mapping(
            search_collection, object_type, mapping['source'])

        for relation in mapping['relations']:
            relation_objects = search_collection.select_all_documents(
                'sys_type:{0} AND {1}:[* TO *]'.format(relation['type'],
                                                       relation['match']),
                ['sys_uri', relation['match']],
                id_field='sys_id')

            updates += get_relation_updates(relation, mapping,
                                            object_uri_to_source_mapping,
                                            relation_objects)

    search_collection.index_documents(updates)

    logging.info('update_relations_with_object_property.py -- finished')
def get_object_uri_to_source_mapping(search_collection: SolrCollection,
                                     object_type: str,
                                     source_field: str) -> dict:
    objects = search_collection.select_all_documents(
        'sys_type:{0} AND {1}:[* TO *]'.format(object_type, source_field),
        ['sys_uri', source_field],
        id_field='sys_id')

    return {
        single_object['sys_uri']: single_object[source_field]
        for single_object in objects
    }
def get_doc_suggestions(search_core: SolrCollection, doc_type: str,
                        mappings: dict, relation_counts: dict,
                        communities: dict, fq: str = None) -> list:
    """
    Get suggestions of a given doc_type from the search core

    :param SolrCollection search_core: The search core to get suggestions from
    :param str doc_type: The document type to get suggestions for
    :param dict mappings: The mapping from the search core to the suggester core
    :param dict relation_counts: A dictionary of the relation facet field
    :param dict communities: A dictionary with community URIs as keys and
    community names as value
    :param str fq: An optional string used for filtering documents for which
    suggestions are returned
    :return: The list of doc suggestions
    """
    dict_mapper = DictMapper(mappings)
    suggestions = []

    filter_all_docs = 'sys_type:"{0}"'.format(doc_type)

    if fq is not None:
        filter_all_docs += ' AND ' + fq

    entities = search_core.select_all_documents(
        filter_all_docs,
        list(mappings.keys()) + ['sys_uri'],
        id_field='sys_id'
    )

    for entity in entities:
        sys_uri = entity['sys_uri']

        if sys_uri not in relation_counts:
            continue

        entity = dict_mapper.apply_map(entity)

        if 'relation_community' in entity:
            names = []
            for community_uri in entity['relation_community']:
                if community_uri in communities:
                    names.append(communities[community_uri])
            entity['relation_community'] = names

        entity['weight'] = relation_counts[sys_uri]
        entity['language'] = ['nl', 'en']
        entity['in_context_of'] = 'self'

        suggestions.append(entity)

    return suggestions
예제 #8
0
def update_popularity(searcher: SolrCollection) -> None:
    logging.info('Updating popularity')
    relation_counts = searcher.get_facet_counts('relation')

    donl_objects = searcher.select_all_documents(fl=['sys_uri', 'popularity'],
                                                 id_field='sys_id')

    updates = [
        {
            'sys_id': donl_object['sys_id'],
            'popularity': {
                'set': relation_counts[donl_object['sys_uri']]
            }
        } for donl_object in donl_objects
        if donl_object['sys_uri'] in relation_counts
        and not ('popularity' in donl_object and donl_object['popularity'] ==
                 relation_counts[donl_object['sys_uri']])
    ]

    searcher.index_documents(updates, commit=False)

    logging.info('results')
    logging.info(' indexed:         %s', len(updates))
def main() -> None:
    utils.setup_logger(__file__)

    logging.info('generate_suggestions.py -- starting')

    suggest = SolrCollection(os.getenv('SOLR_COLLECTION_SUGGESTER'))
    search = SolrCollection(os.getenv('SOLR_COLLECTION_SEARCH'))

    logging.info('clearing suggestions')
    suggest.delete_documents('*:*', commit=False)

    relation_counts = search.get_facet_counts('relation')

    community_uri_to_name = search.select_all_documents(
        fq='sys_type:community',
        fl=['sys_uri', 'sys_name'],
        id_field='sys_id'
    )

    community_uri_to_name = {community['sys_uri']: community['sys_name']
                             for community in community_uri_to_name}

    suggestion_types = utils.load_resource('suggestions')
    doc_suggestions = {doc_type: get_doc_suggestions(
        search, doc_type, config['mapping'], relation_counts,
        community_uri_to_name)
        for doc_type, config in suggestion_types.items()}

    logging.info('adding title suggestions:')

    for doc_type, doc_type_suggestions in doc_suggestions.items():
        suggest.index_documents(doc_type_suggestions, commit=False)
        logging.info(' titles: %s of type %s',
                     len(doc_type_suggestions), doc_type)

    user_defined_synonym_suggestions = {doc_type: get_doc_suggestions(
        search, doc_type, config['user_defined_synonyms'], relation_counts,
        community_uri_to_name, 'user_defined_synonyms:[* TO *]')
        for doc_type, config in suggestion_types.items()
        if 'user_defined_synonyms' in config}

    logging.info('adding user defined synonym suggestions:')

    for doc_type, doc_type_suggestions in user_defined_synonym_suggestions.items():
        suggest.index_documents(doc_type_suggestions, commit=False)
        logging.info(' user defined synonyms: %s of type %s',
                     len(doc_type_suggestions), doc_type)

    context_suggestions = {
        doc_type: {
            relation: get_suggestions(search, doc_type, relation,
                                      suggestion_types[relation]['mapping'],
                                      community_uri_to_name)
        } for doc_type, config in suggestion_types.items()
    for relation in config['relations']}

    logging.info('adding context suggestions:')

    for doc_type, relations in context_suggestions.items():
        for relation, suggestions in relations.items():
            suggest.index_documents(suggestions, commit=False)
            logging.info(' titles: %s of type %s in context of %s',
                         len(suggestions), relation, doc_type)

    logging.info('adding theme suggestions:')
    theme_suggestions = get_theme_suggestions(search, 'dataset')
    suggest.index_documents(theme_suggestions, commit=False)
    logging.info(' themes: %s in context of %s',
                 len(theme_suggestions), 'dataset')

    logging.info('committing changes to index')
    suggest.index_documents([], commit=True)

    logging.info('building Solr suggester')
    suggest.build_suggestions('build_suggest')

    logging.info('generate_suggestions.py -- finished')
def get_suggestions(search_core: SolrCollection,
                    in_context: str, doc_type: str,
                    mappings: dict, communities: dict) -> list:
    """
    Get suggestions of a given doc_type in a given context from the search core

    :param search_core: The search core to get suggestions from
    :param in_context: The context
    :param doc_type: The doc type
    :param mappings: The mappings of the given context
    :param dict communities: A dictionary with community URIs as keys and
    community names as value
    :return: The list of suggestions
    """

    # For context suggestions search for payload mapping and set it to
    # sys_uri => payload
    delete_mappings = [key for key, value in mappings.items()
                       if 'payload' in value]

    for mapping in delete_mappings:
        del mappings[mapping]

    mappings['sys_uri'] = ['payload']

    dict_mapper = DictMapper(mappings)
    doc_entities = search_core.select_all_documents(
        'sys_type:"{0}" AND sys_uri:[* TO *]'.format(doc_type),
        id_field='sys_id'
    )
    context_entities = search_core.select_all_documents(
        'sys_type:"{0}" AND relation:[* TO *]'.format(in_context),
        ['relation'],
        id_field='sys_id'
    )

    counts = {}

    for doc_entity in doc_entities:
        for context_entity in context_entities:
            if doc_entity['sys_uri'] in context_entity['relation']:
                counts[doc_entity['sys_uri']] = \
                    counts[doc_entity['sys_uri']] + 1 \
                        if doc_entity['sys_uri'] in counts else 1

    suggestions = []

    for doc_entity in doc_entities:
        entity = dict_mapper.apply_map(doc_entity)

        if 'relation_community' in entity:
            names = []
            for community_uri in entity['relation_community']:
                if community_uri in communities:
                    names.append(communities[community_uri])
            entity['relation_community'] = names

        if doc_entity['sys_uri'] not in counts:
            continue
        
        entity.update({
            'weight': counts[doc_entity['sys_uri']]
            if doc_entity['sys_uri'] in counts else 0,
            'in_context_of': in_context,
            'language': ['nl', 'en'],
            'type': [suggestion_type + '_filter'
                     for suggestion_type in entity['type']]
            if 'type' in entity else ['filter']
        })
        suggestions.append(entity)

    return suggestions
예제 #11
0
def update_reverse_relations(searcher: SolrCollection) -> None:
    """
    Ensures that the relations in the following example are mirrored:

        [{
          "sys_type": "A",
          "identifier": "Foo",
          "relation_B": "Bar"
        },
        {
         "sys_type": "B",
         "identifier": "Bar"
        }]

    Afterwards the relations are set as:

        [{
          "sys_type": "A",
          "identifier": "Foo",
          "relation_B": "Bar"
        },
        {
         "sys_type": "B",
         "identifier": "Bar",
         "relation_A": "Foo"
        }]

    This ensures that all relations are traversable regardless which object is
    used as a reference point.
    :param SolrCollection searcher: The searcher to find and update objects with
    """
    relations = utils.load_resource('relations')

    for source_object, source_data in relations.items():
        for relation, mapping in source_data.items():
            logging.info('updating reverse relations from %s to %s',
                         source_object, relation)

            field_entities = searcher.select_all_documents(
                'sys_type:{0}'.format(source_object),
                ['sys_id', mapping['match'], mapping['to']],
                id_field='sys_id')
            field_entities = {
                entity[mapping['match']]: entity
                for entity in field_entities
            }

            relation_entities = searcher.select_all_documents(
                'sys_type:{0}'.format(relation),
                [mapping['match'], mapping['from']],
                id_field='sys_id')

            entities_to_relation_entities = {}

            for relation_entity in relation_entities:
                if mapping['from'] not in relation_entity:
                    continue

                for uri in relation_entity[mapping['from']]:
                    if uri in field_entities.keys():
                        if uri not in entities_to_relation_entities:
                            entities_to_relation_entities[uri] = []

                        entities_to_relation_entities[uri].append(
                            relation_entity[mapping['match']])

            logging.info(
                ' found %s objects of type %s with relations to'
                ' objects of type %s',
                len(entities_to_relation_entities.keys()), relation,
                source_object)

            deletes = [{
                'sys_id': field_entity['sys_id'],
                mapping['to']: {
                    'remove': field_entity[mapping['to']]
                }
            } for field_entity in field_entities
                       if mapping['to'] in field_entity
                       and field_entity[mapping['match']] not in
                       entities_to_relation_entities.keys()]

            updates = [{
                'sys_id': field_entities[uri]['sys_id'],
                mapping['to']: {
                    'set': entities_to_relation_entities[uri]
                }
            } for uri in entities_to_relation_entities]

            searcher.index_documents(deletes, commit=False)
            searcher.index_documents(updates, commit=False)

            logging.info('results')
            logging.info(' deleted: %s', len(deletes))
            logging.info(' updated: %s', len(updates))