示例#1
0
 def bulk_index(self, es_actions):
     logger.debug(self.es_actions)
     nr, errors = helpers.bulk(get_es(), es_actions)
     if nr > 0 and not errors:
         logger.info("Indexed records: {}".format(nr))
         return True
     elif errors:
         logger.error("Something went wrong with bulk index: {}".format(errors))
         return False
     return False
示例#2
0
 def diff_by_content_hash(self):
     ids = [{"_id": key[0]} for key in self.es_actions.keys()]
     mget_ids = get_es().mget(body={"docs": ids}, index=settings.SITE_NAME, _source_include=["system.content_hash"])
     index_sets = {
         (doc.get("_id"), doc["_source"].get("system", {"content_hash": None}).get("content_hash"))
         for doc in mget_ids.get("docs")
         if doc["found"]
     }
     new_records = set(self.es_actions.keys()).difference(index_sets)
     self.records_stored = len(new_records)
     self.records_already_stored = len(ids) - self.records_stored
     es_actions = [es_action for k, es_action in self.es_actions.items() if k in new_records]
     sparql_updates = [sparql_update for k, sparql_update in self.sparql_update_queries.items() if k in new_records]
     return es_actions, sparql_updates
示例#3
0
文件: tasks.py 项目: delving/nave
def reindex_dataset(ds, acceptance=False):
    """ Reindex the dataset for elasticsearch from the edm_record cache.
    :param acceptance:
    :param ds: the DataSet
    :return: records processed
    """
    ds.processed_records = 0
    ds.save()

    def process_records():
        for edm in EDMRecord.objects.filter(dataset=ds):
            yield edm.create_es_action(
                    index=settings.SITE_NAME,
                    exclude_fields=ds.excluded_index_fields.names(),
                    context=False,
                    acceptance=acceptance
            )

    response = helpers.bulk(get_es(), actions=process_records())
    return response
示例#4
0
文件: tasks.py 项目: delving/nave
def synchronise_dataset_records(store, dataset_graph_uri=None, ds=None, index=settings.SITE_NAME):
    """Iterate over all records that are out of sync for a dataset and update them in the index and database. """
    if not ds and dataset_graph_uri:
        ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_graph_uri, store=store)
    elif ds and not dataset_graph_uri:
        dataset_graph_uri = ds.document_uri
    elif not dataset_graph_uri and not ds:
        raise ValueError("Unable to find dataset due to missing value in dataset_graph_uri and/or ds")
    logger.info("Graph uri to synchronise: {}".format(dataset_graph_uri))
    # materialize nodes
    # ore:aggregates + remove ore:isAggregatedBy
    graph_list = get_out_of_sync_dataset_record_graph_uris(dataset_graph_uri, store, 200)
    if not ds.stay_in_sync:
        logger.warn("Should not start synchronization for {} when marked as not stay in sync".format(ds.spec))
        return 0
    elif ds.has_sync_error:
        logger.warn("Can't start synchronization of {} due to previous sync error.".format(ds.spec))
        return 0
    ds.has_sync_error = False
    ds.sync_error_message = None
    ds.records_in_sync = False
    ds.processed_records = 0
    ds.save()
    records_processed = 0
    try:
        valid_records = ds.valid
        while len(graph_list) > 0:
            actions = []
            # todo use the graphs instead of the URIs
            for graph_uri in graph_list:
                synchronise_record(graph_uri, ds, store, actions, index=index)
            # index actions
            logger.info("number of actions scheduled: {}".format(len(actions)))
            response = helpers.bulk(client=get_es(), actions=actions, stats_only=True)
            records_processed += len(graph_list)
            logger.info("processed {}/{} for {}".format(records_processed, valid_records, ds.spec))
            logger.debug("ElasticSearch bulk update: {}".format(response))
            update_switch = [QueryType.remove_insert.format(
                named_graph=g,
                remove="?s <http://schemas.delving.eu/narthex/terms/synced> false",
                insert="?s <http://schemas.delving.eu/narthex/terms/synced> true"
            ) for g in graph_list]
            response = store.update(query="\n".join(update_switch))
            logger.debug("SPARQL update succeeded: {}".format(response))
            ds = DataSet.objects.get(id=ds.id)
            ds.processed_records = records_processed
            ds.save()
            graph_list = get_out_of_sync_dataset_record_graph_uris(dataset_graph_uri, store, 200)

        ds.process_key = None
        ds.records_in_sync = True
        ds.dataset_type = DataSetType.aggregated
        if not ds.oai_pmh.real > 0:
            ds.oai_pmh = OaiPmhPublished.none
        ds.save()
        logger.info("Finishing synchronising {} records from dataset: {}".format(records_processed, dataset_graph_uri))
    except Exception as e:
        logger.error("Unable to index all records for dataset {} due to {} at record {}.".format(ds.spec, e, graph_uri))
        ds.sync_error_message = "{} with error: {}".format(graph_uri, e)
        ds.has_sync_error = True
        ds.process_key = None
        ds.save()
        logger.warn("Only index {} of {} valid for dataset {}".format(records_processed, valid_records, ds.spec))
    return records_processed