def _process_action(self, action): try: self.spec = action["dataset"] process_verb = action["action"] record = None if process_verb in ["clear_orphans"]: purge_date = action.get("modification_date") if purge_date: orphans_removed = RDFRecord.remove_orphans(spec=self.spec, timestamp=purge_date) logger.info("Deleted {} orphans for {} before {}".format(orphans_removed, self.spec, purge_date)) elif process_verb in ["disable_index"]: RDFRecord.delete_from_index(self.spec) logger.info("Deleted dataset {} from index. ".format(self.spec)) elif process_verb in ["drop_dataset"]: RDFRecord.delete_from_index(self.spec) DataSet.objects.filter(spec=self.spec).delete() logger.info("Deleted dataset {} from index. ".format(self.spec)) else: record_graph_uri = action["graphUri"] graph_ntriples = action["graph"] acceptance_mode = action.get("acceptanceMode", "false") acceptance = True if acceptance_mode is not None and acceptance_mode.lower() in ["true"] else False content_hash = action.get("contentHash", None) from lod.utils.resolver import ElasticSearchRDFRecord record = ElasticSearchRDFRecord(spec=self.spec, rdf_string=graph_ntriples) try: rdf_format = record.DEFAULT_RDF_FORMAT if "<rdf:RDF" not in graph_ntriples else "xml" record.from_rdf_string( rdf_string=graph_ntriples, named_graph=record_graph_uri, input_format=rdf_format ) except ParseError as e: self.rdf_errors.append((e, action)) logger.error(e, action) return None self.records_stored += 1 self.es_actions[(record.hub_id, content_hash)] = record.create_es_action( action=process_verb, store=self.store, context=True, flat=True, exclude_fields=None, acceptance=acceptance, doc_type="void_edmrecord", record_type="mdr", content_hash=content_hash, ) if settings.RDF_STORE_TRIPLES: self.sparql_update_queries[(record.hub_id, content_hash)] = record.create_sparql_update_query( acceptance=acceptance ) return record except KeyError as ke: self.json_errors.append((ke, action)) self.records_with_errors += 1 return None
def process_narthex_file(self, spec, store=None, acceptance=False, path=None, console=False): start = datetime.now() if not store: store = rdfstore.get_rdfstore() if not path: processed_fname = self.get_narthex_processed_fname() else: processed_fname = path print("started processing {} for dataset {}".format(processed_fname, spec)) with open(processed_fname, 'r') as f: rdf_record = [] lines = 0 records = 0 stored = 0 new = 0 not_orphaned = [] sparql_update_queries = [] es_actions = [] # set orphaned records for line in f: lines += 1 exists, named_graph, content_hash = self.is_line_marker(line) if exists: new += 1 records += 1 triples = " ".join(rdf_record) record = ElasticSearchRDFRecord(rdf_string=triples, spec=spec) try: record.from_rdf_string(named_graph=named_graph, rdf_string=triples, input_format="xml") es_actions.append(record.create_es_action(doc_type="void_edmrecord", record_type="mdr", context=True)) except Exception as ex: if console: print("problem with {} for spec {} caused by {}".format(triples, spec, ex)) else: logger.error("problem with {} for spec {} caused by {}".format(triples, spec, ex)) rdf_record[:] = [] if settings.RDF_STORE_TRIPLES: sparql_update_queries.append( record.create_sparql_update_query(acceptance=acceptance) ) nr_sparql_updates = len(sparql_update_queries) if settings.RDF_STORE_TRIPLES and nr_sparql_updates > 0 and nr_sparql_updates % 50 == 0: store.update("\n".join(sparql_update_queries)) sparql_update_queries[:] = [] if records % 100 == 0 and records > 0: logger.info("processed {} records of {} at {}".format(records, spec, ctime())) if console: print("processed {} records of {} at {}".format(records, spec, ctime())) if len(es_actions) > 100: self.bulk_index(es_actions, spec) es_actions[:] = [] else: rdf_record.append(line) # store the remaining bulk items self.bulk_index(es_actions, spec) if settings.RDF_STORE_TRIPLES and len(sparql_update_queries) > 0: store.update("\n".join(sparql_update_queries)) logger.info( "Dataset {}: records inserted {}, records same content hash {}, lines parsed {}, total records processed {}".format( spec, new, stored, lines, records) ) print("Finished loading {spec} with {lines} and {records} in {seconds}\n".format( spec=spec, lines=lines, records=records, seconds=datetime.now() - start )) RDFRecord.remove_orphans(spec, start.isoformat()) return lines, records