def load_datasets_for_update(doc_stream, index): """Consume stream of dataset documents, associate each to a product by looking up existing dataset in the index. Datasets not in the database will be logged. Doesn't load lineage information Generates tuples in the form (new_dataset, existing_dataset) """ def mk_dataset(ds, uri): uuid = ds.id if uuid is None: return None, None, "Metadata document it missing id field" existing = index.datasets.get(uuid) if existing is None: return None, None, "No such dataset in the database: {}".format(uuid) return Dataset(existing.type, ds.doc_without_lineage_sources, uris=[uri]), existing, None for uri, doc in doc_stream: dataset, existing, error_msg = mk_dataset(doc, uri) if dataset is None: _LOG.error("Failure while processing: %s\n > Reason: %s", uri, error_msg) else: is_consistent, reason = check_dataset_consistent(dataset) if is_consistent: yield dataset, existing else: _LOG.error("Dataset %s inconsistency: %s", dataset.id, reason)
def load_datasets(path, ds_resolve): for uri, ds in ui_path_doc_stream(path): dataset, err = ds_resolve(ds, uri) if dataset is None: _LOG.error('dataset is empty', error=str(err)) continue is_consistent, reason = check_dataset_consistent(dataset) if not is_consistent: _LOG.error("dataset inconsistency", dataset=dataset.id, reason=str(reason)) continue yield dataset