def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) refresh_collection(collection)
def upgrade(): bind = op.get_bind() meta = sa.MetaData() meta.bind = bind meta.reflect() entity_table = meta.tables['entity'] collection_table = meta.tables['collection'] q = sa.select([collection_table]) crp = bind.execute(q) for collection in crp.fetchall(): ns = Namespace(collection.foreign_id) q = sa.select([entity_table]) q = q.where(entity_table.c.collection_id == collection.id) erp = bind.execute(q) while True: entity = erp.fetchone() if not entity: break proxy = model.get_proxy( { 'id': entity.id, 'schema': entity.schema, 'properties': entity.data }, cleaned=False) proxy.add('name', entity.name, quiet=True, cleaned=False) proxy = ns.apply(proxy) q = sa.update(entity_table) q = q.where(entity_table.c.id == entity.id) q = q.values(id=proxy.id, data=proxy.properties) bind.execute(q) op.drop_column('entity', 'foreign_id') op.drop_column('entity', 'name')
def upgrade(): bind = op.get_bind() meta = sa.MetaData() meta.bind = bind meta.reflect() entity_table = meta.tables["entity"] collection_table = meta.tables["collection"] q = sa.select([collection_table]) crp = bind.execute(q) for collection in crp.fetchall(): ns = Namespace(collection.foreign_id) q = sa.select([entity_table]) q = q.where(entity_table.c.collection_id == collection.id) erp = bind.execute(q) while True: entity = erp.fetchone() if not entity: break proxy = model.get_proxy( { "id": entity.id, "schema": entity.schema, "properties": entity.data }, cleaned=False, ) proxy.add("name", entity.name, quiet=True, cleaned=False) proxy = ns.apply(proxy) q = sa.update(entity_table) q = q.where(entity_table.c.id == entity.id) q = q.values(id=proxy.id, data=proxy.properties) bind.execute(q) op.drop_column("entity", "foreign_id") op.drop_column("entity", "name")
def stream_mapping(infile: Path, outfile: Path, mapping_yaml: Path, sign: bool = True) -> None: queries: List[Tuple[str, QueryMapping]] = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, "queries", "query"): data.pop("database", None) data["csv_url"] = "/dev/null" query = model.make_mapping(data, key_prefix=dataset) queries.append((dataset, query)) try: with path_writer(outfile) as outfh: with input_file(infile) as fh: for record in CSVSource.read_csv(fh): for (dataset, query) in queries: ns = Namespace(dataset) if query.source.check_filters(record): # type: ignore entities = query.map(record) for entity in entities.values(): if sign: entity = ns.apply(entity) write_entity(outfh, entity) except BrokenPipeError: raise click.Abort()
def bulk_load_query(queue, collection, query_id, query): namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) if records_total: queue.progress.mark_pending(records_total) aggregator = get_aggregator(collection) writer = aggregator.bulk() entities_count = 0 for idx, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) entities_count += 1 fragment = '%s-%s' % (query_id, idx) writer.put(entity, fragment=fragment) if idx > 0 and idx % 1000 == 0: queue.progress.mark_finished(1000) log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, idx, records_total or 'streaming', entities_count) writer.flush() aggregator.close() log.info("[%s] Query done (%s entities)", collection.foreign_id, entities_count)
def bulk_write(collection, items, merge=True): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) entity = namespace.apply(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
def sign(infile, outfile, signature): ns = Namespace(signature) try: for entity in read_entities(infile): signed = ns.apply(entity) write_object(outfile, signed) except BrokenPipeError: raise click.Abort()
def sign(infile: Path, outfile: Path, signature: Optional[str]) -> None: ns = Namespace(signature) try: with path_writer(outfile) as outfh: for entity in path_entities(infile, EntityProxy): signed = ns.apply(entity) write_entity(outfh, signed) except BrokenPipeError: raise click.Abort()
def test_apply(self): entity = { "id": "banana", "schema": "LegalEntity", "properties": {"sameAs": ["kumkwat"], "parent": ["pretzel"]}, } proxy = model.get_proxy(entity) assert proxy.id == "banana", proxy.id ns = Namespace("fruit") out = ns.apply(proxy) assert out.id == ns.sign(proxy.id), out
def sign(infile, outfile, signature): ns = Namespace(signature) try: while True: entity = read_entity(infile) if entity is None: break signed = ns.apply(entity) write_object(outfile, signed) except BrokenPipeError: raise click.Abort()
def test_apply(self): entity = { 'id': 'banana', 'schema': 'LegalEntity', 'properties': { 'sameAs': ['kumkwat'], 'parent': ['pretzel'] } } proxy = model.get_proxy(entity) assert proxy.id == 'banana', proxy.id ns = Namespace('fruit') out = ns.apply(proxy) assert out.id == ns.sign(proxy.id), out
def run_mapping(outfile, mapping_yaml, sign=True): config = load_mapping_file(mapping_yaml) try: for dataset, meta in config.items(): ns = Namespace(dataset) for mapping in keys_values(meta, "queries", "query"): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: if sign: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def aggregate(infile, outfile): buffer = {} namespace = Namespace(None) try: for entity in read_entities(infile): entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity for entity in buffer.values(): write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def run_mapping(outfile: Path, mapping_yaml: Path, sign: bool = True) -> None: config = load_mapping_file(mapping_yaml) try: with path_writer(outfile) as outfh: for dataset, meta in config.items(): ns = Namespace(dataset) for mapping in keys_values(meta, "queries", "query"): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: if sign: entity = ns.apply(entity) write_entity(outfh, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def aggregate(infile: Path, outfile: Path) -> None: buffer: Dict[str, EntityProxy] = {} namespace = Namespace(None) try: with path_writer(outfile) as outfh: for entity in path_entities(infile, EntityProxy): entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity for entity in buffer.values(): write_entity(outfh, entity) except BrokenPipeError: raise click.Abort()
def bulk_write(collection, iterable, job_id=None, unsafe=False): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) stage = get_stage(collection, OP_INDEX, job_id=job_id) entities = [] for item in iterable: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) entity = namespace.apply(entity) if not unsafe: entity = remove_checksums(entity) entities.append(entity) index_entities(stage, collection, entities)
def aggregate(): buffer = {} namespace = Namespace(None) try: stdin = click.get_text_stream('stdin') while True: entity = read_entity(stdin) if entity is None: break entity = namespace.apply(entity) if entity.id in buffer: buffer[entity.id].merge(entity) else: buffer[entity.id] = entity stdout = click.get_text_stream('stdout') for entity in buffer.values(): write_object(stdout, entity) except BrokenPipeError: raise click.Abort()
def stream_mapping(infile, outfile, signature, mapping_yaml): sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, 'queries', 'query'): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append(source) try: ns = Namespace(signature) for record in StreamSource.read_csv(infile): for source in sources: if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): if signature is not None: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def bulk_write(collection, items, merge=True, unsafe=False): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) if not unsafe: entity = namespace.apply(entity) entity = remove_checksums(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
class Manager(object): """Handles the lifecycle of an ingestor. This can be subclassed to embed it into a larger processing framework.""" #: Indicates that during the processing no errors or failures occured. STATUS_SUCCESS = u"success" #: Indicates occurance of errors during the processing. STATUS_FAILURE = u"failure" MAGIC = magic.Magic(mime=True) def __init__(self, dataset, stage, context): self.dataset = dataset self.writer = dataset.bulk() self.stage = stage self.context = context self.ns = Namespace(self.context.get("namespace")) self.work_path = ensure_path(mkdtemp(prefix="ingestor-")) self.emitted = set() @property def archive(self): if not hasattr(settings, "_archive"): settings._archive = init_archive() return settings._archive def make_entity(self, schema, parent=None): schema = model.get(schema) entity = model.make_entity(schema, key_prefix=self.stage.job.dataset.name) self.make_child(parent, entity) return entity def make_child(self, parent, child): """Derive entity properties by knowing it's parent folder.""" if parent is not None and child is not None: # Folder hierarchy: child.add("parent", parent.id) child.add("ancestors", parent.get("ancestors")) child.add("ancestors", parent.id) self.apply_context(child, parent) def apply_context(self, entity, source): # Aleph-specific context data: entity.context = { "created_at": source.context.get("created_at"), "updated_at": source.context.get("updated_at"), "role_id": source.context.get("role_id"), "mutable": False, } def emit_entity(self, entity, fragment=None): entity = self.ns.apply(entity) # pprint(entity.to_dict()) self.writer.put(entity.to_dict(), fragment) self.emitted.add(entity.id) def emit_text_fragment(self, entity, texts, fragment): texts = [t for t in ensure_list(texts) if filter_text(t)] if len(texts): doc = self.make_entity(entity.schema) doc.id = entity.id doc.add("indexText", texts) self.emit_entity(doc, fragment=safe_fragment(fragment)) def auction(self, file_path, entity): if not entity.has("mimeType"): if file_path.is_dir(): entity.add("mimeType", DirectoryIngestor.MIME_TYPE) return DirectoryIngestor entity.add("mimeType", self.MAGIC.from_file(file_path.as_posix())) best_score, best_cls = 0, None for cls in get_extensions("ingestors"): score = cls.match(file_path, entity) if score > best_score: best_score = score best_cls = cls if best_cls is None: raise ProcessingException("Format not supported") return best_cls def queue_entity(self, entity): log.debug("Queue: %r", entity) self.stage.queue(entity.to_dict(), self.context) def store(self, file_path, mime_type=None): file_path = ensure_path(file_path) mime_type = normalize_mimetype(mime_type) if file_path is not None and file_path.is_file(): return self.archive.archive_file(file_path, mime_type=mime_type) def load(self, content_hash, file_name=None): # log.info("Local archive name: %s", file_name) return self.archive.load_file(content_hash, file_name=file_name, temp_path=self.work_path) def ingest_entity(self, entity): for content_hash in entity.get("contentHash", quiet=True): file_name = entity_filename(entity) file_path = self.load(content_hash, file_name=file_name) if file_path is None or not file_path.exists(): continue self.ingest(file_path, entity) return self.finalize(entity) def ingest(self, file_path, entity, **kwargs): """Main execution step of an ingestor.""" file_path = ensure_path(file_path) if file_path.is_file() and not entity.has("fileSize"): entity.add("fileSize", file_path.stat().st_size) entity.set("processingStatus", self.STATUS_FAILURE) try: ingestor_class = self.auction(file_path, entity) log.info("Ingestor [%r]: %s", entity, ingestor_class.__name__) self.delegate(ingestor_class, file_path, entity) entity.set("processingStatus", self.STATUS_SUCCESS) except ProcessingException as pexc: entity.set("processingError", stringify(pexc)) log.error("[%r] Failed to process: %s", entity, pexc) finally: self.finalize(entity) def finalize(self, entity): self.emit_entity(entity) self.writer.flush() remove_directory(self.work_path) def delegate(self, ingestor_class, file_path, entity): ingestor_class(self).ingest(file_path, entity) def close(self): self.writer.flush() remove_directory(self.work_path)
class Analyzer(object): MENTIONS = {TAG_COMPANY: "Organization", TAG_PERSON: "Person"} def __init__(self, dataset, entity, context): self.dataset = dataset self.ns = Namespace(context.get("namespace", dataset.name)) self.entity = model.make_entity(entity.schema) self.entity.id = entity.id self.aggregator_entities = TagAggregatorFasttext() self.aggregator_patterns = TagAggregator() def feed(self, entity): if not settings.ANALYZE_ENTITIES: return if not entity.schema.is_a(ANALYZABLE): return # HACK: Tables should be mapped, don't try to tag them here. if entity.schema.is_a("Table"): return texts = entity.get_type_values(registry.text) for text in text_chunks(texts): detect_languages(self.entity, text) for (prop, tag) in extract_entities(self.entity, text): self.aggregator_entities.add(prop, tag) for (prop, tag) in extract_patterns(self.entity, text): self.aggregator_patterns.add(prop, tag) def flush(self): writer = self.dataset.bulk() countries = set() results = list( chain(self.aggregator_entities.results(), self.aggregator_patterns.results())) for (key, prop, values) in results: if prop.type == registry.country: countries.add(key) mention_ids = set() for (key, prop, values) in results: label = values[0] if prop.type == registry.name: label = registry.name.pick(values) schema = self.MENTIONS.get(prop) if schema is not None and self.entity.schema.is_a(DOCUMENT): mention = model.make_entity("Mention") mention.make_id("mention", self.entity.id, prop, key) mention_ids.add(mention.id) mention.add("resolved", make_entity_id(key)) mention.add("document", self.entity.id) mention.add("name", values) mention.add("detectedSchema", schema) mention.add("contextCountry", countries) mention = self.ns.apply(mention) writer.put(mention) # pprint(mention.to_dict()) self.entity.add(prop, label, cleaned=True, quiet=True) if len(results): log.debug( "Extracted %d prop values, %d mentions [%s]: %s", len(results), len(mention_ids), self.entity.schema.name, self.entity.id, ) writer.put(self.entity) writer.flush() return mention_ids