Exemplo n.º 1
0
def _merge_schemata(proxy, schemata):
    for other in schemata:
        try:
            other = model.get(other)
            proxy.schema = model.common_schema(proxy.schema, other)
        except InvalidData:
            proxy.schema = model.get(Entity.LEGAL_ENTITY)
Exemplo n.º 2
0
def datagen(proxy):
    if proxy.schema.is_a(model.get("Page")):
        yield from type_datagen_page(proxy)
        return
    elif proxy.schema.is_a(model.get("Document")):
        return
    yield from type_datagen(proxy)
    yield from schema_datagen(proxy)
Exemplo n.º 3
0
    def ingest(self, file_path, entity):
        """Ingestor implementation."""
        if entity.schema == model.get("Document"):
            entity.schema = model.get("Folder")

        if file_path is None or not file_path.is_dir():
            return

        self.crawl(self.manager, file_path, parent=entity)
Exemplo n.º 4
0
def iter_proxies(**kw):
    document = model.get(Document.SCHEMA)
    includes = ['schema', 'properties']
    for data in iter_entities(includes=includes, **kw):
        schema = model.get(data.get('schema'))
        if schema is None:
            continue
        if 'properties' not in data and schema.is_a(document):
            data.update(Document.doc_data_to_schema(data))
        yield model.get_proxy(data)
Exemplo n.º 5
0
def enrich_xref(
    G,
    foreign_id,
    match_collection_ids=None,
    entity_schemata=None,
    match_schemata=None,
    min_score=0.5,
    skip_unknown_entities=True,
):
    if entity_schemata:
        entity_schema = model.get(entity_schemata)
    if match_schemata:
        match_schema = model.get(match_schemata)
    collection = alephclient.get_collection_by_foreign_id(foreign_id)
    collection_id = collection["id"]
    xrefs = alephclient.get_collection_xref(collection_id, publisher=True)
    N = 0
    for xref in tqdm(xrefs):
        if xref["score"] < min_score:
            log.debug(
                f"Stoping xref enrichment due to low xref score: {xref['score']} < {min_score}"
            )
            break
        match_collection_id = int(xref["match_collection"]["collection_id"])
        if match_collection_ids and match_collection_id not in match_collection_ids:
            log.debug(
                f"Collection not wanted: {match_collection_ids}: {match_collection_id}"
            )
            continue
        if skip_unknown_entities and xref["entity"]["id"] not in G:
            log.debug(f"Entity not in graph: {xref['entity']}")
            continue
        entity_proxy = parse_entity(xref["entity"])
        match_proxy = parse_entity(xref["match"])
        if entity_schemata and not entity_proxy.schema.is_a(entity_schema):
            log.debug(
                f"Entity is not the right schema: {entity_schema}: {entity_proxy.schema}"
            )
            continue
        if match_schemata and not match_proxy.schema.is_a(match_schema):
            log.debug(
                f"Match is not the right schema: {match_schema}: {match_proxy.schema}"
            )
            continue
        try:
            G.add_proxy(entity_proxy)
            G.add_proxy(match_proxy)
            G.merge_proxies(entity_proxy, match_proxy)
        except InvalidData:
            pass
        N += 1
    return N
Exemplo n.º 6
0
 def ingest(self, file_path, entity):
     try:
         entity.schema = model.get("Audio")
         metadata = MediaInfo.parse(file_path)
         for track in metadata.tracks:
             entity.add("title", track.title)
             entity.add("generator", track.writing_application)
             entity.add("generator", track.writing_library)
             entity.add("generator", track.publisher)
             entity.add(
                 "authoredAt", self.parse_timestamp(track.recorded_date)
             )  # noqa
             entity.add(
                 "authoredAt", self.parse_timestamp(track.tagged_date)
             )  # noqa
             entity.add(
                 "authoredAt", self.parse_timestamp(track.encoded_date)
             )  # noqa
             modified_at = self.parse_timestamp(
                 track.file_last_modification_date
             )  # noqa
             entity.add("modifiedAt", modified_at)
             if track.sampling_rate:
                 entity.add("samplingRate", track.sampling_rate)
             entity.add("duration", track.duration)
     except Exception as ex:
         raise ProcessingException("Could not read audio: %r", ex) from ex
Exemplo n.º 7
0
    def ingest(self, file_path, entity):
        entity.schema = model.get('Table')
        with io.open(file_path, 'rb') as fh:
            encoding = self.detect_stream_encoding(fh)
            log.debug("Detected encoding [%r]: %s", entity, encoding)

        fh = io.open(file_path, 'r', encoding=encoding, errors='replace')
        try:
            sample = fh.read(4096 * 10)
            fh.seek(0)

            dialect = csv.Sniffer().sniff(sample)
            # dialect.delimiter = dialect.delimiter[0]
            has_header = csv.Sniffer().has_header(sample)

            reader = csv.reader(fh, dialect=dialect)
            rows = self.generate_rows(reader, has_header=has_header)
            self.emit_row_dicts(entity, rows)
        except UnicodeDecodeError as ude:
            log.warning("Encoding error: %r", entity)
            raise ProcessingException("Could not decode CSV (%s)" % encoding) from ude  # noqa
        except Exception as err:
            log.exception("CSV error: %s", err)
            raise ProcessingException("Invalid CSV: %s" % err) from err
        finally:
            fh.close()
Exemplo n.º 8
0
    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            msg = Message(file_path.as_posix())
        except Exception as exc:
            msg = "Cannot open message file: %s" % exc
            raise ProcessingException(msg) from exc

        self.extract_olefileio_metadata(msg, entity)

        try:
            self.extract_msg_headers(entity, msg.header)
        except Exception:
            log.exception("Cannot parse Outlook-stored headers")

        entity.add("subject", msg.subject)
        entity.add("threadTopic", msg.getStringField("0070"))
        entity.add("encoding", msg.encoding)
        entity.add("bodyText", msg.body)
        entity.add("bodyHtml", msg.htmlBody)
        entity.add("messageId", self.parse_message_ids(msg.message_id))

        if not entity.has("inReplyTo"):
            entity.add("inReplyTo", self.parse_references(msg.references, []))

        try:
            date = parsedate_to_datetime(msg.date).isoformat()
            entity.add("date", date)
        except Exception:
            log.warning("Could not parse date: %s", msg.date)

        # sender name and email
        sender = self.get_identities(msg.sender)
        self.apply_identities(entity, sender, "emitters", "sender")

        # received by
        sender = self.get_identity(msg.getStringField("0040"),
                                   msg.getStringField("0076"))
        self.apply_identities(entity, sender, "emitters")

        froms = self.get_identities(msg.getStringField("1046"))
        self.apply_identities(entity, froms, "emitters", "from")

        tos = self.get_identities(msg.to)
        self.apply_identities(entity, tos, "recipients", "to")

        ccs = self.get_identities(msg.cc)
        self.apply_identities(entity, ccs, "recipients", "cc")

        bccs = self.get_identities(msg.bcc)
        self.apply_identities(entity, bccs, "recipients", "bcc")

        self.resolve_message_ids(entity)
        for attachment in msg.attachments:
            if attachment.type != "data":
                continue
            name = stringify(attachment.longFilename)
            name = name or stringify(attachment.shortFilename)
            self.ingest_attachment(entity, name, attachment.type,
                                   attachment.data)
Exemplo n.º 9
0
 def ingest(self, file_path, entity):
     entity.schema = model.get('Pages')
     pdf_path = self.make_work_file('tiff.pdf')
     self.exec_command('tiff2pdf', file_path, '-x', '300', '-y', '300',
                       '-o', pdf_path)
     self.assert_outfile(pdf_path)
     self.pdf_alternative_extract(entity, pdf_path)
Exemplo n.º 10
0
def convert_party(party):
    entity = model.make_entity('LegalEntity')
    party_id = party.pop('id', None)
    identifier = party.pop('identifier', {})
    if party_id is None:
        party_id = identifier.get('id')
    entity.make_id(party_id)
    convert_name(entity, party)
    convert_address(entity, party.pop('address', {}))
    convert_address(entity, party.pop('deliveryAddress', {}))
    entity.add('legalForm', party.pop('organizationType', None))
    contact = party.pop('contactPoint', {})
    entity.add('website', contact.pop('url', None))
    entity.add('phone', contact.pop('telephone', None))
    entity.add('email', contact.pop('email', None))
    convert_identifier(entity, identifier)
    for identifier in party.pop('additionalIdentifiers', []):
        convert_identifier(entity, identifier)
    yield entity
    for mem in ensure_list(party.pop('memberOf', [])):
        for other in convert_party(mem):
            other.schema = model.get('Organization')
            yield other
            mem = model.make_entity('Membership')
            mem.make_id(entity.id, other.id)
            mem.add('member', entity)
            mem.add('organization', other)
            yield mem

    party.pop('roles', None)
Exemplo n.º 11
0
def entity_tags(entity, authz):
    """Do a search on tags of an entity."""
    proxy = model.get_proxy(entity)
    Thing = model.get(Entity.THING)
    types = [registry.name, registry.email, registry.identifier,
             registry.iban, registry.phone, registry.address]
    facets = []
    # Go through all the tags which apply to this entity, and find how
    # often they've been mentioned in other entities.
    for type_ in types:
        if type_.group is None:
            continue
        for fidx, value in enumerate(proxy.get_type_values(type_)):
            if type_.specificity(value) < 0.1:
                continue
            schemata = model.get_type_schemata(type_)
            schemata = [s for s in schemata if s.is_a(Thing)]
            index = entities_read_index(schemata)
            alias = '%s_%s' % (type_.name, fidx)
            facets.append((index, alias, type_.group, type_.group, value))

    res = _filters_faceted_query(authz, facets)
    for (_, alias, field, _, value) in facets:
        total = res.get(alias, 0)
        if total > 1:
            yield (field, value, total)
Exemplo n.º 12
0
    def ingest(self, file_path, entity):
        entity.schema = model.get('Email')
        try:
            msg = Message(file_path.as_posix())
        except Exception as exc:
            msg = "Cannot open message file: %s" % exc
            raise ProcessingException(msg) from exc

        self.extract_olefileio_metadata(msg, entity)

        try:
            self.extract_msg_headers(entity, msg.header)
        except Exception:
            log.exception("Cannot parse Outlook-stored headers")

        entity.add('subject', msg.subject)
        entity.add('threadTopic', msg.getStringField('0070'))
        entity.add('encoding', msg.encoding)
        entity.add('bodyText', msg.body)
        entity.add('bodyHtml', msg.htmlBody)
        entity.add('messageId', self.parse_message_ids(msg.message_id))

        if not entity.has('inReplyTo'):
            entity.add('inReplyTo', self.parse_references(msg.references, []))

        try:
            date = parsedate_to_datetime(msg.date).isoformat()
            entity.add('date', date)
        except Exception:
            log.warning("Could not parse date: %s", msg.date)

        # sender name and email
        sender = self.get_identities(msg.sender)
        self.apply_identities(entity, sender, 'emitters', 'sender')

        # received by
        sender = self.get_identity(msg.getStringField('0040'),
                                   msg.getStringField('0076'))
        self.apply_identities(entity, sender, 'emitters')

        froms = self.get_identities(msg.getStringField('1046'))
        self.apply_identities(entity, froms, 'emitters', 'from')

        tos = self.get_identities(msg.to)
        self.apply_identities(entity, tos, 'recipients', 'to')

        ccs = self.get_identities(msg.cc)
        self.apply_identities(entity, ccs, 'recipients', 'cc')

        bccs = self.get_identities(msg.bcc)
        self.apply_identities(entity, bccs, 'recipients', 'bcc')

        self.resolve_message_ids(entity)
        for attachment in msg.attachments:
            if attachment.type != 'data':
                continue
            name = stringify(attachment.longFilename)
            name = name or stringify(attachment.shortFilename)
            self.ingest_attachment(entity, name, attachment.type,
                                   attachment.data)
Exemplo n.º 13
0
 def _serialize(self, obj):
     pk = obj.get('id')
     obj['id'] = str(pk)
     collection_id = obj.pop('collection_id', None)
     obj['writeable'] = request.authz.can(collection_id,
                                          request.authz.WRITE)  # noqa
     obj['collection'] = self.resolve(Collection, collection_id,
                                      CollectionSerializer)  # noqa
     ent_ids = obj.pop('entities')
     obj['entities'] = []
     for ent_id in ent_ids:
         entity = self.resolve(Entity, ent_id, DiagramEntitySerializer)
         if entity is not None:
             obj['entities'].append(entity)
     for ent in obj['entities']:
         schema = model.get(ent.get('schema'))
         properties = ent.get('properties', {})
         for prop in schema.properties.values():
             if prop.type != registry.entity:
                 continue
             values = ensure_list(properties.get(prop.name))
             if values:
                 properties[prop.name] = []
                 for value in values:
                     entity = self.resolve(Entity, value,
                                           DiagramEntitySerializer)  # noqa
                     properties[prop.name].append(entity)
     return self._clean_response(obj)
Exemplo n.º 14
0
    def add_cast(
        self,
        schema: Union[str, Schema],
        prop: Union[str, Property],
        values: Any,
        cleaned: bool = False,
        fuzzy: bool = False,
        format: Optional[str] = None,
    ):
        """Set a property on an entity. If the entity is of a schema that doesn't
        have the given property, also modify the schema (e.g. if something has a
        birthDate, assume it's a Person, not a LegalEntity).
        """
        prop_ = self.schema.get(prop)
        if prop_ is not None:
            return self.add(prop,
                            values,
                            cleaned=cleaned,
                            fuzzy=fuzzy,
                            format=format)

        schema_ = model.get(schema)
        if schema_ is None:
            raise RuntimeError("Invalid schema: %s" % schema)
        prop_ = schema_.get(prop)
        if prop_ is None:
            raise RuntimeError("Invalid prop: %s" % prop)
        for value in self._lookup_values(prop_, values):
            clean = self._verbose_clean(prop_, value, fuzzy, format)
            if clean is not None:
                self.add_schema(schema)
                self.unsafe_add(prop_, clean, cleaned=True)
Exemplo n.º 15
0
def expand_group(node):
    if node.type.group is None or node.value is None:
        return
    value = str(node.value)
    query = {
        'query': {
            'term': {
                node.type.group: value
            }
        },
        '_source': {
            'includes': ['schema', 'properties']
        }
    }
    for res in scan(es, index=entities_index(), query=query):
        entity_id = res.get('_id')
        source = res.get('_source')
        properties = source.get('properties')
        schema = model.get(source.get('schema'))
        for prop in schema.properties.values():
            if prop.type != node.type:
                continue
            values = properties.get(prop.name)
            values = node.type.normalize_set(values)
            if value not in values:
                continue
            if prop.reverse:
                yield Link(node, prop.reverse, entity_id)
            else:
                yield Link(node, prop, entity_id, inverted=True)
Exemplo n.º 16
0
def suggest_property():
    prefix = request.args.get('prefix', '').lower().strip()
    tag_request(prefix=prefix)
    schema = request.args.get('schema', Entity.THING)
    matches = []
    for prop in model.get(schema).properties.values():
        match = not len(prefix)
        match = prefix in prop.name.lower()
        match = match or prefix in prop.label.lower()
        if match:
            matches.append({
                'id': prop.name,
                'quid': prop.name,
                'name': prop.label,
                'r:score': 100,
                'n:type': {
                    'id': '/properties/property',
                    'name': 'Property'
                }
            })
    return jsonify({
        "code": "/api/status/ok",
        "status": "200 OK",
        "prefix": request.args.get('prefix', ''),
        "result": matches
    })
Exemplo n.º 17
0
 def test_model_basics(self):
     assert model.schemata['Thing'], model.schemata
     thing = model.schemata['Thing']
     assert thing == model.get(thing)
     assert thing in list(model), list(model)
     assert 'Person' in model.to_dict(), model.to_dict()
     assert 'Thing' in model.to_dict(), model.to_dict()
Exemplo n.º 18
0
 def ingest(self, file_path, entity):
     entity.schema = model.get('Table')
     try:
         table = Table(file_path.as_posix()).open()
         self.emit_row_dicts(entity, self.generate_rows(table))
     except DbfError as err:
         raise ProcessingException('Cannot open DBF file: %s' % err) from err  # noqa
Exemplo n.º 19
0
def iter_proxies(**kw):
    includes = ['schema', 'properties']
    for data in iter_entities(includes=includes, **kw):
        schema = model.get(data.get('schema'))
        if schema is None:
            continue
        yield model.get_proxy(data)
Exemplo n.º 20
0
def parse_reference(context, reference, rows):
    entity = context.make("LegalEntity")
    entity.id = context.make_slug(reference)
    # entity.add("sourceUrl", context.dataset.url)
    sanction = h.make_sanction(context, entity)

    for row in rows:
        if row.pop("type") == "Individual":
            entity.schema = model.get("Person")

        name = row.pop("name_of_individual_or_entity", None)
        if row.pop("name_type") == "aka":
            entity.add("alias", name)
        else:
            entity.add("name", name)

        address = h.make_address(context, full=row.pop("address"))
        h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", row.pop("additional_information"))
        entity.add("notes", row.pop("listing_information"), quiet=True)

        control_date = row.pop("control_date")
        sanction.add("modifiedAt", control_date)
        entity.add("modifiedAt", control_date)
        entity.context["updated_at"] = control_date.isoformat()

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemplo n.º 21
0
def suggest_property():
    prefix = request.args.get("prefix", "").lower().strip()
    tag_request(prefix=prefix)
    schema = request.args.get("schema", Entity.THING)
    matches = []
    for prop in model.get(schema).properties.values():
        match = not len(prefix)
        match = prefix in prop.name.lower()
        match = match or prefix in prop.label.lower()
        if match:
            matches.append({
                "id": prop.name,
                "quid": prop.name,
                "name": prop.label,
                "r:score": 100,
                "n:type": {
                    "id": "/properties/property",
                    "name": "Property"
                },
            })
    return jsonify({
        "code": "/api/status/ok",
        "status": "200 OK",
        "prefix": request.args.get("prefix", ""),
        "result": matches,
    })
Exemplo n.º 22
0
    def ingest(self, file_path, entity):
        entity.schema = model.get("Workbook")
        self.ooxml_extract_metadata(file_path, entity)
        try:
            book = load_workbook(file_path, read_only=True)
        except Exception as err:
            raise ProcessingException("Invalid Excel file: %s" % err) from err

        try:
            for name in book.sheetnames:
                sheet = book[name]
                if not hasattr(sheet, "rows"):
                    log.warning("Cannot parse chart sheet: %s", name)
                    continue
                table = self.manager.make_entity("Table", parent=entity)
                table.make_id(entity.id, name)
                table.set("title", name)
                log.debug("Sheet: %s", name)
                self.emit_row_tuples(table, self.generate_rows(sheet))
                if table.has("csvHash"):
                    self.manager.emit_entity(table)
        except Exception as err:
            raise ProcessingException("Cannot read Excel file: %s" %
                                      err) from err
        finally:
            book.close()
Exemplo n.º 23
0
def suggest_property():
    prefix = request.args.get('prefix', '').lower().strip()
    tag_request(prefix=prefix)
    schema = request.args.get('schema', Entity.THING)
    matches = []
    for prop in model.get(schema).properties.values():
        match = not len(prefix)
        match = prefix in prop.name.lower()
        match = match or prefix in prop.label.lower()
        if match:
            matches.append({
                'id': prop.name,
                'quid': prop.name,
                'name': prop.label,
                'r:score': 100,
                'n:type': {
                    'id': '/properties/property',
                    'name': 'Property'
                }
            })
    return jsonify({
        "code": "/api/status/ok",
        "status": "200 OK",
        "prefix": request.args.get('prefix', ''),
        "result": matches
    })
Exemplo n.º 24
0
def iter_proxies(**kw):
    includes = ['schema', 'properties']
    for data in iter_entities(includes=includes, **kw):
        schema = model.get(data.get('schema'))
        if schema is None:
            continue
        yield model.get_proxy(data)
Exemplo n.º 25
0
def entity_tags(entity, authz=None):
    """Do a search on tags of an entity."""
    proxy = model.get_proxy(entity)
    Thing = model.get(Entity.THING)
    types = [registry.name, registry.email, registry.identifier,
             registry.iban, registry.phone, registry.address]
    facets = []
    # Go through all the tags which apply to this entity, and find how
    # often they've been mentioned in other entities.
    for type_ in types:
        if type_.group is None:
            continue
        for fidx, value in enumerate(proxy.get_type_values(type_)):
            if type_.specificity(value) < 0.1:
                continue
            schemata = model.get_type_schemata(type_)
            schemata = [s for s in schemata if s.is_a(Thing)]
            index = entities_read_index(schemata)
            alias = '%s_%s' % (type_.name, fidx)
            facets.append((index, alias, type_.group, type_.group, value))

    res = _filters_faceted_query(facets, authz=authz)
    for (_, alias, field, _, value) in facets:
        total = res.get(alias, 0)
        if total > 1:
            yield (field, value, total)
Exemplo n.º 26
0
def configure_schema(schema, version):
    # Generate relevant type mappings for entity properties so that
    # we can do correct searches on each.
    schema_mapping = {}
    numeric_mapping = {registry.date.group: NUMERIC}
    for prop in schema.properties.values():
        config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD))
        config["copy_to"] = ["text"]
        schema_mapping[prop.name] = config
        if prop.type in NUMERIC_TYPES:
            numeric_mapping[prop.name] = NUMERIC

    mapping = {
        "date_detection": False,
        "dynamic": False,
        "_source": {"excludes": ["text", "fingerprints"]},
        "properties": {
            "caption": KEYWORD,
            "schema": KEYWORD,
            "schemata": KEYWORD,
            registry.entity.group: KEYWORD,
            registry.language.group: KEYWORD,
            registry.country.group: KEYWORD,
            registry.checksum.group: KEYWORD,
            registry.ip.group: KEYWORD,
            registry.url.group: KEYWORD,
            registry.iban.group: KEYWORD,
            registry.email.group: KEYWORD,
            registry.phone.group: KEYWORD,
            registry.mimetype.group: KEYWORD,
            registry.identifier.group: KEYWORD,
            registry.date.group: PARTIAL_DATE,
            registry.address.group: KEYWORD,
            registry.name.group: KEYWORD,
            "fingerprints": {
                "type": "keyword",
                "normalizer": "latin_index",
                "copy_to": "text",
                "fields": {"text": LATIN_TEXT},
            },
            "text": {
                "type": "text",
                "analyzer": "latin_index",
                "search_analyzer": "latin_query",
                "search_quote_analyzer": "latin_index",
                "term_vector": "with_positions_offsets",
            },
            "properties": {"type": "object", "properties": schema_mapping},
            "numeric": {"type": "object", "properties": numeric_mapping},
            "role_id": KEYWORD,
            "collection_id": KEYWORD,
            "origin": KEYWORD,
            "created_at": {"type": "date"},
            "updated_at": {"type": "date"},
        },
    }
    index = schema_index(model.get(schema), version)
    settings = index_settings(shards=get_shard_weight(schema))
    return configure_index(index, mapping, settings)
Exemplo n.º 27
0
def parse_nested(edge):
    schema = model.get(edge["schema"])
    for items in edge["properties"].values():
        for item in items:
            if isinstance(item, dict):
                log.debug(f"Found nested item: {item['id']}")
                yield parse_entity(item)
    yield parse_entity(edge)
Exemplo n.º 28
0
 def ingest(self, file_path, entity):
     """Ingestor implementation."""
     entity.schema = model.get("Pages")
     pdf_path = self.make_work_file("page.pdf")
     self.exec_command("ddjvu", "-format=pdf", "-quality=100", "-skip",
                       file_path, pdf_path)
     self.assert_outfile(pdf_path)
     self.pdf_alternative_extract(entity, pdf_path)
Exemplo n.º 29
0
def get_schemata(dataset: Dataset) -> List[Schema]:
    schemata: List[Schema] = list()
    names = Statement.all_schemata(dataset=dataset)
    for name in names:
        schema = model.get(name)
        if schema is not None:
            schemata.append(schema)
    return schemata
Exemplo n.º 30
0
 def ingest(self, file_path, entity):
     entity.schema = model.get("Pages")
     pdf_path = self.make_work_file("tiff.pdf")
     self.exec_command(
         "tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
     )
     self.assert_outfile(pdf_path)
     self.pdf_alternative_extract(entity, pdf_path)
Exemplo n.º 31
0
 def ingest(self, file_path, entity):
     """Ingestor implementation."""
     entity.schema = model.get('Pages')
     pdf_path = self.make_work_file('page.pdf')
     self.exec_command('ddjvu', '-format=pdf', '-quality=100', '-skip',
                       file_path, pdf_path)
     self.assert_outfile(pdf_path)
     self.pdf_alternative_extract(entity, pdf_path)
Exemplo n.º 32
0
    def _serialize(self, obj):
        pk = obj.get('id')
        obj['id'] = str(pk)
        authz = request.authz
        collection_id = obj.pop('collection_id', None)
        obj['collection'] = self.resolve(Collection, collection_id,
                                         CollectionSerializer)
        schema = model.get(obj.get('schema'))
        if schema is None:
            return None
        obj['schemata'] = schema.names
        properties = obj.get('properties', {})
        for prop in schema.properties.values():
            if prop.type != registry.entity:
                continue
            values = ensure_list(properties.get(prop.name))
            properties[prop.name] = []
            for value in values:
                entity = self.resolve(Entity, value, EntitySerializer)
                properties[prop.name].append(entity)

        links = {
            'self': url_for('entities_api.view', entity_id=pk),
            'references': url_for('entities_api.references', entity_id=pk),
            'tags': url_for('entities_api.tags', entity_id=pk),
            'ui': entity_url(pk)
        }
        if schema.is_a(Document.SCHEMA):
            links['content'] = url_for('entities_api.content', entity_id=pk)
            file_name = first(properties.get('fileName'))
            content_hash = first(properties.get('contentHash'))
            if content_hash:
                mime_type = first(properties.get('mimeType'))
                name = safe_filename(file_name, default=pk)
                links['file'] = archive_url(request.authz.id,
                                            content_hash,
                                            file_name=name,
                                            mime_type=mime_type)

            pdf_hash = first(properties.get('pdfHash'))
            if pdf_hash:
                name = safe_filename(file_name, default=pk, extension='.pdf')
                links['pdf'] = archive_url(request.authz.id,
                                           pdf_hash,
                                           file_name=name,
                                           mime_type=PDF)
            csv_hash = first(properties.get('csvHash'))
            if csv_hash:
                name = safe_filename(file_name, default=pk, extension='.csv')
                links['csv'] = archive_url(request.authz.id,
                                           csv_hash,
                                           file_name=name,
                                           mime_type=CSV)

        obj['links'] = links
        obj['writeable'] = authz.can(collection_id, authz.WRITE)
        obj.pop('_index', None)
        return self._clean_response(obj)
Exemplo n.º 33
0
 def ingest(self, file_path, entity):
     entity.schema = model.get('PlainText')
     text = self.read_file_decoded(entity, file_path)
     entity.set('bodyText', text)
     try:
         for card in vobject.readComponents(text):
             self.ingest_card(entity, card)
     except vobject.base.ParseError as err:
         raise ProcessingException('Cannot parse vcard: %s' % err) from err
Exemplo n.º 34
0
def schema_scope(schema, expand=True):
    schemata = set()
    names = ensure_list(schema) or model.schemata.values()
    for schema in names:
        schema = model.get(schema)
        if schema is not None:
            schemata.add(schema)
            if expand:
                schemata.update(schema.descendants)
    for schema in schemata:
        if not schema.abstract:
            yield schema
Exemplo n.º 35
0
 def ancestors(self):
     if self.parent_id is None:
         return []
     key = cache.key('ancestors', self.id)
     ancestors = cache.get_list(key)
     if len(ancestors):
         return ancestors
     parent_key = cache.key('ancestors', self.parent_id)
     ancestors = cache.get_list(parent_key)
     if not len(ancestors):
         ancestors = []
         parent = Document.by_id(self.parent_id)
         if parent is not None:
             ancestors = parent.ancestors
     ancestors.append(self.parent_id)
     if self.model.is_a(model.get(self.SCHEMA_FOLDER)):
         cache.set_list(key, ancestors, expire=cache.EXPIRE)
     return ancestors
Exemplo n.º 36
0
def entity_references(entity, authz):
    """Given a particular entity, find all the references to it from other
    entities, grouped by the property where they are used."""
    schema = model.get(entity.get('schema'))
    group = registry.entity.group
    facets = []
    for prop in model.properties:
        if prop.type != registry.entity:
            continue
        if not schema.is_a(prop.range):
            continue

        index = entities_read_index(prop.schema)
        field = 'properties.%s' % prop.name
        value = entity.get('id')
        facets.append((index, prop.qname, group, field, value))

    res = _filters_faceted_query(authz, facets)
    for (qname, total) in res.items():
        if total > 0:
            yield (model.get_qname(qname), total)
Exemplo n.º 37
0
def reconcile_index(collection=None):
    domain = settings.APP_UI_URL.strip('/')
    label = settings.APP_TITLE
    suggest_query = []
    schemata = list(model)
    if collection is not None:
        label = '%s (%s)' % (collection.get('label'), label)
        suggest_query.append(('filter:collection_id', collection.get('id')))
        schemata = [model.get(s) for s in collection.get('schemata').keys()]
    return jsonify({
        'name': label,
        'identifierSpace': 'http://rdf.freebase.com/ns/type.object.id',
        'schemaSpace': 'http://rdf.freebase.com/ns/type.object.id',
        'view': {'url': entity_url('{{id}}')},
        'preview': {
            'url': entity_url('{{id}}'),
            'width': 800,
            'height': 400
        },
        'suggest': {
            'entity': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_entity',
                                        _query=suggest_query,
                                        _authorize=True,
                                        _relative=True)
            },
            'type': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_type',
                                        _relative=True)
            },
            'property': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_property',
                                        _relative=True)
            }
        },
        'defaultTypes': [get_freebase_type(s) for s in schemata if s.matchable]
    })
Exemplo n.º 38
0
 def model(self):
     return model.get(self.schema)
Exemplo n.º 39
0
def configure_schema(schema, version):
    # Generate relevant type mappings for entity properties so that
    # we can do correct searches on each.
    schema_mapping = {}
    for prop in schema.properties.values():
        config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD))
        config['copy_to'] = ['text']
        schema_mapping[prop.name] = config

    mapping = {
        "date_detection": False,
        "dynamic": False,
        "_source": {
            "excludes": ["text", "fingerprints"]
        },
        "properties": {
            "name": {
                "type": "text",
                "analyzer": "icu_latin",
                "fields": {"kw": KEYWORD},
                "boost": 3.0,
                "copy_to": "text"
            },
            "schema": KEYWORD,
            "schemata": KEYWORD,
            "bulk": {"type": "boolean"},
            "status": KEYWORD,
            "error_message": {
                "type": "text",
                "copy_to": "text",
                "index": False
            },
            "foreign_id": KEYWORD,
            "document_id": KEYWORD,
            "collection_id": KEYWORD,
            "uploader_id": KEYWORD,
            "fingerprints": {
                "type": "keyword",
                "normalizer": "icu_latin",
                "copy_to": "text",
                "fields": {"text": LATIN_TEXT}
            },
            "entities": KEYWORD,
            "languages": KEYWORD,
            "countries": KEYWORD,
            "checksums": KEYWORD,
            "keywords": KEYWORD,
            "ips": KEYWORD,
            "urls": KEYWORD,
            "ibans": KEYWORD,
            "emails": KEYWORD,
            "phones": KEYWORD,
            "mimetypes": KEYWORD,
            "identifiers": KEYWORD,
            "addresses": {
                "type": "keyword",
                "fields": {"text": LATIN_TEXT}
            },
            "dates": PARTIAL_DATE,
            "names": {
                "type": "keyword",
                "fields": {"text": LATIN_TEXT},
                "copy_to": "text"
            },
            "created_at": {"type": "date"},
            "updated_at": {"type": "date"},
            "text": {
                "type": "text",
                "analyzer": "icu_latin",
                "term_vector": "with_positions_offsets",
                "store": True
            },
            "properties": {
                "type": "object",
                "properties": schema_mapping
            }
        }
    }
    index = schema_index(model.get(schema), version)
    return configure_index(
        index, mapping, index_settings(shards=get_shard_weight(schema))
    )
Exemplo n.º 40
0
def entities_write_index(schema):
    """Index that us currently written by new queries."""
    schema = model.get(schema)
    return schema_index(schema, settings.INDEX_WRITE)
Exemplo n.º 41
0
Arquivo: facet.py Projeto: pudo/aleph
 def update(self, result, key):
     try:
         result['label'] = model.get(key).plural
     except AttributeError:
         result['label'] = key