예제 #1
0
def search(raw_query, query_type='/geo/country'):
    raw_query = fingerprints.generate(raw_query)
    countries = get_countries()

    rv = []

    matches = countries[fingerprints.generate(raw_query)]
    for m in matches:
        m['comparison_score'] = difflib.SequenceMatcher(
            None, raw_query, fingerprints.generate(m['name_to_match'])) \
            .quick_ratio()

    for m in sorted(matches, key=lambda i: i['comparison_score'],
                    reverse=True):
        score = m['comparison_score']
        rv.append({
            'id': str(m['id']),
            'name': m['canonical_name'],
            'type': [QUERY_TYPES[0]['id']],
            'score': score * 100,
            'match': score == 1.0,
            'all_labels': {
                'score': score * 100,
                'weighted': score * 100
            }
        })

    return rv
예제 #2
0
def load_entity(tx, entity):
    log.info("Load node [%s]: %s", entity.id, entity.name)
    node = Node(Vocab.Entity,
                fingerprint=fingerprints.generate(entity.name),
                name=entity.name,
                alephState=entity.state,
                alephEntity=entity.id)
    if entity.jurisdiction_code is not None:
        node['countryCode'] = entity.jurisdiction_code.upper()

    tx.merge(node, Vocab.Entity, 'fingerprint')
    for collection in entity.collections:
        coll_node = load_collection(tx, collection)
        rel = Relationship(node, Vocab.PART_OF, coll_node,
                           alephEntity=entity.id)
        tx.merge(rel, Vocab.PART_OF)

    seen = set([node['fingerprint']])
    for other_name in entity.other_names:
        fingerprint = fingerprints.generate(other_name.display_name)
        if fingerprint in seen or fingerprint is None:
            continue
        seen.add(fingerprint)

        alias = Node(Vocab.Entity,
                     fingerprint=fingerprint,
                     name=other_name.display_name,
                     alephEntity=entity.id,
                     isAlias=True)
        tx.merge(alias, Vocab.Entity, 'fingerprint')
        rel = Relationship(node, Vocab.AKA, alias,
                           alephId=other_name.id)
        tx.merge(rel, Vocab.AKA, 'alephId')
    # TODO contact details, addresses
    return node
예제 #3
0
 def fingerprint(self):
     self.cluster = []
     for i in range(0, len(self.group)):
         for j in range(i + 1, len(self.group)):
             if fingerprints.generate(
                     self.group[i]) == fingerprints.generate(self.group[j]):
                 self.cluster.append([self.group[i], self.group[j]])
     return self.cluster
예제 #4
0
def load_to_neo4j(project, neo4j_uri=None):
    neo4j_uri = neo4j_uri or env.NEO4J_URI
    if neo4j_uri is None:
        project.log.error("No $NEO4J_URI set, cannot load graph.")
        return
    project.log.info("Loading graph to Neo4J: %s", neo4j_uri)
    graph = Graph(neo4j_uri)
    tx = graph.begin()
    try:
        tx.run('MATCH (n) DETACH DELETE n')
        entities = {}
        for entity in project.iter_merged_entities():
            label = entity.pop('type', None) or 'Other'
            node = Node(label, **normalise(entity))
            tx.create(node)
            entities[entity['uid']] = node

            # create "Name" fake nodes
            fps = set()
            for name in entity.get('names', []):
                fp = fingerprints.generate(name)
                if fp is None:
                    continue
                fp = fp.replace(' ', '-')
                if fp in fps:
                    continue
                fps.add(fp)
                alias = Node('Name', name=name, fp=fp)
                tx.merge(alias, 'Name', 'fp')
                rel = Relationship(node, 'ALIAS', alias)
                tx.create(rel)

            address = entity.get('address')
            fp = fingerprints.generate(address)
            if fp is not None:
                fp = fp.replace(' ', '-')
                loc = Node('Address', name=address, fp=fp)
                tx.merge(loc, 'Address', 'fp')
                rel = Relationship(node, 'LOCATION', alias)
                tx.create(rel)

        for link in project.iter_merged_links():
            source = entities.get(link.pop('source'))
            target = entities.get(link.pop('target'))
            if source is None or target is None:
                continue
            rel = Relationship(source, 'LINK', target, **normalise(link))
            tx.create(rel)

        clear_leaf_nodes(tx, 'Name')
        clear_leaf_nodes(tx, 'Address')
        tx.commit()
    except Exception as ex:
        project.log.exception(ex)
        tx.rollback()
예제 #5
0
    def fingerprint(self):
        key = self.key
        if not isinstance(key, str):
            key = unidecode(key)
        fp = fingerprints.generate(key)

        if TRACE_TEXT or TRACE_FP:
            logger_debug('Text.fingerprint:key: ', repr(self.key))
            logger_debug('Text.fingerprint:fp :    ',
                         fingerprints.generate(unidecode(self.key)))

        self.key = fp
예제 #6
0
def entity_similarity(left, right):
    left_name = left.get('name')
    right_name = right.get('name')
    score = 0
    if left_name is not None and right_name is not None:
        name_sim = jaro_winkler(chomp(left_name), chomp(right_name))
        score += (name_sim * 0.6)

    left_fp = fingerprints.generate(left_name)
    right_fp = fingerprints.generate(right_name)
    if left_fp is not None and right_fp is not None:
        fp_sim = jaro_winkler(left_fp, right_fp)
        score += (fp_sim * 0.4)

    return min(1.0, score)
예제 #7
0
def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            "match": {
                "fingerprints.text": {
                    "query": value,
                    "operator": "and",
                    "minimum_should_match": "60%",
                    "boost": boost,
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None and fp != value:
            yield {
                "match": {
                    "fingerprints.text": {
                        "query": value,
                        "operator": "and",
                        "minimum_should_match": "60%",
                        "boost": boost,
                    }
                }
            }
    elif prop.type.group is not None:
        yield {"term": {prop.type.group: {"value": value}}}
예제 #8
0
def finalize_index(data, schema, texts):
    """Apply final denormalisations to the index."""
    data['schema'] = schema.name
    # Get implied schemata (i.e. parents of the actual schema)
    data['schemata'] = schema.names

    properties = data.get('properties', {})
    for name, prop in schema.properties.items():
        if name not in properties:
            continue
        if prop.type_name in ['entity', 'date', 'url', 'uri', 'country']:
            continue
        for value in ensure_list(properties[name]):
            if name == 'name':
                data['name'] = value
            texts.append(value)

    data = schema.invert(data)
    data['text'] = index_form(texts)

    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(latinize_text(name))
    data['names'] = list(set(names))

    if 'created_at' not in data:
        data['created_at'] = data.get('updated_at')
    return data
예제 #9
0
def format_proxy(proxy, collection, job_id=None):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['job_id'] = job_id
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    text.append(collection.label)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for updated_at in properties.pop('indexUpdatedAt', []):
        data['updated_at'] = updated_at

    # pprint(data)
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }
def parse_company(line):

    results = dict()

    # same nomenclature for company_number in function parse_officer()
    # applies.
    results['company_number'] = line[0:8]

    # record_type is always 1 since we're parsing companies.
    results['record_type'] = line[8]

    # company_status (dissolved, active...)
    results['company_status_code'] = line[9]

    results['is_company'] = line[24] == 'Y'

    # filler, can throw away.
    results['filler'] = line[10:32]

    results['number_of_officers'] = line[32:36]

    # holds the length of the name variable (incl. "<" char), used for
    # validation, do not insert in database.
    results['unwanted_company_name_length'] = line[36:40]

    # company names will be of varying length and will always end with
    # '...< \n'.
    results['company_name'] = line[40:].strip('< \n')
    results["company_name_norm"] = generate(results.get("company_name", None))
    return results
예제 #11
0
파일: matching.py 프로젝트: sunu/aleph
def _make_queries(type_, value):
    if type_ == registry.name:
        yield {
            "match": {
                "fingerprints.text": {
                    "query": value,
                    "operator": "and",
                    "minimum_should_match": "60%",
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is None:
            return
        if fp.lower() != value.lower():
            yield {
                "match": {
                    "fingerprints.text": {
                        "query": value,
                        "operator": "and",
                        "minimum_should_match": "60%",
                    }
                }
            }
    elif type_.group is not None:
        yield {"term": {type_.group: {"value": value}}}
예제 #12
0
def load_entities(graph):
    """Load composite entities into the graph."""
    tx = graph.begin()
    entities = {}
    try:
        for entity in Entity.iter_composite():
            label = entity.schema or 'Other'
            data = dict(entity.data)
            data.pop('aliases', None)
            node = Node(label, origin=entity.origin, **data)
            project.log.info("Node [%s]: %s", label, entity.name)
            tx.create(node)
            for uid in entity.uids:
                entities[uid] = node

            for name in entity.names:
                fp = fingerprints.generate(name)
                name_node = Node(NAME, name=name, fp=fp)
                tx.merge(name_node, NAME, 'fp')

                rel = Relationship(node, 'ALIAS', name_node)
                tx.create(rel)

        clear_leaf_nodes(tx, NAME)
        tx.commit()
        return entities
    except Exception:
        tx.rollback()
        raise
예제 #13
0
파일: match.py 프로젝트: pudo/aleph
def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'names.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None:
            yield {
                'term': {
                    'fingerprints': {
                        'value': fp,
                        'boost': boost
                    }
                }
            }
    elif prop.type.group is not None:
        yield {
            'term': {
                prop.type.group: {
                    'value': value
                }
            }
        }
예제 #14
0
 def fingerprint(self, values):
     # TODO: this should not be a property thing, so that fp's can include
     # dates etx.
     fps = []
     for value in values:
         fps.append(fingerprints.generate(value))
     return [fp for fp in fps if fp is not None]
예제 #15
0
def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'fingerprints.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None and fp != value:
            yield {
                'match': {
                    'fingerprints.text': {
                        'query': value,
                        'operator': 'and',
                        'minimum_should_match': '60%',
                        'boost': boost
                    }
                }
            }
    elif prop.type.group is not None:
        yield {
            'term': {
                prop.type.group: {
                    'value': value
                }
            }
        }
예제 #16
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    state = QueryState({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(query.get('type'))
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    suggested = similar_entities(entity, state)
    matches = []
    for ent in suggested.get('results'):
        types = [t for t in get_freebase_types() if ent['schema'] == t['id']]
        matches.append({
            'id': ent.get('id'),
            'name': ent.get('name'),
            'type': types,
            'score': min(100, ent.get('score') * 10),
            'uri': entity_link(ent.get('id')),
            'match': ent.get('name') == name
        })
    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }
예제 #17
0
def process_single(resource):
    for row in resource:
        fp = fingerprints.generate(row['beneficiary_name'])
        if fp is not None:
            row['beneficiary_id'] = fp.capitalize()
        else:
            row['beneficiary_id'] = row['beneficiary_name']
        yield row
예제 #18
0
 def fingerprints(self):
     if not hasattr(self, '_fingerprints'):
         self._fingerprints = set()
         for name in self.names:
             fp = fingerprints.generate(name)
             if fp is not None:
                 self._fingerprints.add(fp)
     return self._fingerprints
def process_single(resource):
    for row in resource:
        fp = fingerprints.generate(row['beneficiary_name'])
        if fp is not None:
            row['beneficiary_id'] = fp.capitalize()
        else:
            row['beneficiary_id'] = row['beneficiary_name']
        yield row
예제 #20
0
def get_declared_holders(codebase, holders_tallies):
    """
    Return a list of declared holders from a codebase using the holders
    detected from key files.

    A declared holder is a copyright holder present in the key files who has the
    highest amount of refrences throughout the codebase.
    """
    entry_by_holders = {
        fingerprints.generate(entry['value']): entry
        for entry in holders_tallies if entry['value']
    }
    key_file_holders = get_field_values_from_codebase_resources(
        codebase, 'holders', key_files_only=True)
    entry_by_key_file_holders = {
        fingerprints.generate(entry['holder']): entry
        for entry in key_file_holders if entry['holder']
    }
    unique_key_file_holders = unique(entry_by_key_file_holders.keys())
    unique_key_file_holders_entries = [
        entry_by_holders[holder] for holder in unique_key_file_holders
    ]

    holder_by_counts = defaultdict(list)
    for holder_entry in unique_key_file_holders_entries:
        count = holder_entry.get('count')
        if count:
            holder = holder_entry.get('value')
            holder_by_counts[count].append(holder)

    declared_holders = []
    if holder_by_counts:
        highest_count = max(holder_by_counts)
        declared_holders = holder_by_counts[highest_count]

    # If we could not determine a holder, then we return a list of all the
    # unique key file holders
    if not declared_holders:
        declared_holders = [
            entry['value'] for entry in unique_key_file_holders_entries
        ]

    return declared_holders
예제 #21
0
def index_names(data):
    """Handle entity names on documents and entities."""
    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(ascii_text(name))
    data['names'] = list(set(names))
예제 #22
0
def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    # Abstract entities can appear when profile fragments for a missing entity
    # are present.
    if proxy.schema.abstract:
        return None

    data = proxy.to_full_dict()
    data["schemata"] = list(proxy.schema.names)
    data["caption"] = proxy.caption

    names = data.get("names", [])
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data["fingerprints"] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get("properties")
    data["text"] = properties.pop("indexText", [])

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric["dates"] = _numeric_values(registry.date, data.get("dates"))
    data["numeric"] = numeric

    # Context data - from aleph system, not followthemoney.
    data["collection_id"] = collection.id
    data["role_id"] = first(data.get("role_id"))
    data["profile_id"] = first(data.get("profile_id"))
    data["mutable"] = max(ensure_list(data.get("mutable")), default=False)
    data["origin"] = ensure_list(data.get("origin"))
    # Logical simplifications of dates:
    created_at = ensure_list(data.get("created_at"))
    if len(created_at) > 0:
        data["created_at"] = min(created_at)
    updated_at = ensure_list(data.get("updated_at")) or created_at
    if len(updated_at) > 0:
        data["updated_at"] = max(updated_at)

    # log.info("%s", pformat(data))
    entity_id = data.pop("id")
    return {
        "_id": entity_id,
        "_index": entities_write_index(proxy.schema),
        "_source": data,
    }
예제 #23
0
def get_countries():
    if 'countries' in g:
        return g.countries

    rv = collections.defaultdict(list)

    cursor = mysql.connection.cursor(MySQLdb.cursors.DictCursor)
    cursor.execute(LOAD_COUNTRIES_SQL)
    for r in cursor.fetchall():
        rv[fingerprints.generate(r["name_to_match"])].append(r)

    g.countries = rv
    return g.countries
예제 #24
0
def _normalize_names(names):
    """Generate a sequence of comparable names for an entity. This also
    generates a `fingerprint`, i.e. a version of the name where all tokens
    are sorted alphabetically, and some parts, such as company suffixes,
    have been removed."""
    seen = set()
    for name in names:
        plain = normalize(name, ascii=True)
        if plain is not None and plain not in seen:
            seen.add(plain)
            yield plain
        fp = fingerprints.generate(name)
        if fp is not None and len(fp) > 6 and fp not in seen:
            seen.add(fp)
            yield fp
예제 #25
0
파일: mapper.py 프로젝트: wcyn/aleph
 def compute_key(self, record):
     digest = sha1(self.query.dataset.name.encode('utf-8'))
     has_key = False
     for key in self.keys:
         value = record.get(key)
         if self.key_fingerprint:
             value = fingerprints.generate(value)
         else:
             value = string_value(value)
         if value is None:
             continue
         digest.update(value.encode('utf-8'))
         has_key = True
     if has_key:
         return digest.hexdigest()
예제 #26
0
파일: entities.py 프로젝트: x0rzkov/aleph
def format_proxy(proxy, collection, extra):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    proxy = collection.ns.apply(proxy)
    # Pull `indexUpdatedAt` before constructing `data`, so that it doesn't
    # creep into `data['dates']` and mess up date sorting afterwards
    updated_at = proxy.pop('indexUpdatedAt', quiet=True)
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['schemata'] = list(proxy.schema.names)

    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for value in updated_at:
        data['updated_at'] = value

    # integer casting
    numeric = {}
    for prop, values in properties.items():
        prop = proxy.schema.get(prop)
        if prop.type in NUMERIC_TYPES:
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # add possible overrides
    data.update(extra)

    # log.info("%s", pformat(data))
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }
예제 #27
0
 def compute_key(self, record):
     if not len(self.keys):
         return None
     digest = sha1(self.query.dataset.name.encode('utf-8'))
     # digest.update(self.schema.name.encode('utf-8'))
     has_key = False
     for key in self.keys:
         value = record.get(key)
         if self.key_fingerprint:
             value = fingerprints.generate(value)
         else:
             value = clean_text(value)
         if value is None:
             continue
         digest.update(value.encode('utf-8'))
         has_key = True
     if has_key:
         return digest.hexdigest()
예제 #28
0
def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    data = proxy.to_full_dict()
    data["schemata"] = list(proxy.schema.names)

    names = data.get("names", [])
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data["fingerprints"] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get("properties")
    text = properties.pop("indexText", [])
    text.extend(fps)
    data["text"] = text

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric["dates"] = _numeric_values(registry.date, data.get("dates"))
    data["numeric"] = numeric

    # Context data - from aleph system, not followthemoney.
    # FIXME: Can there ever really be multiple role_ids?
    data["role_id"] = first(data.get("role_id"))
    data["mutable"] = max(ensure_list(data.get("mutable")), default=False)
    data["origin"] = ensure_list(data.get("origin"))
    created_at = data.get("created_at")
    if created_at:
        data["updated_at"] = data.get("updated_at", created_at)
    data["collection_id"] = collection.id
    # log.info("%s", pformat(data))
    entity_id = data.pop("id")
    return {
        "_id": entity_id,
        "_index": entities_write_index(data.get("schema")),
        "_source": data,
    }
예제 #29
0
파일: entities.py 프로젝트: djoffrey/aleph
def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    data = proxy.to_full_dict()
    data['schemata'] = list(proxy.schema.names)

    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    data['text'] = text

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # Context data - from aleph system, not followthemoney.
    now = iso_text(datetime.utcnow())
    data['created_at'] = min(ensure_list(data.get('created_at')), default=now)
    data['updated_at'] = min(ensure_list(data.get('updated_at')), default=now)
    # FIXME: Can there ever really be multiple role_ids?
    data['role_id'] = first(data.get('role_id'))
    data['mutable'] = max(ensure_list(data.get('mutable')), default=False)
    data['origin'] = ensure_list(data.get('origin'))
    data['collection_id'] = collection.id
    # log.info("%s", pformat(data))
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }
예제 #30
0
파일: util.py 프로젝트: occrp/tabref
def normalize_value(text):
    if text is None:
        return

    try:
        # see if this the cell value clearly numeric:
        float(text)
        return
    except:
        pass

    text = fingerprints.generate(text, keep_order=True)
    if text is None:
        return

    if len(text) <= 3:
        return

    text = u' %s ' % text
    return text.encode('utf-8')
예제 #31
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    parser = SearchQueryParser({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    schema = query.get('type') or 'Thing'
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(schema),
        'schema': schema
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    query = SimilarEntitiesQuery(parser, entity=entity)
    matches = []
    for doc in query.search().get('hits').get('hits'):
        source = doc.get('_source')
        match = {
            'id': doc.get('_id'),
            'name': source.get('name'),
            'score': min(100, doc.get('_score') * 10),
            'uri': entity_url(doc.get('_id')),
            'match': source.get('name') == name
        }
        for type_ in get_freebase_types():
            if source['schema'] == type_['id']:
                match['type'] = [type_]
        matches.append(match)

    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }
예제 #32
0
파일: entities.py 프로젝트: wdsn/aleph
def format_proxy(proxy, collection, job_id=None):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    proxy = collection.ns.apply(proxy)
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['job_id'] = job_id
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    text.append(collection.label)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for updated_at in properties.pop('indexUpdatedAt', []):
        data['updated_at'] = updated_at

    # integer casting
    numeric = {}
    for prop, values in properties.items():
        prop = proxy.schema.get(prop)
        if prop.type in NUMERIC_TYPES:
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # pprint(data)
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }
예제 #33
0
def finalize_index(proxy, context, texts):
    """Apply final denormalisations to the index."""
    for prop, value in proxy.itervalues():
        if prop.type.name in ['entity', 'date', 'url', 'country', 'language']:
            continue
        texts.append(value)

    entity = proxy.to_full_dict()
    data = merge_data(context, entity)
    data['name'] = proxy.caption
    data['text'] = index_form(texts)

    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')
    data.pop('id', None)
    return clean_dict(data)
예제 #34
0
파일: entities.py 프로젝트: pudo/aleph
def index_operation(data):
    """Apply final denormalisations to the index."""
    data['bulk'] = data.get('bulk', False)
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    texts = data.pop('text', [])
    texts.extend(data.get('properties', {}).pop('indexText', []))
    texts.extend(fps)
    data['text'] = texts

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')

    entity_id = str(data.pop('id'))
    data.pop('_index', None)
    index = entities_write_index(data.get('schema'))
    return entity_id, index, data
예제 #35
0
파일: match.py 프로젝트: modulexcite/aleph
def _make_queries(prop, value):
    specificity = prop.type.specificity(value)
    if specificity == 0:
        return

    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'names.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None:
            yield {
                'term': {
                    'fingerprints': {
                        'value': fp,
                        'boost': boost
                    }
                }
            }
        return

    if prop.type.group is None:
        return
    yield {
        'term': {
            prop.type.group: {
                'value': value,
                'boost': specificity
            }
        }
    }
예제 #36
0
파일: model.py 프로젝트: pudo/linkage
 def generate_linktab(self, chunk_size=10000):
     with self.config.engine.begin() as connection:
         q = self.config.linktab.delete()
         q = q.where(self.config.linktab.c.view == self.name)
         connection.execute(q)
         chunk = []
         for i, value in enumerate(self.distinct_key()):
             fp = fingerprints.generate(value)
             if fp is None:
                 continue
             # this is due to postgres' levenshtein
             fp = fp[:255]
             chunk.append({
                 'view': self.name,
                 'serial': self.serial,
                 'key': value,
                 'fingerprint': fp
             })
             if len(chunk) == chunk_size:
                 log.info('Linktab %s (%s): %s', self.name, self.key_ref, i)
                 connection.execute(self.config.linktab.insert(), chunk)
                 chunk = []
         if len(chunk):
             connection.execute(self.config.linktab.insert(), chunk)
예제 #37
0
def make_fingerprint(text, **kwargs):
    """Generate a normalised entity name, used for the graph."""
    return fingerprints.generate(string_value(text))
예제 #38
0
def normalizeaddress(value):
    return fingerprints.generate(value)
예제 #39
0
 def normalize_value(self, value, prop, record):
     return [fingerprints.generate(value)]
예제 #40
0
def fingerprint(value, **kwargs):
    return fingerprints.generate(string_value(value))
예제 #41
0
def addressfp(value, **kwargs):
    value = string_value(value)
    if value is None:
        return
    value = value.replace("<br/>", " ")
    return fingerprints.generate(value, keep_order=True)
예제 #42
0
# coding: utf-8
import fingerprints

tests = [
    u'Foo (Bar) Corp',
    u'ähnlIIch',
    'Open S.A.R.L.',
    'Mr. Boaty McBoatface',
    u'РАДИК ІВАН ЛЬВОВИЧ',
    u'КУШНАРЬОВ ДМИТРО ВІТАЛІЙОВИЧ',
    u'Foo (Bar) CORPORATION',
    'Mr. Sherlock Holmes',
    'Siemens Aktiengesellschaft',
    'New York, New York',
    u'Foo Gesellschaft mit beschränkter Haftung',
    'Software und- Systemgesellschaft mit beschr Haftung'
]

for test in tests:
    out = fingerprints.generate(test)
    print out