Пример #1
0
def search(raw_query, query_type='/geo/country'):
    raw_query = fingerprints.generate(raw_query)
    countries = get_countries()

    rv = []

    matches = countries[fingerprints.generate(raw_query)]
    for m in matches:
        m['comparison_score'] = difflib.SequenceMatcher(
            None, raw_query, fingerprints.generate(m['name_to_match'])) \
            .quick_ratio()

    for m in sorted(matches, key=lambda i: i['comparison_score'],
                    reverse=True):
        score = m['comparison_score']
        rv.append({
            'id': str(m['id']),
            'name': m['canonical_name'],
            'type': [QUERY_TYPES[0]['id']],
            'score': score * 100,
            'match': score == 1.0,
            'all_labels': {
                'score': score * 100,
                'weighted': score * 100
            }
        })

    return rv
Пример #2
0
def load_entity(tx, entity):
    log.info("Load node [%s]: %s", entity.id, entity.name)
    node = Node(Vocab.Entity,
                fingerprint=fingerprints.generate(entity.name),
                name=entity.name,
                alephState=entity.state,
                alephEntity=entity.id)
    if entity.jurisdiction_code is not None:
        node['countryCode'] = entity.jurisdiction_code.upper()

    tx.merge(node, Vocab.Entity, 'fingerprint')
    for collection in entity.collections:
        coll_node = load_collection(tx, collection)
        rel = Relationship(node, Vocab.PART_OF, coll_node,
                           alephEntity=entity.id)
        tx.merge(rel, Vocab.PART_OF)

    seen = set([node['fingerprint']])
    for other_name in entity.other_names:
        fingerprint = fingerprints.generate(other_name.display_name)
        if fingerprint in seen or fingerprint is None:
            continue
        seen.add(fingerprint)

        alias = Node(Vocab.Entity,
                     fingerprint=fingerprint,
                     name=other_name.display_name,
                     alephEntity=entity.id,
                     isAlias=True)
        tx.merge(alias, Vocab.Entity, 'fingerprint')
        rel = Relationship(node, Vocab.AKA, alias,
                           alephId=other_name.id)
        tx.merge(rel, Vocab.AKA, 'alephId')
    # TODO contact details, addresses
    return node
Пример #3
0
 def fingerprint(self):
     self.cluster = []
     for i in range(0, len(self.group)):
         for j in range(i + 1, len(self.group)):
             if fingerprints.generate(
                     self.group[i]) == fingerprints.generate(self.group[j]):
                 self.cluster.append([self.group[i], self.group[j]])
     return self.cluster
Пример #4
0
def load_to_neo4j(project, neo4j_uri=None):
    neo4j_uri = neo4j_uri or env.NEO4J_URI
    if neo4j_uri is None:
        project.log.error("No $NEO4J_URI set, cannot load graph.")
        return
    project.log.info("Loading graph to Neo4J: %s", neo4j_uri)
    graph = Graph(neo4j_uri)
    tx = graph.begin()
    try:
        tx.run('MATCH (n) DETACH DELETE n')
        entities = {}
        for entity in project.iter_merged_entities():
            label = entity.pop('type', None) or 'Other'
            node = Node(label, **normalise(entity))
            tx.create(node)
            entities[entity['uid']] = node

            # create "Name" fake nodes
            fps = set()
            for name in entity.get('names', []):
                fp = fingerprints.generate(name)
                if fp is None:
                    continue
                fp = fp.replace(' ', '-')
                if fp in fps:
                    continue
                fps.add(fp)
                alias = Node('Name', name=name, fp=fp)
                tx.merge(alias, 'Name', 'fp')
                rel = Relationship(node, 'ALIAS', alias)
                tx.create(rel)

            address = entity.get('address')
            fp = fingerprints.generate(address)
            if fp is not None:
                fp = fp.replace(' ', '-')
                loc = Node('Address', name=address, fp=fp)
                tx.merge(loc, 'Address', 'fp')
                rel = Relationship(node, 'LOCATION', alias)
                tx.create(rel)

        for link in project.iter_merged_links():
            source = entities.get(link.pop('source'))
            target = entities.get(link.pop('target'))
            if source is None or target is None:
                continue
            rel = Relationship(source, 'LINK', target, **normalise(link))
            tx.create(rel)

        clear_leaf_nodes(tx, 'Name')
        clear_leaf_nodes(tx, 'Address')
        tx.commit()
    except Exception as ex:
        project.log.exception(ex)
        tx.rollback()
Пример #5
0
    def fingerprint(self):
        key = self.key
        if not isinstance(key, str):
            key = unidecode(key)
        fp = fingerprints.generate(key)

        if TRACE_TEXT or TRACE_FP:
            logger_debug('Text.fingerprint:key: ', repr(self.key))
            logger_debug('Text.fingerprint:fp :    ',
                         fingerprints.generate(unidecode(self.key)))

        self.key = fp
Пример #6
0
def entity_similarity(left, right):
    left_name = left.get('name')
    right_name = right.get('name')
    score = 0
    if left_name is not None and right_name is not None:
        name_sim = jaro_winkler(chomp(left_name), chomp(right_name))
        score += (name_sim * 0.6)

    left_fp = fingerprints.generate(left_name)
    right_fp = fingerprints.generate(right_name)
    if left_fp is not None and right_fp is not None:
        fp_sim = jaro_winkler(left_fp, right_fp)
        score += (fp_sim * 0.4)

    return min(1.0, score)
Пример #7
0
def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            "match": {
                "fingerprints.text": {
                    "query": value,
                    "operator": "and",
                    "minimum_should_match": "60%",
                    "boost": boost,
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None and fp != value:
            yield {
                "match": {
                    "fingerprints.text": {
                        "query": value,
                        "operator": "and",
                        "minimum_should_match": "60%",
                        "boost": boost,
                    }
                }
            }
    elif prop.type.group is not None:
        yield {"term": {prop.type.group: {"value": value}}}
Пример #8
0
def finalize_index(data, schema, texts):
    """Apply final denormalisations to the index."""
    data['schema'] = schema.name
    # Get implied schemata (i.e. parents of the actual schema)
    data['schemata'] = schema.names

    properties = data.get('properties', {})
    for name, prop in schema.properties.items():
        if name not in properties:
            continue
        if prop.type_name in ['entity', 'date', 'url', 'uri', 'country']:
            continue
        for value in ensure_list(properties[name]):
            if name == 'name':
                data['name'] = value
            texts.append(value)

    data = schema.invert(data)
    data['text'] = index_form(texts)

    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(latinize_text(name))
    data['names'] = list(set(names))

    if 'created_at' not in data:
        data['created_at'] = data.get('updated_at')
    return data
Пример #9
0
def format_proxy(proxy, collection, job_id=None):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['job_id'] = job_id
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    text.append(collection.label)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for updated_at in properties.pop('indexUpdatedAt', []):
        data['updated_at'] = updated_at

    # pprint(data)
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }
def parse_company(line):

    results = dict()

    # same nomenclature for company_number in function parse_officer()
    # applies.
    results['company_number'] = line[0:8]

    # record_type is always 1 since we're parsing companies.
    results['record_type'] = line[8]

    # company_status (dissolved, active...)
    results['company_status_code'] = line[9]

    results['is_company'] = line[24] == 'Y'

    # filler, can throw away.
    results['filler'] = line[10:32]

    results['number_of_officers'] = line[32:36]

    # holds the length of the name variable (incl. "<" char), used for
    # validation, do not insert in database.
    results['unwanted_company_name_length'] = line[36:40]

    # company names will be of varying length and will always end with
    # '...< \n'.
    results['company_name'] = line[40:].strip('< \n')
    results["company_name_norm"] = generate(results.get("company_name", None))
    return results
Пример #11
0
def _make_queries(type_, value):
    if type_ == registry.name:
        yield {
            "match": {
                "fingerprints.text": {
                    "query": value,
                    "operator": "and",
                    "minimum_should_match": "60%",
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is None:
            return
        if fp.lower() != value.lower():
            yield {
                "match": {
                    "fingerprints.text": {
                        "query": value,
                        "operator": "and",
                        "minimum_should_match": "60%",
                    }
                }
            }
    elif type_.group is not None:
        yield {"term": {type_.group: {"value": value}}}
Пример #12
0
def load_entities(graph):
    """Load composite entities into the graph."""
    tx = graph.begin()
    entities = {}
    try:
        for entity in Entity.iter_composite():
            label = entity.schema or 'Other'
            data = dict(entity.data)
            data.pop('aliases', None)
            node = Node(label, origin=entity.origin, **data)
            project.log.info("Node [%s]: %s", label, entity.name)
            tx.create(node)
            for uid in entity.uids:
                entities[uid] = node

            for name in entity.names:
                fp = fingerprints.generate(name)
                name_node = Node(NAME, name=name, fp=fp)
                tx.merge(name_node, NAME, 'fp')

                rel = Relationship(node, 'ALIAS', name_node)
                tx.create(rel)

        clear_leaf_nodes(tx, NAME)
        tx.commit()
        return entities
    except Exception:
        tx.rollback()
        raise
Пример #13
0
def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'names.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None:
            yield {
                'term': {
                    'fingerprints': {
                        'value': fp,
                        'boost': boost
                    }
                }
            }
    elif prop.type.group is not None:
        yield {
            'term': {
                prop.type.group: {
                    'value': value
                }
            }
        }
Пример #14
0
 def fingerprint(self, values):
     # TODO: this should not be a property thing, so that fp's can include
     # dates etx.
     fps = []
     for value in values:
         fps.append(fingerprints.generate(value))
     return [fp for fp in fps if fp is not None]
Пример #15
0
def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'fingerprints.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None and fp != value:
            yield {
                'match': {
                    'fingerprints.text': {
                        'query': value,
                        'operator': 'and',
                        'minimum_should_match': '60%',
                        'boost': boost
                    }
                }
            }
    elif prop.type.group is not None:
        yield {
            'term': {
                prop.type.group: {
                    'value': value
                }
            }
        }
Пример #16
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    state = QueryState({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(query.get('type'))
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    suggested = similar_entities(entity, state)
    matches = []
    for ent in suggested.get('results'):
        types = [t for t in get_freebase_types() if ent['schema'] == t['id']]
        matches.append({
            'id': ent.get('id'),
            'name': ent.get('name'),
            'type': types,
            'score': min(100, ent.get('score') * 10),
            'uri': entity_link(ent.get('id')),
            'match': ent.get('name') == name
        })
    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }
Пример #17
0
def process_single(resource):
    for row in resource:
        fp = fingerprints.generate(row['beneficiary_name'])
        if fp is not None:
            row['beneficiary_id'] = fp.capitalize()
        else:
            row['beneficiary_id'] = row['beneficiary_name']
        yield row
Пример #18
0
 def fingerprints(self):
     if not hasattr(self, '_fingerprints'):
         self._fingerprints = set()
         for name in self.names:
             fp = fingerprints.generate(name)
             if fp is not None:
                 self._fingerprints.add(fp)
     return self._fingerprints
def process_single(resource):
    for row in resource:
        fp = fingerprints.generate(row['beneficiary_name'])
        if fp is not None:
            row['beneficiary_id'] = fp.capitalize()
        else:
            row['beneficiary_id'] = row['beneficiary_name']
        yield row
Пример #20
0
def get_declared_holders(codebase, holders_tallies):
    """
    Return a list of declared holders from a codebase using the holders
    detected from key files.

    A declared holder is a copyright holder present in the key files who has the
    highest amount of refrences throughout the codebase.
    """
    entry_by_holders = {
        fingerprints.generate(entry['value']): entry
        for entry in holders_tallies if entry['value']
    }
    key_file_holders = get_field_values_from_codebase_resources(
        codebase, 'holders', key_files_only=True)
    entry_by_key_file_holders = {
        fingerprints.generate(entry['holder']): entry
        for entry in key_file_holders if entry['holder']
    }
    unique_key_file_holders = unique(entry_by_key_file_holders.keys())
    unique_key_file_holders_entries = [
        entry_by_holders[holder] for holder in unique_key_file_holders
    ]

    holder_by_counts = defaultdict(list)
    for holder_entry in unique_key_file_holders_entries:
        count = holder_entry.get('count')
        if count:
            holder = holder_entry.get('value')
            holder_by_counts[count].append(holder)

    declared_holders = []
    if holder_by_counts:
        highest_count = max(holder_by_counts)
        declared_holders = holder_by_counts[highest_count]

    # If we could not determine a holder, then we return a list of all the
    # unique key file holders
    if not declared_holders:
        declared_holders = [
            entry['value'] for entry in unique_key_file_holders_entries
        ]

    return declared_holders
Пример #21
0
def index_names(data):
    """Handle entity names on documents and entities."""
    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(ascii_text(name))
    data['names'] = list(set(names))
Пример #22
0
def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    # Abstract entities can appear when profile fragments for a missing entity
    # are present.
    if proxy.schema.abstract:
        return None

    data = proxy.to_full_dict()
    data["schemata"] = list(proxy.schema.names)
    data["caption"] = proxy.caption

    names = data.get("names", [])
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data["fingerprints"] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get("properties")
    data["text"] = properties.pop("indexText", [])

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric["dates"] = _numeric_values(registry.date, data.get("dates"))
    data["numeric"] = numeric

    # Context data - from aleph system, not followthemoney.
    data["collection_id"] = collection.id
    data["role_id"] = first(data.get("role_id"))
    data["profile_id"] = first(data.get("profile_id"))
    data["mutable"] = max(ensure_list(data.get("mutable")), default=False)
    data["origin"] = ensure_list(data.get("origin"))
    # Logical simplifications of dates:
    created_at = ensure_list(data.get("created_at"))
    if len(created_at) > 0:
        data["created_at"] = min(created_at)
    updated_at = ensure_list(data.get("updated_at")) or created_at
    if len(updated_at) > 0:
        data["updated_at"] = max(updated_at)

    # log.info("%s", pformat(data))
    entity_id = data.pop("id")
    return {
        "_id": entity_id,
        "_index": entities_write_index(proxy.schema),
        "_source": data,
    }
Пример #23
0
def get_countries():
    if 'countries' in g:
        return g.countries

    rv = collections.defaultdict(list)

    cursor = mysql.connection.cursor(MySQLdb.cursors.DictCursor)
    cursor.execute(LOAD_COUNTRIES_SQL)
    for r in cursor.fetchall():
        rv[fingerprints.generate(r["name_to_match"])].append(r)

    g.countries = rv
    return g.countries
Пример #24
0
def _normalize_names(names):
    """Generate a sequence of comparable names for an entity. This also
    generates a `fingerprint`, i.e. a version of the name where all tokens
    are sorted alphabetically, and some parts, such as company suffixes,
    have been removed."""
    seen = set()
    for name in names:
        plain = normalize(name, ascii=True)
        if plain is not None and plain not in seen:
            seen.add(plain)
            yield plain
        fp = fingerprints.generate(name)
        if fp is not None and len(fp) > 6 and fp not in seen:
            seen.add(fp)
            yield fp
Пример #25
0
 def compute_key(self, record):
     digest = sha1(self.query.dataset.name.encode('utf-8'))
     has_key = False
     for key in self.keys:
         value = record.get(key)
         if self.key_fingerprint:
             value = fingerprints.generate(value)
         else:
             value = string_value(value)
         if value is None:
             continue
         digest.update(value.encode('utf-8'))
         has_key = True
     if has_key:
         return digest.hexdigest()
Пример #26
0
def format_proxy(proxy, collection, extra):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    proxy = collection.ns.apply(proxy)
    # Pull `indexUpdatedAt` before constructing `data`, so that it doesn't
    # creep into `data['dates']` and mess up date sorting afterwards
    updated_at = proxy.pop('indexUpdatedAt', quiet=True)
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['schemata'] = list(proxy.schema.names)

    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for value in updated_at:
        data['updated_at'] = value

    # integer casting
    numeric = {}
    for prop, values in properties.items():
        prop = proxy.schema.get(prop)
        if prop.type in NUMERIC_TYPES:
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # add possible overrides
    data.update(extra)

    # log.info("%s", pformat(data))
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }
Пример #27
0
 def compute_key(self, record):
     if not len(self.keys):
         return None
     digest = sha1(self.query.dataset.name.encode('utf-8'))
     # digest.update(self.schema.name.encode('utf-8'))
     has_key = False
     for key in self.keys:
         value = record.get(key)
         if self.key_fingerprint:
             value = fingerprints.generate(value)
         else:
             value = clean_text(value)
         if value is None:
             continue
         digest.update(value.encode('utf-8'))
         has_key = True
     if has_key:
         return digest.hexdigest()
Пример #28
0
def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    data = proxy.to_full_dict()
    data["schemata"] = list(proxy.schema.names)

    names = data.get("names", [])
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data["fingerprints"] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get("properties")
    text = properties.pop("indexText", [])
    text.extend(fps)
    data["text"] = text

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric["dates"] = _numeric_values(registry.date, data.get("dates"))
    data["numeric"] = numeric

    # Context data - from aleph system, not followthemoney.
    # FIXME: Can there ever really be multiple role_ids?
    data["role_id"] = first(data.get("role_id"))
    data["mutable"] = max(ensure_list(data.get("mutable")), default=False)
    data["origin"] = ensure_list(data.get("origin"))
    created_at = data.get("created_at")
    if created_at:
        data["updated_at"] = data.get("updated_at", created_at)
    data["collection_id"] = collection.id
    # log.info("%s", pformat(data))
    entity_id = data.pop("id")
    return {
        "_id": entity_id,
        "_index": entities_write_index(data.get("schema")),
        "_source": data,
    }
Пример #29
0
def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    data = proxy.to_full_dict()
    data['schemata'] = list(proxy.schema.names)

    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    data['text'] = text

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # Context data - from aleph system, not followthemoney.
    now = iso_text(datetime.utcnow())
    data['created_at'] = min(ensure_list(data.get('created_at')), default=now)
    data['updated_at'] = min(ensure_list(data.get('updated_at')), default=now)
    # FIXME: Can there ever really be multiple role_ids?
    data['role_id'] = first(data.get('role_id'))
    data['mutable'] = max(ensure_list(data.get('mutable')), default=False)
    data['origin'] = ensure_list(data.get('origin'))
    data['collection_id'] = collection.id
    # log.info("%s", pformat(data))
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }
Пример #30
0
def normalize_value(text):
    if text is None:
        return

    try:
        # see if this the cell value clearly numeric:
        float(text)
        return
    except:
        pass

    text = fingerprints.generate(text, keep_order=True)
    if text is None:
        return

    if len(text) <= 3:
        return

    text = u' %s ' % text
    return text.encode('utf-8')
Пример #31
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    parser = SearchQueryParser({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    schema = query.get('type') or 'Thing'
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(schema),
        'schema': schema
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    query = SimilarEntitiesQuery(parser, entity=entity)
    matches = []
    for doc in query.search().get('hits').get('hits'):
        source = doc.get('_source')
        match = {
            'id': doc.get('_id'),
            'name': source.get('name'),
            'score': min(100, doc.get('_score') * 10),
            'uri': entity_url(doc.get('_id')),
            'match': source.get('name') == name
        }
        for type_ in get_freebase_types():
            if source['schema'] == type_['id']:
                match['type'] = [type_]
        matches.append(match)

    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }
Пример #32
0
def format_proxy(proxy, collection, job_id=None):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    proxy = collection.ns.apply(proxy)
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['job_id'] = job_id
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    text.append(collection.label)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for updated_at in properties.pop('indexUpdatedAt', []):
        data['updated_at'] = updated_at

    # integer casting
    numeric = {}
    for prop, values in properties.items():
        prop = proxy.schema.get(prop)
        if prop.type in NUMERIC_TYPES:
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # pprint(data)
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }
Пример #33
0
def finalize_index(proxy, context, texts):
    """Apply final denormalisations to the index."""
    for prop, value in proxy.itervalues():
        if prop.type.name in ['entity', 'date', 'url', 'country', 'language']:
            continue
        texts.append(value)

    entity = proxy.to_full_dict()
    data = merge_data(context, entity)
    data['name'] = proxy.caption
    data['text'] = index_form(texts)

    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')
    data.pop('id', None)
    return clean_dict(data)
Пример #34
0
def index_operation(data):
    """Apply final denormalisations to the index."""
    data['bulk'] = data.get('bulk', False)
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    texts = data.pop('text', [])
    texts.extend(data.get('properties', {}).pop('indexText', []))
    texts.extend(fps)
    data['text'] = texts

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')

    entity_id = str(data.pop('id'))
    data.pop('_index', None)
    index = entities_write_index(data.get('schema'))
    return entity_id, index, data
Пример #35
0
def _make_queries(prop, value):
    specificity = prop.type.specificity(value)
    if specificity == 0:
        return

    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'names.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None:
            yield {
                'term': {
                    'fingerprints': {
                        'value': fp,
                        'boost': boost
                    }
                }
            }
        return

    if prop.type.group is None:
        return
    yield {
        'term': {
            prop.type.group: {
                'value': value,
                'boost': specificity
            }
        }
    }
Пример #36
0
 def generate_linktab(self, chunk_size=10000):
     with self.config.engine.begin() as connection:
         q = self.config.linktab.delete()
         q = q.where(self.config.linktab.c.view == self.name)
         connection.execute(q)
         chunk = []
         for i, value in enumerate(self.distinct_key()):
             fp = fingerprints.generate(value)
             if fp is None:
                 continue
             # this is due to postgres' levenshtein
             fp = fp[:255]
             chunk.append({
                 'view': self.name,
                 'serial': self.serial,
                 'key': value,
                 'fingerprint': fp
             })
             if len(chunk) == chunk_size:
                 log.info('Linktab %s (%s): %s', self.name, self.key_ref, i)
                 connection.execute(self.config.linktab.insert(), chunk)
                 chunk = []
         if len(chunk):
             connection.execute(self.config.linktab.insert(), chunk)
Пример #37
0
def make_fingerprint(text, **kwargs):
    """Generate a normalised entity name, used for the graph."""
    return fingerprints.generate(string_value(text))
Пример #38
0
def normalizeaddress(value):
    return fingerprints.generate(value)
Пример #39
0
 def normalize_value(self, value, prop, record):
     return [fingerprints.generate(value)]
Пример #40
0
def fingerprint(value, **kwargs):
    return fingerprints.generate(string_value(value))
Пример #41
0
def addressfp(value, **kwargs):
    value = string_value(value)
    if value is None:
        return
    value = value.replace("<br/>", " ")
    return fingerprints.generate(value, keep_order=True)
Пример #42
0
# coding: utf-8
import fingerprints

tests = [
    u'Foo (Bar) Corp',
    u'ähnlIIch',
    'Open S.A.R.L.',
    'Mr. Boaty McBoatface',
    u'РАДИК ІВАН ЛЬВОВИЧ',
    u'КУШНАРЬОВ ДМИТРО ВІТАЛІЙОВИЧ',
    u'Foo (Bar) CORPORATION',
    'Mr. Sherlock Holmes',
    'Siemens Aktiengesellschaft',
    'New York, New York',
    u'Foo Gesellschaft mit beschränkter Haftung',
    'Software und- Systemgesellschaft mit beschr Haftung'
]

for test in tests:
    out = fingerprints.generate(test)
    print out