Python generate примеры, fingerprints.generate Python примеры использования

Пример #1

0

Показать файл

def search(raw_query, query_type='/geo/country'):
    raw_query = fingerprints.generate(raw_query)
    countries = get_countries()

    rv = []

    matches = countries[fingerprints.generate(raw_query)]
    for m in matches:
        m['comparison_score'] = difflib.SequenceMatcher(
            None, raw_query, fingerprints.generate(m['name_to_match'])) \
            .quick_ratio()

    for m in sorted(matches, key=lambda i: i['comparison_score'],
                    reverse=True):
        score = m['comparison_score']
        rv.append({
            'id': str(m['id']),
            'name': m['canonical_name'],
            'type': [QUERY_TYPES[0]['id']],
            'score': score * 100,
            'match': score == 1.0,
            'all_labels': {
                'score': score * 100,
                'weighted': score * 100
            }
        })

    return rv

Пример #2

0

Показать файл

Файл: entities.py Проект: simonwoerpel/aleph

def load_entity(tx, entity):
    log.info("Load node [%s]: %s", entity.id, entity.name)
    node = Node(Vocab.Entity,
                fingerprint=fingerprints.generate(entity.name),
                name=entity.name,
                alephState=entity.state,
                alephEntity=entity.id)
    if entity.jurisdiction_code is not None:
        node['countryCode'] = entity.jurisdiction_code.upper()

    tx.merge(node, Vocab.Entity, 'fingerprint')
    for collection in entity.collections:
        coll_node = load_collection(tx, collection)
        rel = Relationship(node, Vocab.PART_OF, coll_node,
                           alephEntity=entity.id)
        tx.merge(rel, Vocab.PART_OF)

    seen = set([node['fingerprint']])
    for other_name in entity.other_names:
        fingerprint = fingerprints.generate(other_name.display_name)
        if fingerprint in seen or fingerprint is None:
            continue
        seen.add(fingerprint)

        alias = Node(Vocab.Entity,
                     fingerprint=fingerprint,
                     name=other_name.display_name,
                     alephEntity=entity.id,
                     isAlias=True)
        tx.merge(alias, Vocab.Entity, 'fingerprint')
        rel = Relationship(node, Vocab.AKA, alias,
                           alephId=other_name.id)
        tx.merge(rel, Vocab.AKA, 'alephId')
    # TODO contact details, addresses
    return node

Пример #3

0

Показать файл

 def fingerprint(self):
     self.cluster = []
     for i in range(0, len(self.group)):
         for j in range(i + 1, len(self.group)):
             if fingerprints.generate(
                     self.group[i]) == fingerprints.generate(self.group[j]):
                 self.cluster.append([self.group[i], self.group[j]])
     return self.cluster

Пример #4

0

Показать файл

def load_to_neo4j(project, neo4j_uri=None):
    neo4j_uri = neo4j_uri or env.NEO4J_URI
    if neo4j_uri is None:
        project.log.error("No $NEO4J_URI set, cannot load graph.")
        return
    project.log.info("Loading graph to Neo4J: %s", neo4j_uri)
    graph = Graph(neo4j_uri)
    tx = graph.begin()
    try:
        tx.run('MATCH (n) DETACH DELETE n')
        entities = {}
        for entity in project.iter_merged_entities():
            label = entity.pop('type', None) or 'Other'
            node = Node(label, **normalise(entity))
            tx.create(node)
            entities[entity['uid']] = node

            # create "Name" fake nodes
            fps = set()
            for name in entity.get('names', []):
                fp = fingerprints.generate(name)
                if fp is None:
                    continue
                fp = fp.replace(' ', '-')
                if fp in fps:
                    continue
                fps.add(fp)
                alias = Node('Name', name=name, fp=fp)
                tx.merge(alias, 'Name', 'fp')
                rel = Relationship(node, 'ALIAS', alias)
                tx.create(rel)

            address = entity.get('address')
            fp = fingerprints.generate(address)
            if fp is not None:
                fp = fp.replace(' ', '-')
                loc = Node('Address', name=address, fp=fp)
                tx.merge(loc, 'Address', 'fp')
                rel = Relationship(node, 'LOCATION', alias)
                tx.create(rel)

        for link in project.iter_merged_links():
            source = entities.get(link.pop('source'))
            target = entities.get(link.pop('target'))
            if source is None or target is None:
                continue
            rel = Relationship(source, 'LINK', target, **normalise(link))
            tx.create(rel)

        clear_leaf_nodes(tx, 'Name')
        clear_leaf_nodes(tx, 'Address')
        tx.commit()
    except Exception as ex:
        project.log.exception(ex)
        tx.rollback()

Пример #5

0

Показать файл

Файл: copyright_summary.py Проект: akugarg/scancode-toolkit

    def fingerprint(self):
        key = self.key
        if not isinstance(key, str):
            key = unidecode(key)
        fp = fingerprints.generate(key)

        if TRACE_TEXT or TRACE_FP:
            logger_debug('Text.fingerprint:key: ', repr(self.key))
            logger_debug('Text.fingerprint:fp :    ',
                         fingerprints.generate(unidecode(self.key)))

        self.key = fp

Пример #6

0

Показать файл

Файл: similarity.py Проект: bigrayhicks/corpint

def entity_similarity(left, right):
    left_name = left.get('name')
    right_name = right.get('name')
    score = 0
    if left_name is not None and right_name is not None:
        name_sim = jaro_winkler(chomp(left_name), chomp(right_name))
        score += (name_sim * 0.6)

    left_fp = fingerprints.generate(left_name)
    right_fp = fingerprints.generate(right_name)
    if left_fp is not None and right_fp is not None:
        fp_sim = jaro_winkler(left_fp, right_fp)
        score += (fp_sim * 0.4)

    return min(1.0, score)

Пример #7

0

Показать файл

def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            "match": {
                "fingerprints.text": {
                    "query": value,
                    "operator": "and",
                    "minimum_should_match": "60%",
                    "boost": boost,
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None and fp != value:
            yield {
                "match": {
                    "fingerprints.text": {
                        "query": value,
                        "operator": "and",
                        "minimum_should_match": "60%",
                        "boost": boost,
                    }
                }
            }
    elif prop.type.group is not None:
        yield {"term": {prop.type.group: {"value": value}}}

Пример #8

0

Показать файл

def finalize_index(data, schema, texts):
    """Apply final denormalisations to the index."""
    data['schema'] = schema.name
    # Get implied schemata (i.e. parents of the actual schema)
    data['schemata'] = schema.names

    properties = data.get('properties', {})
    for name, prop in schema.properties.items():
        if name not in properties:
            continue
        if prop.type_name in ['entity', 'date', 'url', 'uri', 'country']:
            continue
        for value in ensure_list(properties[name]):
            if name == 'name':
                data['name'] = value
            texts.append(value)

    data = schema.invert(data)
    data['text'] = index_form(texts)

    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(latinize_text(name))
    data['names'] = list(set(names))

    if 'created_at' not in data:
        data['created_at'] = data.get('updated_at')
    return data

Пример #9

0

Показать файл

def format_proxy(proxy, collection, job_id=None):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['job_id'] = job_id
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    text.append(collection.label)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for updated_at in properties.pop('indexUpdatedAt', []):
        data['updated_at'] = updated_at

    # pprint(data)
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }

Пример #10

0

Показать файл

Файл: read_directors.py Проект: Transparency-International-UK/companies_house_bulk

def parse_company(line):

    results = dict()

    # same nomenclature for company_number in function parse_officer()
    # applies.
    results['company_number'] = line[0:8]

    # record_type is always 1 since we're parsing companies.
    results['record_type'] = line[8]

    # company_status (dissolved, active...)
    results['company_status_code'] = line[9]

    results['is_company'] = line[24] == 'Y'

    # filler, can throw away.
    results['filler'] = line[10:32]

    results['number_of_officers'] = line[32:36]

    # holds the length of the name variable (incl. "<" char), used for
    # validation, do not insert in database.
    results['unwanted_company_name_length'] = line[36:40]

    # company names will be of varying length and will always end with
    # '...< \n'.
    results['company_name'] = line[40:].strip('< \n')
    results["company_name_norm"] = generate(results.get("company_name", None))
    return results

Пример #11

0

Показать файл

Файл: matching.py Проект: sunu/aleph

def _make_queries(type_, value):
    if type_ == registry.name:
        yield {
            "match": {
                "fingerprints.text": {
                    "query": value,
                    "operator": "and",
                    "minimum_should_match": "60%",
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is None:
            return
        if fp.lower() != value.lower():
            yield {
                "match": {
                    "fingerprints.text": {
                        "query": value,
                        "operator": "and",
                        "minimum_should_match": "60%",
                    }
                }
            }
    elif type_.group is not None:
        yield {"term": {type_.group: {"value": value}}}

Пример #12

0

Показать файл

def load_entities(graph):
    """Load composite entities into the graph."""
    tx = graph.begin()
    entities = {}
    try:
        for entity in Entity.iter_composite():
            label = entity.schema or 'Other'
            data = dict(entity.data)
            data.pop('aliases', None)
            node = Node(label, origin=entity.origin, **data)
            project.log.info("Node [%s]: %s", label, entity.name)
            tx.create(node)
            for uid in entity.uids:
                entities[uid] = node

            for name in entity.names:
                fp = fingerprints.generate(name)
                name_node = Node(NAME, name=name, fp=fp)
                tx.merge(name_node, NAME, 'fp')

                rel = Relationship(node, 'ALIAS', name_node)
                tx.create(rel)

        clear_leaf_nodes(tx, NAME)
        tx.commit()
        return entities
    except Exception:
        tx.rollback()
        raise

Пример #13

0

Показать файл

Файл: match.py Проект: pudo/aleph

def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'names.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None:
            yield {
                'term': {
                    'fingerprints': {
                        'value': fp,
                        'boost': boost
                    }
                }
            }
    elif prop.type.group is not None:
        yield {
            'term': {
                prop.type.group: {
                    'value': value
                }
            }
        }

Пример #14

0

Показать файл

 def fingerprint(self, values):
     # TODO: this should not be a property thing, so that fp's can include
     # dates etx.
     fps = []
     for value in values:
         fps.append(fingerprints.generate(value))
     return [fp for fp in fps if fp is not None]

Пример #15

0

Показать файл

def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'fingerprints.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None and fp != value:
            yield {
                'match': {
                    'fingerprints.text': {
                        'query': value,
                        'operator': 'and',
                        'minimum_should_match': '60%',
                        'boost': boost
                    }
                }
            }
    elif prop.type.group is not None:
        yield {
            'term': {
                prop.type.group: {
                    'value': value
                }
            }
        }

Пример #16

0

Показать файл

def reconcile_op(query):
    """Reconcile operation for a single query."""
    state = QueryState({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(query.get('type'))
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    suggested = similar_entities(entity, state)
    matches = []
    for ent in suggested.get('results'):
        types = [t for t in get_freebase_types() if ent['schema'] == t['id']]
        matches.append({
            'id': ent.get('id'),
            'name': ent.get('name'),
            'type': types,
            'score': min(100, ent.get('score') * 10),
            'uri': entity_link(ent.get('id')),
            'match': ent.get('name') == name
        })
    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }

Пример #17

0

Показать файл

def process_single(resource):
    for row in resource:
        fp = fingerprints.generate(row['beneficiary_name'])
        if fp is not None:
            row['beneficiary_id'] = fp.capitalize()
        else:
            row['beneficiary_id'] = row['beneficiary_name']
        yield row

Пример #18

0

Показать файл

Файл: entity.py Проект: nstallbaumer/corpint

 def fingerprints(self):
     if not hasattr(self, '_fingerprints'):
         self._fingerprints = set()
         for name in self.names:
             fp = fingerprints.generate(name)
             if fp is not None:
                 self._fingerprints.add(fp)
     return self._fingerprints

Пример #19

0

Показать файл

Файл: fingerprint_beneficiaries.py Проект: os-data/eu-structural-funds

def process_single(resource):
    for row in resource:
        fp = fingerprints.generate(row['beneficiary_name'])
        if fp is not None:
            row['beneficiary_id'] = fp.capitalize()
        else:
            row['beneficiary_id'] = row['beneficiary_name']
        yield row

Пример #20

0

Показать файл

def get_declared_holders(codebase, holders_tallies):
    """
    Return a list of declared holders from a codebase using the holders
    detected from key files.

    A declared holder is a copyright holder present in the key files who has the
    highest amount of refrences throughout the codebase.
    """
    entry_by_holders = {
        fingerprints.generate(entry['value']): entry
        for entry in holders_tallies if entry['value']
    }
    key_file_holders = get_field_values_from_codebase_resources(
        codebase, 'holders', key_files_only=True)
    entry_by_key_file_holders = {
        fingerprints.generate(entry['holder']): entry
        for entry in key_file_holders if entry['holder']
    }
    unique_key_file_holders = unique(entry_by_key_file_holders.keys())
    unique_key_file_holders_entries = [
        entry_by_holders[holder] for holder in unique_key_file_holders
    ]

    holder_by_counts = defaultdict(list)
    for holder_entry in unique_key_file_holders_entries:
        count = holder_entry.get('count')
        if count:
            holder = holder_entry.get('value')
            holder_by_counts[count].append(holder)

    declared_holders = []
    if holder_by_counts:
        highest_count = max(holder_by_counts)
        declared_holders = holder_by_counts[highest_count]

    # If we could not determine a holder, then we return a list of all the
    # unique key file holders
    if not declared_holders:
        declared_holders = [
            entry['value'] for entry in unique_key_file_holders_entries
        ]

    return declared_holders

Пример #21

0

Показать файл

def index_names(data):
    """Handle entity names on documents and entities."""
    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(ascii_text(name))
    data['names'] = list(set(names))

Пример #22

0

Показать файл

def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    # Abstract entities can appear when profile fragments for a missing entity
    # are present.
    if proxy.schema.abstract:
        return None

    data = proxy.to_full_dict()
    data["schemata"] = list(proxy.schema.names)
    data["caption"] = proxy.caption

    names = data.get("names", [])
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data["fingerprints"] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get("properties")
    data["text"] = properties.pop("indexText", [])

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric["dates"] = _numeric_values(registry.date, data.get("dates"))
    data["numeric"] = numeric

    # Context data - from aleph system, not followthemoney.
    data["collection_id"] = collection.id
    data["role_id"] = first(data.get("role_id"))
    data["profile_id"] = first(data.get("profile_id"))
    data["mutable"] = max(ensure_list(data.get("mutable")), default=False)
    data["origin"] = ensure_list(data.get("origin"))
    # Logical simplifications of dates:
    created_at = ensure_list(data.get("created_at"))
    if len(created_at) > 0:
        data["created_at"] = min(created_at)
    updated_at = ensure_list(data.get("updated_at")) or created_at
    if len(updated_at) > 0:
        data["updated_at"] = max(updated_at)

    # log.info("%s", pformat(data))
    entity_id = data.pop("id")
    return {
        "_id": entity_id,
        "_index": entities_write_index(proxy.schema),
        "_source": data,
    }

Пример #23

0

Показать файл

def get_countries():
    if 'countries' in g:
        return g.countries

    rv = collections.defaultdict(list)

    cursor = mysql.connection.cursor(MySQLdb.cursors.DictCursor)
    cursor.execute(LOAD_COUNTRIES_SQL)
    for r in cursor.fetchall():
        rv[fingerprints.generate(r["name_to_match"])].append(r)

    g.countries = rv
    return g.countries

Пример #24

0

Показать файл

def _normalize_names(names):
    """Generate a sequence of comparable names for an entity. This also
    generates a `fingerprint`, i.e. a version of the name where all tokens
    are sorted alphabetically, and some parts, such as company suffixes,
    have been removed."""
    seen = set()
    for name in names:
        plain = normalize(name, ascii=True)
        if plain is not None and plain not in seen:
            seen.add(plain)
            yield plain
        fp = fingerprints.generate(name)
        if fp is not None and len(fp) > 6 and fp not in seen:
            seen.add(fp)
            yield fp

Пример #25

0

Показать файл

Файл: mapper.py Проект: wcyn/aleph

 def compute_key(self, record):
     digest = sha1(self.query.dataset.name.encode('utf-8'))
     has_key = False
     for key in self.keys:
         value = record.get(key)
         if self.key_fingerprint:
             value = fingerprints.generate(value)
         else:
             value = string_value(value)
         if value is None:
             continue
         digest.update(value.encode('utf-8'))
         has_key = True
     if has_key:
         return digest.hexdigest()

Пример #26

0

Показать файл

Файл: entities.py Проект: x0rzkov/aleph

def format_proxy(proxy, collection, extra):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    proxy = collection.ns.apply(proxy)
    # Pull `indexUpdatedAt` before constructing `data`, so that it doesn't
    # creep into `data['dates']` and mess up date sorting afterwards
    updated_at = proxy.pop('indexUpdatedAt', quiet=True)
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['schemata'] = list(proxy.schema.names)

    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for value in updated_at:
        data['updated_at'] = value

    # integer casting
    numeric = {}
    for prop, values in properties.items():
        prop = proxy.schema.get(prop)
        if prop.type in NUMERIC_TYPES:
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # add possible overrides
    data.update(extra)

    # log.info("%s", pformat(data))
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }

Пример #27

0

Показать файл

Файл: mapper.py Проект: pudo-attic/leadgraph

 def compute_key(self, record):
     if not len(self.keys):
         return None
     digest = sha1(self.query.dataset.name.encode('utf-8'))
     # digest.update(self.schema.name.encode('utf-8'))
     has_key = False
     for key in self.keys:
         value = record.get(key)
         if self.key_fingerprint:
             value = fingerprints.generate(value)
         else:
             value = clean_text(value)
         if value is None:
             continue
         digest.update(value.encode('utf-8'))
         has_key = True
     if has_key:
         return digest.hexdigest()

Пример #28

0

Показать файл

def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    data = proxy.to_full_dict()
    data["schemata"] = list(proxy.schema.names)

    names = data.get("names", [])
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data["fingerprints"] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get("properties")
    text = properties.pop("indexText", [])
    text.extend(fps)
    data["text"] = text

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric["dates"] = _numeric_values(registry.date, data.get("dates"))
    data["numeric"] = numeric

    # Context data - from aleph system, not followthemoney.
    # FIXME: Can there ever really be multiple role_ids?
    data["role_id"] = first(data.get("role_id"))
    data["mutable"] = max(ensure_list(data.get("mutable")), default=False)
    data["origin"] = ensure_list(data.get("origin"))
    created_at = data.get("created_at")
    if created_at:
        data["updated_at"] = data.get("updated_at", created_at)
    data["collection_id"] = collection.id
    # log.info("%s", pformat(data))
    entity_id = data.pop("id")
    return {
        "_id": entity_id,
        "_index": entities_write_index(data.get("schema")),
        "_source": data,
    }

Пример #29

0

Показать файл

Файл: entities.py Проект: djoffrey/aleph

def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    data = proxy.to_full_dict()
    data['schemata'] = list(proxy.schema.names)

    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    data['text'] = text

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # Context data - from aleph system, not followthemoney.
    now = iso_text(datetime.utcnow())
    data['created_at'] = min(ensure_list(data.get('created_at')), default=now)
    data['updated_at'] = min(ensure_list(data.get('updated_at')), default=now)
    # FIXME: Can there ever really be multiple role_ids?
    data['role_id'] = first(data.get('role_id'))
    data['mutable'] = max(ensure_list(data.get('mutable')), default=False)
    data['origin'] = ensure_list(data.get('origin'))
    data['collection_id'] = collection.id
    # log.info("%s", pformat(data))
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }

Пример #30

0

Показать файл

Файл: util.py Проект: occrp/tabref

def normalize_value(text):
    if text is None:
        return

    try:
        # see if this the cell value clearly numeric:
        float(text)
        return
    except:
        pass

    text = fingerprints.generate(text, keep_order=True)
    if text is None:
        return

    if len(text) <= 3:
        return

    text = u' %s ' % text
    return text.encode('utf-8')

Пример #31

0

Показать файл

def reconcile_op(query):
    """Reconcile operation for a single query."""
    parser = SearchQueryParser({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    schema = query.get('type') or 'Thing'
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(schema),
        'schema': schema
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    query = SimilarEntitiesQuery(parser, entity=entity)
    matches = []
    for doc in query.search().get('hits').get('hits'):
        source = doc.get('_source')
        match = {
            'id': doc.get('_id'),
            'name': source.get('name'),
            'score': min(100, doc.get('_score') * 10),
            'uri': entity_url(doc.get('_id')),
            'match': source.get('name') == name
        }
        for type_ in get_freebase_types():
            if source['schema'] == type_['id']:
                match['type'] = [type_]
        matches.append(match)

    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }

Пример #32

0

Показать файл

Файл: entities.py Проект: wdsn/aleph

def format_proxy(proxy, collection, job_id=None):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    proxy = collection.ns.apply(proxy)
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['job_id'] = job_id
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    text.append(collection.label)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for updated_at in properties.pop('indexUpdatedAt', []):
        data['updated_at'] = updated_at

    # integer casting
    numeric = {}
    for prop, values in properties.items():
        prop = proxy.schema.get(prop)
        if prop.type in NUMERIC_TYPES:
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # pprint(data)
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }

Пример #33

0

Показать файл

Файл: entities.py Проект: mustafaascha/aleph

def finalize_index(proxy, context, texts):
    """Apply final denormalisations to the index."""
    for prop, value in proxy.itervalues():
        if prop.type.name in ['entity', 'date', 'url', 'country', 'language']:
            continue
        texts.append(value)

    entity = proxy.to_full_dict()
    data = merge_data(context, entity)
    data['name'] = proxy.caption
    data['text'] = index_form(texts)

    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')
    data.pop('id', None)
    return clean_dict(data)

Пример #34

0

Показать файл

Файл: entities.py Проект: pudo/aleph

def index_operation(data):
    """Apply final denormalisations to the index."""
    data['bulk'] = data.get('bulk', False)
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    texts = data.pop('text', [])
    texts.extend(data.get('properties', {}).pop('indexText', []))
    texts.extend(fps)
    data['text'] = texts

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')

    entity_id = str(data.pop('id'))
    data.pop('_index', None)
    index = entities_write_index(data.get('schema'))
    return entity_id, index, data

Пример #35

0

Показать файл

Файл: match.py Проект: modulexcite/aleph

def _make_queries(prop, value):
    specificity = prop.type.specificity(value)
    if specificity == 0:
        return

    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'names.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None:
            yield {
                'term': {
                    'fingerprints': {
                        'value': fp,
                        'boost': boost
                    }
                }
            }
        return

    if prop.type.group is None:
        return
    yield {
        'term': {
            prop.type.group: {
                'value': value,
                'boost': specificity
            }
        }
    }

Пример #36

0

Показать файл

Файл: model.py Проект: pudo/linkage

 def generate_linktab(self, chunk_size=10000):
     with self.config.engine.begin() as connection:
         q = self.config.linktab.delete()
         q = q.where(self.config.linktab.c.view == self.name)
         connection.execute(q)
         chunk = []
         for i, value in enumerate(self.distinct_key()):
             fp = fingerprints.generate(value)
             if fp is None:
                 continue
             # this is due to postgres' levenshtein
             fp = fp[:255]
             chunk.append({
                 'view': self.name,
                 'serial': self.serial,
                 'key': value,
                 'fingerprint': fp
             })
             if len(chunk) == chunk_size:
                 log.info('Linktab %s (%s): %s', self.name, self.key_ref, i)
                 connection.execute(self.config.linktab.insert(), chunk)
                 chunk = []
         if len(chunk):
             connection.execute(self.config.linktab.insert(), chunk)

Пример #37

0

Показать файл

Файл: common.py Проект: CodeForAfrica/aleph

def make_fingerprint(text, **kwargs):
    """Generate a normalised entity name, used for the graph."""
    return fingerprints.generate(string_value(text))

Пример #38

0

Показать файл

Файл: util.py Проект: pudo-attic/leadgraph

def normalizeaddress(value):
    return fingerprints.generate(value)

Пример #39

0

Показать файл

Файл: types.py Проект: pudo-attic/leadgraph

 def normalize_value(self, value, prop, record):
     return [fingerprints.generate(value)]

Пример #40

0

Показать файл

Файл: converter.py Проект: nivertech/aleph

def fingerprint(value, **kwargs):
    return fingerprints.generate(string_value(value))

Пример #41

0

Показать файл

Файл: converter.py Проект: CodeForAfrica/aleph

def addressfp(value, **kwargs):
    value = string_value(value)
    if value is None:
        return
    value = value.replace("<br/>", " ")
    return fingerprints.generate(value, keep_order=True)

Пример #42

0

Показать файл

Файл: test.py Проект: backgroundcheck/fingerprints

# coding: utf-8
import fingerprints

tests = [
    u'Foo (Bar) Corp',
    u'ähnlIIch',
    'Open S.A.R.L.',
    'Mr. Boaty McBoatface',
    u'РАДИК ІВАН ЛЬВОВИЧ',
    u'КУШНАРЬОВ ДМИТРО ВІТАЛІЙОВИЧ',
    u'Foo (Bar) CORPORATION',
    'Mr. Sherlock Holmes',
    'Siemens Aktiengesellschaft',
    'New York, New York',
    u'Foo Gesellschaft mit beschränkter Haftung',
    'Software und- Systemgesellschaft mit beschr Haftung'
]

for test in tests:
    out = fingerprints.generate(test)
    print out

Python generate примеры использования