def similar(id): entity, _ = get_entity(id, request.authz.READ) schema = schemata.get(entity.get('schema')) if not schema.fuzzy: return jsonify({'status': 'ignore', 'results': [], 'total': 0}) state = QueryState(request.args, request.authz) combined = combined_entity(entity) return jsonify(similar_entities(combined, state))
def expand(self, keys): labels = {} for key in keys: try: labels[key] = {'label': schemata.get(key).plural} except NameError: labels[key] = {'label': key} return labels
def __init__(self, query, data): self.query = query self.data = data self.keys = dict_list(data, 'keys', 'key') self.key_fingerprint = data.get('key_fingerprint', False) self.schema = schemata.get(data.get('schema')) if self.schema is None or self.schema.section != self.section: raise TypeError("Invalid schema: %r" % data.get('schema')) self.properties = [] for name, prop in data.get('properties', {}).items(): schema = self.schema.get(name) self.properties.append(MapperProperty(self, name, prop, schema))
def schema(self): return schemata.get(self.type)
def entity_query(sample, collection_id=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" # Do not attempt to find xrefs for entity types such as land, buildings, # etc. schema = schemata.get(sample.get('schema')) if sample.get('schema') != Document.SCHEMA and not schema.fuzzy: return {'match_none': {}} if query is None: query = { 'bool': { 'should': [], 'filter': [], 'must': [], 'must_not': [] } } required = [] if collection_id is not None: query['bool']['must'].append( {'term': { 'collection_id': collection_id }}) for fp in sample.get('fingerprints', []): required.append({ 'fuzzy': { 'fingerprints': { 'value': fp, 'fuzziness': 2, 'boost': 3.0 } } }) # TODO: put names in FIELDS_XREF up there ^^^ # for value in sample.get('names', []): # required.append({ # 'match': { # 'names': { # 'query': value, # 'operator': 'and', # 'cutoff_frequency': 0.01, # } # } # }) for index in ['emails', 'phones']: for value in sample.get(index, []): required.append({'term': {index: {'value': value, 'boost': 2}}}) if not len(required): # e.g. a document from which no features have been extracted. return {'match_none': {}} # make it mandatory to have either a fingerprint or name match query['bool']['must'].append( {"bool": { "should": required, "minimum_should_match": 1 }}) # boost by "contributing criteria" for field in ['dates', 'countries', 'schemata', 'identifiers']: for val in sample.get(field, []): query['bool']['should'].append({'term': {field: val}}) for val in sample.get('addresses', []): query['bool']['should'].append({'common': {field: {'query': val}}}) # filter types which cannot be resolved via fuzzy matching. query['bool']['must_not'].append([{ "ids": { "values": [sample.get('id')] } }, { "terms": { "schema": [s.name for s in schemata if not s.fuzzy] } }]) return query
def update(self, result, key): key = result.get('id') try: result['label'] = schemata.get(key).plural except NameError: result['label'] = key
def _validate(self, value): try: schemata.get(value) except TypeError: raise ValidationError('Invalid schema name.')
def generate_matches_sheet(workbook, sheet, collection, match_collection, authz, links=True, one_sheet=False, offset=0, limit=1000): from aleph.views.serializers import MatchSchema if one_sheet: sheet_label = "All matches (top %s per collection)" % limit else: sheet_label = "%s (top %s)" % (match_collection.label, limit) sheet.set_zoom(125) parser = QueryParser({}, authz, limit=limit) q_match = Match.find_by_collection(collection.id, match_collection.id) matches = MatchQueryResult({}, q_match, parser=parser, schema=MatchSchema) if offset < 3: sheet.write(0, 0, '', workbook.header_format) sheet.write(1, 0, 'Score', workbook.header_format) sheet.merge_range(0, 1, 0, 4, collection.label, workbook.header_format) sheet.write(1, 1, 'Name', workbook.header_format) sheet.write(1, 2, 'Type', workbook.header_format) sheet.write(1, 3, 'Country', workbook.header_format) sheet.write(1, 4, 'Source URL', workbook.header_format) sheet.merge_range(0, 5, 0, 8, sheet_label, workbook.header_format) sheet.write(1, 5, 'Name', workbook.header_format) sheet.write(1, 6, 'Type', workbook.header_format) sheet.write(1, 7, 'Country', workbook.header_format) if one_sheet: sheet.write(1, 8, 'Collection', workbook.header_format) sheet.freeze_panes(2, 0) sheet.autofilter(1, 1, 2 + len(matches.results), 8) widths = {} for row, result in enumerate(matches.results, offset): sheet.write_number(row, 0, int(result.score)) name = result.entity.get('name') widths[1] = max(widths.get(1, 0), len(name)) if links: url = entity_url(result.entity_id) sheet.write_url(row, 1, url, workbook.link_format, name) else: sheet.write_string(row, 1, name) schema = schemata.get(result.entity['schema']) sheet.write_string(row, 2, schema.label) countries = ', '.join(sorted(result.entity.get('countries', []))) sheet.write_string(row, 3, countries.upper()) ent_props = result.entity.get('properties', {}) if (ent_props.get('sourceUrl') is not None): source_url = ', '.join(ent_props.get('sourceUrl')) else: source_url = '' sheet.write_string(row, 4, source_url) name = result.match.get('name') widths[5] = max(widths.get(5, 0), len(name)) if links: url = entity_url(result.match_id) sheet.write_url(row, 5, url, workbook.link_format, name) else: sheet.write_string(row, 5, name) schema = schemata.get(result.match['schema']) sheet.write_string(row, 6, schema.label) countries = ', '.join(sorted(result.match.get('countries', []))) sheet.write_string(row, 7, countries.upper()) if one_sheet: sheet.write_string(row, 8, match_collection.label) for idx, max_len in widths.items(): max_len = min(70, max(7, max_len + 1)) sheet.set_column(idx, idx, float(max_len)) return sheet