def index(collection_id): collection = get_db_collection(collection_id) record_audit(Audit.ACT_COLLECTION, id=collection.id) parser = QueryParser(request.args, request.authz) q = Match.group_by_collection(collection.id, authz=request.authz) result = DatabaseQueryResult(request, q, parser=parser) return MatchCollectionsSerializer.jsonify_result(result)
def matches(collection_id, other_id): collection = get_db_collection(collection_id) record_audit(Audit.ACT_COLLECTION, id=collection.id) other = get_db_collection(other_id) record_audit(Audit.ACT_COLLECTION, id=other.id) parser = QueryParser(request.args, request.authz) q = Match.find_by_collection(collection.id, other.id) result = DatabaseQueryResult(request, q, parser=parser) return MatchSerializer.jsonify_result(result)
def delete_collection_content(collection_id): # Deleting a collection affects many associated objects and requires # checks, so this is done manually and in detail here. q = db.session.query(Collection) q = q.filter(Collection.id == collection_id) collection = q.first() if collection is None: log.error("No collection with ID: %r", collection_id) return log.info("Deleting collection [%r]: %r", collection.id, collection.label) deleted_at = collection.deleted_at or datetime.utcnow() Entity.delete_by_collection(collection_id, deleted_at=deleted_at) Match.delete_by_collection(collection_id, deleted_at=deleted_at) Permission.delete_by_collection(collection_id, deleted_at=deleted_at) index.delete_collection(collection_id) index.delete_entities(collection_id) collection.delete(deleted_at=deleted_at) db.session.commit()
def _xref_item(item, collection_id=None): """Cross-reference an entity or document, given as an indexed document.""" name = item.get('name') or item.get('title') query = entity_query(item, collection_id=collection_id) if 'match_none' in query: return query = { 'query': query, 'size': 10, '_source': ['collection_id', 'name'], } result = search_safe(index=entities_index(), body=query) results = result.get('hits').get('hits') entity_id, document_id = None, None if Document.SCHEMA in item.get('schemata'): document_id = item.get('id') else: entity_id = item.get('id') dq = db.session.query(Match) dq = dq.filter(Match.entity_id == entity_id) dq = dq.filter(Match.document_id == document_id) if collection_id is not None: dq = dq.filter(Match.match_collection_id == collection_id) dq.delete() for result in results: source = result.get('_source', {}) log.info("Xref [%.1f]: %s <=> %s", result.get('_score'), name, source.get('name')) obj = Match() obj.entity_id = entity_id obj.document_id = document_id obj.collection_id = item.get('collection_id') obj.match_id = result.get('_id') obj.match_collection_id = source.get('collection_id') obj.score = result.get('_score') db.session.add(obj) db.session.commit()
def xref_item(item, collection_id=None): """Cross-reference an entity or document, given as an indexed document.""" name = item.get('name') or item.get('title') result = es.search(index=es_index, doc_type=TYPE_ENTITY, body={ 'query': entity_query(item, collection_id), 'size': 10, '_source': ['collection_id', 'name'], }) results = result.get('hits').get('hits') entity_id, document_id = None, None if item.get('$type') == TYPE_DOCUMENT: document_id = item.get('id') else: entity_id = item.get('id') dq = db.session.query(Match) dq = dq.filter(Match.entity_id == entity_id) dq = dq.filter(Match.document_id == document_id) if collection_id is not None: dq = dq.filter(Match.match_collection_id == collection_id) dq.delete() for result in results: source = result.get('_source', {}) log.info("Xref [%.1f]: %s <=> %s", result.get('_score'), name, source.get('name')) obj = Match() obj.entity_id = entity_id obj.document_id = document_id obj.collection_id = item.get('collection_id') obj.match_id = result.get('_id') obj.match_collection_id = source.get('collection_id') obj.score = result.get('_score') db.session.add(obj) db.session.commit()
def xref_collection(collection_id, against_collection_ids=None): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] entities = iter_proxies(collection_id=collection_id, schemata=matchable) for entity in entities: proxy = model.get_proxy(entity) dq = db.session.query(Match) dq = dq.filter(Match.entity_id == proxy.id) dq.delete() matches = xref_item(proxy, collection_ids=against_collection_ids) for (score, other_id, other) in matches: log.info("Xref [%.3f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = proxy.id obj.collection_id = collection_id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit()
def index(collection_id): """ --- get: summary: Fetch cross-reference summary description: >- Fetch cross-reference matches grouped by collection, for entities in the collection with id `collection_id` parameters: - in: path name: collection_id required: true schema: type: integer responses: '200': description: OK content: application/json: schema: type: object allOf: - $ref: '#/components/schemas/QueryResponse' properties: results: type: array items: $ref: '#/components/schemas/XrefCollection' tags: - Xref - Collection """ collection = get_db_collection(collection_id) parser = QueryParser(request.args, request.authz) q = Match.group_by_collection(collection.id, authz=request.authz) result = DatabaseQueryResult(request, q, parser=parser) return MatchCollectionsSerializer.jsonify_result(result)
def xref_item(stage, collection, entity_id=None, against_collection_ids=None): "Cross-reference an entity against others to generate potential matches." entity_ids = [entity_id] # This is running as a background job. In order to avoid running each # entity one by one, we do it 101 at a time. This avoids sending redudant # queries to the database and elasticsearch, making cross-ref much faster. for task in stage.get_tasks(limit=50): entity_ids.append(task.payload.get('entity_id')) stage.mark_done(len(entity_ids) - 1) # log.debug("Have %d entity IDs for xref", len(entity_ids)) for data in entities_by_ids(entity_ids, includes=['schema', 'properties']): proxy = model.get_proxy(data) # log.info("XRef: %r", proxy) matches = xref_query_item(proxy, collection_ids=against_collection_ids) for (score, other_id, other) in matches: log.info("Xref [%.3f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = proxy.id obj.collection_id = collection.id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit()
def index(collection_id): collection = get_db_collection(collection_id) parser = QueryParser(request.args, request.authz) q = Match.group_by_collection(collection.id, authz=request.authz) result = DatabaseQueryResult(request, q, parser=parser) return MatchCollectionsSerializer.jsonify_result(result)
def delete_bulk_entities(collection_id, deleted_at=None): deleted_at = deleted_at or datetime.utcnow() log.info("Deleting entities...") index.delete_entities(collection_id, bulk_only=True) Match.delete_by_collection(collection_id, deleted_at=deleted_at)
def generate_excel(collection, authz, links=True, one_sheet=False): limit = 1000 output = StringIO.StringIO() workbook = xlsxwriter.Workbook(output) workbook.link_format = workbook.add_format({ 'font_color': 'blue', 'underline': True }) workbook.header_format = workbook.add_format({ 'font_color': 'white', 'fg_color': '#982022', 'bold': True }) # Write the summary worksheet (Collection names and count) sheet = workbook.add_worksheet('Summary') sheet.set_zoom(125) title = 'Cross-referencing: %s' % collection.label sheet.merge_range(0, 0, 0, 2, title, workbook.header_format) sheet.write(1, 0, 'Collection', workbook.header_format) sheet.write(1, 1, 'Matches', workbook.header_format) if not one_sheet: sheet.write(1, 2, 'Details', workbook.header_format) sheet.set_column(2, 2, 20) sheet.freeze_panes(1, 0) # Query for all the collections with matches collections = Match.group_by_collection(collection.id, authz=authz) max_label = 70 offset = 2 # Number of header rows for row, result in enumerate(collections, 2): if links: url = collection_url(result.collection.id) sheet.write_url(row, 0, url, workbook.link_format, result.collection.label) else: sheet.write_string(row, 0, result.collection.label) max_label = max(max_label, len(result.collection.label)) sheet.set_column(0, 0, float(max_label)) sheet.write_number(row, 1, result.matches) if not one_sheet: matches_sheet_name = make_excel_safe_name(result.collection) matches_sheet = workbook.add_worksheet(matches_sheet_name) url = "internal:'%s'!B3" % matches_sheet_name sheet.write_url(row, 2, url, workbook.link_format, 'See matches') try: matches_sheet except NameError: matches_sheet = workbook.add_worksheet("All matches") matches_sheet = generate_matches_sheet(workbook, matches_sheet, collection, result.collection, authz, links=links, one_sheet=one_sheet, offset=offset, limit=limit) if one_sheet: if result.matches > limit: offset = offset + limit else: offset = offset + result.matches workbook.close() output.seek(0) return output
def generate_matches_sheet(workbook, sheet, collection, match_collection, authz, links=True, one_sheet=False, offset=0, limit=1000): from aleph.views.serializers import MatchSchema if one_sheet: sheet_label = "All matches (top %s per collection)" % limit else: sheet_label = "%s (top %s)" % (match_collection.label, limit) sheet.set_zoom(125) parser = QueryParser({}, authz, limit=limit) q_match = Match.find_by_collection(collection.id, match_collection.id) matches = MatchQueryResult({}, q_match, parser=parser, schema=MatchSchema) if offset < 3: sheet.write(0, 0, '', workbook.header_format) sheet.write(1, 0, 'Score', workbook.header_format) sheet.merge_range(0, 1, 0, 4, collection.label, workbook.header_format) sheet.write(1, 1, 'Name', workbook.header_format) sheet.write(1, 2, 'Type', workbook.header_format) sheet.write(1, 3, 'Country', workbook.header_format) sheet.write(1, 4, 'Source URL', workbook.header_format) sheet.merge_range(0, 5, 0, 8, sheet_label, workbook.header_format) sheet.write(1, 5, 'Name', workbook.header_format) sheet.write(1, 6, 'Type', workbook.header_format) sheet.write(1, 7, 'Country', workbook.header_format) if one_sheet: sheet.write(1, 8, 'Collection', workbook.header_format) sheet.freeze_panes(2, 0) sheet.autofilter(1, 1, 2 + len(matches.results), 8) widths = {} for row, result in enumerate(matches.results, offset): sheet.write_number(row, 0, int(result.score)) name = result.entity.get('name') widths[1] = max(widths.get(1, 0), len(name)) if links: url = entity_url(result.entity_id) sheet.write_url(row, 1, url, workbook.link_format, name) else: sheet.write_string(row, 1, name) schema = model.get(result.entity['schema']) sheet.write_string(row, 2, schema.label) countries = ', '.join(sorted(result.entity.get('countries', []))) sheet.write_string(row, 3, countries.upper()) ent_props = result.entity.get('properties', {}) if (ent_props.get('sourceUrl') is not None): source_url = ', '.join(ent_props.get('sourceUrl')) else: source_url = '' sheet.write_string(row, 4, source_url) name = result.match.get('name') widths[5] = max(widths.get(5, 0), len(name)) if links: url = entity_url(result.match_id) sheet.write_url(row, 5, url, workbook.link_format, name) else: sheet.write_string(row, 5, name) schema = model.get(result.match['schema']) sheet.write_string(row, 6, schema.label) countries = ', '.join(sorted(result.match.get('countries', []))) sheet.write_string(row, 7, countries.upper()) if one_sheet: sheet.write_string(row, 8, match_collection.label) for idx, max_len in widths.items(): max_len = min(70, max(7, max_len + 1)) sheet.set_column(idx, idx, float(max_len)) return sheet