def aleph_folder(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) foreign_id = data.get("foreign_id") if foreign_id is None: context.log.warning("No folder foreign ID!") return meta = clean_dict(_create_meta_object(context, data)) label = meta.get("file_name", meta.get("source_url")) context.log.info("Make folder: %s", label) for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.ingest_upload(collection_id, metadata=meta, sync=True) document_id = res.get("id") context.log.info("Aleph folder entity ID: %s", document_id) # Save the document id in cache for future use context.set_tag(make_key(collection_id, foreign_id), document_id) data["aleph_folder_id"] = document_id data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return except AlephException as ae: if try_number > api.retries or not ae.transient: context.emit_warning("Error: %s" % ae) return backoff(ae, try_number)
def submit_result(context, result, data): if result.file_path is None: context.log.info("Cannot ingest non-existant response: %s", result) return session = requests.Session() session.headers['Authorization'] = 'apikey %s' % settings.ALEPH_API_KEY collection_id = get_collection_id(context, session) meta = { 'crawler': context.crawler.name, 'source_url': data.get('source_url', result.url), 'file_name': data.get('file_name', result.file_name), 'title': data.get('title'), 'author': data.get('author'), 'foreign_id': data.get('foreign_id', result.request_id), 'mime_type': data.get('mime_type', result.content_type), 'countries': data.get('countries'), 'languages': data.get('languages'), 'headers': dict(result.headers or {}) } meta = clean_dict(meta) url = make_url('collections/%s/ingest' % collection_id) title = meta.get('title', meta.get('file_name', meta.get('source_url'))) context.log.info("Sending '%s' to %s", title, url) res = session.post(url, data={'meta': json.dumps(meta)}, files={'file': open(result.file_path, 'rb')}) if not res.ok: context.emit_warning("Could not ingest '%s': %r" % (title, res.text)) else: document = res.json().get('documents')[0] context.log.info("Ingesting, document ID: %s", document['id'])
def merge_docs(old, new): """Exend the values of the new doc with extra values from the old.""" old = clean_dict(old) new = dict(clean_dict(new)) for k, v in old.items(): if k == 'created_at': new[k] = v elif k in new: if is_sequence(v): v = new[k] + v new[k] = unique_list(v) elif isinstance(v, dict): new[k] = merge_docs(v, new[k]) else: new[k] = v return new
def aleph_emit(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) content_hash = data.get('content_hash') source_url = data.get('source_url', data.get('url')) foreign_id = data.get('foreign_id', data.get('request_id', source_url)) if context.skip_incremental(collection_id, foreign_id, content_hash): context.log.info("Skip aleph upload: %s", foreign_id) return meta = { 'crawler': context.crawler.name, 'foreign_id': foreign_id, 'source_url': source_url, 'title': data.get('title'), 'author': data.get('author'), 'file_name': data.get('file_name'), 'retrieved_at': data.get('retrieved_at'), 'modified_at': data.get('modified_at'), 'published_at': data.get('published_at'), 'headers': data.get('headers', {}) } languages = context.params.get('languages') meta['languages'] = data.get('languages', languages) countries = context.params.get('countries') meta['countries'] = data.get('countries', countries) mime_type = context.params.get('mime_type') meta['mime_type'] = data.get('mime_type', mime_type) if data.get('parent_foreign_id'): meta['parent'] = {'foreign_id': data.get('parent_foreign_id')} meta = clean_dict(meta) # pprint(meta) label = meta.get('file_name', meta.get('source_url')) context.log.info("Upload: %s", label) with context.load_file(content_hash) as fh: if fh is None: return file_path = Path(fh.name).resolve() for try_number in range(api.retries): try: res = api.ingest_upload(collection_id, file_path, meta) document_id = res.get('id') context.log.info("Aleph document entity ID: %s", document_id) data['aleph_id'] = document_id data['aleph_document'] = meta data['aleph_collection_id'] = collection_id context.emit(data=data, optional=True) return except AlephException as ae: if try_number > api.retries or not ae.transient: context.emit_warning("Error: %s" % ae) return backoff(ae, try_number)
def index_single(obj, data, texts): """Indexing aspects common to entities and documents.""" data['bulk'] = False data['roles'] = obj.collection.roles data['collection_id'] = obj.collection.id data['created_at'] = obj.created_at data['updated_at'] = obj.updated_at data = finalize_index(data, obj.model, texts) data = clean_dict(data) return index_safe(entity_index(), obj.id, data)
def index_single(obj, data, texts): """Indexing aspects common to entities and documents.""" data['bulk'] = False data['roles'] = obj.collection.roles data['collection_id'] = obj.collection.id data['created_at'] = obj.created_at data['updated_at'] = obj.updated_at data = finalize_index(data, obj.model, texts) data = clean_dict(data) es.index(index=entity_index(), doc_type='doc', id=str(obj.id), body=data) data['id'] = str(obj.id) return data
def aleph_emit_document(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) content_hash = data.get("content_hash") source_url = data.get("source_url", data.get("url")) foreign_id = data.get("foreign_id", data.get("request_id", source_url)) # Fetch document id from cache document = context.get_tag( make_key(collection_id, foreign_id, content_hash)) if document: context.log.info("Skip aleph upload: %s", foreign_id) context.log.info("Skip aleph upload: %s", foreign_id) data["aleph_id"] = document["id"] data["aleph_document"] = document data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return meta = clean_dict(_create_meta_object(context, data)) meta.update(_create_document_metadata(context, data)) label = meta.get("file_name", meta.get("source_url")) context.log.info("Upload: %s", label) with context.load_file(content_hash) as fh: if fh is None: return file_path = Path(fh.name).resolve() for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.ingest_upload(collection_id, file_path, meta) document_id = res.get("id") context.log.info("Aleph document ID: %s", document_id) # Save the document id in cache for future use meta["id"] = document_id context.set_tag( make_key(collection_id, foreign_id, content_hash), meta) data["aleph_id"] = document_id data["aleph_document"] = meta data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return except AlephException as exc: if try_number > api.retries or not exc.transient: context.emit_warning("Error: %s" % exc) return backoff(exc, try_number)
def finalize_index(proxy, context, texts): """Apply final denormalisations to the index.""" for prop, value in proxy.itervalues(): if prop.type.name in ['entity', 'date', 'url', 'country', 'language']: continue texts.append(value) entity = proxy.to_full_dict() data = merge_data(context, entity) data['name'] = proxy.caption data['text'] = index_form(texts) names = data.get('names', []) fps = [fingerprints.generate(name) for name in names] fps = [fp for fp in fps if fp is not None] data['fingerprints'] = list(set(fps)) if not data.get('created_at'): data['created_at'] = data.get('updated_at') data.pop('id', None) return clean_dict(data)
def aleph_emit(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) content_hash = data.get("content_hash") source_url = data.get("source_url", data.get("url")) foreign_id = data.get("foreign_id", data.get("request_id", source_url)) # Fetch document id from cache document_id = context.get_tag(make_key(collection_id, foreign_id, content_hash)) if document_id: context.log.info("Skip aleph upload: %s", foreign_id) data["aleph_id"] = document_id context.emit(data=data, optional=True) return meta = { "crawler": context.crawler.name, "foreign_id": foreign_id, "source_url": source_url, "title": data.get("title"), "author": data.get("author"), "file_name": data.get("file_name"), "retrieved_at": data.get("retrieved_at"), "modified_at": data.get("modified_at"), "published_at": data.get("published_at"), "headers": data.get("headers", {}), } languages = context.params.get("languages") meta["languages"] = data.get("languages", languages) countries = context.params.get("countries") meta["countries"] = data.get("countries", countries) mime_type = context.params.get("mime_type") meta["mime_type"] = data.get("mime_type", mime_type) if data.get("aleph_folder_id"): meta["parent"] = {"id": data.get("aleph_folder_id")} meta = clean_dict(meta) # pprint(meta) label = meta.get("file_name", meta.get("source_url")) context.log.info("Upload: %s", label) with context.load_file(content_hash) as fh: if fh is None: return file_path = Path(fh.name).resolve() for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.ingest_upload(collection_id, file_path, meta) document_id = res.get("id") context.log.info("Aleph document entity ID: %s", document_id) # Save the document id in cache for future use context.set_tag( make_key(collection_id, foreign_id, content_hash), document_id ) data["aleph_id"] = document_id data["aleph_document"] = meta data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return except AlephException as exc: if try_number > api.retries or not exc.transient: context.emit_warning("Error: %s" % exc) return backoff(exc, try_number)
def aleph_emit(context, data): if not settings.ALEPH_HOST: context.log.warning("No $MEMORIOUS_ALEPH_HOST, skipping upload...") return if not settings.ALEPH_API_KEY: context.log.warning("No $MEMORIOUS_ALEPH_API_KEY, skipping upload...") return session_id = 'memorious:%s' % context.crawler.name api = AlephAPI(settings.ALEPH_HOST, settings.ALEPH_API_KEY, session_id=session_id) collection_id = get_collection_id(context, api) if collection_id is None: context.log.warning("Cannot get aleph collection.") return content_hash = data.get('content_hash') source_url = data.get('source_url', data.get('url')) foreign_id = data.get('foreign_id', data.get('request_id', source_url)) if context.skip_incremental(collection_id, foreign_id, content_hash): context.log.info("Skip aleph upload: %s", foreign_id) return meta = { 'crawler': context.crawler.name, 'foreign_id': foreign_id, 'source_url': source_url, 'title': data.get('title'), 'author': data.get('author'), 'file_name': data.get('file_name'), 'retrieved_at': data.get('retrieved_at'), 'modified_at': data.get('modified_at'), 'published_at': data.get('published_at'), 'headers': data.get('headers', {}) } languages = context.params.get('languages') meta['languages'] = data.get('languages', languages) countries = context.params.get('countries') meta['countries'] = data.get('countries', countries) mime_type = context.params.get('mime_type') meta['mime_type'] = data.get('mime_type', mime_type) if data.get('parent_foreign_id'): meta['parent'] = {'foreign_id': data.get('parent_foreign_id')} meta = clean_dict(meta) # pprint(meta) label = meta.get('file_name', meta.get('source_url')) context.log.info("Upload: %s", label) with context.load_file(content_hash) as fh: if fh is None: return file_path = Path(fh.name).resolve() res = api.ingest_upload(collection_id, file_path, meta) if res.get('status') == 'ok': document = res.get('documents')[0] context.log.info("Document ID: %s", document['id']) else: context.emit_warning("Error: %r" % res)
def index_document(document): if document.status == Document.STATUS_PENDING: return log.info("Index document [%s]: %s", document.id, document.title) schema = model.get(Document.SCHEMA) data = { 'schema': schema.name, 'schemata': schema.names, 'collection_id': document.collection_id, 'roles': document.collection.roles, 'type': document.type, 'status': document.status, 'content_hash': document.content_hash, 'foreign_id': document.foreign_id, 'error_message': document.error_message, 'uploader_id': document.uploader_id, 'created_at': document.created_at, 'updated_at': document.updated_at, 'title': document.title, 'name': document.title, 'summary': document.summary, 'author': document.author, 'file_size': document.file_size, 'file_name': document.file_title, 'source_url': document.source_url, 'languages': document.languages, 'countries': document.countries, 'keywords': document.keywords, 'dates': document.dates, 'extension': document.extension, 'encoding': document.encoding, 'mime_type': document.mime_type, 'pdf_version': document.pdf_version, 'columns': document.columns, 'children': document.children.count(), 'text': index_form(document.texts) } if document.parent_id is not None: data['parent'] = { 'id': document.parent_id, 'type': document.parent.type, 'title': document.parent.title, } q = db.session.query(DocumentTag) q = q.filter(DocumentTag.document_id == document.id) for tag in q.yield_per(5000): field = TAG_FIELDS.get(tag.type) if field is None: log.warning("Cannot index document tag: %r", tag) continue if field not in data: data[field] = [] data[field].append(tag.text) index_names(data) data = clean_dict(data) # pprint(data) es.index(index=entity_index(), doc_type=entity_type(), body=data, id=document.id) data['id'] = document.id return data