def aleph_folder(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) foreign_id = data.get("foreign_id") if foreign_id is None: context.log.warning("No folder foreign ID!") return meta = clean_dict(_create_meta_object(context, data)) label = meta.get("file_name", meta.get("source_url")) context.log.info("Make folder: %s", label) for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.ingest_upload(collection_id, metadata=meta, sync=True) document_id = res.get("id") context.log.info("Aleph folder entity ID: %s", document_id) # Save the document id in cache for future use context.set_tag(make_key(collection_id, foreign_id), document_id) data["aleph_folder_id"] = document_id data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return except AlephException as ae: if try_number > api.retries or not ae.transient: context.emit_warning("Error: %s" % ae) return backoff(ae, try_number)
def ingest_upload(self, collection_id, file_path=None, metadata=None): """ Create an empty folder in a collection or upload a document to it params ------ collection_id: id of the collection to upload to file_path: path of the file to upload. None while creating folders metadata: dict containing metadata for the file or folders. In case of files, metadata contains foreign_id of the parent. Metadata for a directory contains foreign_id for itself as well as its parent and the name of the directory. """ url = self._make_url("collections/{0}/ingest".format(collection_id)) if not file_path or file_path.is_dir(): data = {"meta": json.dumps(metadata)} return self._request("POST", url, data=data) for attempt in count(1): try: with file_path.open('rb') as fh: # use multipart encoder to allow uploading very large files m = MultipartEncoder(fields={ 'meta': json.dumps(metadata), 'file': (file_path.name, fh, MIME) }) headers = {'Content-Type': m.content_type} return self._request("POST", url, data=m, headers=headers) except AlephException as ae: if not ae.transient or attempt > self.retries: raise ae backoff(ae, attempt)
def aleph_emit_entity(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) entity_id = data.get("entity_id", data.get("id")) if not entity_id: context.emit_warning( "Error: Can not create entity. `id` is not definied") return source_url = data.get("source_url", data.get("url")) foreign_id = data.get("foreign_id", data.get("request_id", source_url)) # Fetch entity from cache cached_entity = context.get_tag( make_key(collection_id, foreign_id, entity_id)) if cached_entity and isinstance(cached_entity, dict): context.log.info("Skip entity creation: {}".format(foreign_id)) data["aleph_id"] = cached_entity["id"] data["aleph_collection_id"] = collection_id data["aleph_entity"] = cached_entity context.emit(data=data, optional=True) return for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.write_entity( collection_id, { "schema": data.get("schema"), "properties": data.get("properties"), }, entity_id, ) entity = { "id": res.get("id"), "schema": res.get("schema"), "properties": res.get("properties"), } context.log.info("Aleph entity ID: %s", entity["id"]) # Save the entity in cache for future use context.set_tag(make_key(collection_id, foreign_id, entity_id), entity) data["aleph_id"] = entity["id"] data["aleph_collection_id"] = collection_id data["aleph_entity"] = entity context.emit(data=data, optional=True) return except AlephException as exc: if try_number > api.retries or not exc.transient: context.emit_warning("Error: %s" % exc) return backoff(exc, try_number)
def aleph_emit(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) content_hash = data.get('content_hash') source_url = data.get('source_url', data.get('url')) foreign_id = data.get('foreign_id', data.get('request_id', source_url)) if context.skip_incremental(collection_id, foreign_id, content_hash): context.log.info("Skip aleph upload: %s", foreign_id) return meta = { 'crawler': context.crawler.name, 'foreign_id': foreign_id, 'source_url': source_url, 'title': data.get('title'), 'author': data.get('author'), 'file_name': data.get('file_name'), 'retrieved_at': data.get('retrieved_at'), 'modified_at': data.get('modified_at'), 'published_at': data.get('published_at'), 'headers': data.get('headers', {}) } languages = context.params.get('languages') meta['languages'] = data.get('languages', languages) countries = context.params.get('countries') meta['countries'] = data.get('countries', countries) mime_type = context.params.get('mime_type') meta['mime_type'] = data.get('mime_type', mime_type) if data.get('parent_foreign_id'): meta['parent'] = {'foreign_id': data.get('parent_foreign_id')} meta = clean_dict(meta) # pprint(meta) label = meta.get('file_name', meta.get('source_url')) context.log.info("Upload: %s", label) with context.load_file(content_hash) as fh: if fh is None: return file_path = Path(fh.name).resolve() for try_number in range(api.retries): try: res = api.ingest_upload(collection_id, file_path, meta) document_id = res.get('id') context.log.info("Aleph document entity ID: %s", document_id) data['aleph_id'] = document_id data['aleph_document'] = meta data['aleph_collection_id'] = collection_id context.emit(data=data, optional=True) return except AlephException as ae: if try_number > api.retries or not ae.transient: context.emit_warning("Error: %s" % ae) return backoff(ae, try_number)
def aleph_emit_document(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) content_hash = data.get("content_hash") source_url = data.get("source_url", data.get("url")) foreign_id = data.get("foreign_id", data.get("request_id", source_url)) # Fetch document id from cache document = context.get_tag( make_key(collection_id, foreign_id, content_hash)) if document: context.log.info("Skip aleph upload: %s", foreign_id) context.log.info("Skip aleph upload: %s", foreign_id) data["aleph_id"] = document["id"] data["aleph_document"] = document data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return meta = clean_dict(_create_meta_object(context, data)) meta.update(_create_document_metadata(context, data)) label = meta.get("file_name", meta.get("source_url")) context.log.info("Upload: %s", label) with context.load_file(content_hash) as fh: if fh is None: return file_path = Path(fh.name).resolve() for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.ingest_upload(collection_id, file_path, meta) document_id = res.get("id") context.log.info("Aleph document ID: %s", document_id) # Save the document id in cache for future use meta["id"] = document_id context.set_tag( make_key(collection_id, foreign_id, content_hash), meta) data["aleph_id"] = document_id data["aleph_document"] = meta data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return except AlephException as exc: if try_number > api.retries or not exc.transient: context.emit_warning("Error: %s" % exc) return backoff(exc, try_number)
def _upload(q: Queue, api: AlephAPI, collection_id: str, root_path: Path): while not q.empty(): path, parent_id, try_number = q.get() try: _crawl_path(q, api, collection_id, parent_id, root_path, path) except AlephException as exc: if exc.transient and try_number < api.retries: backoff(exc, try_number) q.put((path, parent_id, try_number + 1)) else: log.error(exc.message) except Exception: log.exception('Failed [%s]: %s', collection_id, path) q.task_done()
def execute(self): while not self.queue.empty(): path, parent_id, try_number = self.queue.get() try: self.crawl_path(parent_id, path) except AlephException as exc: if exc.transient and try_number < self.api.retries: backoff(exc, try_number) self.queue.put((path, parent_id, try_number + 1)) else: log.error(exc.message) except Exception: log.exception('Failed [%s]: %s', self.collection_id, path) finally: self.queue.task_done()
def _bulk_chunk(self, collection_id, chunk, force=False, unsafe=False): for attempt in count(1): url = self._make_url("collections/{0}/_bulk".format(collection_id)) params = {'unsafe': unsafe} try: response = self.session.post(url, json=chunk, params=params) response.raise_for_status() return except RequestException as exc: ae = AlephException(exc) if not ae.transient or attempt > self.retries: if not force: raise ae log.error(ae) return backoff(ae, attempt)
def backoff_ingest_upload(self, path: Path, parent_id: str, foreign_id: str) -> Optional[str]: try_number = 1 while True: try: return self.ingest_upload(Path(path), parent_id, foreign_id) except AlephException as err: if err.transient and try_number < self.api.retries: try_number += 1 backoff(err, try_number) else: log.error(err.message) return None except Exception: log.exception("Failed [%s]: %s", self.collection_id, path) return None
def ingest_upload( self, collection_id: str, file_path: Optional[Path] = None, metadata: Optional[Dict] = None, sync: bool = False, index: bool = True, ) -> Dict: """ Create an empty folder in a collection or upload a document to it params ------ collection_id: id of the collection to upload to file_path: path of the file to upload. None while creating folders metadata: dict containing metadata for the file or folders. In case of files, metadata contains foreign_id of the parent. Metadata for a directory contains foreign_id for itself as well as its parent and the name of the directory. """ url_path = "collections/{0}/ingest".format(collection_id) params = {"sync": sync, "index": index} url = self._make_url(url_path, params=params) if not file_path or file_path.is_dir(): data = {"meta": json.dumps(metadata)} return self._request("POST", url, data=data) for attempt in count(1): try: with file_path.open("rb") as fh: # use multipart encoder to allow uploading very large files m = MultipartEncoder( fields={ "meta": json.dumps(metadata), "file": (file_path.name, fh, MIME), }) headers = {"Content-Type": m.content_type} return self._request("POST", url, data=m, headers=headers) except AlephException as ae: if not ae.transient or attempt > self.retries: raise ae backoff(ae, attempt) return {}
def _bulk_chunk( self, collection_id: str, chunk: List, entityset_id: Optional[str] = None, force: bool = False, unsafe: bool = False, ): for attempt in count(1): url = self._make_url(f"collections/{collection_id}/_bulk") params = {"unsafe": unsafe, "entityset_id": entityset_id} try: response = self.session.post(url, json=chunk, params=params) response.raise_for_status() return except RequestException as exc: ae = AlephException(exc) if not ae.transient or attempt > self.retries: if not force: raise ae log.error(ae) return backoff(ae, attempt)
def write_entity(self, collection_id: str, entity: Dict, entity_id: str = None, **kw) -> Dict: """Create a single entity via the API, in the given collection. params ------ collection_id: id of the collection to use. This will overwrite any existing collection specified in the entity dict entity_id: id for the entity to be created. This will overwrite any existing entity specified in the entity dict entity: A dict object containing the values of the entity """ entity["collection_id"] = collection_id if entity_id is not None: entity["id"] = entity_id for attempt in count(1): if entity_id is not None: url = self._make_url("entities/{}").format(entity_id) else: url = self._make_url("entities") try: return self._request("POST", url, json=entity) except RequestException as exc: ae = AlephException(exc) if not ae.transient or attempt > self.retries: log.error(ae) raise exc backoff(ae, attempt) return {}
def aleph_emit(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) content_hash = data.get("content_hash") source_url = data.get("source_url", data.get("url")) foreign_id = data.get("foreign_id", data.get("request_id", source_url)) # Fetch document id from cache document_id = context.get_tag(make_key(collection_id, foreign_id, content_hash)) if document_id: context.log.info("Skip aleph upload: %s", foreign_id) data["aleph_id"] = document_id context.emit(data=data, optional=True) return meta = { "crawler": context.crawler.name, "foreign_id": foreign_id, "source_url": source_url, "title": data.get("title"), "author": data.get("author"), "file_name": data.get("file_name"), "retrieved_at": data.get("retrieved_at"), "modified_at": data.get("modified_at"), "published_at": data.get("published_at"), "headers": data.get("headers", {}), } languages = context.params.get("languages") meta["languages"] = data.get("languages", languages) countries = context.params.get("countries") meta["countries"] = data.get("countries", countries) mime_type = context.params.get("mime_type") meta["mime_type"] = data.get("mime_type", mime_type) if data.get("aleph_folder_id"): meta["parent"] = {"id": data.get("aleph_folder_id")} meta = clean_dict(meta) # pprint(meta) label = meta.get("file_name", meta.get("source_url")) context.log.info("Upload: %s", label) with context.load_file(content_hash) as fh: if fh is None: return file_path = Path(fh.name).resolve() for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.ingest_upload(collection_id, file_path, meta) document_id = res.get("id") context.log.info("Aleph document entity ID: %s", document_id) # Save the document id in cache for future use context.set_tag( make_key(collection_id, foreign_id, content_hash), document_id ) data["aleph_id"] = document_id data["aleph_document"] = meta data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return except AlephException as exc: if try_number > api.retries or not exc.transient: context.emit_warning("Error: %s" % exc) return backoff(exc, try_number)