def bulk_load(api: AlephAPI, mapping_file: str): data = load_config_file(mapping_file) if not isinstance(data, dict): raise AlephException('mapping_file has to be a json dictionary') for foreign_id, config in data.items(): collection = api.load_collection_by_foreign_id(foreign_id, config) collection_id = collection['id'] log.info(f"Bulk mapping collection ID: {collection_id}") api.map_collection(collection_id, data)
def stream_entities(): api = AlephAPI() collections = api.filter_collections("*") # collections = filter(lambda c: not c.get("secret", False), collections) collections = list(collections) random.shuffle(collections) for c in tqdm(collections, desc="Collections"): cid = c["id"] for entity in _stream_collection(c): if entity: yield cid, entity
def fetch_object(api: AlephAPI, path: Path, entity: Dict, overwrite: bool = False): file_name = _get_filename(entity) path.mkdir(exist_ok=True, parents=True) object_path = path.joinpath(file_name) url = entity.get("links", {}).get("file") if url is not None: # Skip existing files after checking file size: if not overwrite and object_path.exists(): for file_size in entity.get("properties", {}).get("fileSize", []): if int(file_size) == object_path.stat().st_size: log.info("Skip [%s]: %s", path, file_name) return log.info("Fetch [%s]: %s", path, file_name) return fetch_archive(url, object_path) filters = [("properties.parent", entity.get("id"))] results = api.search("", filters=filters, schemata="Document") log.info("Directory [%s]: %s (%d children)", path, file_name, len(results)) for entity in results: fetch_object(api, object_path, entity, overwrite=overwrite)
def crawl_dir(api: AlephAPI, path: str, foreign_id: str, config: Dict, index: bool = True): """Crawl a directory and upload its content to a collection params ------ path: path of the directory foreign_id: foreign_id of the collection to use. language: language hint for the documents """ root = Path(path).resolve() collection = api.load_collection_by_foreign_id(foreign_id, config) crawler = CrawlDirectory(api, collection, root, index=index) threads = [] for i in range(settings.THREADS): thread = threading.Thread(target=crawler.execute) thread.daemon = True thread.start() threads.append(thread) # block until all tasks are done crawler.queue.join() for thread in threads: thread.join()
def crawl_dir(api: AlephAPI, path: str, foreign_id: str, config: Dict): """Crawl a directory and upload its content to a collection params ------ path: path of the directory foreign_id: foreign_id of the collection to use. language: language hint for the documents """ _path = Path(path).resolve() collection = api.load_collection_by_foreign_id(foreign_id, config) collection_id = collection.get('id') _queue: Queue = Queue() _queue.put((_path, None, 1)) threads = [] for i in range(settings.THREADS): args = (_queue, api, collection_id, _path) thread = threading.Thread(target=_upload, args=args) thread.daemon = True thread.start() threads.append(thread) # block until all tasks are done _queue.join() for thread in threads: thread.join()
def aleph_initializer(initializer=None): global alephclient alephclient = AlephAPI(timeout=60) adapter = requests.adapters.HTTPAdapter(pool_connections=52) alephclient.session.mount("http://", adapter) alephclient.session.mount("https://", adapter) if initializer is not None: initializer()
def fetch_collection(api: AlephAPI, prefix: Optional[str], foreign_id: str, overwrite: bool = False): path = _fix_path(prefix) collection = api.get_collection_by_foreign_id(foreign_id) if collection is None: return filters = [("collection_id", collection.get("id"))] params = {"empty:properties.parent": "true"} results = api.search("", filters=filters, schemata="Document", params=params) label = collection.get("label") log.info("Dataset [%s]: %s (%d children)", path, label, len(results)) for entity in results: fetch_object(api, path, entity, overwrite=overwrite)
def get_api(context): if not settings.HOST: context.log.warning("No $ALEPHCLIENT_HOST, skipping upload...") return None if not settings.API_KEY: context.log.warning("No $ALEPHCLIENT_API_KEY, skipping upload...") return None session_id = "memorious:%s" % context.crawler.name return AlephAPI(settings.HOST, settings.API_KEY, session_id=session_id)
def load_aleph(foreign_id, api_url, api_key): api = AlephAPI(api_url, api_key) collection_id = None if foreign_id is not None: collection = api.get_collection_by_foreign_id(foreign_id) if collection is None: raise click.BadParameter("Cannot find collection: %s" % foreign_id) collection_id = collection.get('id') stdout = click.get_text_stream('stdout') entities = api.stream_entities(collection_id=collection_id, include=['schema', 'properties']) for data in entities: if 'properties' not in data: continue entity = model.get_proxy(data) api_url = urljoin(api.base_url, 'entities/%s' % entity.id) entity.add('alephUrl', api_url, quiet=True) write_object(stdout, entity)
def cli(ctx, host, api_key, retries): """API client for Aleph API""" logging.basicConfig(level=logging.DEBUG) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("httpstream").setLevel(logging.WARNING) if not host: raise click.BadParameter("Missing Aleph host URL") if ctx.obj is None: ctx.obj = {} ctx.obj["api"] = AlephAPI(host, api_key, retries=retries)
def _upload_path(api: AlephAPI, path: Path, collection_id: str, parent_id: str, foreign_id: str) -> str: metadata = { 'foreign_id': foreign_id, 'file_name': path.name, } log.info('Upload [%s->%s]: %s', collection_id, parent_id, foreign_id) if parent_id is not None: metadata['parent_id'] = parent_id result = api.ingest_upload(collection_id, path, metadata=metadata) if 'id' not in result: raise AlephException('Upload failed') return result['id']
class TestApiCollection: fake_url = "http://aleph.test/api/2/" def setup_method(self, mocker): self.api = AlephAPI(host=self.fake_url, api_key="fake_key") def test_get_collection(self, mocker): collection_id = "8" mocker.patch.object(self.api, "_request") self.api.get_collection(collection_id) self.api._request.assert_called_with( "GET", "{}collections/{}".format(self.fake_url, collection_id)) def test_reingest_collection(self, mocker): pass def test_reindex_collection(self, mocker): pass def test_delete_collection(self, mocker): pass def test_flush_collection(self, mocker): pass def test_get_collection_by_foreign_id(self, mocker): pass def test_load_collection_by_foreign_id(self, mocker): pass def test_filter_collections(self, mocker): pass def test_create_collection(self, mocker): pass
def crawl_dir( api: AlephAPI, path: str, foreign_id: str, config: Dict, index: bool = True, nojunk: bool = False, parallel: int = 1, ): """Crawl a directory and upload its content to a collection params ------ path: path of the directory foreign_id: foreign_id of the collection to use. language: language hint for the documents """ root = Path(path).resolve() collection = api.load_collection_by_foreign_id(foreign_id, config) crawler = CrawlDirectory(api, collection, root, index=index, nojunk=nojunk) consumers = [] # Use one thread to produce using scandir and at least one to consume # files for upload. producer = threading.Thread(target=crawler.crawl, daemon=True) producer.start() for i in range(max(1, parallel)): consumer = threading.Thread(target=crawler.consume, daemon=True) consumer.start() consumers.append(consumer) # Block until the producer is done with queueing the tree. producer.join() # Block until the file upload queue is drained. crawler.queue.join() # Poison the queue to signal end to each consumer. for consumer in consumers: crawler.queue.put((None, None)) # Block until all file upload queue consumers are done. for consumer in consumers: consumer.join()
class TestApiSearch: fake_url = "http://aleph.test/api/2/" fake_query = "fleem" def setup_method(self, mocker): self.api = AlephAPI(host="http://aleph.test/api/2/", api_key="fake_key") def test_search(self, mocker): mocker.patch.object(self.api, "_request") search_result = self.api.search(self.fake_query) assert isinstance(search_result, APIResultSet) == True def test_search_url(self, mocker): mocker.patch.object(self.api, "_request") search_result = self.api.search(self.fake_query) assert self.fake_url in search_result.url def test_search_query(self, mocker): mocker.patch.object(self.api, "_request") search_result = self.api.search(self.fake_query) assert self.fake_query in search_result.url def test_search_schema(self, mocker): schema = "Article" mocker.patch.object(self.api, "_request") search_result = self.api.search(self.fake_query, schema) assert "schema={}".format(schema) in search_result.url def test_search_schemata(self, mocker): schemata = "Document" mocker.patch.object(self.api, "_request") search_result = self.api.search(self.fake_query, None, schemata) assert "schemata={}".format(schemata) in search_result.url def test_search_params(self, mocker): params = {"first": "first", "second": "second"} mocker.patch.object(self.api, "_request") search_result = self.api.search(self.fake_query, params=params) assert "first=first" in search_result.url assert "second=second" in search_result.url
class TestTasks(object): def setup_method(self): self.api = AlephAPI(host="http://aleph.test/api/2/", api_key="fake_key") def test_new_collection(self, mocker): mocker.patch.object(self.api, "filter_collections", return_value=[]) mocker.patch.object(self.api, "create_collection") mocker.patch.object(self.api, "update_collection") mocker.patch.object(self.api, "ingest_upload") crawl_dir(self.api, "alephclient/tests/testdata", "test153", {}, True, True) self.api.create_collection.assert_called_once_with({ "category": "other", "foreign_id": "test153", "label": "test153", "languages": [], "summary": "", "casefile": False, }) def test_write_entity(self, mocker): mocker.patch.object(self.api, "write_entity", return_value={"id": 24}) collection_id = 8 entity = { "id": 24, "schema": "Article", "properties": { "title": "", "author": "", "publishedAt": "", "bodyText": "", }, } res = self.api.write_entity(collection_id, entity) assert res["id"] == 24 def test_ingest(self, mocker): mocker.patch.object(self.api, "ingest_upload", return_value={"id": 42}) mocker.patch.object(self.api, "load_collection_by_foreign_id", return_value={"id": 2}) mocker.patch.object(self.api, "update_collection") crawl_dir(self.api, "alephclient/tests/testdata", "test153", {}, True, True) base_path = os.path.abspath("alephclient/tests/testdata") assert self.api.ingest_upload.call_count == 6 expected_calls = [ mocker.call( 2, Path(os.path.join(base_path, "feb")), metadata={ "foreign_id": "feb", "file_name": "feb" }, index=True, ), mocker.call( 2, Path(os.path.join(base_path, "jan")), metadata={ "foreign_id": "jan", "file_name": "jan" }, index=True, ), mocker.call( 2, Path(os.path.join(base_path, "feb/2.txt")), metadata={ "parent_id": 42, "foreign_id": "feb/2.txt", "file_name": "2.txt", }, index=True, ), mocker.call( 2, Path(os.path.join(base_path, "jan/week1")), metadata={ "parent_id": 42, "foreign_id": "jan/week1", "file_name": "week1", }, index=True, ), mocker.call( 2, Path(os.path.join(base_path, "jan/week1/1.txt")), metadata={ "parent_id": 42, "foreign_id": "jan/week1/1.txt", "file_name": "1.txt", }, index=True, ), ] for call in expected_calls: assert call in self.api.ingest_upload.mock_calls
def load_entities(json_file, root_path): api = AlephAPI() collection = api.load_collection_by_foreign_id('zz_occrp_pdi') cid = collection.get('id') api.write_entities(cid, generate_entities(json_file, root_path, api, cid))
def aleph_emit(context, data): if not settings.ALEPH_HOST: context.log.warning("No $MEMORIOUS_ALEPH_HOST, skipping upload...") return if not settings.ALEPH_API_KEY: context.log.warning("No $MEMORIOUS_ALEPH_API_KEY, skipping upload...") return session_id = 'memorious:%s' % context.crawler.name api = AlephAPI(settings.ALEPH_HOST, settings.ALEPH_API_KEY, session_id=session_id) collection_id = get_collection_id(context, api) if collection_id is None: context.log.warning("Cannot get aleph collection.") return content_hash = data.get('content_hash') source_url = data.get('source_url', data.get('url')) foreign_id = data.get('foreign_id', data.get('request_id', source_url)) if context.skip_incremental(collection_id, foreign_id, content_hash): context.log.info("Skip aleph upload: %s", foreign_id) return meta = { 'crawler': context.crawler.name, 'foreign_id': foreign_id, 'source_url': source_url, 'title': data.get('title'), 'author': data.get('author'), 'file_name': data.get('file_name'), 'retrieved_at': data.get('retrieved_at'), 'modified_at': data.get('modified_at'), 'published_at': data.get('published_at'), 'headers': data.get('headers', {}) } languages = context.params.get('languages') meta['languages'] = data.get('languages', languages) countries = context.params.get('countries') meta['countries'] = data.get('countries', countries) mime_type = context.params.get('mime_type') meta['mime_type'] = data.get('mime_type', mime_type) if data.get('parent_foreign_id'): meta['parent'] = {'foreign_id': data.get('parent_foreign_id')} meta = clean_dict(meta) # pprint(meta) label = meta.get('file_name', meta.get('source_url')) context.log.info("Upload: %s", label) with context.load_file(content_hash) as fh: if fh is None: return file_path = Path(fh.name).resolve() res = api.ingest_upload(collection_id, file_path, meta) if res.get('status') == 'ok': document = res.get('documents')[0] context.log.info("Document ID: %s", document['id']) else: context.emit_warning("Error: %r" % res)
def __init__(self): self.api = AlephAPI()
class AlephEnricher(Enricher): def __init__(self): self.api = AlephAPI() def get_api(self, url): data = self.cache.get(url) if data is not None: return data try: log.info("Aleph fetch: %s", url) res = self.api.session.get(url) if res.status_code != 200: return {} data = res.json() self.cache.store(url, data) return data except RequestException: log.exception("Error calling Aleph API") return {} def post_match(self, url, proxy): data = proxy.to_dict() key = proxy.id or hash_data(data) key = hash_data((url, key)) if self.cache.has(key): log.info("Cached [%s]: %s", key, url) return self.cache.get(key) log.info("Enrich: %r", proxy) try: res = self.api.session.post(url, json=data) except RequestException: log.exception("Error calling Aleph matcher") return {} if res.status_code != 200: return {} data = res.json() self.cache.store(key, data) return data def convert_entity(self, data): data = ensure_dict(data) if "properties" not in data or "schema" not in data: return try: entity = model.get_proxy(data, cleaned=False) except InvalidData: log.error("Server model mismatch: %s" % data.get("schema")) return entity.id = data.get("id") links = ensure_dict(data.get("links")) entity.add("alephUrl", links.get("self"), quiet=True, cleaned=True) collection = data.get("collection", {}) entity.add("publisher", collection.get("label"), quiet=True, cleaned=True) clinks = collection.get("links", {}) entity.add("publisherUrl", clinks.get("ui"), quiet=True, cleaned=True) return entity def convert_nested(self, data): entity = self.convert_entity(data) properties = ensure_dict(data.get("properties")) for prop, values in properties.items(): for value in ensure_list(values): if is_mapping(value): yield self.convert_entity(value) yield entity def enrich_entity(self, entity): url = self.api._make_url("match") for page in range(10): data = self.post_match(url, entity) for res in data.get("results", []): proxy = self.convert_entity(res) yield self.make_match(entity, proxy) url = data.get("next") if url is None: break def expand_entity(self, entity): for url in entity.get("alephUrl", quiet=True): data = self.get_api(url) yield from self.convert_nested(data) _, entity_id = url.rsplit("/", 1) filters = (("entities", entity_id), ) search_api = self.api._make_url("entities", filters=filters) while True: res = self.get_api(search_api) for data in ensure_list(res.get("results")): yield from self.convert_nested(data) search_api = res.get("next") if search_api is None: break
def __init__(self, host=None): self.host = host or os.environ.get('ENRICH_ALEPH_HOST') self.host = self.host or os.environ.get('ALEPH_HOST') self.api_key = os.environ.get('ALEPH_API_KEY') self.api_key = os.environ.get('ENRICH_ALEPH_API_KEY', self.api_key) self.api = AlephAPI(self.host, self.api_key)
class AlephEnricher(Enricher): key_prefix = 'aleph' TYPE_CONSTRAINT = 'LegalEntity' def __init__(self, host=None): self.host = host or os.environ.get('ENRICH_ALEPH_HOST') self.host = self.host or os.environ.get('ALEPH_HOST') self.api_key = os.environ.get('ALEPH_API_KEY') self.api_key = os.environ.get('ENRICH_ALEPH_API_KEY', self.api_key) self.api = AlephAPI(self.host, self.api_key) def get_api(self, url, params=None): url = make_url(url, params) data = self.cache.get(url) if data is None: try: res = self.api.session.get(url) if res.status_code != 200: return {} data = res.json() self.cache.store(url, data) except RequestException: log.exception("Error calling Aleph API") return {} return data def post_match(self, url, proxy): data = proxy.to_dict() key = proxy.id or hash_data(data) key = hash_data((url, key)) if self.cache.has(key): # log.info("Cached [%s]: %s", self.host, proxy) return self.cache.get(key) log.info("Enrich [%s]: %s", self.host, proxy) try: res = self.api.session.post(url, json=data) except RequestException: log.exception("Error calling Aleph matcher") return {} if res.status_code != 200: return {} data = res.json() self.cache.store(key, data) return data def convert_entity(self, result, data): data = ensure_dict(data) if 'properties' not in data or 'schema' not in data: return try: entity = result.make_entity(data.get('schema')) except InvalidData: log.error("Server model mismatch: %s" % data.get('schema')) return entity.id = data.get('id') links = ensure_dict(data.get('links')) entity.add('alephUrl', links.get('self'), quiet=True, cleaned=True) collection = data.get('collection', {}) entity.add('publisher', collection.get('label'), quiet=True, cleaned=True) clinks = collection.get('links', {}) entity.add('publisherUrl', clinks.get('ui'), quiet=True, cleaned=True) properties = ensure_dict(data.get('properties')) for prop, values in properties.items(): for value in ensure_list(values): if is_mapping(value): child = self.convert_entity(result, value) if child.id is None: continue value = child.id try: entity.add(prop, value, cleaned=True) except InvalidData: msg = "Server property mismatch (%s): %s" log.warning(msg % (entity.schema.name, prop)) result.add_entity(entity) return entity def enrich_entity(self, entity): if not entity.schema.matchable: return url = self.api._make_url('match') for page in range(10): data = self.post_match(url, entity) for res in data.get('results', []): result = self.make_result(entity) proxy = self.convert_entity(result, res) result.set_candidate(proxy) if result.candidate is not None: yield result url = data.get('next') if url is None: break def expand_entity(self, entity): result = super(AlephEnricher, self).expand_entity(entity) for url in entity.get('alephUrl', quiet=True): _, entity_id = url.rsplit('/', 1) data = self.get_api(url) self.convert_entity(result, data) search_api = self.api._make_url('search') params = {'filter:entities': entity_id} entities = self.get_api(search_api, params=params) for data in ensure_list(entities.get('results')): self.convert_entity(result, data) return result
def fetch_entity(api: AlephAPI, prefix: Optional[str], entity_id: str, overwrite: bool = False): entity = api.get_entity(entity_id) return fetch_object(api, _fix_path(prefix), entity, overwrite=overwrite)
def setup_method(self, mocker): self.api = AlephAPI(host=self.fake_url, api_key="fake_key")
def init_aleph(lock=None): global api api = AlephAPI() if lock is not None: tqdm.set_lock(lock)
def setup_method(self): self.api = AlephAPI( host="http://aleph.test/api/2/", api_key="fake_key" )
import logging from functools import lru_cache from copy import deepcopy from functools import partial, wraps from concurrent.futures import ThreadPoolExecutor from alephclient.api import AlephAPI, EntityResultSet, AlephException from tqdm.autonotebook import tqdm import requests from followthemoney import model from followthemoney.exc import InvalidData log = logging.getLogger(__name__) alephclient = AlephAPI(timeout=60) alephclient._request = lru_cache(2048)(alephclient._request) def aleph_initializer(initializer=None): global alephclient alephclient = AlephAPI(timeout=60) adapter = requests.adapters.HTTPAdapter(pool_connections=52) alephclient.session.mount("http://", adapter) alephclient.session.mount("https://", adapter) if initializer is not None: initializer() @wraps(ThreadPoolExecutor) def AlephPool(*args, **kwargs):