Exemplo n.º 1
0
def bulk_load(api: AlephAPI, mapping_file: str):
    data = load_config_file(mapping_file)
    if not isinstance(data, dict):
        raise AlephException('mapping_file has to be a json dictionary')
    for foreign_id, config in data.items():
        collection = api.load_collection_by_foreign_id(foreign_id, config)
        collection_id = collection['id']
        log.info(f"Bulk mapping collection ID: {collection_id}")
        api.map_collection(collection_id, data)
Exemplo n.º 2
0
def stream_entities():
    api = AlephAPI()
    collections = api.filter_collections("*")
    # collections = filter(lambda c: not c.get("secret", False), collections)
    collections = list(collections)
    random.shuffle(collections)
    for c in tqdm(collections, desc="Collections"):
        cid = c["id"]
        for entity in _stream_collection(c):
            if entity:
                yield cid, entity
Exemplo n.º 3
0
def fetch_object(api: AlephAPI,
                 path: Path,
                 entity: Dict,
                 overwrite: bool = False):
    file_name = _get_filename(entity)
    path.mkdir(exist_ok=True, parents=True)
    object_path = path.joinpath(file_name)
    url = entity.get("links", {}).get("file")
    if url is not None:

        # Skip existing files after checking file size:
        if not overwrite and object_path.exists():
            for file_size in entity.get("properties", {}).get("fileSize", []):
                if int(file_size) == object_path.stat().st_size:
                    log.info("Skip [%s]: %s", path, file_name)
                    return

        log.info("Fetch [%s]: %s", path, file_name)
        return fetch_archive(url, object_path)

    filters = [("properties.parent", entity.get("id"))]
    results = api.search("", filters=filters, schemata="Document")
    log.info("Directory [%s]: %s (%d children)", path, file_name, len(results))
    for entity in results:
        fetch_object(api, object_path, entity, overwrite=overwrite)
Exemplo n.º 4
0
def crawl_dir(api: AlephAPI,
              path: str,
              foreign_id: str,
              config: Dict,
              index: bool = True):
    """Crawl a directory and upload its content to a collection

    params
    ------
    path: path of the directory
    foreign_id: foreign_id of the collection to use.
    language: language hint for the documents
    """
    root = Path(path).resolve()
    collection = api.load_collection_by_foreign_id(foreign_id, config)
    crawler = CrawlDirectory(api, collection, root, index=index)
    threads = []
    for i in range(settings.THREADS):
        thread = threading.Thread(target=crawler.execute)
        thread.daemon = True
        thread.start()
        threads.append(thread)

    # block until all tasks are done
    crawler.queue.join()
    for thread in threads:
        thread.join()
Exemplo n.º 5
0
def crawl_dir(api: AlephAPI, path: str, foreign_id: str, config: Dict):
    """Crawl a directory and upload its content to a collection

    params
    ------
    path: path of the directory
    foreign_id: foreign_id of the collection to use.
    language: language hint for the documents
    """
    _path = Path(path).resolve()
    collection = api.load_collection_by_foreign_id(foreign_id, config)
    collection_id = collection.get('id')
    _queue: Queue = Queue()
    _queue.put((_path, None, 1))
    threads = []
    for i in range(settings.THREADS):
        args = (_queue, api, collection_id, _path)
        thread = threading.Thread(target=_upload, args=args)
        thread.daemon = True
        thread.start()
        threads.append(thread)

    # block until all tasks are done
    _queue.join()
    for thread in threads:
        thread.join()
Exemplo n.º 6
0
def aleph_initializer(initializer=None):
    global alephclient
    alephclient = AlephAPI(timeout=60)
    adapter = requests.adapters.HTTPAdapter(pool_connections=52)
    alephclient.session.mount("http://", adapter)
    alephclient.session.mount("https://", adapter)
    if initializer is not None:
        initializer()
Exemplo n.º 7
0
def fetch_collection(api: AlephAPI,
                     prefix: Optional[str],
                     foreign_id: str,
                     overwrite: bool = False):
    path = _fix_path(prefix)
    collection = api.get_collection_by_foreign_id(foreign_id)
    if collection is None:
        return
    filters = [("collection_id", collection.get("id"))]
    params = {"empty:properties.parent": "true"}
    results = api.search("",
                         filters=filters,
                         schemata="Document",
                         params=params)
    label = collection.get("label")
    log.info("Dataset [%s]: %s (%d children)", path, label, len(results))
    for entity in results:
        fetch_object(api, path, entity, overwrite=overwrite)
Exemplo n.º 8
0
def get_api(context):
    if not settings.HOST:
        context.log.warning("No $ALEPHCLIENT_HOST, skipping upload...")
        return None
    if not settings.API_KEY:
        context.log.warning("No $ALEPHCLIENT_API_KEY, skipping upload...")
        return None

    session_id = "memorious:%s" % context.crawler.name
    return AlephAPI(settings.HOST, settings.API_KEY, session_id=session_id)
Exemplo n.º 9
0
def load_aleph(foreign_id, api_url, api_key):
    api = AlephAPI(api_url, api_key)
    collection_id = None
    if foreign_id is not None:
        collection = api.get_collection_by_foreign_id(foreign_id)
        if collection is None:
            raise click.BadParameter("Cannot find collection: %s" % foreign_id)
        collection_id = collection.get('id')

    stdout = click.get_text_stream('stdout')
    entities = api.stream_entities(collection_id=collection_id,
                                   include=['schema', 'properties'])
    for data in entities:
        if 'properties' not in data:
            continue
        entity = model.get_proxy(data)
        api_url = urljoin(api.base_url, 'entities/%s' % entity.id)
        entity.add('alephUrl', api_url, quiet=True)
        write_object(stdout, entity)
Exemplo n.º 10
0
def cli(ctx, host, api_key, retries):
    """API client for Aleph API"""
    logging.basicConfig(level=logging.DEBUG)
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("urllib3").setLevel(logging.WARNING)
    logging.getLogger("httpstream").setLevel(logging.WARNING)
    if not host:
        raise click.BadParameter("Missing Aleph host URL")
    if ctx.obj is None:
        ctx.obj = {}
    ctx.obj["api"] = AlephAPI(host, api_key, retries=retries)
Exemplo n.º 11
0
def _upload_path(api: AlephAPI, path: Path, collection_id: str, parent_id: str,
                 foreign_id: str) -> str:
    metadata = {
        'foreign_id': foreign_id,
        'file_name': path.name,
    }
    log.info('Upload [%s->%s]: %s', collection_id, parent_id, foreign_id)
    if parent_id is not None:
        metadata['parent_id'] = parent_id
    result = api.ingest_upload(collection_id, path, metadata=metadata)
    if 'id' not in result:
        raise AlephException('Upload failed')
    return result['id']
Exemplo n.º 12
0
class TestApiCollection:
    fake_url = "http://aleph.test/api/2/"

    def setup_method(self, mocker):
        self.api = AlephAPI(host=self.fake_url, api_key="fake_key")

    def test_get_collection(self, mocker):
        collection_id = "8"
        mocker.patch.object(self.api, "_request")
        self.api.get_collection(collection_id)
        self.api._request.assert_called_with(
            "GET", "{}collections/{}".format(self.fake_url, collection_id))

    def test_reingest_collection(self, mocker):
        pass

    def test_reindex_collection(self, mocker):
        pass

    def test_delete_collection(self, mocker):
        pass

    def test_flush_collection(self, mocker):
        pass

    def test_get_collection_by_foreign_id(self, mocker):
        pass

    def test_load_collection_by_foreign_id(self, mocker):
        pass

    def test_filter_collections(self, mocker):
        pass

    def test_create_collection(self, mocker):
        pass
Exemplo n.º 13
0
def crawl_dir(
    api: AlephAPI,
    path: str,
    foreign_id: str,
    config: Dict,
    index: bool = True,
    nojunk: bool = False,
    parallel: int = 1,
):
    """Crawl a directory and upload its content to a collection

    params
    ------
    path: path of the directory
    foreign_id: foreign_id of the collection to use.
    language: language hint for the documents
    """
    root = Path(path).resolve()
    collection = api.load_collection_by_foreign_id(foreign_id, config)
    crawler = CrawlDirectory(api, collection, root, index=index, nojunk=nojunk)
    consumers = []

    # Use one thread to produce using scandir and at least one to consume
    # files for upload.
    producer = threading.Thread(target=crawler.crawl, daemon=True)
    producer.start()
    for i in range(max(1, parallel)):
        consumer = threading.Thread(target=crawler.consume, daemon=True)
        consumer.start()
        consumers.append(consumer)

    # Block until the producer is done with queueing the tree.
    producer.join()

    # Block until the file upload queue is drained.
    crawler.queue.join()

    # Poison the queue to signal end to each consumer.
    for consumer in consumers:
        crawler.queue.put((None, None))

    # Block until all file upload queue consumers are done.
    for consumer in consumers:
        consumer.join()
Exemplo n.º 14
0
class TestApiSearch:
    fake_url = "http://aleph.test/api/2/"
    fake_query = "fleem"

    def setup_method(self, mocker):
        self.api = AlephAPI(host="http://aleph.test/api/2/", api_key="fake_key")

    def test_search(self, mocker):
        mocker.patch.object(self.api, "_request")
        search_result = self.api.search(self.fake_query)

        assert isinstance(search_result, APIResultSet) == True

    def test_search_url(self, mocker):
        mocker.patch.object(self.api, "_request")
        search_result = self.api.search(self.fake_query)

        assert self.fake_url in search_result.url

    def test_search_query(self, mocker):
        mocker.patch.object(self.api, "_request")
        search_result = self.api.search(self.fake_query)

        assert self.fake_query in search_result.url

    def test_search_schema(self, mocker):
        schema = "Article"
        mocker.patch.object(self.api, "_request")
        search_result = self.api.search(self.fake_query, schema)

        assert "schema={}".format(schema) in search_result.url

    def test_search_schemata(self, mocker):
        schemata = "Document"
        mocker.patch.object(self.api, "_request")
        search_result = self.api.search(self.fake_query, None, schemata)

        assert "schemata={}".format(schemata) in search_result.url

    def test_search_params(self, mocker):
        params = {"first": "first", "second": "second"}
        mocker.patch.object(self.api, "_request")
        search_result = self.api.search(self.fake_query, params=params)

        assert "first=first" in search_result.url
        assert "second=second" in search_result.url
Exemplo n.º 15
0
class TestTasks(object):
    def setup_method(self):
        self.api = AlephAPI(host="http://aleph.test/api/2/",
                            api_key="fake_key")

    def test_new_collection(self, mocker):
        mocker.patch.object(self.api, "filter_collections", return_value=[])
        mocker.patch.object(self.api, "create_collection")
        mocker.patch.object(self.api, "update_collection")
        mocker.patch.object(self.api, "ingest_upload")
        crawl_dir(self.api, "alephclient/tests/testdata", "test153", {}, True,
                  True)
        self.api.create_collection.assert_called_once_with({
            "category": "other",
            "foreign_id": "test153",
            "label": "test153",
            "languages": [],
            "summary": "",
            "casefile": False,
        })

    def test_write_entity(self, mocker):
        mocker.patch.object(self.api, "write_entity", return_value={"id": 24})
        collection_id = 8
        entity = {
            "id": 24,
            "schema": "Article",
            "properties": {
                "title": "",
                "author": "",
                "publishedAt": "",
                "bodyText": "",
            },
        }

        res = self.api.write_entity(collection_id, entity)
        assert res["id"] == 24

    def test_ingest(self, mocker):
        mocker.patch.object(self.api, "ingest_upload", return_value={"id": 42})
        mocker.patch.object(self.api,
                            "load_collection_by_foreign_id",
                            return_value={"id": 2})
        mocker.patch.object(self.api, "update_collection")
        crawl_dir(self.api, "alephclient/tests/testdata", "test153", {}, True,
                  True)
        base_path = os.path.abspath("alephclient/tests/testdata")
        assert self.api.ingest_upload.call_count == 6
        expected_calls = [
            mocker.call(
                2,
                Path(os.path.join(base_path, "feb")),
                metadata={
                    "foreign_id": "feb",
                    "file_name": "feb"
                },
                index=True,
            ),
            mocker.call(
                2,
                Path(os.path.join(base_path, "jan")),
                metadata={
                    "foreign_id": "jan",
                    "file_name": "jan"
                },
                index=True,
            ),
            mocker.call(
                2,
                Path(os.path.join(base_path, "feb/2.txt")),
                metadata={
                    "parent_id": 42,
                    "foreign_id": "feb/2.txt",
                    "file_name": "2.txt",
                },
                index=True,
            ),
            mocker.call(
                2,
                Path(os.path.join(base_path, "jan/week1")),
                metadata={
                    "parent_id": 42,
                    "foreign_id": "jan/week1",
                    "file_name": "week1",
                },
                index=True,
            ),
            mocker.call(
                2,
                Path(os.path.join(base_path, "jan/week1/1.txt")),
                metadata={
                    "parent_id": 42,
                    "foreign_id": "jan/week1/1.txt",
                    "file_name": "1.txt",
                },
                index=True,
            ),
        ]
        for call in expected_calls:
            assert call in self.api.ingest_upload.mock_calls
Exemplo n.º 16
0
def load_entities(json_file, root_path):
    api = AlephAPI()
    collection = api.load_collection_by_foreign_id('zz_occrp_pdi')
    cid = collection.get('id')
    api.write_entities(cid, generate_entities(json_file, root_path, api, cid))
Exemplo n.º 17
0
def aleph_emit(context, data):
    if not settings.ALEPH_HOST:
        context.log.warning("No $MEMORIOUS_ALEPH_HOST, skipping upload...")
        return
    if not settings.ALEPH_API_KEY:
        context.log.warning("No $MEMORIOUS_ALEPH_API_KEY, skipping upload...")
        return

    session_id = 'memorious:%s' % context.crawler.name
    api = AlephAPI(settings.ALEPH_HOST,
                   settings.ALEPH_API_KEY,
                   session_id=session_id)
    collection_id = get_collection_id(context, api)
    if collection_id is None:
        context.log.warning("Cannot get aleph collection.")
        return

    content_hash = data.get('content_hash')
    source_url = data.get('source_url', data.get('url'))
    foreign_id = data.get('foreign_id', data.get('request_id', source_url))
    if context.skip_incremental(collection_id, foreign_id, content_hash):
        context.log.info("Skip aleph upload: %s", foreign_id)
        return

    meta = {
        'crawler': context.crawler.name,
        'foreign_id': foreign_id,
        'source_url': source_url,
        'title': data.get('title'),
        'author': data.get('author'),
        'file_name': data.get('file_name'),
        'retrieved_at': data.get('retrieved_at'),
        'modified_at': data.get('modified_at'),
        'published_at': data.get('published_at'),
        'headers': data.get('headers', {})
    }

    languages = context.params.get('languages')
    meta['languages'] = data.get('languages', languages)
    countries = context.params.get('countries')
    meta['countries'] = data.get('countries', countries)
    mime_type = context.params.get('mime_type')
    meta['mime_type'] = data.get('mime_type', mime_type)

    if data.get('parent_foreign_id'):
        meta['parent'] = {'foreign_id': data.get('parent_foreign_id')}

    meta = clean_dict(meta)
    # pprint(meta)

    label = meta.get('file_name', meta.get('source_url'))
    context.log.info("Upload: %s", label)
    with context.load_file(content_hash) as fh:
        if fh is None:
            return
        file_path = Path(fh.name).resolve()
        res = api.ingest_upload(collection_id, file_path, meta)
        if res.get('status') == 'ok':
            document = res.get('documents')[0]
            context.log.info("Document ID: %s", document['id'])
        else:
            context.emit_warning("Error: %r" % res)
Exemplo n.º 18
0
 def __init__(self):
     self.api = AlephAPI()
Exemplo n.º 19
0
class AlephEnricher(Enricher):
    def __init__(self):
        self.api = AlephAPI()

    def get_api(self, url):
        data = self.cache.get(url)
        if data is not None:
            return data
        try:
            log.info("Aleph fetch: %s", url)
            res = self.api.session.get(url)
            if res.status_code != 200:
                return {}
            data = res.json()
            self.cache.store(url, data)
            return data
        except RequestException:
            log.exception("Error calling Aleph API")
            return {}

    def post_match(self, url, proxy):
        data = proxy.to_dict()
        key = proxy.id or hash_data(data)
        key = hash_data((url, key))
        if self.cache.has(key):
            log.info("Cached [%s]: %s", key, url)
            return self.cache.get(key)

        log.info("Enrich: %r", proxy)
        try:
            res = self.api.session.post(url, json=data)
        except RequestException:
            log.exception("Error calling Aleph matcher")
            return {}
        if res.status_code != 200:
            return {}
        data = res.json()
        self.cache.store(key, data)
        return data

    def convert_entity(self, data):
        data = ensure_dict(data)
        if "properties" not in data or "schema" not in data:
            return
        try:
            entity = model.get_proxy(data, cleaned=False)
        except InvalidData:
            log.error("Server model mismatch: %s" % data.get("schema"))
            return
        entity.id = data.get("id")
        links = ensure_dict(data.get("links"))
        entity.add("alephUrl", links.get("self"), quiet=True, cleaned=True)
        collection = data.get("collection", {})
        entity.add("publisher",
                   collection.get("label"),
                   quiet=True,
                   cleaned=True)
        clinks = collection.get("links", {})
        entity.add("publisherUrl", clinks.get("ui"), quiet=True, cleaned=True)
        return entity

    def convert_nested(self, data):
        entity = self.convert_entity(data)
        properties = ensure_dict(data.get("properties"))
        for prop, values in properties.items():
            for value in ensure_list(values):
                if is_mapping(value):
                    yield self.convert_entity(value)
        yield entity

    def enrich_entity(self, entity):
        url = self.api._make_url("match")
        for page in range(10):
            data = self.post_match(url, entity)
            for res in data.get("results", []):
                proxy = self.convert_entity(res)
                yield self.make_match(entity, proxy)

            url = data.get("next")
            if url is None:
                break

    def expand_entity(self, entity):
        for url in entity.get("alephUrl", quiet=True):
            data = self.get_api(url)
            yield from self.convert_nested(data)

            _, entity_id = url.rsplit("/", 1)
            filters = (("entities", entity_id), )
            search_api = self.api._make_url("entities", filters=filters)
            while True:
                res = self.get_api(search_api)
                for data in ensure_list(res.get("results")):
                    yield from self.convert_nested(data)

                search_api = res.get("next")
                if search_api is None:
                    break
Exemplo n.º 20
0
 def __init__(self, host=None):
     self.host = host or os.environ.get('ENRICH_ALEPH_HOST')
     self.host = self.host or os.environ.get('ALEPH_HOST')
     self.api_key = os.environ.get('ALEPH_API_KEY')
     self.api_key = os.environ.get('ENRICH_ALEPH_API_KEY', self.api_key)
     self.api = AlephAPI(self.host, self.api_key)
Exemplo n.º 21
0
class AlephEnricher(Enricher):
    key_prefix = 'aleph'
    TYPE_CONSTRAINT = 'LegalEntity'

    def __init__(self, host=None):
        self.host = host or os.environ.get('ENRICH_ALEPH_HOST')
        self.host = self.host or os.environ.get('ALEPH_HOST')
        self.api_key = os.environ.get('ALEPH_API_KEY')
        self.api_key = os.environ.get('ENRICH_ALEPH_API_KEY', self.api_key)
        self.api = AlephAPI(self.host, self.api_key)

    def get_api(self, url, params=None):
        url = make_url(url, params)
        data = self.cache.get(url)
        if data is None:
            try:
                res = self.api.session.get(url)
                if res.status_code != 200:
                    return {}
                data = res.json()
                self.cache.store(url, data)
            except RequestException:
                log.exception("Error calling Aleph API")
                return {}
        return data

    def post_match(self, url, proxy):
        data = proxy.to_dict()
        key = proxy.id or hash_data(data)
        key = hash_data((url, key))
        if self.cache.has(key):
            # log.info("Cached [%s]: %s", self.host, proxy)
            return self.cache.get(key)

        log.info("Enrich [%s]: %s", self.host, proxy)
        try:
            res = self.api.session.post(url, json=data)
        except RequestException:
            log.exception("Error calling Aleph matcher")
            return {}
        if res.status_code != 200:
            return {}
        data = res.json()
        self.cache.store(key, data)
        return data

    def convert_entity(self, result, data):
        data = ensure_dict(data)
        if 'properties' not in data or 'schema' not in data:
            return
        try:
            entity = result.make_entity(data.get('schema'))
        except InvalidData:
            log.error("Server model mismatch: %s" % data.get('schema'))
            return
        entity.id = data.get('id')
        links = ensure_dict(data.get('links'))
        entity.add('alephUrl', links.get('self'),
                   quiet=True, cleaned=True)
        collection = data.get('collection', {})
        entity.add('publisher', collection.get('label'),
                   quiet=True, cleaned=True)
        clinks = collection.get('links', {})
        entity.add('publisherUrl', clinks.get('ui'),
                   quiet=True, cleaned=True)
        properties = ensure_dict(data.get('properties'))
        for prop, values in properties.items():
            for value in ensure_list(values):
                if is_mapping(value):
                    child = self.convert_entity(result, value)
                    if child.id is None:
                        continue
                    value = child.id
                try:
                    entity.add(prop, value, cleaned=True)
                except InvalidData:
                    msg = "Server property mismatch (%s): %s"
                    log.warning(msg % (entity.schema.name, prop))
        result.add_entity(entity)
        return entity

    def enrich_entity(self, entity):
        if not entity.schema.matchable:
            return

        url = self.api._make_url('match')
        for page in range(10):
            data = self.post_match(url, entity)
            for res in data.get('results', []):
                result = self.make_result(entity)
                proxy = self.convert_entity(result, res)
                result.set_candidate(proxy)
                if result.candidate is not None:
                    yield result

            url = data.get('next')
            if url is None:
                break

    def expand_entity(self, entity):
        result = super(AlephEnricher, self).expand_entity(entity)
        for url in entity.get('alephUrl', quiet=True):
            _, entity_id = url.rsplit('/', 1)
            data = self.get_api(url)
            self.convert_entity(result, data)
            search_api = self.api._make_url('search')
            params = {'filter:entities': entity_id}
            entities = self.get_api(search_api, params=params)
            for data in ensure_list(entities.get('results')):
                self.convert_entity(result, data)
        return result
Exemplo n.º 22
0
def fetch_entity(api: AlephAPI,
                 prefix: Optional[str],
                 entity_id: str,
                 overwrite: bool = False):
    entity = api.get_entity(entity_id)
    return fetch_object(api, _fix_path(prefix), entity, overwrite=overwrite)
Exemplo n.º 23
0
 def setup_method(self, mocker):
     self.api = AlephAPI(host=self.fake_url, api_key="fake_key")
def init_aleph(lock=None):
    global api
    api = AlephAPI()
    if lock is not None:
        tqdm.set_lock(lock)
Exemplo n.º 25
0
 def setup_method(self):
     self.api = AlephAPI(
         host="http://aleph.test/api/2/", api_key="fake_key"
     )
Exemplo n.º 26
0
import logging
from functools import lru_cache
from copy import deepcopy
from functools import partial, wraps
from concurrent.futures import ThreadPoolExecutor

from alephclient.api import AlephAPI, EntityResultSet, AlephException
from tqdm.autonotebook import tqdm
import requests

from followthemoney import model
from followthemoney.exc import InvalidData

log = logging.getLogger(__name__)

alephclient = AlephAPI(timeout=60)
alephclient._request = lru_cache(2048)(alephclient._request)


def aleph_initializer(initializer=None):
    global alephclient
    alephclient = AlephAPI(timeout=60)
    adapter = requests.adapters.HTTPAdapter(pool_connections=52)
    alephclient.session.mount("http://", adapter)
    alephclient.session.mount("https://", adapter)
    if initializer is not None:
        initializer()


@wraps(ThreadPoolExecutor)
def AlephPool(*args, **kwargs):