示例#1
0
文件: util.py 项目: stofstar/aleph
def get_session_id():
    role_id = stringify(request.authz.id) or 'anonymous'
    session_id = None
    if hasattr(request, '_session_id'):
        session_id = stringify(request._session_id)
    session_id = session_id or Job.random_id()
    return '%s:%s' % (role_id, session_id)
示例#2
0
def reingest_collection(collection, job_id=None, index=False, flush=True):
    """Trigger a re-ingest for all documents in the collection."""
    job_id = job_id or Job.random_id()
    if flush:
        ingest_flush(collection)
    for document in Document.by_collection(collection.id):
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=index)
示例#3
0
def reingest_collection(collection, job_id=None, index=False):
    """Trigger a re-ingest for all documents in the collection."""
    job_id = job_id or Job.random_id()
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=OP_ANALYZE)
    aggregator.delete(origin=OP_INGEST)
    aggregator.close()
    for document in Document.by_collection(collection.id):
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=index)
示例#4
0
def crawl_directory(collection, path, parent=None, job_id=None):
    """Crawl the contents of the given path."""
    try:
        content_hash = None
        if not path.is_dir():
            content_hash = archive.archive_file(path)
        foreign_id = path.name
        if parent is not None:
            foreign_id = os.path.join(parent.foreign_id, foreign_id)

        # if the job_id is not set yet and path.is_dir(), we know it is the
        # first iteration and we don't create an initial root folder as parent
        # to be consistent with the behaviour of alephclient
        if path.is_dir() and job_id is None:
            document = None
            job_id = Job.random_id()
        else:
            meta = {"file_name": path.name}
            document = Document.save(
                collection,
                parent=parent,
                foreign_id=foreign_id,
                content_hash=content_hash,
                meta=meta,
            )
            db.session.commit()
            job_id = job_id or Job.random_id()
            proxy = document.to_proxy()
            ingest_flush(collection, entity_id=proxy.id)
            ingest_entity(collection, proxy, job_id=job_id)
            log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id)

        if path.is_dir():
            for child in path.iterdir():
                crawl_directory(collection, child, document, job_id)
    except OSError:
        log.exception("Cannot crawl directory: %s", path)
示例#5
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
            "continue_on_error": settings.CONTINUE_ON_ERROR,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})
示例#6
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id or Job.random_id(),
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        Queue.queue(self.init_stage, state, {})
示例#7
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})
示例#8
0
def load_entities(foreign_id, infile, unsafe=False):
    """Load FtM entities from the specified iJSON file."""
    collection = ensure_collection(foreign_id, foreign_id)

    def read_entities():
        for idx in count(1):
            line = infile.readline()
            if not line:
                return
            if idx % 1000 == 0:
                log.info("[%s] Loaded %s entities from: %s",
                         foreign_id, idx, infile.name)
            yield json.loads(line)

    job_id = Job.random_id()
    log.info("Loading [%s]: %s", job_id, foreign_id)
    bulk_write(collection, read_entities(), job_id=job_id, unsafe=unsafe)
    update_collection(collection)
示例#9
0
def crawl_directory(collection, path, parent=None, job_id=None):
    """Crawl the contents of the given path."""
    content_hash = None
    if not path.is_dir():
        content_hash = archive.archive_file(path)
    foreign_id = path.name
    if parent is not None:
        foreign_id = os.path.join(parent.foreign_id, foreign_id)
    meta = {'file_name': path.name}
    document = Document.save(collection,
                             parent=parent,
                             foreign_id=foreign_id,
                             content_hash=content_hash,
                             meta=meta)
    db.session.commit()
    job_id = job_id or Job.random_id()
    ingest_entity(collection, document.to_proxy(), job_id=job_id)
    log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id)
    if path.is_dir():
        for child in path.iterdir():
            crawl_directory(collection, child, document, job_id)
示例#10
0
文件: queues.py 项目: wdsn/aleph
def get_stage(collection, stage, job_id=None):
    job_id = job_id or Job.random_id()
    job = Job(kv, collection.foreign_id, job_id)
    return job.get_stage(stage)
示例#11
0
# execute in the aleph shell:
# exec(open("reingest_partial.py").read())
from servicelayer.jobs import Job
from aleph.queues import ingest_entity
from aleph.model import Collection, Document

job_id = Job.random_id()
collection = Collection.by_id(125)
with open('collection_ftm_failures.csv', 'r') as f:
    for document_id in f:
        print("reingest " + document_id)
        document = Document.by_id(document_id)
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=True)

示例#12
0
def get_session_id():
    role_id = stringify(request.authz.id) or "anonymous"
    session_id = stringify(request._session_id)
    session_id = session_id or Job.random_id()
    return "%s:%s" % (role_id, session_id)
示例#13
0
文件: queues.py 项目: moreymat/aleph
def get_stage(collection, stage, job_id=None):
    dataset = dataset_from_collection(collection)
    job_id = job_id or Job.random_id()
    job = Job(kv, dataset, job_id)
    return job.get_stage(stage)