Пример #1
0
def create_from_json(records, apply_async=True):
    current_app.logger.info('Loading dump...')

    for i, record in enumerate(records['records']):
        engine = WorkflowEngine.with_name("articles_upload")
        engine.save()
        obj = workflow_object_class.create(data=record)
        obj.id_workflow = str(engine.uuid)
        extra_data = {}
        record_extra = record.pop('extra_data', {})
        if record_extra:
            extra_data['record_extra'] = record_extra

        obj.extra_data['source_data'] = {
            'data': copy.deepcopy(record),
            'extra_data': copy.deepcopy(extra_data),
        }
        obj.extra_data.update(extra_data)

        obj.data_type = current_app.config['CRAWLER_DATA_TYPE']
        obj.save()
        db.session.commit()

        job_id = uuid1()

        crawler_object = CrawlerWorkflowObject(job_id=job_id, object_id=obj.id)
        db.session.add(crawler_object)
        queue = current_app.config['CRAWLER_CELERY_QUEUE']

        if apply_async:
            start.apply_async(
                kwargs={
                    'workflow_name': "articles_upload",
                    'object_id': obj.id,
                },
                queue=queue,
            )
        else:
            start(workflow_name="articles_upload", object_id=obj.id)

        current_app.logger.info('Parsed record {}.'.format(i))
Пример #2
0
def create_from_json(records, apply_async=True):
    current_app.logger.info('Loading dump...')

    results = []

    for i, record in enumerate(records['records']):
        obj = workflow_object_class.create(data=record)
        extra_data = {}
        record_extra = record.pop('extra_data', {})
        if record_extra:
            extra_data['record_extra'] = record_extra

        obj.extra_data['source_data'] = {
            'data': copy.deepcopy(record),
            'extra_data': copy.deepcopy(extra_data),
        }
        obj.extra_data.update(extra_data)

        obj.data_type = current_app.config['CRAWLER_DATA_TYPE']
        obj.save()
        db.session.commit()

        queue = current_app.config['CRAWLER_CELERY_QUEUE']

        if apply_async:
            workflow = start.apply_async(
                kwargs={
                    'workflow_name': "articles_upload",
                    'object_id': obj.id,
                },
                queue=queue,
            )
        else:
            workflow = start(workflow_name="articles_upload", object_id=obj.id)
        results.append(workflow)

        current_app.logger.info('Parsed record {}.'.format(i))

    return results
Пример #3
0
def create_from_json(records, apply_async=True):
    current_app.logger.info('Loading dump...')

    results = []

    for i, record in enumerate(records['records']):
        obj = workflow_object_class.create(data=record)
        extra_data = {}
        record_extra = record.pop('extra_data', {})
        if record_extra:
            extra_data['record_extra'] = record_extra

        obj.extra_data['source_data'] = {
            'data': copy.deepcopy(record),
            'extra_data': copy.deepcopy(extra_data),
        }
        obj.extra_data.update(extra_data)

        obj.data_type = current_app.config['CRAWLER_DATA_TYPE']
        obj.save()
        db.session.commit()

        queue = current_app.config['CRAWLER_CELERY_QUEUE']

        if apply_async:
            workflow = start.apply_async(
                kwargs={
                    'workflow_name': "articles_upload",
                    'object_id': obj.id,
                },
                queue=queue,
            )
        else:
            workflow = start(workflow_name="articles_upload", object_id=obj.id)
        results.append(workflow)

        current_app.logger.info('Parsed record {}.'.format(i))

    return results
Пример #4
0
def submit_results(job_id, errors, log_file, results_uri, results_data=None):
    """Receive the submission of the results of a crawl job.

    Then it spawns the appropiate workflow according to whichever workflow
    the crawl job specifies.

    :param job_id: Id of the crawler job.
    :param errors: Errors that happened, if any (seems ambiguous)
    :param log_file: Path to the log file of the crawler job.
    :param results_uri: URI to the file containing the results of the crawl
       job, namely the records extracted.
    :param results_data: Optional data payload with the results list, to skip
        retrieving them from the `results_uri`, useful for slow or unreliable
        storages.
    """
    def _extract_results_data(results_path):
        if not os.path.exists(results_path):
            raise CrawlerInvalidResultsPath(
                "Path specified in result does not exist: {0}".format(
                    results_path))

        current_app.logger.info('Parsing records from {}'.format(results_path))
        results_data = []
        with open(results_path) as records:
            lines = (line.strip() for line in records if line.strip())

            for line in lines:
                current_app.logger.debug(
                    'Reading record line: {}'.format(line))
                record = json.loads(line)
                results_data.append(record)

        current_app.logger.debug('Read {} records from {}'.format(
            len(results_data), results_path))
        return results_data

    results_path = urlparse(results_uri).path
    job = CrawlerJob.get_by_job(job_id)
    job.logs = log_file
    job.results = results_uri

    if errors:
        job.status = JobStatus.ERROR
        job.save()
        db.session.commit()
        raise CrawlerJobError(str(errors))

    if results_data is None:
        results_data = _extract_results_data(results_path)

    for record in results_data:
        current_app.logger.debug('Parsing record: {}'.format(record))
        obj = workflow_object_class.create(data=record)
        extra_data = {
            'crawler_job_id': job_id,
            'crawler_results_path': results_path,
        }
        record_extra = record.pop('extra_data', {})
        if record_extra:
            extra_data['record_extra'] = record_extra

        obj.extra_data['source_data'] = {
            'data': copy.deepcopy(record),
            'extra_data': copy.deepcopy(extra_data),
        }
        obj.extra_data.update(extra_data)

        obj.data_type = current_app.config['CRAWLER_DATA_TYPE']
        obj.save()
        db.session.commit()

        crawler_object = CrawlerWorkflowObject(job_id=job_id, object_id=obj.id)
        db.session.add(crawler_object)
        queue = current_app.config['CRAWLER_CELERY_QUEUE']

        start.apply_async(
            kwargs={
                'workflow_name': job.workflow,
                'object_id': obj.id,
            },
            queue=queue,
        )

    current_app.logger.info('Parsed {} records.'.format(len(results_data)))

    job.status = JobStatus.FINISHED
    job.save()
    db.session.commit()