def create_from_json(records, apply_async=True): current_app.logger.info('Loading dump...') for i, record in enumerate(records['records']): engine = WorkflowEngine.with_name("articles_upload") engine.save() obj = workflow_object_class.create(data=record) obj.id_workflow = str(engine.uuid) extra_data = {} record_extra = record.pop('extra_data', {}) if record_extra: extra_data['record_extra'] = record_extra obj.extra_data['source_data'] = { 'data': copy.deepcopy(record), 'extra_data': copy.deepcopy(extra_data), } obj.extra_data.update(extra_data) obj.data_type = current_app.config['CRAWLER_DATA_TYPE'] obj.save() db.session.commit() job_id = uuid1() crawler_object = CrawlerWorkflowObject(job_id=job_id, object_id=obj.id) db.session.add(crawler_object) queue = current_app.config['CRAWLER_CELERY_QUEUE'] if apply_async: start.apply_async( kwargs={ 'workflow_name': "articles_upload", 'object_id': obj.id, }, queue=queue, ) else: start(workflow_name="articles_upload", object_id=obj.id) current_app.logger.info('Parsed record {}.'.format(i))
def submit_results(job_id, errors, log_file, results_uri, results_data=None): """Receive the submission of the results of a crawl job. Then it spawns the appropiate workflow according to whichever workflow the crawl job specifies. :param job_id: Id of the crawler job. :param errors: Errors that happened, if any (seems ambiguous) :param log_file: Path to the log file of the crawler job. :param results_uri: URI to the file containing the results of the crawl job, namely the records extracted. :param results_data: Optional data payload with the results list, to skip retrieving them from the `results_uri`, useful for slow or unreliable storages. """ results_path = urlparse(results_uri).path job = CrawlerJob.get_by_job(job_id) job.logs = log_file job.results = results_uri if errors: job.status = JobStatus.ERROR job.save() db.session.commit() raise CrawlerJobError(str(errors)) if results_data is None: results_data = _extract_results_data(results_path) for crawl_result in results_data: crawl_result = copy.deepcopy(crawl_result) try: _check_crawl_result_format(crawl_result) except KeyError as e: crawl_result = _crawl_result_from_exception(e, crawl_result) record = crawl_result.pop('record') crawl_errors = crawl_result['errors'] current_app.logger.debug('Parsing record: {}'.format(record)) engine = WorkflowEngine.with_name(job.workflow) engine.save() obj = workflow_object_class.create(data=record) obj.id_workflow = str(engine.uuid) if crawl_errors: obj.status = ObjectStatus.ERROR obj.extra_data['crawl_errors'] = crawl_result else: extra_data = { 'crawler_job_id': job_id, 'crawler_results_path': results_path, } record_extra = record.pop('extra_data', {}) if record_extra: extra_data['record_extra'] = record_extra obj.extra_data['source_data'] = { 'data': copy.deepcopy(record), 'extra_data': copy.deepcopy(extra_data), } obj.extra_data.update(extra_data) obj.data_type = current_app.config['CRAWLER_DATA_TYPE'] obj.save() db.session.commit() crawler_object = CrawlerWorkflowObject(job_id=job_id, object_id=obj.id) db.session.add(crawler_object) queue = current_app.config['CRAWLER_CELERY_QUEUE'] if not crawl_errors: start.apply_async( kwargs={ 'workflow_name': job.workflow, 'object_id': obj.id, }, queue=queue, ) current_app.logger.info('Parsed {} records.'.format(len(results_data))) job.status = JobStatus.FINISHED job.save() db.session.commit()