def __init__(self, content, username, update_if_exists, parallel=None, self_task_id=None): # pylint: disable=too-many-arguments super().__init__(content, username, update_if_exists, None, False) self.start_time = time.time() self.self_task_id = self_task_id self.username = username self.total = 0 self.resource_distribution = dict() self.parallel = int(parallel) if parallel else 5 self.tasks = [] self.groups = [] self.results = [] self.elapsed_seconds = 0 self.resource_wise_time = dict() self.parts = [[]] self.result = None self._json_result = None self.redis_service = RedisService() if self.content: self.input_list = self.content.splitlines() self.total = len(self.input_list) self.make_resource_distribution() self.make_parts()
def notify_progress(self): if self.self_task_id: service = RedisService() service.set(self.self_task_id, self.processed)
class BulkImportParallelRunner(BaseImporter): # pragma: no cover def __init__(self, content, username, update_if_exists, parallel=None, self_task_id=None): # pylint: disable=too-many-arguments super().__init__(content, username, update_if_exists, None, False) self.start_time = time.time() self.self_task_id = self_task_id self.username = username self.total = 0 self.resource_distribution = dict() self.parallel = int(parallel) if parallel else 5 self.tasks = [] self.groups = [] self.results = [] self.elapsed_seconds = 0 self.resource_wise_time = dict() self.parts = [[]] self.result = None self._json_result = None self.redis_service = RedisService() if self.content: self.input_list = self.content.splitlines() self.total = len(self.input_list) self.make_resource_distribution() self.make_parts() def make_resource_distribution(self): for line in self.input_list: data = json.loads(line) data_type = data['type'] if data_type not in self.resource_distribution: self.resource_distribution[data_type] = [] self.resource_distribution[data_type].append(data) def make_parts(self): prev_line = None orgs = self.resource_distribution.get('Organization', None) sources = self.resource_distribution.get('Source', None) collections = self.resource_distribution.get('Collection', None) if orgs: self.parts = [orgs] if sources: self.parts.append(sources) if collections: self.parts.append(collections) self.parts = compact(self.parts) self.parts.append([]) for data in self.input_list: line = json.loads(data) data_type = line.get('type', None).lower() if data_type not in ['organization', 'source', 'collection']: if prev_line: prev_type = prev_line.get('type').lower() if prev_type == data_type or ( data_type not in ['concept', 'mapping'] and prev_type not in ['concept', 'mapping']): self.parts[-1].append(line) else: self.parts.append([line]) else: self.parts[-1].append(line) prev_line = line self.parts = compact(self.parts) @staticmethod def chunker_list(seq, size): return (seq[i::size] for i in range(size)) def is_any_process_alive(self): if not self.groups: return False result = True try: result = any(grp.completed_count() != len(grp) for grp in self.groups) except: # pylint: disable=bare-except pass return result def get_overall_tasks_progress(self): total_processed = 0 if not self.tasks: return total_processed for task in self.tasks: try: if task.task_id: total_processed += self.redis_service.get_int(task.task_id) except: # pylint: disable=bare-except pass return total_processed def get_details_to_notify(self): summary = "Started: {} | Processed: {}/{} | Time: {}secs".format( self.start_time_formatted, self.get_overall_tasks_progress(), self.total, self.elapsed_seconds) return dict(summary=summary) def get_sub_task_ids(self): return {task.task_id: task.state for task in self.tasks} def notify_progress(self): if self.self_task_id: try: self.redis_service.set_json(self.self_task_id, self.get_details_to_notify()) except: # pylint: disable=bare-except pass def wait_till_tasks_alive(self): while self.is_any_process_alive(): self.update_elapsed_seconds() self.notify_progress() time.sleep(1) def run(self): if self.self_task_id: print("****STARTED MAIN****") print("TASK ID: {}".format(self.self_task_id)) print("***************") for part_list in self.parts: if part_list: part_type = get(part_list, '0.type', '').lower() if part_type: is_child = part_type in ['concept', 'mapping', 'reference'] start_time = time.time() self.queue_tasks(part_list, is_child) self.wait_till_tasks_alive() if is_child: if part_type not in self.resource_wise_time: self.resource_wise_time[part_type] = 0 self.resource_wise_time[part_type] += (time.time() - start_time) self.update_elapsed_seconds() self.make_result() return self.result def update_elapsed_seconds(self): self.elapsed_seconds = time.time() - self.start_time @property def detailed_summary(self): result = self.json_result return "Started: {} | Processed: {}/{} | Created: {} | Updated: {} | Existing: {} | Time: {}secs".format( self.start_time_formatted, result.get('processed'), result.get('total'), len(result.get('created')), len(result.get('updated')), len(result.get('exists')), self.elapsed_seconds) @property def start_time_formatted(self): return datetime.fromtimestamp(self.start_time) @property def json_result(self): if self._json_result: return self._json_result total_result = dict(total=0, processed=0, created=[], updated=[], invalid=[], exists=[], failed=[], exception=[], others=[], unknown=[], elapsed_seconds=self.elapsed_seconds) for task in self.tasks: result = task.result.get('json') for key in total_result: total_result[key] += result.get(key) total_result['start_time'] = self.start_time_formatted total_result['elapsed_seconds'] = self.elapsed_seconds total_result[ 'child_resource_time_distribution'] = self.resource_wise_time self._json_result = total_result return self._json_result @property def report(self): data = { k: len(v) if isinstance(v, list) else v for k, v in self.json_result.items() } data['child_resource_time_distribution'] = self.resource_wise_time return data def make_result(self): self.result = dict(json=self.json_result, detailed_summary=self.detailed_summary, report=self.report) def queue_tasks(self, part_list, is_child): chunked_lists = compact( self.chunker_list(part_list, self.parallel ) if is_child else [part_list]) jobs = group( bulk_import_parts_inline.s(_list, self.username, self.update_if_exists) for _list in chunked_lists) group_result = jobs.apply_async(queue='concurrent') self.groups.append(group_result) self.tasks += group_result.results
def get(self, request, import_queue=None): # pylint: disable=too-many-return-statements,too-many-locals,too-many-branches task_id = request.GET.get('task') result_format = request.GET.get('result') username = request.GET.get('username') user = self.request.user if task_id: parsed_task = parse_bulk_import_task_id(task_id) username = parsed_task['username'] if not user.is_staff and user.username != username: return Response(status=status.HTTP_403_FORBIDDEN) task = AsyncResult(task_id) if task.successful(): result = task.get() if result and result_format == 'json': return Response(result.get('json', None), content_type="application/json") if result and result_format == 'report': return Response(result.get('report', None)) if result: return Response(result.get('detailed_summary', None)) if task.failed(): return Response(dict(exception=str(task.result)), status=status.HTTP_400_BAD_REQUEST) if task.state == 'STARTED': service = RedisService() if service.exists(task_id): return Response(dict( details=service.get_formatted(task_id), task=task.id, state=task.state, username=username, queue=parsed_task['queue']), status=status.HTTP_200_OK) if task.state == 'PENDING' and not task_exists(task_id): return Response(dict(exception='task ' + task_id + ' not found'), status=status.HTTP_404_NOT_FOUND) return Response(dict(task=task.id, state=task.state, username=username, queue=parsed_task['queue']), status=status.HTTP_202_ACCEPTED) try: response = flower_get('api/tasks') flower_tasks = response.json() except Exception as ex: return Response(dict( detail= 'Flower service returned unexpected result. Maybe check healthcheck.', exception=str(ex)), status=status.HTTP_422_UNPROCESSABLE_ENTITY) tasks = [] for task_id, value in flower_tasks.items(): if not value.get('name', None) or not value['name'].startswith( 'core.common.tasks.bulk_import'): continue task = parse_bulk_import_task_id(task_id) if user.is_staff or user.username == task['username']: if (not import_queue or task['queue'] == import_queue) and \ (not username or task['username'] == username): tasks.append( dict(task=task_id, state=value['state'], queue=task['queue'], username=task['username'])) return Response(tasks)