예제 #1
0
 def __init__(self,
              content,
              username,
              update_if_exists,
              parallel=None,
              self_task_id=None):  # pylint: disable=too-many-arguments
     super().__init__(content, username, update_if_exists, None, False)
     self.start_time = time.time()
     self.self_task_id = self_task_id
     self.username = username
     self.total = 0
     self.resource_distribution = dict()
     self.parallel = int(parallel) if parallel else 5
     self.tasks = []
     self.groups = []
     self.results = []
     self.elapsed_seconds = 0
     self.resource_wise_time = dict()
     self.parts = [[]]
     self.result = None
     self._json_result = None
     self.redis_service = RedisService()
     if self.content:
         self.input_list = self.content.splitlines()
         self.total = len(self.input_list)
     self.make_resource_distribution()
     self.make_parts()
예제 #2
0
 def notify_progress(self):
     if self.self_task_id:
         service = RedisService()
         service.set(self.self_task_id, self.processed)
예제 #3
0
class BulkImportParallelRunner(BaseImporter):  # pragma: no cover
    def __init__(self,
                 content,
                 username,
                 update_if_exists,
                 parallel=None,
                 self_task_id=None):  # pylint: disable=too-many-arguments
        super().__init__(content, username, update_if_exists, None, False)
        self.start_time = time.time()
        self.self_task_id = self_task_id
        self.username = username
        self.total = 0
        self.resource_distribution = dict()
        self.parallel = int(parallel) if parallel else 5
        self.tasks = []
        self.groups = []
        self.results = []
        self.elapsed_seconds = 0
        self.resource_wise_time = dict()
        self.parts = [[]]
        self.result = None
        self._json_result = None
        self.redis_service = RedisService()
        if self.content:
            self.input_list = self.content.splitlines()
            self.total = len(self.input_list)
        self.make_resource_distribution()
        self.make_parts()

    def make_resource_distribution(self):
        for line in self.input_list:
            data = json.loads(line)
            data_type = data['type']
            if data_type not in self.resource_distribution:
                self.resource_distribution[data_type] = []
            self.resource_distribution[data_type].append(data)

    def make_parts(self):
        prev_line = None
        orgs = self.resource_distribution.get('Organization', None)
        sources = self.resource_distribution.get('Source', None)
        collections = self.resource_distribution.get('Collection', None)
        if orgs:
            self.parts = [orgs]
        if sources:
            self.parts.append(sources)
        if collections:
            self.parts.append(collections)

        self.parts = compact(self.parts)

        self.parts.append([])

        for data in self.input_list:
            line = json.loads(data)
            data_type = line.get('type', None).lower()
            if data_type not in ['organization', 'source', 'collection']:
                if prev_line:
                    prev_type = prev_line.get('type').lower()
                    if prev_type == data_type or (
                            data_type not in ['concept', 'mapping']
                            and prev_type not in ['concept', 'mapping']):
                        self.parts[-1].append(line)
                    else:
                        self.parts.append([line])
                else:
                    self.parts[-1].append(line)
                prev_line = line

        self.parts = compact(self.parts)

    @staticmethod
    def chunker_list(seq, size):
        return (seq[i::size] for i in range(size))

    def is_any_process_alive(self):
        if not self.groups:
            return False

        result = True

        try:
            result = any(grp.completed_count() != len(grp)
                         for grp in self.groups)
        except:  # pylint: disable=bare-except
            pass

        return result

    def get_overall_tasks_progress(self):
        total_processed = 0
        if not self.tasks:
            return total_processed

        for task in self.tasks:
            try:
                if task.task_id:
                    total_processed += self.redis_service.get_int(task.task_id)
            except:  # pylint: disable=bare-except
                pass

        return total_processed

    def get_details_to_notify(self):
        summary = "Started: {} | Processed: {}/{} | Time: {}secs".format(
            self.start_time_formatted, self.get_overall_tasks_progress(),
            self.total, self.elapsed_seconds)

        return dict(summary=summary)

    def get_sub_task_ids(self):
        return {task.task_id: task.state for task in self.tasks}

    def notify_progress(self):
        if self.self_task_id:
            try:
                self.redis_service.set_json(self.self_task_id,
                                            self.get_details_to_notify())
            except:  # pylint: disable=bare-except
                pass

    def wait_till_tasks_alive(self):
        while self.is_any_process_alive():
            self.update_elapsed_seconds()
            self.notify_progress()
            time.sleep(1)

    def run(self):
        if self.self_task_id:
            print("****STARTED MAIN****")
            print("TASK ID: {}".format(self.self_task_id))
            print("***************")
        for part_list in self.parts:
            if part_list:
                part_type = get(part_list, '0.type', '').lower()
                if part_type:
                    is_child = part_type in ['concept', 'mapping', 'reference']
                    start_time = time.time()
                    self.queue_tasks(part_list, is_child)
                    self.wait_till_tasks_alive()
                    if is_child:
                        if part_type not in self.resource_wise_time:
                            self.resource_wise_time[part_type] = 0
                        self.resource_wise_time[part_type] += (time.time() -
                                                               start_time)

        self.update_elapsed_seconds()

        self.make_result()

        return self.result

    def update_elapsed_seconds(self):
        self.elapsed_seconds = time.time() - self.start_time

    @property
    def detailed_summary(self):
        result = self.json_result
        return "Started: {} | Processed: {}/{} | Created: {} | Updated: {} | Existing: {} | Time: {}secs".format(
            self.start_time_formatted, result.get('processed'),
            result.get('total'), len(result.get('created')),
            len(result.get('updated')), len(result.get('exists')),
            self.elapsed_seconds)

    @property
    def start_time_formatted(self):
        return datetime.fromtimestamp(self.start_time)

    @property
    def json_result(self):
        if self._json_result:
            return self._json_result

        total_result = dict(total=0,
                            processed=0,
                            created=[],
                            updated=[],
                            invalid=[],
                            exists=[],
                            failed=[],
                            exception=[],
                            others=[],
                            unknown=[],
                            elapsed_seconds=self.elapsed_seconds)
        for task in self.tasks:
            result = task.result.get('json')
            for key in total_result:
                total_result[key] += result.get(key)

        total_result['start_time'] = self.start_time_formatted
        total_result['elapsed_seconds'] = self.elapsed_seconds
        total_result[
            'child_resource_time_distribution'] = self.resource_wise_time
        self._json_result = total_result
        return self._json_result

    @property
    def report(self):
        data = {
            k: len(v) if isinstance(v, list) else v
            for k, v in self.json_result.items()
        }

        data['child_resource_time_distribution'] = self.resource_wise_time

        return data

    def make_result(self):
        self.result = dict(json=self.json_result,
                           detailed_summary=self.detailed_summary,
                           report=self.report)

    def queue_tasks(self, part_list, is_child):
        chunked_lists = compact(
            self.chunker_list(part_list, self.parallel
                              ) if is_child else [part_list])
        jobs = group(
            bulk_import_parts_inline.s(_list, self.username,
                                       self.update_if_exists)
            for _list in chunked_lists)
        group_result = jobs.apply_async(queue='concurrent')
        self.groups.append(group_result)
        self.tasks += group_result.results
예제 #4
0
    def get(self, request, import_queue=None):  # pylint: disable=too-many-return-statements,too-many-locals,too-many-branches
        task_id = request.GET.get('task')
        result_format = request.GET.get('result')
        username = request.GET.get('username')
        user = self.request.user

        if task_id:
            parsed_task = parse_bulk_import_task_id(task_id)
            username = parsed_task['username']

            if not user.is_staff and user.username != username:
                return Response(status=status.HTTP_403_FORBIDDEN)

            task = AsyncResult(task_id)

            if task.successful():
                result = task.get()
                if result and result_format == 'json':
                    return Response(result.get('json', None),
                                    content_type="application/json")
                if result and result_format == 'report':
                    return Response(result.get('report', None))
                if result:
                    return Response(result.get('detailed_summary', None))
            if task.failed():
                return Response(dict(exception=str(task.result)),
                                status=status.HTTP_400_BAD_REQUEST)
            if task.state == 'STARTED':
                service = RedisService()
                if service.exists(task_id):
                    return Response(dict(
                        details=service.get_formatted(task_id),
                        task=task.id,
                        state=task.state,
                        username=username,
                        queue=parsed_task['queue']),
                                    status=status.HTTP_200_OK)
            if task.state == 'PENDING' and not task_exists(task_id):
                return Response(dict(exception='task ' + task_id +
                                     ' not found'),
                                status=status.HTTP_404_NOT_FOUND)

            return Response(dict(task=task.id,
                                 state=task.state,
                                 username=username,
                                 queue=parsed_task['queue']),
                            status=status.HTTP_202_ACCEPTED)

        try:
            response = flower_get('api/tasks')
            flower_tasks = response.json()
        except Exception as ex:
            return Response(dict(
                detail=
                'Flower service returned unexpected result. Maybe check healthcheck.',
                exception=str(ex)),
                            status=status.HTTP_422_UNPROCESSABLE_ENTITY)

        tasks = []
        for task_id, value in flower_tasks.items():
            if not value.get('name', None) or not value['name'].startswith(
                    'core.common.tasks.bulk_import'):
                continue

            task = parse_bulk_import_task_id(task_id)

            if user.is_staff or user.username == task['username']:
                if (not import_queue or task['queue'] == import_queue) and \
                        (not username or task['username'] == username):
                    tasks.append(
                        dict(task=task_id,
                             state=value['state'],
                             queue=task['queue'],
                             username=task['username']))

        return Response(tasks)