Пример #1
0
    def process(self, **kwargs):
        """
        Locate employees
        :param kwargs:
        :return:
        """
        if kwargs.get('delete'):
            deleted = Employee.objects.all().delete() + Employer.objects.all(
            ).delete() + Provision.objects.all().delete()
            self.log('Deleted: ' + str(deleted))

        documents = Document.objects.all()
        # TODO: outdated
        if kwargs.get('document_type'):
            documents = documents.filter(
                document_type__in=kwargs['document_type'])
            self.log('Filter documents by "%s" document type.' %
                     str(kwargs['document_type']))

        if kwargs.get('document_id'):
            documents = documents.filter(pk=kwargs['document_id'])
            self.log('Process document id={}.'.format(kwargs['document_id']))

        self.task.subtasks_total = documents.count()
        self.task.save()
        self.log('Found {0} Documents. Added {0} subtasks.'.format(
            self.task.subtasks_total))

        for d in documents:
            self.parse_document_for_employee.apply_async(
                args=(d.id, kwargs.get('no_detect', True), kwargs['task_id']),
                task_id='%d_%s' % (self.task.id, fast_uuid()))
Пример #2
0
def push_or_pop(key, value,
                batch_size=None, batch_time=None):
    """
    Push pickled values to redis store until some limit,
    use uid suffix to make keys unique,
    then return unpickled list using key mask
    """
    existing_keys = r.keys('{}__*'.format(key))
    batch_time_key = '{}_time'.format(key)
    time_cached_sec = r.get(batch_time_key)
    time_now_sec = time.mktime(now().timetuple())

    if (batch_size is not None and len(existing_keys) >= batch_size - 1) or \
            (batch_time is not None and time_cached_sec is not None
             and time_now_sec - float(time_cached_sec) >= batch_time):
        res = [popd(k) for k in existing_keys]
        res.append(value)
        r.delete(batch_time_key)
        resume = True
    else:
        key = '{}__{}'.format(key, fast_uuid())
        res = push(key, value)
        r.getset(batch_time_key, time.mktime(now().timetuple()))
        resume = False
    return res, resume
Пример #3
0
    def detect_field_values_for_document_type(document_type_pk,
                                              document_ids: List, do_not_write,
                                              drop_classifier_model, task_id):
        log('Detecting field values for document type: {0}'.format(
            document_type_pk),
            task=task_id)
        document_type = DocumentType.objects.get(pk=document_type_pk)
        document_fields = document_type.fields
        if not document_fields:
            log('Can not find any fields assigned to document type: {0}'.
                format(document_type),
                task=task_id)
            return

        if drop_classifier_model:
            log('Deleting field values and classifier models for document type: {0}'
                .format(document_type),
                task=task_id)
            ClassifierModel.objects.filter(
                document_type=document_type).delete()

        task_count = 0

        for doc_id in document_ids or Document.objects.filter(
                document_type=document_type).values_list('id', flat=True):
            DetectFieldValues.detect_field_values_for_document.apply_async(
                args=(doc_id, do_not_write, task_id),
                task_id='%d_%s' % (task_id, fast_uuid()))
            task_count += 1
        if task_count == 0:
            log('No documents in DB for document type: {0}'.format(
                document_type),
                task=task_id)
        return task_count
Пример #4
0
    def process(self, **kwargs):

        project_id = kwargs.get('project_id')
        cluster_ids = kwargs.get('cluster_ids')
        new_project_id = kwargs.get('new_project_id')
        new_project = Project.objects.get(pk=new_project_id)

        documents = Document.objects.filter(
            documentcluster__pk__in=cluster_ids)
        documents.update(project_id=new_project,
                         document_type=new_project.type)

        self.task.metadata = {
            'task_name': 'reassigning',
            'old_project_id': project_id,
            'new_project_id': new_project_id,
            'cluster_ids': cluster_ids,
        }
        self.task.save()

        reassigning = {
            'date': now().isoformat(),
            'new_project_id': new_project_id,
            'cluster_ids': cluster_ids,
            'task_id': self.task.id
        }
        p_cl = ProjectClustering.objects.get(
            document_clusters__pk=cluster_ids[0])
        reassignings = p_cl.metadata.get('reassigning', [])
        reassignings.append(reassigning)
        p_cl.metadata['reassigning'] = reassignings
        reassigned_cluster_ids = list(
            set(p_cl.metadata.get('reassigned_cluster_ids', []) + cluster_ids))
        p_cl.metadata['reassigned_cluster_ids'] = reassigned_cluster_ids
        p_cl.save()

        tasks = []
        for app_name in custom_apps:
            module_str = 'apps.%s.tasks' % app_name
            module = sys.modules.get(module_str)
            detector_task = getattr(module, 'DetectFieldValues', None)
            if detector_task and hasattr(detector_task,
                                         'detect_field_values_for_document'):
                tasks.append(
                    getattr(detector_task, 'detect_field_values_for_document'))

        if tasks:
            self.task.subtasks_total = documents.count() * len(tasks)
            self.task.save()

            for document in documents:
                for task in tasks:
                    task.apply_async(
                        args=(document.id, False, self.task.id, None),
                        task_id='%d_%s' % (self.task.id, fast_uuid()))
Пример #5
0
    def __init__(self, prefix, instance: Task, initial):
        super().__init__()
        self.fields['name'].initial = instance.name

        logs = list()
        # on this stage it was quite hard to implement proper formatting in templates
        # so putting some html/js right here.
        # TODO: Refactor, put formatting to the templates

        # Main problem is that this form's template uses some base template which replaces \n with <br />
        for record in instance.get_task_log_from_elasticsearch():
            color = 'green'
            if record.log_level == 'WARN':
                color = 'yellow'
            elif record.log_level == 'ERROR':
                color = 'red'

            if not record.timestamp:
                ts = ''
            else:
                ts = record.timestamp.strftime('%Y-%m-%d %H:%M:%S')

            level = record.log_level or 'INFO'
            message = record.message
            if message and '\n' in message:
                message = '<br />' + message

            log_add = f'<b><span style="color: {color}">{level}</span> {ts} | {record.task_name or "no task"} |</b> ' \
                      f'{message}'

            logs.append(log_add)

            if record.stack_trace:
                # Adding JS to toggle stack trace showing/hiding
                stack = record.stack_trace.replace('\n', '<br />')
                uid = str(fast_uuid())
                uid_toggle = uid + '_toggle'
                show_hide = f'''e = document.getElementById('{uid}');
                                e.style.display = e.style.display === 'block' ? 'none' : 'block';
                                document.getElementById('{uid_toggle}').innerText 
                                        = e.style.display === 'block' ? '[-] Stack trace:' : '[+] Stack trace';
                            '''.replace('\n', '')
                logs.append(
                    f'<a id="{uid_toggle}" onclick="{show_hide}">[+] Stack trace:</a>'
                )
                logs.append(
                    f'<div id="{uid}" style="display: none; border-left: 1px solid grey; padding-left: 16px">'
                    f'{stack}</div>')

        self.fields['log'].initial = '\n'.join(logs)
    def send_task(self,
                  name,
                  args=None,
                  kwargs=None,
                  countdown=None,
                  eta=None,
                  task_id=None,
                  producer=None,
                  connection=None,
                  router=None,
                  result_cls=None,
                  expires=None,
                  publisher=None,
                  link=None,
                  link_error=None,
                  add_to_parent=True,
                  group_id=None,
                  retries=0,
                  chord=None,
                  reply_to=None,
                  time_limit=None,
                  soft_time_limit=None,
                  root_id=None,
                  parent_id=None,
                  source_data=None,
                  run_after_sub_tasks_finished=False,
                  route_name=None,
                  shadow=None,
                  chain=None,
                  task_type=None,
                  main_task_id=None,
                  **options):
        task_id = task_id or str(fast_uuid())

        TaskUtils.prepare_task_execution()

        main_task_id = main_task_id or parent_id or root_id
        Task.objects.init_task(task_id, name, main_task_id,
                               'Args: {0}\nKwargs: {1}'.format(
                                   str(args), str(kwargs)), args, source_data,
                               run_after_sub_tasks_finished)  # type: Task

        return super().send_task(name, args, kwargs, countdown, eta, task_id,
                                 producer, connection, router, result_cls,
                                 expires, publisher, link, link_error,
                                 add_to_parent, group_id, retries, chord,
                                 reply_to, time_limit, soft_time_limit,
                                 root_id, parent_id, route_name, shadow, chain,
                                 task_type, **options)
Пример #7
0
    def build_sentences_to_fields_relations_dataset(document_class_name: str, document_ids: List,
                                                    task_id):
        log(
            'Building classifier for detecting sentences related to fields '
            'for document class: {0}'.format(
                document_class_name),
            task=task_id)
        field_configs = DOCUMENT_FIELDS[document_class_name]
        if not field_configs:
            log('Can not find any field configs for document class: {0}'.format(
                document_class_name),
                task=task_id)
            return

        document_class = BuildFieldDetectorDataset._get_doc_class(document_class_name)

        classifier_model, created = ClassifierModel.objects.get_or_create(
            kind=ClassifierModel.KIND_SENTENCES_RELATED_TO_FIELDS,
            document_class=document_class_name,
            document_field=None)

        if created:
            log('Classifier data set already exists for document class: {0}'.format(
                document_class_name),
                task=task_id)
        else:
            log('New classifier data created for document class: {0}'.format(
                document_class_name),
                task=task_id)

        task_count = 0

        for doc_id in document_ids or document_class.objects.all().values_list('id', flat=True):
            BuildFieldDetectorDataset.build_dataset_on_document.apply_async(
                args=(document_class_name, doc_id, False, task_id),
                task_id='%d_%s' % (
                    task_id, fast_uuid()))
            task_count += 1
        if task_count == 0:
            log('No documents in DB for document class: {0}'.format(
                document_class_name),
                task=task_id)
        return task_count
Пример #8
0
    def train_model_for_document_type(self,
                                      document_type_pk,
                                      task_id=None) -> int:
        self.log('Building classifier model for document type: {0}'.format(
            document_type_pk))

        document_type = DocumentType.objects.get(pk=document_type_pk)

        no_field_sentences = list(
            TextUnit.objects.filter(unit_type='sentence',
                                    related_field_values=None).values_list(
                                        'text', flat=True)[:10000])

        task_count = 0
        for field in document_type.fields.all():
            self.train_model_for_field.apply_async(
                args=(document_type.uid, field.uid, task_id,
                      no_field_sentences),
                task_id='%d_%s' % (task_id, fast_uuid()))
            task_count += 1

        return task_count
Пример #9
0
    def process(self, **kwargs):
        self.log(
            "Going to detect lease documents among the all loaded documents in the system..."
        )

        if kwargs['delete']:
            for ld in LeaseDocument.objects.all():
                ld.delete(keep_parents=True)

        documents = Document.objects.all()
        if kwargs.get('document_type'):
            documents = documents.filter(
                document_type__in=kwargs['document_type'])
            self.log('Filter documents by "%s" document type.' %
                     str(kwargs['document_type']))

        self.task.subtasks_total = documents.count()
        self.task.save()

        for row in documents.values_list('id'):
            ProcessLeaseDocuments.detect_and_process_lease_document.apply_async(
                args=(row[0], kwargs['no_detect'], kwargs['task_id']),
                task_id='%d_%s' % (self.task.id, fast_uuid()))
Пример #10
0
    def __init__(self, prefix, instance: Task, initial):
        super().__init__()
        display_name = instance.display_name or instance.name
        if display_name != instance.name:
            display_name += f' ({instance.name})'
        if instance.status != SUCCESS:
            display_name += f' STATUS: {instance.status}'
        if instance.progress < 100:
            display_name += f' ({instance.progress}%)'

        self.fields['task'].initial = display_name
        self.fields['parents'].initial = ''
        self.fields['child_tasks'].initial = ''

        logs = list()
        # on this stage it was quite hard to implement proper formatting in templates
        # so putting some html/js right here.
        # TODO: Refactor, put formatting to the templates

        # list ancestors (parent tasks) up to the root
        parents_markup = []
        this_task = instance
        while this_task.parent_task_id:
            parent = this_task.parent_task
            task_name = parent.display_name or parent.name
            url = reverse('task:task-detail', args=[parent.pk])
            color = self.COLOR_BY_STATUS.get(
                parent.status) or self.COLOR_BY_STATUS['default']
            link_name = task_name if parent.progress == 100 else f'{task_name} ({parent.progress}%)'
            parents_markup.append(
                f'<a style="{color}" href="{url}">{link_name}</a>')
            this_task = this_task.parent_task

        markup = ''
        if parents_markup:
            markup = ' &lt;- '.join(parents_markup)
        self.fields['parents'].initial = markup

        # list child tasks
        child_query = Task.objects.filter(parent_task_id=instance.pk)
        children_count = child_query.count()
        children = list(
            child_query.values_list('pk', 'name', 'display_name', 'status',
                                    'progress')[:30])
        children_markup = []
        for pk, name, display_name, status, progress in children:
            url = reverse('task:task-detail', args=[pk])
            color = self.COLOR_BY_STATUS.get(
                status) or self.COLOR_BY_STATUS['default']
            task_name = display_name or name
            link_name = task_name if progress == 100 else f'{task_name} ({progress}%)'
            children_markup.append(
                f'<a style="{color}" href="{url}">{link_name}</a>')
        if children_count > len(children):
            children_markup.append(
                f' ... and {children_count - len(children)} more')
        self.fields['child_tasks'].initial = ', '.join(children_markup)

        # Main problem is that this form's template uses some base template which replaces \n with <br />
        for record in instance.get_task_log_from_elasticsearch():
            color = 'green'
            if record.log_level == 'WARN':
                color = 'yellow'
            elif record.log_level == 'ERROR':
                color = 'red'

            if not record.timestamp:
                ts = ''
            else:
                ts = record.timestamp.strftime('%Y-%m-%d %H:%M:%S')

            level = record.log_level or 'INFO'
            message = record.message
            if message and '\n' in message:
                message = '<br />' + message

            log_add = f'<b><span style="color: {color}">{level}</span> {ts} | {record.task_name or "no task"} |</b> ' \
                      f'{message}'

            logs.append(log_add)

            if record.stack_trace:
                # Adding JS to toggle stack trace showing/hiding
                stack = record.stack_trace.replace('\n', '<br />')
                uid = str(fast_uuid())
                uid_toggle = uid + '_toggle'
                show_hide = f'''e = document.getElementById('{uid}');
                                e.style.display = e.style.display === 'block' ? 'none' : 'block';
                                document.getElementById('{uid_toggle}').innerText 
                                        = e.style.display === 'block' ? '[-] Stack trace:' : '[+] Stack trace';
                            '''.replace('\n', '')
                logs.append(
                    f'<a id="{uid_toggle}" onclick="{show_hide}">[+] Stack trace:</a>'
                )
                logs.append(
                    f'<div id="{uid}" style="display: none; border-left: 1px solid grey; padding-left: 16px">'
                    f'{stack}</div>')

        self.fields['log'].initial = '\n'.join(logs)
    def send_task(self,
                  name,
                  args=None,
                  kwargs=None,
                  countdown=None,
                  eta=None,
                  task_id=None,
                  producer=None,
                  connection=None,
                  router=None,
                  result_cls=None,
                  expires=None,
                  publisher=None,
                  link=None,
                  link_error=None,
                  add_to_parent=True,
                  group_id=None,
                  retries=0,
                  chord=None,
                  reply_to=None,
                  time_limit=None,
                  soft_time_limit=None,
                  root_id=None,
                  parent_id=None,
                  source_data=None,
                  run_after_sub_tasks_finished=False,
                  run_if_parent_task_failed=False,
                  route_name=None,
                  shadow=None,
                  chain=None,
                  task_type=None,
                  main_task_id=None,
                  **options):
        task_id = task_id or str(fast_uuid())

        main_task_id = main_task_id or parent_id or root_id
        args_str = ', '.join([str(arg) for arg in args]) if args else ''
        kwargs_str = '\n'.join([f'{f}: {str(v)}'
                                for f, v in kwargs.items()]) if kwargs else ''

        description = list()
        if args_str:
            description.append(args_str)

        if kwargs_str:
            description.append(kwargs_str)

        TaskUtils.prepare_task_execution()
        with transaction.atomic():
            Task.objects.init_task(
                task_id=task_id,
                task_name=name,
                main_task_id=main_task_id,
                parent_task_id=parent_id,
                description='\n'.join(description),
                args=args,
                source_data=source_data,
                run_after_sub_tasks_finished=run_after_sub_tasks_finished,
                run_if_parent_task_failed=run_if_parent_task_failed
            )  # type: Task
            if parent_id is not None:
                Task.objects.filter(id=parent_id).update(has_sub_tasks=True)

        return super().send_task(name, args, kwargs, countdown, eta, task_id,
                                 producer, connection, router, result_cls,
                                 expires, publisher, link, link_error,
                                 add_to_parent, group_id, retries, chord,
                                 reply_to, time_limit, soft_time_limit,
                                 root_id, parent_id, route_name, shadow, chain,
                                 task_type, **options)
    def send_task(self,
                  name,
                  args=None,
                  kwargs=None,
                  countdown=None,
                  eta=None,
                  task_id=None,
                  producer=None,
                  connection=None,
                  router=None,
                  result_cls=None,
                  expires=None,
                  publisher=None,
                  link=None,
                  link_error=None,
                  add_to_parent=True,
                  group_id=None,
                  retries=0,
                  chord=None,
                  reply_to=None,
                  time_limit=None,
                  soft_time_limit=None,
                  root_id=None,
                  parent_id=None,
                  source_data=None,
                  run_after_sub_tasks_finished=False,
                  run_if_parent_task_failed=False,
                  route_name=None,
                  shadow=None,
                  chain=None,
                  task_type=None,
                  main_task_id=None,
                  **options):
        """
        Custom Celery send_task() method which stores a lot of additional required info in the DB.
        There is a task re-sending method in this class which is used to re-start hanged tasks
        lost by workers because of unexpected too fast restart or any other similar reason.
        Task re-send feature requires all the required task info to be stored in the DB
        because the info in the RabbitMQ usually appears lost in case the worker has lost the task.

        Take into account that args, kwargs, queue, priority are required to save in the DB in the same
        form as can be used for loading and re-sending.
        """
        task_id = task_id or str(fast_uuid())

        main_task_id = main_task_id or parent_id or root_id
        args_str = ', '.join([str(arg) for arg in args]) if args else ''
        kwargs_str = '\n'.join([f'{f}: {str(v)}'
                                for f, v in kwargs.items()]) if kwargs else ''

        description = list()
        if args_str:
            description.append(args_str)

        if kwargs_str:
            description.append(kwargs_str)

        TaskUtils.prepare_task_execution()
        with transaction.atomic():
            # it is important to save args, kwargs, queue, priority and other task fields
            # because they can be used for the task re-sending
            Task.objects.init_task(
                task_id=task_id,
                task_name=name,
                main_task_id=main_task_id,
                parent_task_id=parent_id,
                description='\n'.join(description),
                args=args,
                kwargs=kwargs,
                queue=options.get('queue'),
                priority=options.get('priority'),
                source_data=source_data,
                run_after_sub_tasks_finished=run_after_sub_tasks_finished,
                run_if_parent_task_failed=run_if_parent_task_failed
            )  # type: Task
            if parent_id is not None:
                Task.objects.filter(id=parent_id).exclude(
                    has_sub_tasks=True).update(has_sub_tasks=True)

        return super().send_task(name, args, kwargs, countdown, eta, task_id,
                                 producer, connection, router, result_cls,
                                 expires, publisher, link, link_error,
                                 add_to_parent, group_id, retries, chord,
                                 reply_to, time_limit, soft_time_limit,
                                 root_id, parent_id, route_name, shadow, chain,
                                 task_type, **options)