def process(self, **kwargs): """ Locate employees :param kwargs: :return: """ if kwargs.get('delete'): deleted = Employee.objects.all().delete() + Employer.objects.all( ).delete() + Provision.objects.all().delete() self.log('Deleted: ' + str(deleted)) documents = Document.objects.all() # TODO: outdated if kwargs.get('document_type'): documents = documents.filter( document_type__in=kwargs['document_type']) self.log('Filter documents by "%s" document type.' % str(kwargs['document_type'])) if kwargs.get('document_id'): documents = documents.filter(pk=kwargs['document_id']) self.log('Process document id={}.'.format(kwargs['document_id'])) self.task.subtasks_total = documents.count() self.task.save() self.log('Found {0} Documents. Added {0} subtasks.'.format( self.task.subtasks_total)) for d in documents: self.parse_document_for_employee.apply_async( args=(d.id, kwargs.get('no_detect', True), kwargs['task_id']), task_id='%d_%s' % (self.task.id, fast_uuid()))
def push_or_pop(key, value, batch_size=None, batch_time=None): """ Push pickled values to redis store until some limit, use uid suffix to make keys unique, then return unpickled list using key mask """ existing_keys = r.keys('{}__*'.format(key)) batch_time_key = '{}_time'.format(key) time_cached_sec = r.get(batch_time_key) time_now_sec = time.mktime(now().timetuple()) if (batch_size is not None and len(existing_keys) >= batch_size - 1) or \ (batch_time is not None and time_cached_sec is not None and time_now_sec - float(time_cached_sec) >= batch_time): res = [popd(k) for k in existing_keys] res.append(value) r.delete(batch_time_key) resume = True else: key = '{}__{}'.format(key, fast_uuid()) res = push(key, value) r.getset(batch_time_key, time.mktime(now().timetuple())) resume = False return res, resume
def detect_field_values_for_document_type(document_type_pk, document_ids: List, do_not_write, drop_classifier_model, task_id): log('Detecting field values for document type: {0}'.format( document_type_pk), task=task_id) document_type = DocumentType.objects.get(pk=document_type_pk) document_fields = document_type.fields if not document_fields: log('Can not find any fields assigned to document type: {0}'. format(document_type), task=task_id) return if drop_classifier_model: log('Deleting field values and classifier models for document type: {0}' .format(document_type), task=task_id) ClassifierModel.objects.filter( document_type=document_type).delete() task_count = 0 for doc_id in document_ids or Document.objects.filter( document_type=document_type).values_list('id', flat=True): DetectFieldValues.detect_field_values_for_document.apply_async( args=(doc_id, do_not_write, task_id), task_id='%d_%s' % (task_id, fast_uuid())) task_count += 1 if task_count == 0: log('No documents in DB for document type: {0}'.format( document_type), task=task_id) return task_count
def process(self, **kwargs): project_id = kwargs.get('project_id') cluster_ids = kwargs.get('cluster_ids') new_project_id = kwargs.get('new_project_id') new_project = Project.objects.get(pk=new_project_id) documents = Document.objects.filter( documentcluster__pk__in=cluster_ids) documents.update(project_id=new_project, document_type=new_project.type) self.task.metadata = { 'task_name': 'reassigning', 'old_project_id': project_id, 'new_project_id': new_project_id, 'cluster_ids': cluster_ids, } self.task.save() reassigning = { 'date': now().isoformat(), 'new_project_id': new_project_id, 'cluster_ids': cluster_ids, 'task_id': self.task.id } p_cl = ProjectClustering.objects.get( document_clusters__pk=cluster_ids[0]) reassignings = p_cl.metadata.get('reassigning', []) reassignings.append(reassigning) p_cl.metadata['reassigning'] = reassignings reassigned_cluster_ids = list( set(p_cl.metadata.get('reassigned_cluster_ids', []) + cluster_ids)) p_cl.metadata['reassigned_cluster_ids'] = reassigned_cluster_ids p_cl.save() tasks = [] for app_name in custom_apps: module_str = 'apps.%s.tasks' % app_name module = sys.modules.get(module_str) detector_task = getattr(module, 'DetectFieldValues', None) if detector_task and hasattr(detector_task, 'detect_field_values_for_document'): tasks.append( getattr(detector_task, 'detect_field_values_for_document')) if tasks: self.task.subtasks_total = documents.count() * len(tasks) self.task.save() for document in documents: for task in tasks: task.apply_async( args=(document.id, False, self.task.id, None), task_id='%d_%s' % (self.task.id, fast_uuid()))
def __init__(self, prefix, instance: Task, initial): super().__init__() self.fields['name'].initial = instance.name logs = list() # on this stage it was quite hard to implement proper formatting in templates # so putting some html/js right here. # TODO: Refactor, put formatting to the templates # Main problem is that this form's template uses some base template which replaces \n with <br /> for record in instance.get_task_log_from_elasticsearch(): color = 'green' if record.log_level == 'WARN': color = 'yellow' elif record.log_level == 'ERROR': color = 'red' if not record.timestamp: ts = '' else: ts = record.timestamp.strftime('%Y-%m-%d %H:%M:%S') level = record.log_level or 'INFO' message = record.message if message and '\n' in message: message = '<br />' + message log_add = f'<b><span style="color: {color}">{level}</span> {ts} | {record.task_name or "no task"} |</b> ' \ f'{message}' logs.append(log_add) if record.stack_trace: # Adding JS to toggle stack trace showing/hiding stack = record.stack_trace.replace('\n', '<br />') uid = str(fast_uuid()) uid_toggle = uid + '_toggle' show_hide = f'''e = document.getElementById('{uid}'); e.style.display = e.style.display === 'block' ? 'none' : 'block'; document.getElementById('{uid_toggle}').innerText = e.style.display === 'block' ? '[-] Stack trace:' : '[+] Stack trace'; '''.replace('\n', '') logs.append( f'<a id="{uid_toggle}" onclick="{show_hide}">[+] Stack trace:</a>' ) logs.append( f'<div id="{uid}" style="display: none; border-left: 1px solid grey; padding-left: 16px">' f'{stack}</div>') self.fields['log'].initial = '\n'.join(logs)
def send_task(self, name, args=None, kwargs=None, countdown=None, eta=None, task_id=None, producer=None, connection=None, router=None, result_cls=None, expires=None, publisher=None, link=None, link_error=None, add_to_parent=True, group_id=None, retries=0, chord=None, reply_to=None, time_limit=None, soft_time_limit=None, root_id=None, parent_id=None, source_data=None, run_after_sub_tasks_finished=False, route_name=None, shadow=None, chain=None, task_type=None, main_task_id=None, **options): task_id = task_id or str(fast_uuid()) TaskUtils.prepare_task_execution() main_task_id = main_task_id or parent_id or root_id Task.objects.init_task(task_id, name, main_task_id, 'Args: {0}\nKwargs: {1}'.format( str(args), str(kwargs)), args, source_data, run_after_sub_tasks_finished) # type: Task return super().send_task(name, args, kwargs, countdown, eta, task_id, producer, connection, router, result_cls, expires, publisher, link, link_error, add_to_parent, group_id, retries, chord, reply_to, time_limit, soft_time_limit, root_id, parent_id, route_name, shadow, chain, task_type, **options)
def build_sentences_to_fields_relations_dataset(document_class_name: str, document_ids: List, task_id): log( 'Building classifier for detecting sentences related to fields ' 'for document class: {0}'.format( document_class_name), task=task_id) field_configs = DOCUMENT_FIELDS[document_class_name] if not field_configs: log('Can not find any field configs for document class: {0}'.format( document_class_name), task=task_id) return document_class = BuildFieldDetectorDataset._get_doc_class(document_class_name) classifier_model, created = ClassifierModel.objects.get_or_create( kind=ClassifierModel.KIND_SENTENCES_RELATED_TO_FIELDS, document_class=document_class_name, document_field=None) if created: log('Classifier data set already exists for document class: {0}'.format( document_class_name), task=task_id) else: log('New classifier data created for document class: {0}'.format( document_class_name), task=task_id) task_count = 0 for doc_id in document_ids or document_class.objects.all().values_list('id', flat=True): BuildFieldDetectorDataset.build_dataset_on_document.apply_async( args=(document_class_name, doc_id, False, task_id), task_id='%d_%s' % ( task_id, fast_uuid())) task_count += 1 if task_count == 0: log('No documents in DB for document class: {0}'.format( document_class_name), task=task_id) return task_count
def train_model_for_document_type(self, document_type_pk, task_id=None) -> int: self.log('Building classifier model for document type: {0}'.format( document_type_pk)) document_type = DocumentType.objects.get(pk=document_type_pk) no_field_sentences = list( TextUnit.objects.filter(unit_type='sentence', related_field_values=None).values_list( 'text', flat=True)[:10000]) task_count = 0 for field in document_type.fields.all(): self.train_model_for_field.apply_async( args=(document_type.uid, field.uid, task_id, no_field_sentences), task_id='%d_%s' % (task_id, fast_uuid())) task_count += 1 return task_count
def process(self, **kwargs): self.log( "Going to detect lease documents among the all loaded documents in the system..." ) if kwargs['delete']: for ld in LeaseDocument.objects.all(): ld.delete(keep_parents=True) documents = Document.objects.all() if kwargs.get('document_type'): documents = documents.filter( document_type__in=kwargs['document_type']) self.log('Filter documents by "%s" document type.' % str(kwargs['document_type'])) self.task.subtasks_total = documents.count() self.task.save() for row in documents.values_list('id'): ProcessLeaseDocuments.detect_and_process_lease_document.apply_async( args=(row[0], kwargs['no_detect'], kwargs['task_id']), task_id='%d_%s' % (self.task.id, fast_uuid()))
def __init__(self, prefix, instance: Task, initial): super().__init__() display_name = instance.display_name or instance.name if display_name != instance.name: display_name += f' ({instance.name})' if instance.status != SUCCESS: display_name += f' STATUS: {instance.status}' if instance.progress < 100: display_name += f' ({instance.progress}%)' self.fields['task'].initial = display_name self.fields['parents'].initial = '' self.fields['child_tasks'].initial = '' logs = list() # on this stage it was quite hard to implement proper formatting in templates # so putting some html/js right here. # TODO: Refactor, put formatting to the templates # list ancestors (parent tasks) up to the root parents_markup = [] this_task = instance while this_task.parent_task_id: parent = this_task.parent_task task_name = parent.display_name or parent.name url = reverse('task:task-detail', args=[parent.pk]) color = self.COLOR_BY_STATUS.get( parent.status) or self.COLOR_BY_STATUS['default'] link_name = task_name if parent.progress == 100 else f'{task_name} ({parent.progress}%)' parents_markup.append( f'<a style="{color}" href="{url}">{link_name}</a>') this_task = this_task.parent_task markup = '' if parents_markup: markup = ' <- '.join(parents_markup) self.fields['parents'].initial = markup # list child tasks child_query = Task.objects.filter(parent_task_id=instance.pk) children_count = child_query.count() children = list( child_query.values_list('pk', 'name', 'display_name', 'status', 'progress')[:30]) children_markup = [] for pk, name, display_name, status, progress in children: url = reverse('task:task-detail', args=[pk]) color = self.COLOR_BY_STATUS.get( status) or self.COLOR_BY_STATUS['default'] task_name = display_name or name link_name = task_name if progress == 100 else f'{task_name} ({progress}%)' children_markup.append( f'<a style="{color}" href="{url}">{link_name}</a>') if children_count > len(children): children_markup.append( f' ... and {children_count - len(children)} more') self.fields['child_tasks'].initial = ', '.join(children_markup) # Main problem is that this form's template uses some base template which replaces \n with <br /> for record in instance.get_task_log_from_elasticsearch(): color = 'green' if record.log_level == 'WARN': color = 'yellow' elif record.log_level == 'ERROR': color = 'red' if not record.timestamp: ts = '' else: ts = record.timestamp.strftime('%Y-%m-%d %H:%M:%S') level = record.log_level or 'INFO' message = record.message if message and '\n' in message: message = '<br />' + message log_add = f'<b><span style="color: {color}">{level}</span> {ts} | {record.task_name or "no task"} |</b> ' \ f'{message}' logs.append(log_add) if record.stack_trace: # Adding JS to toggle stack trace showing/hiding stack = record.stack_trace.replace('\n', '<br />') uid = str(fast_uuid()) uid_toggle = uid + '_toggle' show_hide = f'''e = document.getElementById('{uid}'); e.style.display = e.style.display === 'block' ? 'none' : 'block'; document.getElementById('{uid_toggle}').innerText = e.style.display === 'block' ? '[-] Stack trace:' : '[+] Stack trace'; '''.replace('\n', '') logs.append( f'<a id="{uid_toggle}" onclick="{show_hide}">[+] Stack trace:</a>' ) logs.append( f'<div id="{uid}" style="display: none; border-left: 1px solid grey; padding-left: 16px">' f'{stack}</div>') self.fields['log'].initial = '\n'.join(logs)
def send_task(self, name, args=None, kwargs=None, countdown=None, eta=None, task_id=None, producer=None, connection=None, router=None, result_cls=None, expires=None, publisher=None, link=None, link_error=None, add_to_parent=True, group_id=None, retries=0, chord=None, reply_to=None, time_limit=None, soft_time_limit=None, root_id=None, parent_id=None, source_data=None, run_after_sub_tasks_finished=False, run_if_parent_task_failed=False, route_name=None, shadow=None, chain=None, task_type=None, main_task_id=None, **options): task_id = task_id or str(fast_uuid()) main_task_id = main_task_id or parent_id or root_id args_str = ', '.join([str(arg) for arg in args]) if args else '' kwargs_str = '\n'.join([f'{f}: {str(v)}' for f, v in kwargs.items()]) if kwargs else '' description = list() if args_str: description.append(args_str) if kwargs_str: description.append(kwargs_str) TaskUtils.prepare_task_execution() with transaction.atomic(): Task.objects.init_task( task_id=task_id, task_name=name, main_task_id=main_task_id, parent_task_id=parent_id, description='\n'.join(description), args=args, source_data=source_data, run_after_sub_tasks_finished=run_after_sub_tasks_finished, run_if_parent_task_failed=run_if_parent_task_failed ) # type: Task if parent_id is not None: Task.objects.filter(id=parent_id).update(has_sub_tasks=True) return super().send_task(name, args, kwargs, countdown, eta, task_id, producer, connection, router, result_cls, expires, publisher, link, link_error, add_to_parent, group_id, retries, chord, reply_to, time_limit, soft_time_limit, root_id, parent_id, route_name, shadow, chain, task_type, **options)
def send_task(self, name, args=None, kwargs=None, countdown=None, eta=None, task_id=None, producer=None, connection=None, router=None, result_cls=None, expires=None, publisher=None, link=None, link_error=None, add_to_parent=True, group_id=None, retries=0, chord=None, reply_to=None, time_limit=None, soft_time_limit=None, root_id=None, parent_id=None, source_data=None, run_after_sub_tasks_finished=False, run_if_parent_task_failed=False, route_name=None, shadow=None, chain=None, task_type=None, main_task_id=None, **options): """ Custom Celery send_task() method which stores a lot of additional required info in the DB. There is a task re-sending method in this class which is used to re-start hanged tasks lost by workers because of unexpected too fast restart or any other similar reason. Task re-send feature requires all the required task info to be stored in the DB because the info in the RabbitMQ usually appears lost in case the worker has lost the task. Take into account that args, kwargs, queue, priority are required to save in the DB in the same form as can be used for loading and re-sending. """ task_id = task_id or str(fast_uuid()) main_task_id = main_task_id or parent_id or root_id args_str = ', '.join([str(arg) for arg in args]) if args else '' kwargs_str = '\n'.join([f'{f}: {str(v)}' for f, v in kwargs.items()]) if kwargs else '' description = list() if args_str: description.append(args_str) if kwargs_str: description.append(kwargs_str) TaskUtils.prepare_task_execution() with transaction.atomic(): # it is important to save args, kwargs, queue, priority and other task fields # because they can be used for the task re-sending Task.objects.init_task( task_id=task_id, task_name=name, main_task_id=main_task_id, parent_task_id=parent_id, description='\n'.join(description), args=args, kwargs=kwargs, queue=options.get('queue'), priority=options.get('priority'), source_data=source_data, run_after_sub_tasks_finished=run_after_sub_tasks_finished, run_if_parent_task_failed=run_if_parent_task_failed ) # type: Task if parent_id is not None: Task.objects.filter(id=parent_id).exclude( has_sub_tasks=True).update(has_sub_tasks=True) return super().send_task(name, args, kwargs, countdown, eta, task_id, producer, connection, router, result_cls, expires, publisher, link, link_error, add_to_parent, group_id, retries, chord, reply_to, time_limit, soft_time_limit, root_id, parent_id, route_name, shadow, chain, task_type, **options)