def start_task(request): user = request.user task_type = request.POST['task_type'] task_params = filter_params(request.POST) description = task_params['description'] if 'dataset' in request.session.keys(): task_params['dataset'] = request.session['dataset'] if 'model' in request.session.keys(): task_params['language_model'] = request.session['model'] # Create execution task task_id = create_task(task_type, description, task_params, user) # Add task to queue task = Task.get_by_id(task_id) if not TaskTypes.hasValue(task_type): task.result = json.dumps({'error': '{} is not a proper Task Type value'.format(task_type)}) task.update_status(Task.STATUS_FAILED, set_time_completed=True) else: task.update_status(Task.STATUS_QUEUED) return HttpResponse()
def api_mass_tagger(request, user, params): """ Apply mass tagger (via auth_token) """ # Get parameters with default values if 'search' not in params: params['search'] = 'all_docs' if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] # Select taggers taggers = params.get('taggers', None) if taggers is None: taggers = [tagger.id for tagger in Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value).filter(status=Task.STATUS_COMPLETED)] params['text_tagger_taggers'] = taggers # Prepare description description = params['description'] # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def update_view(self, percentage): r = Task.get_by_id(self.task_pk) # Check if task was canceled if r.status == Task.STATUS_CANCELED: raise TaskCanceledException() r.status = Task.STATUS_RUNNING progress_message = '{0:3.0f} %'.format(percentage) if self.step: progress_message = '{1}: {0}'.format(progress_message, self.step) r.update_progress(percentage, progress_message)
def update_view(self): i = self.n_step percentage = (100.0 * i) / self.n_total r = Task.get_by_id(self.model_pk) # Check if task was canceled if r.status == Task.STATUS_CANCELED: raise TaskCanceledException() r.status = Task.STATUS_RUNNING progress_message = '{0} [{1}/{2}]'.format(self.step_messages[i], i + 1, self.n_total) r.update_progress(percentage, progress_message)
def api_get_task_status(request, user, params): """ Get task status for a given task id """ task_id = params.get('task_id', None) try: task = Task.get_by_id(task_id) data = task.to_json() data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json') except Task.DoesNotExist as e: error = {'error': 'task id is not valid'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json')
def api_train_tagger(request, user, params): """ Create task for train tagger """ task_type = TaskTypes.TRAIN_TAGGER description = params['description'] # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_train_model(request, user, params): """ Create task for train model """ task_type = TaskTypes.TRAIN_MODEL description = params['description'] # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_mass_tagger(request, user, params): """ Apply mass tagger (via auth_token) """ # Get parameters with default values if 'search' not in params: params['search'] = 'all_docs' if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] # Select taggers taggers = params.get('taggers', None) if taggers is None: taggers = [ tagger.id for tagger in Task.objects.filter( task_type=TaskTypes.TRAIN_TAGGER.value).filter( status=Task.STATUS_COMPLETED) ] params['text_tagger_taggers'] = taggers # Prepare description description = params['description'] # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def schedule_tasks(self, selected_tags, normalizer_opt, classifier_opt, reductor_opt, extractor_opt, field, dataset_id, user): tag_frequency = self.get_tag_frequency(selected_tags) retrain_tasks = [] # Get list of available models task_tagger_list = [tagger for tagger in Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value)] task_tagger_tag_set = set([tagger.description for tagger in task_tagger_list]) for task_tagger in task_tagger_list: # Get tag label tag_label = task_tagger.description # Filter models that are not included in the tag_frequency map if tag_label not in tag_frequency: continue # Filter models with less than MIN_DOCS_TAGGED docs if tag_frequency.get(tag_label, 0) < MIN_DOCS_TAGGED: continue # Filter running tasks if task_tagger.is_running(): continue # If here, retrain model with tags (re-queue task) task_id = task_tagger.pk retrain_task = {'task_id': task_id, 'tag': tag_label} retrain_tasks.append(retrain_task) # Update query parameter from task tag_parameters = json.loads(task_tagger.parameters) self._add_search_tag_query(tag_parameters, tag_label) task_tagger.parameters = json.dumps(tag_parameters) task_tagger.requeue_task() new_model_tasks = [] for tag_label in selected_tags: # Check if it is a new model if tag_label in task_tagger_tag_set: continue # Filter models with less than MIN_DOCS_TAGGED docs if tag_frequency.get(tag_label, 0) < MIN_DOCS_TAGGED: continue # Build task parameters task_param = {} task_param["description"] = tag_label task_param["normalizer_opt"] = normalizer_opt task_param["classifier_opt"] = classifier_opt task_param["reductor_opt"] = reductor_opt task_param["extractor_opt"] = extractor_opt task_param["field"] = field task_param["dataset"] = dataset_id self._add_search_tag_query(task_param, tag_label) # Create execution task task_type = TaskTypes.TRAIN_TAGGER task_id = create_task(task_type, tag_label, task_param, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Add task id to response new_model_task = {'task_id': task_id, 'tag': tag_label} new_model_tasks.append(new_model_task) data = {'retrain_models': retrain_tasks, 'new_models': new_model_tasks} return data
def api_hybrid_tagger(request, user, params): """ Apply hybrid tagger (via auth_token) """ DEFAULT_TAGS_THRESHOLD = 50 DEFAULT_MAX_TAGGERS = 20 dataset_id = params['dataset'] search = params['search'] field = params['field'] max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS)) min_count_threshold = int(params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD)) if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') param_query = json.loads(Search.objects.get(pk=int(search)).query) es_m = ds.build_manager(ES_Manager) es_m.load_combined_query(param_query) # Get similar documents in a neighborhood of size 1000 response = es_m.more_like_this_search([field], search_size=1000) docs = response['hits']['hits'] # Build Tag frequency tag_freq = {} for doc in docs: for f in doc['_source'].get('texta_facts', []): if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field: doc_tag = f['str_val'] if doc_tag not in tag_freq: tag_freq[doc_tag] = 0 tag_freq[doc_tag] += 1 # Top Tags to limit the number of taggers top_tags = [t[0] for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)] top_tags = set(top_tags[:max_taggers]) # Perform tag selection data = {'task': {}, 'explain': []} candidate_tags = set() for tag in tag_freq: selected = 0 count = tag_freq[tag] if count >= min_count_threshold and tag in top_tags: selected = 1 candidate_tags.add(tag) data['explain'].append({'tag': tag, 'selected': selected, 'count': count }) # Filter tags tagger_search = Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value).filter(status=Task.STATUS_COMPLETED) taggers = [tagger.id for tagger in tagger_search if tagger.description in candidate_tags] # Create Task if taggers is not zero if len(taggers) > 0: description = params['description'] params['text_tagger_taggers'] = taggers # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data['task'] = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } else: # If here, no taggers were selected data['task'] = {"error": "no similar documents have tags count above threshold"} # Generate response data['min_count_threshold'] = min_count_threshold data['max_taggers'] = max_taggers data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def schedule_tasks(self, selected_tags, normalizer_opt, classifier_opt, reductor_opt, extractor_opt, field, dataset_id, user): tag_frequency = self.get_tag_frequency(selected_tags) retrain_tasks = [] # Get list of available models task_tagger_list = [ tagger for tagger in Task.objects.filter( task_type=TaskTypes.TRAIN_TAGGER.value) ] task_tagger_tag_set = set( [tagger.description for tagger in task_tagger_list]) for task_tagger in task_tagger_list: # Get tag label tag_label = task_tagger.description # Filter models that are not included in the tag_frequency map if tag_label not in tag_frequency: continue # Filter models with less than MIN_DOCS_TAGGED docs if tag_frequency.get(tag_label, 0) < MIN_DOCS_TAGGED: continue # Filter running tasks if task_tagger.is_running(): continue # If here, retrain model with tags (re-queue task) task_id = task_tagger.pk retrain_task = {'task_id': task_id, 'tag': tag_label} retrain_tasks.append(retrain_task) # Update query parameter from task tag_parameters = json.loads(task_tagger.parameters) self._add_search_tag_query(tag_parameters, tag_label) task_tagger.parameters = json.dumps(tag_parameters) task_tagger.requeue_task() new_model_tasks = [] for tag_label in selected_tags: # Check if it is a new model if tag_label in task_tagger_tag_set: continue # Filter models with less than MIN_DOCS_TAGGED docs if tag_frequency.get(tag_label, 0) < MIN_DOCS_TAGGED: continue # Build task parameters task_param = {} task_param["description"] = tag_label task_param["normalizer_opt"] = normalizer_opt task_param["classifier_opt"] = classifier_opt task_param["reductor_opt"] = reductor_opt task_param["extractor_opt"] = extractor_opt task_param["field"] = field task_param["dataset"] = dataset_id self._add_search_tag_query(task_param, tag_label) # Create execution task task_type = TaskTypes.TRAIN_TAGGER task_id = create_task(task_type, tag_label, task_param, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Add task id to response new_model_task = {'task_id': task_id, 'tag': tag_label} new_model_tasks.append(new_model_task) data = {'retrain_models': retrain_tasks, 'new_models': new_model_tasks} return data
def api_hybrid_tagger(request, user, params): """ Apply hybrid tagger (via auth_token) """ DEFAULT_TAGS_THRESHOLD = 50 DEFAULT_MAX_TAGGERS = 20 dataset_id = params['dataset'] search = params['search'] field = params['field'] max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS)) min_count_threshold = int( params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD)) if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') param_query = json.loads(Search.objects.get(pk=int(search)).query) es_m = ds.build_manager(ES_Manager) es_m.load_combined_query(param_query) # Get similar documents in a neighborhood of size 1000 response = es_m.more_like_this_search([field], search_size=1000) docs = response['hits']['hits'] # Build Tag frequency tag_freq = {} for doc in docs: for f in doc['_source'].get('texta_facts', []): if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field: doc_tag = f['str_val'] if doc_tag not in tag_freq: tag_freq[doc_tag] = 0 tag_freq[doc_tag] += 1 # Top Tags to limit the number of taggers top_tags = [ t[0] for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True) ] top_tags = set(top_tags[:max_taggers]) # Perform tag selection data = {'task': {}, 'explain': []} candidate_tags = set() for tag in tag_freq: selected = 0 count = tag_freq[tag] if count >= min_count_threshold and tag in top_tags: selected = 1 candidate_tags.add(tag) data['explain'].append({ 'tag': tag, 'selected': selected, 'count': count }) # Filter tags tagger_search = Task.objects.filter( task_type=TaskTypes.TRAIN_TAGGER.value).filter( status=Task.STATUS_COMPLETED) taggers = [ tagger.id for tagger in tagger_search if tagger.description in candidate_tags ] # Create Task if taggers is not zero if len(taggers) > 0: description = params['description'] params['text_tagger_taggers'] = taggers # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data['task'] = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } else: # If here, no taggers were selected data['task'] = { "error": "no similar documents have tags count above threshold" } # Generate response data['min_count_threshold'] = min_count_threshold data['max_taggers'] = max_taggers data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')