def api_mass_tagger(request, user, params): """ Apply mass tagger (via auth_token) """ # Get parameters with default values if 'search' not in params: params['search'] = 'all_docs' if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] # Select taggers taggers = params.get('taggers', None) if taggers is None: taggers = [tagger.id for tagger in Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value).filter(status=Task.STATUS_COMPLETED)] params['text_tagger_taggers'] = taggers # Prepare description description = params['description'] # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def start_task(request): user = request.user task_type = request.POST['task_type'] task_params = filter_params(request.POST) description = task_params['description'] if 'dataset' in request.session.keys(): task_params['dataset'] = request.session['dataset'] if 'model' in request.session.keys(): task_params['language_model'] = request.session['model'] # Create execution task task_id = create_task(task_type, description, task_params, user) # Add task to queue task = Task.get_by_id(task_id) if not TaskTypes.hasValue(task_type): task.result = json.dumps({'error': '{} is not a proper Task Type value'.format(task_type)}) task.update_status(Task.STATUS_FAILED, set_time_completed=True) else: task.update_status(Task.STATUS_QUEUED) return HttpResponse()
def start_fact_deleter_task(self, rm_facts_dict, doc_id=None): """Remove facts from documents, by starting fact_deleter management task. Arguments: rm_facts_dict {Dict[str: List[str]]} -- Dict of fact values to remove Examples: General format - { 'factname1': ['factvalue1','factvalue2', ...]} Real example - {'CITY': ['tallinna', 'tallinn'], 'CAR': ['bmw', 'audi']} Keyword Arguments: doc_id {str} -- If present, deletes the facts only in a given document (default: {None}) """ task_type = TaskTypes.MANAGEMENT_TASK description = 'fact_manager_fact_deletion' params = { 'fact_deleter_fact_values': rm_facts_dict, 'fact_deleter_doc_id': doc_id, 'task_type': task_type, 'manager_key': ManagerKeys.FACT_DELETER, 'description': description, 'dataset': self.request.session['dataset'] } task_id = create_task(task_type, description, params, self.request.user) task = Task.objects.get(pk=task_id) task.update_status(Task.STATUS_QUEUED)
def api_train_tagger(request, user, params): """ Create task for train tagger """ task_type = TaskTypes.TRAIN_TAGGER description = params['description'] # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_train_model(request, user, params): """ Create task for train model """ task_type = TaskTypes.TRAIN_MODEL description = params['description'] # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def start_fact_adder_task(self, fact_name: str, fact_value: str, fact_field: str, doc_id: str, method: str, match_type: str, case_sens: bool): """Adds custom facts to documents, by starting fact_adder management task.""" task_type = TaskTypes.MANAGEMENT_TASK.value description = 'fact_manager_fact_adding' params = { 'fact_name': fact_name, 'fact_value': fact_value, 'fact_field': fact_field, 'doc_id': doc_id, 'method': method, 'match_type': match_type, 'case_sens': case_sens, 'task_type': task_type, 'manager_key': ManagerKeys.FACT_ADDER, 'description': description, 'dataset': self.request.session['dataset'] } task_id = create_task(task_type, description, params, self.request.user) task = Task.objects.get(pk=task_id) task.update_status(Task.STATUS_QUEUED)
def api_mass_tagger(request, user, params): """ Apply mass tagger (via auth_token) """ # Get parameters with default values if 'search' not in params: params['search'] = 'all_docs' if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] # Select taggers taggers = params.get('taggers', None) if taggers is None: taggers = [ tagger.id for tagger in Task.objects.filter( task_type=TaskTypes.TRAIN_TAGGER.value).filter( status=Task.STATUS_COMPLETED) ] params['text_tagger_taggers'] = taggers # Prepare description description = params['description'] # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_hybrid_tagger(request, user, params): """ Apply hybrid tagger (via auth_token) """ DEFAULT_TAGS_THRESHOLD = 50 DEFAULT_MAX_TAGGERS = 20 dataset_id = params['dataset'] search = params['search'] field = params['field'] max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS)) min_count_threshold = int(params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD)) if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') param_query = json.loads(Search.objects.get(pk=int(search)).query) es_m = ds.build_manager(ES_Manager) es_m.load_combined_query(param_query) # Get similar documents in a neighborhood of size 1000 response = es_m.more_like_this_search([field], search_size=1000) docs = response['hits']['hits'] # Build Tag frequency tag_freq = {} for doc in docs: for f in doc['_source'].get('texta_facts', []): if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field: doc_tag = f['str_val'] if doc_tag not in tag_freq: tag_freq[doc_tag] = 0 tag_freq[doc_tag] += 1 # Top Tags to limit the number of taggers top_tags = [t[0] for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)] top_tags = set(top_tags[:max_taggers]) # Perform tag selection data = {'task': {}, 'explain': []} candidate_tags = set() for tag in tag_freq: selected = 0 count = tag_freq[tag] if count >= min_count_threshold and tag in top_tags: selected = 1 candidate_tags.add(tag) data['explain'].append({'tag': tag, 'selected': selected, 'count': count }) # Filter tags tagger_search = Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value).filter(status=Task.STATUS_COMPLETED) taggers = [tagger.id for tagger in tagger_search if tagger.description in candidate_tags] # Create Task if taggers is not zero if len(taggers) > 0: description = params['description'] params['text_tagger_taggers'] = taggers # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data['task'] = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } else: # If here, no taggers were selected data['task'] = {"error": "no similar documents have tags count above threshold"} # Generate response data['min_count_threshold'] = min_count_threshold data['max_taggers'] = max_taggers data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_hybrid_tagger(request, user, params): """ Apply hybrid tagger (via auth_token) """ DEFAULT_TAGS_THRESHOLD = 50 DEFAULT_MAX_TAGGERS = 20 dataset_id = params['dataset'] search = params['search'] field = params['field'] max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS)) min_count_threshold = int( params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD)) if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') param_query = json.loads(Search.objects.get(pk=int(search)).query) es_m = ds.build_manager(ES_Manager) es_m.load_combined_query(param_query) # Get similar documents in a neighborhood of size 1000 response = es_m.more_like_this_search([field], search_size=1000) docs = response['hits']['hits'] # Build Tag frequency tag_freq = {} for doc in docs: for f in doc['_source'].get('texta_facts', []): if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field: doc_tag = f['str_val'] if doc_tag not in tag_freq: tag_freq[doc_tag] = 0 tag_freq[doc_tag] += 1 # Top Tags to limit the number of taggers top_tags = [ t[0] for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True) ] top_tags = set(top_tags[:max_taggers]) # Perform tag selection data = {'task': {}, 'explain': []} candidate_tags = set() for tag in tag_freq: selected = 0 count = tag_freq[tag] if count >= min_count_threshold and tag in top_tags: selected = 1 candidate_tags.add(tag) data['explain'].append({ 'tag': tag, 'selected': selected, 'count': count }) # Filter tags tagger_search = Task.objects.filter( task_type=TaskTypes.TRAIN_TAGGER.value).filter( status=Task.STATUS_COMPLETED) taggers = [ tagger.id for tagger in tagger_search if tagger.description in candidate_tags ] # Create Task if taggers is not zero if len(taggers) > 0: description = params['description'] params['text_tagger_taggers'] = taggers # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data['task'] = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } else: # If here, no taggers were selected data['task'] = { "error": "no similar documents have tags count above threshold" } # Generate response data['min_count_threshold'] = min_count_threshold data['max_taggers'] = max_taggers data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')