Пример #1
0
def start_task(request):
    user = request.user
    task_type = request.POST['task_type']
    task_params = filter_params(request.POST)

    description = task_params['description']
    if 'dataset' in request.session.keys():
        task_params['dataset'] = request.session['dataset']

    if 'model' in request.session.keys():
        task_params['language_model'] = request.session['model']

    # Create execution task
    task_id = create_task(task_type, description, task_params, user)
    # Add task to queue
    task = Task.get_by_id(task_id)

    if not TaskTypes.hasValue(task_type):    
        task.result = json.dumps({'error': '{} is not a proper Task Type value'.format(task_type)})
        task.update_status(Task.STATUS_FAILED, set_time_completed=True)
    else:
        task.update_status(Task.STATUS_QUEUED)


    return HttpResponse()
Пример #2
0
def api_mass_tagger(request, user, params):
    """ Apply mass tagger (via auth_token)
    """
    # Get parameters with default values
    if 'search' not in params:
        params['search'] = 'all_docs'
    if 'description' not in params:
        params['description'] = "via API call"
    # Paramater projection for preprocessor task
    task_type = TaskTypes.APPLY_PREPROCESSOR
    params["preprocessor_key"] = "text_tagger"
    params["text_tagger_feature_names"] = params['field']
    # Select taggers
    taggers = params.get('taggers', None)
    if taggers is None:
        taggers = [tagger.id for tagger in Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value).filter(status=Task.STATUS_COMPLETED)]
    params['text_tagger_taggers'] = taggers
    # Prepare description
    description = params['description']
    # Create execution task
    task_id = create_task(task_type, description, params, user)
    # Add task to queue
    task = Task.get_by_id(task_id)
    task.update_status(Task.STATUS_QUEUED)
    # Return reference to task
    data = {
        'task_id': task_id,
        'task_type': task_type,
        'status': task.status,
        'user': task.user.username
    }
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Пример #3
0
def start_task(request):
    user = request.user
    task_type = request.POST['task_type']
    task_params = filter_params(request.POST)

    description = task_params['description']
    if 'dataset' in request.session.keys():
        task_params['dataset'] = request.session['dataset']

    if 'model' in request.session.keys():
        task_params['language_model'] = request.session['model']

    # Create execution task
    task_id = create_task(task_type, description, task_params, user)
    # Add task to queue
    task = Task.get_by_id(task_id)

    if not TaskTypes.hasValue(task_type):    
        task.result = json.dumps({'error': '{} is not a proper Task Type value'.format(task_type)})
        task.update_status(Task.STATUS_FAILED, set_time_completed=True)
    else:
        task.update_status(Task.STATUS_QUEUED)


    return HttpResponse()
Пример #4
0
 def update_view(self, percentage):
     r = Task.get_by_id(self.task_pk)
     # Check if task was canceled
     if r.status == Task.STATUS_CANCELED:
         raise TaskCanceledException()
     r.status = Task.STATUS_RUNNING
     progress_message = '{0:3.0f} %'.format(percentage)
     if self.step:
         progress_message = '{1}: {0}'.format(progress_message, self.step)
     r.update_progress(percentage, progress_message)
Пример #5
0
 def update_view(self, percentage):
     r = Task.get_by_id(self.task_pk)
     # Check if task was canceled
     if r.status == Task.STATUS_CANCELED:
         raise TaskCanceledException()
     r.status = Task.STATUS_RUNNING
     progress_message = '{0:3.0f} %'.format(percentage)
     if self.step:
         progress_message = '{1}: {0}'.format(progress_message, self.step)
     r.update_progress(percentage, progress_message)
Пример #6
0
 def update_view(self):
     i = self.n_step
     percentage = (100.0 * i) / self.n_total
     r = Task.get_by_id(self.model_pk)
     # Check if task was canceled
     if r.status == Task.STATUS_CANCELED:
         raise TaskCanceledException()
     r.status = Task.STATUS_RUNNING
     progress_message = '{0} [{1}/{2}]'.format(self.step_messages[i], i + 1, self.n_total)
     r.update_progress(percentage, progress_message)
Пример #7
0
 def update_view(self):
     i = self.n_step
     percentage = (100.0 * i) / self.n_total
     r = Task.get_by_id(self.model_pk)
     # Check if task was canceled
     if r.status == Task.STATUS_CANCELED:
         raise TaskCanceledException()
     r.status = Task.STATUS_RUNNING
     progress_message = '{0} [{1}/{2}]'.format(self.step_messages[i], i + 1,
                                               self.n_total)
     r.update_progress(percentage, progress_message)
Пример #8
0
def api_get_task_status(request, user, params):
    """ Get task status for a given task id
    """
    task_id = params.get('task_id', None)
    try:
        task = Task.get_by_id(task_id)
        data = task.to_json()
        data_json = json.dumps(data)
        return HttpResponse(data_json, status=200, content_type='application/json')
    except Task.DoesNotExist as e:
        error = {'error': 'task id is not valid'}
        data_json = json.dumps(error)
        return HttpResponse(data_json, status=400, content_type='application/json')
Пример #9
0
def api_get_task_status(request, user, params):
    """ Get task status for a given task id
    """
    task_id = params.get('task_id', None)
    try:
        task = Task.get_by_id(task_id)
        data = task.to_json()
        data_json = json.dumps(data)
        return HttpResponse(data_json,
                            status=200,
                            content_type='application/json')
    except Task.DoesNotExist as e:
        error = {'error': 'task id is not valid'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')
Пример #10
0
def api_train_tagger(request, user, params):
    """ Create task for train tagger
    """
    task_type = TaskTypes.TRAIN_TAGGER
    description = params['description']
    # Create execution task
    task_id = create_task(task_type, description, params, user)
    # Add task to queue
    task = Task.get_by_id(task_id)
    task.update_status(Task.STATUS_QUEUED)
    # Return reference to task
    data = {
        'task_id': task_id,
        'task_type': task_type,
        'status': task.status,
        'user': task.user.username
    }
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Пример #11
0
def api_train_model(request, user, params):
    """ Create task for train model
    """
    task_type = TaskTypes.TRAIN_MODEL
    description = params['description']
    # Create execution task
    task_id = create_task(task_type, description, params, user)
    # Add task to queue
    task = Task.get_by_id(task_id)
    task.update_status(Task.STATUS_QUEUED)
    # Return reference to task
    data = {
        'task_id': task_id,
        'task_type': task_type,
        'status': task.status,
        'user': task.user.username
    }
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Пример #12
0
def api_mass_tagger(request, user, params):
    """ Apply mass tagger (via auth_token)
    """
    # Get parameters with default values
    if 'search' not in params:
        params['search'] = 'all_docs'
    if 'description' not in params:
        params['description'] = "via API call"
    # Paramater projection for preprocessor task
    task_type = TaskTypes.APPLY_PREPROCESSOR
    params["preprocessor_key"] = "text_tagger"
    params["text_tagger_feature_names"] = params['field']
    # Select taggers
    taggers = params.get('taggers', None)
    if taggers is None:
        taggers = [
            tagger.id for tagger in Task.objects.filter(
                task_type=TaskTypes.TRAIN_TAGGER.value).filter(
                    status=Task.STATUS_COMPLETED)
        ]
    params['text_tagger_taggers'] = taggers
    # Prepare description
    description = params['description']
    # Create execution task
    task_id = create_task(task_type, description, params, user)
    # Add task to queue
    task = Task.get_by_id(task_id)
    task.update_status(Task.STATUS_QUEUED)
    # Return reference to task
    data = {
        'task_id': task_id,
        'task_type': task_type,
        'status': task.status,
        'user': task.user.username
    }
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Пример #13
0
    def schedule_tasks(self, selected_tags, normalizer_opt, classifier_opt, reductor_opt, extractor_opt, field, dataset_id, user):

        tag_frequency = self.get_tag_frequency(selected_tags)
        retrain_tasks = []
        # Get list of available models
        task_tagger_list = [tagger for tagger in Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value)]
        task_tagger_tag_set = set([tagger.description for tagger in task_tagger_list])

        for task_tagger in task_tagger_list:

            # Get tag label
            tag_label = task_tagger.description
            # Filter models that are not included in the tag_frequency map
            if tag_label not in tag_frequency:
                continue
            # Filter models with less than MIN_DOCS_TAGGED docs
            if tag_frequency.get(tag_label, 0) < MIN_DOCS_TAGGED:
                continue
            # Filter running tasks
            if task_tagger.is_running():
                continue
            # If here, retrain model with tags (re-queue task)
            task_id = task_tagger.pk
            retrain_task = {'task_id': task_id, 'tag': tag_label}
            retrain_tasks.append(retrain_task)

            # Update query parameter from task
            tag_parameters = json.loads(task_tagger.parameters)
            self._add_search_tag_query(tag_parameters, tag_label)
            task_tagger.parameters = json.dumps(tag_parameters)
            task_tagger.requeue_task()

        new_model_tasks = []
        for tag_label in selected_tags:
            # Check if it is a new model
            if tag_label in task_tagger_tag_set:
                continue
            # Filter models with less than MIN_DOCS_TAGGED docs
            if tag_frequency.get(tag_label, 0) < MIN_DOCS_TAGGED:
                continue
            # Build task parameters
            task_param = {}
            task_param["description"] = tag_label
            task_param["normalizer_opt"] = normalizer_opt
            task_param["classifier_opt"] = classifier_opt
            task_param["reductor_opt"] = reductor_opt
            task_param["extractor_opt"] = extractor_opt
            task_param["field"] = field
            task_param["dataset"] = dataset_id
            self._add_search_tag_query(task_param, tag_label)
            # Create execution task
            task_type = TaskTypes.TRAIN_TAGGER
            task_id = create_task(task_type, tag_label, task_param, user)
            # Add task to queue
            task = Task.get_by_id(task_id)
            task.update_status(Task.STATUS_QUEUED)
            # Add task id to response
            new_model_task = {'task_id': task_id, 'tag': tag_label}
            new_model_tasks.append(new_model_task)

        data = {'retrain_models': retrain_tasks, 'new_models': new_model_tasks}
        return data
Пример #14
0
def api_hybrid_tagger(request, user, params):
    """ Apply hybrid tagger (via auth_token)
    """
    DEFAULT_TAGS_THRESHOLD = 50
    DEFAULT_MAX_TAGGERS = 20

    dataset_id = params['dataset']
    search = params['search']
    field = params['field']
    max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS))
    min_count_threshold = int(params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD))

    if 'description' not in params:
        params['description'] = "via API call"
    # Paramater projection for preprocessor task
    task_type = TaskTypes.APPLY_PREPROCESSOR
    params["preprocessor_key"] = "text_tagger"
    params["text_tagger_feature_names"] = params['field']

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json, status=400, content_type='application/json')

    param_query = json.loads(Search.objects.get(pk=int(search)).query)
    es_m = ds.build_manager(ES_Manager)    
    es_m.load_combined_query(param_query)
    # Get similar documents in a neighborhood of size 1000
    response = es_m.more_like_this_search([field], search_size=1000)
    docs = response['hits']['hits']
    # Build Tag frequency
    tag_freq = {}
    for doc in docs:
        for f in doc['_source'].get('texta_facts', []):
            if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field:
                doc_tag = f['str_val']
                if doc_tag not in tag_freq:
                    tag_freq[doc_tag] = 0
                tag_freq[doc_tag] += 1

    # Top Tags to limit the number of taggers
    top_tags = [t[0] for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)]
    top_tags = set(top_tags[:max_taggers])
    # Perform tag selection
    data = {'task': {}, 'explain': []}
    candidate_tags = set()
    for tag in tag_freq:
        selected = 0
        count = tag_freq[tag]
        if count >= min_count_threshold and tag in top_tags:
            selected = 1
            candidate_tags.add(tag)
        data['explain'].append({'tag': tag, 
                                'selected': selected, 
                                'count': count })
    # Filter tags
    tagger_search = Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value).filter(status=Task.STATUS_COMPLETED)
    taggers = [tagger.id for tagger in tagger_search if tagger.description in candidate_tags]
    # Create Task if taggers is not zero
    if len(taggers) > 0:
        description = params['description']
        params['text_tagger_taggers'] = taggers
        # Create execution task
        task_id = create_task(task_type, description, params, user)
        # Add task to queue
        task = Task.get_by_id(task_id)
        task.update_status(Task.STATUS_QUEUED)
        # Return reference to task
        data['task'] = {
            'task_id': task_id,
            'task_type': task_type,
            'status': task.status,
            'user': task.user.username
        }
    else:
        # If here, no taggers were selected
        data['task'] = {"error": "no similar documents have tags count above threshold"}
    # Generate response
    data['min_count_threshold'] = min_count_threshold
    data['max_taggers'] = max_taggers
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Пример #15
0
    def schedule_tasks(self, selected_tags, normalizer_opt, classifier_opt,
                       reductor_opt, extractor_opt, field, dataset_id, user):

        tag_frequency = self.get_tag_frequency(selected_tags)
        retrain_tasks = []
        # Get list of available models
        task_tagger_list = [
            tagger for tagger in Task.objects.filter(
                task_type=TaskTypes.TRAIN_TAGGER.value)
        ]
        task_tagger_tag_set = set(
            [tagger.description for tagger in task_tagger_list])

        for task_tagger in task_tagger_list:

            # Get tag label
            tag_label = task_tagger.description
            # Filter models that are not included in the tag_frequency map
            if tag_label not in tag_frequency:
                continue
            # Filter models with less than MIN_DOCS_TAGGED docs
            if tag_frequency.get(tag_label, 0) < MIN_DOCS_TAGGED:
                continue
            # Filter running tasks
            if task_tagger.is_running():
                continue
            # If here, retrain model with tags (re-queue task)
            task_id = task_tagger.pk
            retrain_task = {'task_id': task_id, 'tag': tag_label}
            retrain_tasks.append(retrain_task)

            # Update query parameter from task
            tag_parameters = json.loads(task_tagger.parameters)
            self._add_search_tag_query(tag_parameters, tag_label)
            task_tagger.parameters = json.dumps(tag_parameters)
            task_tagger.requeue_task()

        new_model_tasks = []
        for tag_label in selected_tags:
            # Check if it is a new model
            if tag_label in task_tagger_tag_set:
                continue
            # Filter models with less than MIN_DOCS_TAGGED docs
            if tag_frequency.get(tag_label, 0) < MIN_DOCS_TAGGED:
                continue
            # Build task parameters
            task_param = {}
            task_param["description"] = tag_label
            task_param["normalizer_opt"] = normalizer_opt
            task_param["classifier_opt"] = classifier_opt
            task_param["reductor_opt"] = reductor_opt
            task_param["extractor_opt"] = extractor_opt
            task_param["field"] = field
            task_param["dataset"] = dataset_id
            self._add_search_tag_query(task_param, tag_label)
            # Create execution task
            task_type = TaskTypes.TRAIN_TAGGER
            task_id = create_task(task_type, tag_label, task_param, user)
            # Add task to queue
            task = Task.get_by_id(task_id)
            task.update_status(Task.STATUS_QUEUED)
            # Add task id to response
            new_model_task = {'task_id': task_id, 'tag': tag_label}
            new_model_tasks.append(new_model_task)

        data = {'retrain_models': retrain_tasks, 'new_models': new_model_tasks}
        return data
Пример #16
0
def api_hybrid_tagger(request, user, params):
    """ Apply hybrid tagger (via auth_token)
    """
    DEFAULT_TAGS_THRESHOLD = 50
    DEFAULT_MAX_TAGGERS = 20

    dataset_id = params['dataset']
    search = params['search']
    field = params['field']
    max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS))
    min_count_threshold = int(
        params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD))

    if 'description' not in params:
        params['description'] = "via API call"
    # Paramater projection for preprocessor task
    task_type = TaskTypes.APPLY_PREPROCESSOR
    params["preprocessor_key"] = "text_tagger"
    params["text_tagger_feature_names"] = params['field']

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    param_query = json.loads(Search.objects.get(pk=int(search)).query)
    es_m = ds.build_manager(ES_Manager)
    es_m.load_combined_query(param_query)
    # Get similar documents in a neighborhood of size 1000
    response = es_m.more_like_this_search([field], search_size=1000)
    docs = response['hits']['hits']
    # Build Tag frequency
    tag_freq = {}
    for doc in docs:
        for f in doc['_source'].get('texta_facts', []):
            if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field:
                doc_tag = f['str_val']
                if doc_tag not in tag_freq:
                    tag_freq[doc_tag] = 0
                tag_freq[doc_tag] += 1

    # Top Tags to limit the number of taggers
    top_tags = [
        t[0]
        for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)
    ]
    top_tags = set(top_tags[:max_taggers])
    # Perform tag selection
    data = {'task': {}, 'explain': []}
    candidate_tags = set()
    for tag in tag_freq:
        selected = 0
        count = tag_freq[tag]
        if count >= min_count_threshold and tag in top_tags:
            selected = 1
            candidate_tags.add(tag)
        data['explain'].append({
            'tag': tag,
            'selected': selected,
            'count': count
        })
    # Filter tags
    tagger_search = Task.objects.filter(
        task_type=TaskTypes.TRAIN_TAGGER.value).filter(
            status=Task.STATUS_COMPLETED)
    taggers = [
        tagger.id for tagger in tagger_search
        if tagger.description in candidate_tags
    ]
    # Create Task if taggers is not zero
    if len(taggers) > 0:
        description = params['description']
        params['text_tagger_taggers'] = taggers
        # Create execution task
        task_id = create_task(task_type, description, params, user)
        # Add task to queue
        task = Task.get_by_id(task_id)
        task.update_status(Task.STATUS_QUEUED)
        # Return reference to task
        data['task'] = {
            'task_id': task_id,
            'task_type': task_type,
            'status': task.status,
            'user': task.user.username
        }
    else:
        # If here, no taggers were selected
        data['task'] = {
            "error": "no similar documents have tags count above threshold"
        }
    # Generate response
    data['min_count_threshold'] = min_count_threshold
    data['max_taggers'] = max_taggers
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')