def _validate_task(self, key, parsed_data):
        """ Validate parsed data with labeling config and task structure
        """
        is_list = isinstance(parsed_data, list)
        # we support only one task per JSON file
        if not (is_list and len(parsed_data) == 1
                or isinstance(parsed_data, dict)):
            raise TaskValidationError(
                'Error at ' + key + ':\n'
                'Cloud storages support one task per one JSON file only. '
                'Task must be {} or [{}] with length = 1')

        # classic validation for one task
        validator = TaskValidator(self.project)
        try:
            new_tasks = validator.to_internal_value(
                parsed_data if is_list else [parsed_data])
        except TaskValidationError as e:
            # pretty format of errors
            messages = e.msg_to_list()
            out = [(key + ' :: ' + msg) for msg in messages]
            out = "\n".join(out)
            raise TaskValidationError(out)

        return new_tasks[0]
Exemplo n.º 2
0
def api_import():
    project = project_get_or_create()

    # make django compatibility for uploader module
    class DjangoRequest:
        POST = request.form
        GET = request.args
        FILES = request.files
        data = request.json if request.json else request.form
        content_type = request.content_type

    start = time.time()
    # get tasks from request
    parsed_data = uploader.load_tasks(DjangoRequest())
    # validate tasks
    validator = TaskValidator(project)
    try:
        new_tasks = validator.to_internal_value(parsed_data)
    except ValidationError as e:
        return make_response(jsonify(e.msg_to_list()),
                             status.HTTP_400_BAD_REQUEST)

    # save task file to input dir
    if os.path.isdir(project.config['input_path']):
        # tasks are in directory, write a new file with tasks
        task_dir = project.config['input_path']
        now = datetime.now()
        data = json.dumps(new_tasks, ensure_ascii=False)
        md5 = hashlib.md5(json.dumps(data).encode('utf-8')).hexdigest()
        name = 'import-' + now.strftime('%Y-%m-%d-%H-%M') + '-' + str(md5[0:8])
        path = os.path.join(task_dir, name + '.json')
        tasks = new_tasks
    else:
        # tasks are all in one file, append it
        path = project.config['input_path']
        old_tasks = json.load(open(path))
        assert isinstance(old_tasks,
                          list), 'Tasks from input_path must be list'
        tasks = old_tasks + new_tasks
        logger.error("It's recommended to use directory as input_path: " +
                     project.config['input_path'] + ' -> ' +
                     os.path.dirname(project.config['input_path']))

    with open(path, 'w') as f:
        json.dump(tasks, f, ensure_ascii=False, indent=4)

    # load new tasks
    project.reload()

    duration = time.time() - start
    return make_response(
        jsonify({
            'task_count': len(new_tasks),
            'completion_count': validator.completion_count,
            'prediction_count': validator.prediction_count,
            'duration': duration
        }), status.HTTP_201_CREATED)
Exemplo n.º 3
0
def api_import():
    """ The main API for task import, supports
        * json task data
        * files (as web form, files will be hosted by this flask server)
        * url links to images, audio, csv (if you use TimeSeries in labeling config)
    """
    # make django compatibility for uploader module
    class DjangoRequest:
        def __init__(self): pass
        POST = request.form
        GET = request.args
        FILES = request.files
        data = request.json if request.json else request.form
        content_type = request.content_type

    start = time.time()
    # get tasks from request
    parsed_data, formats = uploader.load_tasks(DjangoRequest(), g.project)
    # validate tasks
    validator = TaskValidator(g.project)
    try:
        new_tasks = validator.to_internal_value(parsed_data)
    except ValidationError as e:
        return make_response(jsonify(e.msg_to_list()), status.HTTP_400_BAD_REQUEST)

    # get the last task id
    max_id_in_old_tasks = -1
    if not g.project.no_tasks():
        max_id_in_old_tasks = g.project.source_storage.max_id()

    new_tasks = Tasks().from_list_of_dicts(new_tasks, max_id_in_old_tasks + 1)
    try:
        g.project.source_storage.set_many(new_tasks.keys(), new_tasks.values())
    except NotImplementedError:
        raise NotImplementedError('Import is not supported for the current storage ' + str(g.project.source_storage))

    # if tasks have completion - we need to implicitly save it to target
    for i in new_tasks.keys():
        for completion in new_tasks[i].get('completions', []):
            g.project.save_completion(int(i), completion)

    # update schemas based on newly uploaded tasks
    g.project.update_derived_input_schema()
    g.project.update_derived_output_schema()

    duration = time.time() - start
    return make_response(jsonify({
        'task_count': len(new_tasks),
        'completion_count': validator.completion_count,
        'prediction_count': validator.prediction_count,
        'duration': duration,
        'formats': formats,
        'new_task_ids': [t for t in new_tasks]
    }), status.HTTP_201_CREATED)
Exemplo n.º 4
0
def api_import():
    project = project_get_or_create()

    # make django compatibility for uploader module
    class DjangoRequest:
        POST = request.form
        GET = request.args
        FILES = request.files
        data = request.json if request.json else request.form
        content_type = request.content_type

    start = time.time()
    # get tasks from request
    parsed_data = uploader.load_tasks(DjangoRequest(), project)
    # validate tasks
    validator = TaskValidator(project)
    try:
        new_tasks = validator.to_internal_value(parsed_data)
    except ValidationError as e:
        return make_response(jsonify(e.msg_to_list()),
                             status.HTTP_400_BAD_REQUEST)

    # tasks are all in one file, append it
    path = project.config['input_path']
    old_tasks = json.load(open(path))
    max_id_in_old_tasks = int(max(map(int,
                                      old_tasks.keys()))) if old_tasks else -1

    new_tasks = Tasks().from_list_of_dicts(new_tasks, max_id_in_old_tasks + 1)
    old_tasks.update(new_tasks)

    with open(path, 'w') as f:
        json.dump(old_tasks, f, ensure_ascii=False, indent=4)

    # load new tasks and everything related
    project.load_tasks()
    project.load_derived_schemas()

    duration = time.time() - start
    return make_response(
        jsonify({
            'task_count': len(new_tasks),
            'completion_count': validator.completion_count,
            'prediction_count': validator.prediction_count,
            'duration': duration,
            'new_task_ids': [t for t in new_tasks]
        }), status.HTTP_201_CREATED)
Exemplo n.º 5
0
def api_import():
    # make django compatibility for uploader module
    class DjangoRequest:
        POST = request.form
        GET = request.args
        FILES = request.files
        data = request.json if request.json else request.form
        content_type = request.content_type

    start = time.time()
    # get tasks from request
    parsed_data, formats = uploader.load_tasks(DjangoRequest(), g.project)
    # validate tasks
    validator = TaskValidator(g.project)
    try:
        new_tasks = validator.to_internal_value(parsed_data)
    except ValidationError as e:
        return make_response(jsonify(e.msg_to_list()),
                             status.HTTP_400_BAD_REQUEST)

    max_id_in_old_tasks = -1
    if not g.project.no_tasks():
        max_id_in_old_tasks = g.project.source_storage.max_id()

    new_tasks = Tasks().from_list_of_dicts(new_tasks, max_id_in_old_tasks + 1)
    g.project.source_storage.set_many(new_tasks.keys(), new_tasks.values())

    # if tasks have completion - we need to implicitly save it to target
    for i in new_tasks.keys():
        for completion in new_tasks[i].get('completions', []):
            g.project.save_completion(int(i), completion)

    # update schemas based on newly uploaded tasks
    g.project.update_derived_input_schema()
    g.project.update_derived_output_schema()

    duration = time.time() - start
    return make_response(
        jsonify({
            'task_count': len(new_tasks),
            'completion_count': validator.completion_count,
            'prediction_count': validator.prediction_count,
            'duration': duration,
            'formats': formats,
            'new_task_ids': [t for t in new_tasks]
        }), status.HTTP_201_CREATED)
Exemplo n.º 6
0
    def _update(self):
        if self.filelist:
            self.tasks, found_formats, self.data_keys = self._read_tasks()

            self._raise_if_inconsistent_with_current_project()

            if not self.found_formats:
                # It's a first time we get all formats
                self.found_formats = found_formats
            if self.selected_formats is None:
                # It's a first time we get all formats
                self.selected_formats, self.selected_objects = [], []
                for format in sorted(found_formats.keys()):
                    self.selected_formats.append(format)

            self.selected_objects = self._get_selected_objects()
            self.show_files_as_tasks_list = self._show_files_as_tasks_list()

        # validate tasks
        self._validator = TaskValidator(self.project)
        self.tasks = self._validator.to_internal_value(self.tasks)
Exemplo n.º 7
0
class ImportState(object):

    # TODO: define SQLAlchemy declarative_base()
    _db = {}

    object_to_formats, format_to_object = read_object_formats()
    AMBIGUOUS_TASKS_LIST_FORMATS = {'csv', 'tsv', 'txt'}

    def __init__(self, filelist=(), tasks=(), project=None, **kwargs):
        super(ImportState, self).__init__(**kwargs)

        # these are actual db columns
        self.id = 0
        self.reset()
        self.project = project
        self.filelist = filelist
        self.tasks = tasks
        self.preview_size = 10

        self._validator = None

        if project and (filelist or tasks):
            self._update()

    def reset(self):
        self.project = None
        self.filelist = ()
        self.tasks = ()
        self.found_formats = {}
        self.selected_formats = None
        self.selected_objects = None
        self.columns_to_draw = []
        self.data_keys = []
        self.files_as_tasks_list = {'type': None, 'selected': True}
        self.show_files_as_tasks_list = None

    def serialize(self):
        return {
            'id': self.id,
            'project': self.project.name,
            'task_preview': self.tasks_preview,
            'columns_to_draw': self.columns_to_draw,
            'total_tasks': self.total_tasks,
            'total_completions': self.total_completions,
            'total_predictions': self.total_predictions,
            'found_formats': self.found_formats,
            'selected_formats': self.selected_formats,
            'selected_objects': self.selected_objects,
            'files_as_tasks_list': self.files_as_tasks_list,
            'show_files_as_tasks_list': self.show_files_as_tasks_list
        }

    def _get_selected_objects(self):
        objects = []
        for format in self.selected_formats:
            normalized_format = format.lower().lstrip('.')
            if self.files_as_tasks_list[
                    'selected'] and normalized_format in self.AMBIGUOUS_TASKS_LIST_FORMATS:
                objects.append(None)
            else:
                objects.append(self.format_to_object.get(normalized_format))
        return objects

    def _show_files_as_tasks_list(self):
        for format in self.selected_formats:
            norm_format = format.lower().lstrip('.')
            if norm_format in self.AMBIGUOUS_TASKS_LIST_FORMATS:
                return True
        return False

    def _generate_label_config(self):
        # TODO: this is a temp workaround to guess initial config - we should make it prettier
        data_keys = list(self.project.derived_input_schema)
        if len(data_keys) > 1:
            # better to use Table here
            return '<View></View>'
        if len(data_keys) == 1:
            data_key = data_keys[0]
            objects = set(
                self.selected_objects) if self.selected_objects else [None]
            if len(objects) > 1:
                raise ValidationError('More than one data type is presented')
            object_tag = list(objects)[0]
            if not object_tag:
                return '<View></View>'
            data_key = object_tag.lower(
            ) if data_key == Settings.UPLOAD_DATA_UNDEFINED_NAME else data_key
            return '<View><{0} name="{1}" value="${2}"/></View>'.format(
                object_tag, object_tag.lower(), data_key)

    def _read_tasks(self, num_tasks=None):
        request_files = {}
        for filename in self.filelist:
            request_files[filename] = open(self.project.upload_dir + '/' +
                                           filename,
                                           mode='rb')
        with get_temp_dir() as tmpdir:
            files = aggregate_files(request_files, tmpdir,
                                    self.project.upload_dir)
            tasks, found_formats, data_keys = aggregate_tasks(
                files, self.project, self.selected_formats,
                self.files_as_tasks_list['selected'], num_tasks)
            for file in files.values():
                try:
                    file.close()
                except:
                    pass
        return tasks, found_formats, data_keys

    def _raise_if_inconsistent_with_current_project(self):
        project_data_keys = self.project.data_keys
        if project_data_keys:
            import_data_keys = list(
                filter(lambda k: k != Settings.UPLOAD_DATA_UNDEFINED_NAME,
                       self.data_keys))
            if import_data_keys and import_data_keys != project_data_keys:
                raise ValidationError(
                    "Import data inconsistent with current project:\n"
                    "Imported column names {}\nare inconsistent with common columns found in dataset: {}"
                    .format(','.join(import_data_keys),
                            ','.join(project_data_keys)))

    def _update(self):
        if self.filelist:
            self.tasks, found_formats, self.data_keys = self._read_tasks()

            self._raise_if_inconsistent_with_current_project()

            if not self.found_formats:
                # It's a first time we get all formats
                self.found_formats = found_formats
            if self.selected_formats is None:
                # It's a first time we get all formats
                self.selected_formats, self.selected_objects = [], []
                for format in sorted(found_formats.keys()):
                    self.selected_formats.append(format)

            self.selected_objects = self._get_selected_objects()
            self.show_files_as_tasks_list = self._show_files_as_tasks_list()

        # validate tasks
        self._validator = TaskValidator(self.project)
        self.tasks = self._validator.to_internal_value(self.tasks)

    def apply(self):
        # get the last task id
        max_id_in_old_tasks = -1
        if not self.project.no_tasks():
            max_id_in_old_tasks = self.project.source_storage.max_id()

        # now read all tasks
        # currently self._update() reads all tasks - uncomment this on change
        # all_tasks, _, _ = self._read_tasks()
        all_tasks = self.tasks

        new_tasks = Tasks().from_list_of_dicts(all_tasks,
                                               max_id_in_old_tasks + 1)
        try:
            self.project.source_storage.set_many(new_tasks.keys(),
                                                 new_tasks.values())
        except NotImplementedError:
            raise NotImplementedError(
                'Import is not supported for the current storage, change storage type in project settings'
                + str(self.project.source_storage))

        # if tasks have completion - we need to implicitly save it to target
        for i in new_tasks.keys():
            for completion in new_tasks[i].get('completions', []):
                self.project.save_completion(int(i), completion)

        # update schemas based on newly uploaded tasks
        self.project.update_derived_input_schema()
        self.project.update_derived_output_schema()

        if self.project.label_config_is_empty:
            generated_label_config = self._generate_label_config()
            self.project.update_label_config(generated_label_config)
        return new_tasks

    @property
    def tasks_preview(self):
        preview = []
        for task in self.tasks[:self.preview_size]:
            t = deepcopy(task['data'])
            if 'completions' in task:
                t['completions'] = task['completions']
            if 'predictions' in task:
                t['predictions'] = task['predictions']
            preview.append(t)
        return preview

    @property
    def total_tasks(self):
        return len(self.tasks)

    @property
    def total_completions(self):
        return self._validator.completion_count

    @property
    def total_predictions(self):
        return self._validator.prediction_count

    @classmethod
    def create_from_filelist(cls, filelist, project):
        _id = 1
        if _id not in cls._db:
            i = ImportState()
            i.id = _id
            cls._db[_id] = i
        import_state = cls._db[_id]
        import_state.reset()
        import_state.filelist = filelist
        import_state.project = project
        import_state._update()

        return import_state

    @classmethod
    def create_from_data(cls, data, project):
        if isinstance(data, dict):
            tasks = [data]
        elif isinstance(data, list):
            tasks = data
        else:
            raise ValidationError(
                'Incorrect input data type, it must be JSON dict or list')

        _id = 1
        if _id not in cls._db:
            i = ImportState()
            i.id = _id
            cls._db[_id] = i
        import_state = cls._db[_id]
        import_state.reset()
        import_state.tasks = tasks
        import_state.project = project
        import_state._update()
        return import_state

    @classmethod
    def get_by_id(cls, id):
        return cls._db[id]

    def update(self, **import_state_interface):
        [
            setattr(self, name, value)
            for name, value in import_state_interface.items()
        ]
        self._update()
Exemplo n.º 8
0
def api_import():
    print("in api import")
    project = project_get_or_create()

    # make django compatibility for uploader module
    class DjangoRequest:
        POST = request.form
        GET = request.args
        FILES = request.files
        data = request.json if request.json else request.form
        content_type = request.content_type

    print("In api_import")
    start = time.time()
    # get tasks from request
    parsed_data = uploader.load_tasks(DjangoRequest())
    # validate tasks
    validator = TaskValidator(project)
    try:
        new_tasks = validator.to_internal_value(parsed_data)
    except ValidationError as e:
        return make_response(jsonify(e.msg_to_list()),
                             status.HTTP_400_BAD_REQUEST)

    # save task file to input dir
    if os.path.isdir(project.config['input_path']):
        # tasks are in directory, write a new file with tasks
        task_dir = project.config['input_path']
        now = datetime.now()
        print("In new tasks api_import")
        data = json.dumps(new_tasks, ensure_ascii=False)
        md5 = hashlib.md5(json.dumps(data).encode('utf-8')).hexdigest()
        name = 'import-' + now.strftime('%Y-%m-%d-%H-%M') + '-' + str(md5[0:8])
        path = os.path.join(task_dir, name + '.json')
        tasks = new_tasks
    else:
        # tasks are all in one file, append it
        path = project.config['input_path']
        print("in old tasks section api_import")
        old_tasks = json.load(open(path))
        assert isinstance(old_tasks,
                          list), 'Tasks from input_path must be list'
        tasks = old_tasks + new_tasks

        temp = copy.deepcopy(tasks)
        tasks[:] = []
        numcomps = 3
        startingindex = 0
        count = [0] * len(temp)
        c = 0

        for i in range(0, len(temp)):
            for j in range(0, param):
                if (j + startingindex < len(temp)):
                    if (count[j + startingindex] < 3):
                        tasks.append(temp[j + startingindex])
                        count[j + startingindex] = count[j + startingindex] + 1
                        print(temp[j + startingindex])
                        c = c + 1

            if (len(tasks) % (numcomps * param) == 0):
                startingindex = startingindex + param

        print(c)
        logger.error("It's recommended to use directory as input_path: " +
                     project.config['input_path'] + ' -> ' +
                     os.path.dirname(project.config['input_path']))

    with open(path, 'w') as f:
        json.dump(tasks, f, ensure_ascii=False, indent=4)

    # load new tasks
    project.reload()

    duration = time.time() - start

    # #add to tasks queues
    # num_tasks = len(new_tasks)
    # temp = []
    # a = 1
    # while a < num_tasks:
    #     for b in range(a,a+param):
    #         temp.append(b)

    #     a = a + param
    #     if num_tasks - a < param:
    #         # add all the rest
    #         while(a <= num_tasks):
    #             temp.append(a)
    #             a = a + 1;

    #     task_queue.append(temp)
    #     temp=[]

    # print("JUST MADE THE QUEUE!!!*********")
    # print(task_queue)

    task_queue = make_task_queue(num_tasks)

    return make_response(
        jsonify({
            'task_count': len(new_tasks),
            'completion_count': validator.completion_count,
            'prediction_count': validator.prediction_count,
            'duration': duration
        }), status.HTTP_201_CREATED)