示例#1
0
    def reload(self):
        self.tasks = None
        self.derived_input_schema = []
        self.derived_output_schema = {
            'from_name_to_name_type': set(),
            'labels': defaultdict(set)
        }

        self._init()

        self.label_config_full = config_comments_free(
            open(self.config['label_config']).read())
        self.label_config_line = config_line_stripped(self.label_config_full)

        if self.analytics is None:
            self.analytics = Analytics(
                self.label_config_line,
                self.config.get('collect_analytics', True), self.name)
        else:
            self.analytics.update_info(
                self.label_config_line,
                self.config.get('collect_analytics', True), self.name)

        # configure project
        self.project_obj = ProjectObj(label_config=self.label_config_line,
                                      label_config_full=self.label_config_full)

        # configure machine learning backend
        if self.ml_backend is None:
            ml_backend_params = self.config.get('ml_backend')
            if ml_backend_params:
                ml_backend = MLBackend.from_params(ml_backend_params)
                self.project_obj.connect(ml_backend)

        self.converter = Converter(self.label_config_full)
示例#2
0
 def load_project_ml_backend(self):
     # configure project
     self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full)
     # configure machine learning backend
     ml_backend_params = self.config.get('ml_backend')
     if ml_backend_params:
         self.ml_backend = MLBackend.from_params(ml_backend_params)
         self.project_obj.connect(self.ml_backend)
示例#3
0
    def load_project_ml_backend(self):
        # configure project
        self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full)

        # configure multiple machine learning backends
        self.ml_backends = []
        ml_backends_params = self.config.get('ml_backends', [])
        for ml_backend_params in ml_backends_params:
            self.add_ml_backend(ml_backend_params, raise_on_error=False)
示例#4
0
 def load_project_ml_backend(self):
     # configure project
     self.project_obj = ProjectObj(label_config=self.label_config_line,
                                   label_config_full=self.label_config_full)
     # configure machine learning backend
     ml_backend_params = self.config.get('ml_backend')
     if ml_backend_params:
         self.ml_backend = MLBackend.from_params(ml_backend_params)
         if not self.ml_backend.connected:
             raise ValueError('ML backend is not connected.')
示例#5
0
    def load_project_ml_backend(self):
        # configure project
        self.project_obj = ProjectObj(label_config=self.label_config_line,
                                      label_config_full=self.label_config_full)

        # configure multiple machine learning backends
        self.ml_backends = []
        ml_backends_params = self.config.get('ml_backends', [])
        for ml_backend_params in ml_backends_params:
            ml_backend = MLBackend.from_params(ml_backend_params)
            if not ml_backend.connected:
                raise ValueError('ML backend ' + str(ml_backend_params) +
                                 ' is not connected.')
            self.ml_backends.append(ml_backend)
示例#6
0
class Project(object):

    _storage = {}

    _allowed_extensions = {
        'Text': ('.txt', ),
        'Image': ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'),
        'Audio': ('.wav', '.aiff', '.mp3', '.au', '.flac')
    }

    def __init__(self, config, name, context=None):
        self.config = config
        self.name = name

        self.tasks = None
        self.derived_input_schema = []
        self.derived_output_schema = {
            'from_name_to_name_type': set(),
            'labels': defaultdict(set)
        }
        self.label_config_line = None
        self.label_config_full = None
        self.ml_backend = None
        self.project_obj = None
        self.analytics = None
        self.converter = None
        self.on_boarding = {}
        self.context = context or {}
        self.reload()

    @property
    def id(self):
        return self.project_obj.id

    @property
    def data_types(self):
        return self.project_obj.data_types

    @property
    def label_config(self):
        return self.project_obj.label_config

    def extract_data_types(self, config):
        return self.project_obj.extract_data_types(config)

    def validate_label_config(self, config_string):
        self.project_obj.validate_label_config(config_string)

        parsed_config = parse_config(config_string)
        self.validate_label_config_on_derived_input_schema(parsed_config)
        self.validate_label_config_on_derived_output_schema(parsed_config)

    def update_label_config(self, new_label_config):
        print("in update label config")
        label_config_file = self.config['label_config']
        # save xml label config to file
        with io.open(label_config_file, mode='w') as f:
            f.write(new_label_config)

        # save project config state
        self.config['label_config_updated'] = True
        with io.open(self.config['config_path'], mode='w') as f:
            json.dump(self.config, f)
        logger.info(
            'Label config saved to: {path}'.format(path=label_config_file))

    def _get_single_input_value(self, input_data_tags):
        print("in get single input value")
        if len(input_data_tags) > 1:
            val = ",".join(tag.attrib.get("name") for tag in input_data_tags)
            print('Warning! Multiple input data tags found: ' + val +
                  '. Only first one is used.')
        input_data_tag = input_data_tags[0]
        data_key = input_data_tag.attrib.get('value').lstrip('$')
        return data_key

    def _create_task_with_local_uri(self, filepath, data_key, task_id):
        print("in create task with local uri")
        """ Convert filepath to task with flask serving URL
        """
        filename = os.path.basename(self, filepath)
        params = urllib.parse.urlencode({'d': os.path.dirname(filepath)})
        base_url = 'http://localhost:{port}/'.format(
            port=self.config.get("port"))
        image_url_path = base_url + urllib.parse.quote('data/' + filename)
        image_local_url = '{image_url_path}?{params}'.format(
            image_url_path=image_url_path, params=params)
        return {
            'id': task_id,
            'task_path': filepath,
            'data': {
                data_key: image_local_url
            }
        }

    def is_text_annotation(self, input_data_tags, filepath):
        print("in text annotation")
        return (len(input_data_tags) == 1 and input_data_tags[0].tag == 'Text'
                and filepath.endswith(self._allowed_extensions['Text']))

    def is_image_annotation(self, input_data_tags, filepath):
        return (len(input_data_tags) == 1 and input_data_tags[0].tag == 'Image'
                and filepath.lower().endswith(
                    self._allowed_extensions['Image']))

    def is_audio_annotation(self, input_data_tags, filepath):
        return (len(input_data_tags) == 1
                and input_data_tags[0].tag in ('Audio', 'AudioPlus') and
                filepath.lower().endswith(self._allowed_extensions['Audio']))

    def _update_derived_output_schema(self, completion):
        print("in update_derived_output_schema")
        """
        Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type)
        and list of unique labels derived from existed completions
        :param completion:
        :return:
        """
        for result in completion['result']:
            self.derived_output_schema['from_name_to_name_type'].add((
                result['from_name'],
                result['to_name'],
                result['type']  #,completion['user']
            ))
            print("in update derived output schema")
            for label in result['value'][result['type']]:
                self.derived_output_schema['labels'][result['from_name']].add(
                    label)
        print(self.derived_output_schema)

    def get_result_ds(self):
        return self.result_ds

    def validate_label_config_on_derived_input_schema(
            self, config_string_or_parsed_config):
        print("in validate_label_config_on_derived_input_schema")
        """
        Validate label config on input schemas (tasks types and data keys) derived from imported tasks
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already imported tasks
        """
        input_schema = self.derived_input_schema

        # check if schema exists, i.e. at least one task has been uploaded
        if not input_schema:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)
        input_types, input_values = set(), set()
        for input_items in map(itemgetter('inputs'), config.values()):
            for input_item in input_items:
                input_types.add(input_item['type'])
                input_values.add(input_item['value'])

        input_schema_types = set([item['type'] for item in input_schema])
        input_schema_values = set([item['value'] for item in input_schema])

        # check input data types: they must be in schema
        for item in input_types:
            if item not in input_schema_types:
                raise ValidationError(
                    'You have already imported tasks and they are incompatible with a new config. '
                    'Can\'t find type "{item}" among already imported tasks with types {input_schema_types}'
                    .format(item=item,
                            input_schema_types=list(input_schema_types)))

        # check input data values: they must be in schema
        for item in input_values:
            if item not in input_schema_values:
                raise ValidationError(
                    'You have already imported tasks and they are incompatible with a new config. '
                    'Can\t find key "{item}" among already imported tasks with keys {input_schema_values}'
                    .format(item=item,
                            input_schema_values=list(input_schema_types)))

    def validate_label_config_on_derived_output_schema(
            self, config_string_or_parsed_config):
        print("in validate_label_config_on_derived_output_schema")
        """
        Validate label config on output schema (from_names, to_names and labeling types) derived from completions
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already created completions
        """
        output_schema = self.derived_output_schema

        # check if schema exists, i.e. at least one completion has been created
        if not output_schema['from_name_to_name_type']:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)

        completion_tuples = set()

        for from_name, to in config.items():
            completion_tuples.add(
                (from_name, to['to_name'][0], to['type'].lower()))

        for from_name, to_name, type in output_schema[
                'from_name_to_name_type']:
            if (from_name, to_name, type) not in completion_tuples:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'name={from_name}, toName={to_name}, type={type} are expected'
                    .format(from_name=from_name, to_name=to_name, type=type))
        for from_name, expected_label_set in output_schema['labels'].items():
            if from_name not in config:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'name=' + from_name + ' is expected')
            found_labels = set(config[from_name]['labels'])
            extra_labels = list(expected_label_set - found_labels)
            if extra_labels:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'there are labels already created for "{from_name}":\n{extra_labels}'
                    .format(from_name=from_name, extra_labels=extra_labels))

    def tasks_from_json_file(self, path):
        print("in tasks_from_json_file")
        """ Prepare tasks from json

        :param path: path to json with list or dict
        :param tasks: main db instance of tasks
        :return: new task id
        """
        def push_task(root):
            task_id = len(self.tasks) + 1
            print("push task")
            data = root['data'] if 'data' in root else root
            self.tasks[task_id] = {
                'id': task_id,
                'task_path': path,
                'data': data
            }
            if 'predictions' in data:
                self.tasks[task_id]['predictions'] = data['predictions']
                self.tasks[task_id]['data'].pop('predictions', None)
            if 'predictions' in root:
                self.tasks[task_id]['predictions'] = root['predictions']

        logger.debug('Reading tasks from JSON file ' + path)
        with open(path) as f:
            json_body = orjson.loads(f.read())

            # multiple tasks in file
            if isinstance(json_body, list):
                print("reading tasks from JSON file")
                [push_task(data) for data in json_body]

            # one task in file
            elif isinstance(json_body, dict):
                push_task(json_body)

            # unsupported task type
            else:
                raise Exception('Unsupported task data:', path)

    def _init(self):
        print("Init")
        label_config = LabelConfigParser(self.config['label_config'])

        if not os.path.exists(self.config['output_dir']):
            os.mkdir(self.config['output_dir'])

        task_id = 0
        data_key = None

        input_data_tags = label_config.get_input_data_tags()

        # load at first start
        self.tasks = OrderedDict()

        # file
        if os.path.isfile(self.config['input_path']):
            files = [os.path.basename(self.config['input_path'])]
            root_dir = os.path.normpath(
                os.path.dirname(self.config['input_path']))

        # directory
        else:
            root_dir = os.path.normpath(self.config['input_path'])
            files = [os.path.join(root, f) for root, _, files in os.walk(root_dir) for f in files \
                     if 'completion' not in f and 'completion' not in root]

        # walk over all the files
        for f in files:
            norm_f = os.path.normpath(f)
            path = os.path.join(
                root_dir, norm_f) if not norm_f.startswith(root_dir) else f

            # load tasks from json
            if f.endswith('.json'):
                self.tasks_from_json_file(path)

            # load tasks from txt: line by line, task by task
            elif self.is_text_annotation(input_data_tags, f):
                print("Text annotation")
                if data_key is None:
                    data_key = self._get_single_input_value(input_data_tags)
                with io.open(path) as fin:
                    for line in fin:
                        print("is text annotation")
                        task_id = len(self.tasks) + 1
                        self.tasks[task_id] = {
                            'id': task_id,
                            'task_path': path,
                            'data': {
                                data_key: line.strip()
                            }
                        }

            # load tasks from files: creating URI to local resources
            elif self.is_image_annotation(input_data_tags,
                                          f) or self.is_audio_annotation(
                                              input_data_tags, f):
                if data_key is None:
                    data_key = self._get_single_input_value(input_data_tags)
                task_id = len(self.tasks) + 1
                self.tasks[task_id] = self._create_task_with_local_uri(
                    f, data_key, task_id)
            else:
                logger.warning('Unrecognized file format for file ' + f)

        num_tasks_loaded = len(self.tasks)

        # make derived input schema
        if num_tasks_loaded > 0:
            for tag in input_data_tags:
                self.derived_input_schema.append({
                    'type':
                    tag.tag,
                    'value':
                    tag.attrib['value'].lstrip('$')
                })

        # for all already completed tasks we update derived output schema for further label config validation
        for task_id in self.get_task_ids():
            task_with_completions = self.get_task_with_completions(task_id)
            if task_with_completions and 'completions' in task_with_completions:
                completions = task_with_completions['completions']
                for completion in completions:
                    self._update_derived_output_schema(completion)

        print(
            str(len(self.tasks)) + ' tasks loaded from: ' +
            self.config['input_path'])

    def get_tasks(self):
        print("in get tasks")
        """ Load tasks from JSON files in input_path directory

        :return: file list
        """
        return self.tasks

    def delete_tasks(self):
        print("in delete tasks")
        """
        Deletes all tasks & completions from filesystem, then reloads clean project
        :return:
        """
        delete_dir_content(self.config['output_dir'])
        with io.open(self.config['input_path'], mode='w') as f:
            json.dump([], f)
        self.reload()

    def iter_tasks(self):
        print("in iter tasks")
        sampling = self.config.get('sampling', 'sequential')
        if sampling == 'sequential':
            print("sequential")
            return self.tasks.items()
        elif sampling == 'uniform':
            print("uniform")
            keys = list(self.tasks.keys())
            random.shuffle(keys)
            return ((k, self.tasks[k]) for k in keys)
        else:
            print("unknown")
            raise NotImplementedError('Unknown sampling method ' + sampling)

    def get_task_ids(self):
        print("in get task ids")
        """ Get task ids only

        :return: list of task ids
        """
        return list(self.tasks.keys())

    def get_task(self, task_id):
        print("in get task")
        """ Get one task

        :param task_id:
        :return: task
        """
        try:
            task_id = int(task_id)
        except ValueError:
            return None
        return self.tasks.get(task_id)

    def get_completions_ids(self):
        print("in get completitons ids")
        """ List completion ids from output_dir directory

        :return: filenames without extensions and directories
        """
        root_dir = self.config['output_dir']
        os.mkdir(root_dir) if not os.path.exists(root_dir) else ()
        files = os.listdir(root_dir)
        completions = [
            int(os.path.splitext(f)[0]) for f in files if f.endswith('.json')
        ]
        logger.debug('{num} completions found in {output_dir}'.format(
            num=len(completions), output_dir=self.config["output_dir"]))
        return sorted(completions)

    def get_completions_user(self):
        print("in get completions user")
        user_dict = {}
        all_ids = self.get_completions_ids()
        for i in all_ids:
            data = self.get_task_with_completions(i)
            if (data['completions'][0]['user']):
                user_dict[i] = data['completions'][0]['user']
            else:
                user_dict[i] = ''
        return user_dict

    def get_completed_at(self, task_ids):
        print("in get completed at")
        """ Get completed time for list of task ids

        :param task_ids: list of task ids
        :return: list of string with formatted datetime
        """
        root_dir = self.config['output_dir']
        existing_completions = set(self.get_completions_ids())
        ids = existing_completions.intersection(task_ids)
        times = {
            i: os.path.getmtime(os.path.join(root_dir,
                                             str(i) + '.json'))
            for i in ids
        }
        times = {
            i: datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')
            for i, t in times.items()
        }
        return times

    def get_task_with_completions(self, task_id):
        print("in get task with completions")
        """ Get task with completions

        :param task_id: task ids
        :return: json dict with completion
        """
        try:
            task_id = int(
                task_id
            )  # check task_id is int (disallow to escape from output_dir)
        except ValueError:
            return None

        filename = os.path.join(self.config['output_dir'],
                                str(task_id) + '.json')

        if os.path.exists(filename):
            data = json.load(open(filename))
            # tasks can hold the newest version of predictions, so task it from tasks
            data['predictions'] = self.tasks[task_id].get('predictions', [])
        else:
            data = None
        return data

    def save_completion(self, task_id, completion, user_id):
        print("in save completion")
        """ Save completion

        :param task_id: task id
        :param completion: json data from label (editor)
        """

        # try to get completions with task first
        task = self.get_task_with_completions(task_id)

        # init task if completions with task not exists
        if not task:
            task = self.get_task(task_id)
            task['completions'] = []

        # update old completion
        updated = False
        if 'id' in completion:
            for i, item in enumerate(task['completions']):
                if item['id'] == completion['id']:
                    task['completions'][i].update(completion)
                    completion['user'] = user_id
                    updated = True

        # write new completion
        if not updated:
            completion['id'] = task['id'] * 1000 + len(task['completions']) + 1
            completion['user'] = user_id
            task['completions'].append(completion)

        self._update_derived_output_schema(completion)

        # write task + completions to file
        filename = os.path.join(self.config['output_dir'],
                                str(task_id) + '.json')
        os.mkdir(self.config['output_dir']) if not os.path.exists(
            self.config['output_dir']) else ()
        json.dump(task, open(filename, 'w'), indent=4, sort_keys=True)
        return [completion['id'], completion['user']]

    def delete_completion(self, task_id):
        print("in delete completion")
        """ Delete completion from disk

        :param task_id: task id
        """
        filename = os.path.join(self.config['output_dir'],
                                str(task_id) + '.json')
        os.remove(filename)

    def reload(self):
        print("in reload")
        self.tasks = None
        self.derived_input_schema = []
        self.derived_output_schema = {
            'from_name_to_name_type': set(),
            'labels': defaultdict(set)
        }

        self._init()

        self.label_config_full = config_comments_free(
            open(self.config['label_config']).read())
        self.label_config_line = config_line_stripped(self.label_config_full)

        collect_analytics = os.getenv('collect_analytics')
        if collect_analytics is None:
            collect_analytics = self.config.get('collect_analytics', True)
        if self.analytics is None:
            self.analytics = Analytics(self.label_config_line,
                                       collect_analytics, self.name,
                                       self.context)
        else:
            self.analytics.update_info(self.label_config_line,
                                       collect_analytics, self.name,
                                       self.context)

        # configure project
        self.project_obj = ProjectObj(label_config=self.label_config_line,
                                      label_config_full=self.label_config_full)

        # configure machine learning backend
        if self.ml_backend is None:
            ml_backend_params = self.config.get('ml_backend')
            if ml_backend_params:
                ml_backend = MLBackend.from_params(ml_backend_params)
                self.project_obj.connect(ml_backend)

        self.converter = Converter(self.label_config_full)

    @classmethod
    def get_project_dir(cls, project_name, args):
        return os.path.join(args.root_dir, project_name)

    @classmethod
    def create_project_dir(cls, project_name, args):
        print("in create project dir")
        """
        Create project directory in args.root_dir/project_name, and initialize there all required files
        If some files are missed, restore them from defaults.
        If config files are specified by args, copy them in project directory
        :param project_name:
        :param args:
        :return:
        """
        dir = cls.get_project_dir(project_name, args)
        os.makedirs(dir, exist_ok=True)
        label_config_name = 'config.xml'
        output_dir_name = 'completions'
        input_path_name = 'tasks.json'
        default_config_file = os.path.join(dir, 'config.json')
        default_label_config_file = os.path.join(dir, label_config_name)
        default_output_dir = os.path.join(dir, output_dir_name)
        default_input_path = os.path.join(dir, input_path_name)

        if hasattr(args, 'config_path') and args.config_path:
            copy2(args.config_path, default_config_file)
        if hasattr(args, 'input_path') and args.input_path:
            copy2(args.input_path, default_input_path)
        if hasattr(args, 'output_dir') and args.output_dir:
            if os.path.exists(args.output_dir):
                copy2(args.output_dir, default_output_dir)
        if hasattr(args, 'label_config') and args.label_config:
            copy2(args.label_config, default_label_config_file)

        default_config = {
            'title': 'Label Studio',
            'port': 8200,
            'debug': False,
            'label_config': label_config_name,
            'input_path': input_path_name,
            'output_dir': output_dir_name,
            'instruction': 'Type some <b>hypertext</b> for label experts!',
            'allow_delete_completions': True,
            'templates_dir': 'examples',
            'editor': {
                'debug': False
            },
            '!ml_backend': {
                'url': 'http://localhost:9090',
                'model_name': 'my_super_model'
            },
            'sampling': 'uniform'
        }

        # create input_path (tasks.json)
        if not os.path.exists(default_input_path):
            with io.open(default_input_path, mode='w') as fout:
                json.dump([], fout, indent=2)
            print(default_input_path + ' input path has been created.')
        else:
            print(default_input_path + ' input path already exists.')

        # create config file (config.json)
        if not os.path.exists(default_config_file):
            with io.open(default_config_file, mode='w') as fout:
                json.dump(default_config, fout, indent=2)
            print(default_config_file + ' config file has been created.')
        else:
            print(default_config_file + ' config file already exists.')

        # create label config (config.xml)
        if not os.path.exists(default_label_config_file):
            path = find_file('examples/image_polygons/config.xml')
            default_label_config = open(path).read()

            with io.open(default_label_config_file, mode='w') as fout:
                fout.write(default_label_config)
            print(default_label_config_file +
                  ' label config file has been created.')
        else:
            print(default_label_config_file +
                  ' label config file already exists.')

        # create output dir (completions)
        if not os.path.exists(default_output_dir):
            os.makedirs(default_output_dir)
            print(default_output_dir + ' output directory has been created.')
        else:
            print(default_output_dir + ' output directory already exists.')

        print('')
        print(
            'Label Studio has been successfully initialized. Check project states in '
            + dir)
        print('Start the server: label-studio start ' + dir)
        return dir

    @classmethod
    def _get_config(cls, project_dir, args):
        print("in get config")
        """
        Get config path from input args Namespace acquired by Argparser
        :param args:
        :param args:
        :return:
        """
        # if config is explicitly specified, just return it
        if args.config_path:
            config_path = args.config_path
        else:
            # check if project directory exists
            if not os.path.exists(project_dir):
                raise FileNotFoundError(
                    'Couldn\'t find directory ' + project_dir +
                    ', maybe you\'ve missed appending "--init" option:\nlabel-studio start '
                    + args.project_name + ' --init')

            # check config.json exists in directory
            config_path = os.path.join(project_dir, 'config.json')
            if not os.path.exists(config_path):
                raise FileNotFoundError(
                    'Couldn\'t find config file ' + config_path +
                    ' in project directory ' + project_dir +
                    ', maybe you\'ve missed appending "--init" option:\nlabel-studio start '
                    + args.project_name + ' --init')

        config_path = os.path.abspath(config_path)
        with io.open(config_path) as c:
            config = json.load(c)

        if args.port:
            config['port'] = args.port

        if args.label_config:
            config['label_config'] = args.label_config

        if args.input_path:
            config['input_path'] = args.input_path

        if args.output_dir:
            config['output_dir'] = args.output_dir

        if args.debug is not None:
            config['debug'] = args.debug

        if args.ml_backend_url:
            if 'ml_backend' not in config:
                config['ml_backend'] = {}
            config['ml_backend']['url'] = args.ml_backend_url

        if args.ml_backend_name:
            if 'ml_backend' not in config:
                config['ml_backend'] = {}
            config['ml_backend']['name'] = args.ml_backend_name

        # absolutize paths relative to config.json
        config_dir = os.path.dirname(config_path)
        config['label_config'] = os.path.join(config_dir,
                                              config['label_config'])
        config['input_path'] = os.path.join(config_dir, config['input_path'])
        config['output_dir'] = os.path.join(config_dir, config['output_dir'])
        config['config_path'] = config_path

        return config

    @classmethod
    def _load_from_dir(cls, project_dir, project_name, args, context):
        config = cls._get_config(project_dir, args)
        return cls(config, project_name, context)

    @classmethod
    def get(cls, project_name, args, context):

        # If project stored in memory, just return it
        if project_name in cls._storage:
            return cls._storage[project_name]

        # If project directory exists, load project from directory and update in-memory storage
        project_dir = cls.get_project_dir(project_name, args)
        if os.path.exists(project_dir):
            project = cls._load_from_dir(project_dir, project_name, args,
                                         context)
            cls._storage[project_name] = project

        raise KeyError('Project {p} doesn\'t exist'.format(p=project_name))

    @classmethod
    def create(cls, project_name, args, context):
        # "create" method differs from "get" as it can create new directory with project resources
        project_dir = cls.create_project_dir(project_name, args)
        project = cls._load_from_dir(project_dir, project_name, args, context)
        cls._storage[project_name] = project
        return project

    @classmethod
    def get_or_create(cls, project_name, args, context):
        try:
            project = cls.get(project_name, args, context)
            logger.info('Get project "' + project_name + '".')
        except KeyError:
            project = cls.create(project_name, args, context)
            logger.info('Project "' + project_name + '" created.')
        return project

    def update_on_boarding_state(self):
        self.on_boarding['setup'] = self.config.get('label_config_updated',
                                                    False)
        self.on_boarding['import'] = len(self.tasks) > 0
        self.on_boarding['labeled'] = len(os.listdir(
            self.config['output_dir'])) > 0
        return self.on_boarding
示例#7
0
class Project(object):

    _storage = {}

    @classmethod
    def get_user_projects(cls, user, root):
        """Get all project names by user, this is used in multi-session mode"""
        return os.listdir(os.path.join(root, user))

    @classmethod
    def get_all_projects(cls, root):
        """Get all projects in the system, this is used in multi-session mode
        Returns {user: projects}
        """
        result = {}
        regex = r"........-....-....-....-............"  # user uuid filter

        for user in os.listdir(root):
            # leave user dirs satisfied regex only
            matches = re.search(regex, user)
            if matches:
                user_dir = os.path.join(root, user)
                result[user] = os.listdir(user_dir)
        return result

    @classmethod
    def get_user_by_project(cls, project_uuid, root):
        all_projects = cls.get_all_projects(root)
        for user in all_projects:
            if project_uuid in all_projects[user]:
                return user

    def __init__(self, config, name, root_dir=".", context=None):
        self.config = config
        self.name = name
        self.path = os.path.join(root_dir, self.name)
        self.ml_backends = []

        self.on_boarding = {}
        self.context = context or {}
        self.project_obj = None
        self.source_storage = None
        self.target_storage = None
        self.create_storages()

        (
            self.label_config_line,
            self.label_config_full,
            self.parsed_label_config,
            self.input_data_tags,
        ) = (
            None,
            None,
            None,
            None,
        )  # noqa
        self.derived_input_schema, self.derived_output_schema = None, None

        self.load_label_config()
        self.load_project_and_ml_backends()
        self.update_derived_input_schema()
        self.update_derived_output_schema()

        self.converter = None
        self.load_converter()
        self.max_tasks_file_size = 250

    def get_storage(self, storage_for):
        if storage_for == "source":
            return self.source_storage
        elif storage_for == "target":
            return self.target_storage

    def get_available_storage_names(self, storage_for):
        if storage_for == "source":
            return self.get_available_source_storage_names()
        elif storage_for == "target":
            return self.get_available_target_storage_names()

    @classmethod
    def get_available_source_storages(cls):
        return ["tasks-json", "s3", "gcs"]

    @classmethod
    def get_available_target_storages(cls):
        return ["completions-dir", "s3-completions", "gcs-completions"]

    def get_available_source_storage_names(self):
        names = OrderedDict()
        nameset = set(self.get_available_source_storages())
        for name, desc in get_available_storage_names().items():
            # we don't expose configurable filesystem storage in UI to avoid security problems
            if name in nameset:
                names[name] = desc
        return names

    def get_available_target_storage_names(self):
        names = OrderedDict()
        nameset = set(self.get_available_target_storages())
        for name, desc in get_available_storage_names().items():
            # blobs have no sense for target storages
            if name in nameset:
                names[name] = desc
        return names

    def create_storages(self):
        source = self.config["source"]
        target = self.config["target"]
        self.source_storage = create_storage(source["type"], "source",
                                             source["path"], self.path, self,
                                             **source.get("params", {}))
        self.target_storage = create_storage(target["type"], "target",
                                             target["path"], self.path, self,
                                             **target.get("params", {}))

    def update_storage(self, storage_for, storage_kwargs):
        def _update_storage(storage_for, storage_kwargs):
            storage_name = storage_kwargs.pop("name", storage_for)
            storage_type = storage_kwargs.pop("type")
            storage_path = storage_kwargs.pop("path", None)
            # storage_path = self.config[storage_for]['path']
            storage = create_storage(storage_type, storage_name, storage_path,
                                     self.path, self, **storage_kwargs)
            self.config[storage_for] = {
                "name": storage_name,
                "type": storage_type,
                "path": storage_path,
                "params": storage_kwargs,
            }
            self._save_config()
            logger.debug('Created storage type "' + storage_type + '"')
            return storage

        if storage_for == "source":
            self.source_storage = _update_storage("source", storage_kwargs)
        elif storage_for == "target":
            self.target_storage = _update_storage("target", storage_kwargs)
        self.update_derived_input_schema()
        self.update_derived_output_schema()

    @property
    def can_manage_tasks(self):
        return self.config["source"]["type"] not in {
            "s3",
            "s3-completions",
            "gcs",
            "gcs-completions",
        }

    @property
    def can_manage_completions(self):
        return self.config["target"]["type"] not in {
            "s3",
            "s3-completions",
            "gcs",
            "gcs-completions",
        }

    @property
    def can_delete_tasks(self):
        return self.can_manage_tasks and self.can_manage_completions

    @property
    def data_types_json(self):
        return self.project_obj.data_types_json

    def load_label_config(self):
        self.label_config_full = config_comments_free(
            open(self.config["label_config"], encoding="utf8").read())
        self.label_config_line = config_line_stripped(self.label_config_full)
        self.parsed_label_config = parse_config(self.label_config_line)
        self.input_data_tags = self.get_input_data_tags(self.label_config_line)

    def update_derived_input_schema(self):
        self.derived_input_schema = set()
        for task_id, task in self.source_storage.items():
            data_keys = set(task["data"].keys())
            if not self.derived_input_schema:
                self.derived_input_schema = data_keys
            else:
                self.derived_input_schema &= data_keys
        logger.debug("Derived input schema: " + str(self.derived_input_schema))

    def update_derived_output_schema(self):
        self.derived_output_schema = {
            "from_name_to_name_type": set(),
            "labels": defaultdict(set),
        }

        # for all already completed tasks we update derived output schema for further label config validation
        for task_id, c in self.target_storage.items():
            for completion in c["completions"]:
                self._update_derived_output_schema(completion)
        logger.debug("Derived output schema: " +
                     str(self.derived_output_schema))

    def add_ml_backend(self, params, raise_on_error=True):
        ml_backend = MLBackend.from_params(params)
        if not ml_backend.connected and raise_on_error:
            raise ValueError('ML backend with URL: "' + str(params["url"]) +
                             '" is not connected.')
        self.ml_backends.append(ml_backend)

    def remove_ml_backend(self, name):
        # remove from memory
        remove_idx = next(
            (i
             for i, b in enumerate(self.ml_backends) if b.model_name == name),
            None)
        if remove_idx is None:
            raise KeyError("Can't remove ML backend with name \"" + name +
                           '": not found.')
        self.ml_backends.pop(remove_idx)

        # remove from config
        config_params = self.config.get("ml_backends", [])
        remove_idx = next(
            (i for i, b in enumerate(config_params) if b["name"] == name),
            None)
        if remove_idx is not None:
            config_params.pop(remove_idx)
        self.config["ml_backends"] = config_params
        self._save_config()

    def load_project_and_ml_backends(self):
        # configure project
        self.project_obj = ProjectObj(
            label_config=self.label_config_line,
            label_config_full=self.label_config_full,
        )

        # configure multiple machine learning backends
        self.ml_backends = []
        ml_backends_params = self.config.get("ml_backends", [])
        for ml_backend_params in ml_backends_params:
            self.add_ml_backend(ml_backend_params, raise_on_error=False)

    def load_converter(self):
        self.converter = Converter(self.parsed_label_config)

    @property
    def id(self):
        return self.project_obj.id

    @property
    def uuid(self):
        return os.path.basename(self.path)

    @property
    def data_types(self):
        return self.project_obj.data_types

    @property
    def label_config(self):
        return self.project_obj.label_config

    @property
    def ml_backends_connected(self):
        return len(self.ml_backends) > 0

    @property
    def task_data_login(self):
        return self.project_obj.task_data_login

    @property
    def task_data_password(self):
        return self.project_obj.task_data_password

    def extract_data_types(self, config):
        return self.project_obj.extract_data_types(config)

    def validate_label_config(self, config_string):
        logger.debug("Validate label config")
        self.project_obj.validate_label_config(config_string)

        logger.debug("Get parsed config")
        parsed_config = parse_config(config_string)

        logger.debug("Validate label config on derived input schema")
        self.validate_label_config_on_derived_input_schema(parsed_config)

        logger.debug("Validate label config on derived output schema")
        self.validate_label_config_on_derived_output_schema(parsed_config)

    def _save_config(self):
        with io.open(self.config["config_path"], mode="w") as f:
            json.dump(self.config, f, indent=2)

    def update_params(self, params):
        if "ml_backend" in params:
            ml_backend_params = self._create_ml_backend_params(
                params["ml_backend"], self.name)
            self.add_ml_backend(ml_backend_params)
            self.config["ml_backends"].append(ml_backend_params)
            self._save_config()

    def update_label_config(self, new_label_config):
        label_config_file = self.config["label_config"]
        # save xml label config to file
        new_label_config = new_label_config.replace("\r\n", "\n")
        with io.open(label_config_file, mode="w", encoding="utf8") as f:
            f.write(new_label_config)

        # reload everything that depends on label config
        self.load_label_config()
        self.update_derived_output_schema()
        self.load_project_and_ml_backends()
        self.load_converter()

        # save project config state
        self.config["label_config_updated"] = True
        with io.open(self.config["config_path"], mode="w",
                     encoding="utf8") as f:
            json.dump(self.config, f)
        logger.info(
            "Label config saved to: {path}".format(path=label_config_file))

    def _update_derived_output_schema(self, completion):
        """
        Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type)
        and list of unique labels derived from existed completions
        :param completion:
        :return:
        """
        for result in completion["result"]:
            result_type = result.get("type")
            if result_type in ("relation", "rating", "pairwise"):
                continue
            if "from_name" not in result or "to_name" not in result:
                logger.error(
                    'Unexpected completion.result format: "from_name" or "to_name" not found in %r'
                    % result)
                continue

            self.derived_output_schema["from_name_to_name_type"].add(
                (result["from_name"], result["to_name"], result_type))
            for label in result["value"].get(result_type, []):
                self.derived_output_schema["labels"][result["from_name"]].add(
                    label)

    def validate_label_config_on_derived_input_schema(
            self, config_string_or_parsed_config):
        """
        Validate label config on input schemas (tasks types and data keys) derived from imported tasks
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already imported tasks
        """

        # check if schema exists, i.e. at least one task has been uploaded
        if not self.derived_input_schema:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)
        input_types, input_values = set(), set()
        for input_items in map(itemgetter("inputs"), config.values()):
            for input_item in input_items:
                input_types.add(input_item["type"])
                input_values.add(input_item["value"])

        # check input data values: they must be in schema
        for item in input_values:
            if item not in self.derived_input_schema:
                raise ValidationError(
                    "You have already imported tasks and they are incompatible with a new config. "
                    "You've specified value=${item}, but imported tasks contain only keys: {input_schema_values}"
                    .format(item=item,
                            input_schema_values=list(
                                self.derived_input_schema)))

    def validate_label_config_on_derived_output_schema(
            self, config_string_or_parsed_config):
        """
        Validate label config on output schema (from_names, to_names and labeling types) derived from completions
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already created completions
        """
        output_schema = self.derived_output_schema

        # check if schema exists, i.e. at least one completion has been created
        if not output_schema["from_name_to_name_type"]:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)
        completion_tuples = set()

        for from_name, to in config.items():
            completion_tuples.add(
                (from_name, to["to_name"][0], to["type"].lower()))
        for from_name, to_name, type in output_schema[
                "from_name_to_name_type"]:
            if (from_name, to_name, type) not in completion_tuples:
                raise ValidationError(
                    "You've already completed some tasks, but some of them couldn't be loaded with this config: "
                    "name={from_name}, toName={to_name}, type={type} are expected"
                    .format(from_name=from_name, to_name=to_name, type=type))
        for from_name, expected_label_set in output_schema["labels"].items():
            if from_name not in config:
                raise ValidationError(
                    "You've already completed some tasks, but some of them couldn't be loaded with this config: "
                    "name=" + from_name + " is expected")
            found_labels = set(config[from_name]["labels"])
            extra_labels = list(expected_label_set - found_labels)
            if extra_labels:
                raise ValidationError(
                    "You've already completed some tasks, but some of them couldn't be loaded with this config: "
                    'there are labels already created for "{from_name}":\n{extra_labels}'
                    .format(from_name=from_name, extra_labels=extra_labels))

    def no_tasks(self):
        return self.source_storage.empty()

    def delete_tasks(self):
        """
        Deletes all tasks & completions from filesystem, then reloads clean project
        :return:
        """
        self.source_storage.remove_all()
        self.target_storage.remove_all()
        self.update_derived_input_schema()
        self.update_derived_output_schema()

        # delete everything on ML backend
        if self.ml_backends_connected:
            for m in self.ml_backends:
                m.clear(self)

    def next_task(self, completed_tasks_ids):
        completed_tasks_ids = set(completed_tasks_ids)
        sampling = self.config.get("sampling", "sequential")

        # Tasks are ordered ascending by their "id" fields. This is default mode.
        task_iter = filter(lambda i: i not in completed_tasks_ids,
                           sorted(self.source_storage.ids()))
        if sampling == "sequential":
            task_id = next(task_iter, None)
            if task_id is not None:
                return self.source_storage.get(task_id)

        # Tasks are sampled with equal probabilities
        elif sampling == "uniform":
            actual_tasks_ids = list(task_iter)
            if not actual_tasks_ids:
                return None
            random.shuffle(actual_tasks_ids)
            return self.source_storage.get(actual_tasks_ids[0])

        # Task with minimum / maximum average prediction score is taken
        elif sampling.startswith("prediction-score"):
            id_score_map = {}
            for task_id, task in self.source_storage.items():
                if task_id in completed_tasks_ids:
                    continue
                if "predictions" in task and len(task["predictions"]) > 0:
                    score = sum((p["score"] for p in task["predictions"]),
                                0) / len(task["predictions"])
                    id_score_map[task_id] = score
            if not id_score_map:
                return None
            if sampling.endswith("-min"):
                best_idx = min(id_score_map, key=id_score_map.get)
            elif sampling.endswith("-max"):
                best_idx = max(id_score_map, key=id_score_map.get)
            else:
                raise NotImplementedError("Unknown sampling method " +
                                          sampling)
            return self.source_storage.get(best_idx)
        else:
            raise NotImplementedError("Unknown sampling method " + sampling)

    def remove_task(self, task_id):
        self.source_storage.remove(task_id)
        self.delete_task_completions(task_id)

        self.update_derived_input_schema()
        self.update_derived_output_schema()

    def get_completions_ids(self):
        """List completion ids from output_dir directory

        :return: filenames without extensions and directories
        """
        task_ids = set(self.source_storage.ids())
        completion_ids = set(self.target_storage.ids())
        completions = completion_ids.intersection(task_ids)
        # completions = list(self.target_storage.ids())
        logger.debug("{num} completions found in {output_dir}".format(
            num=len(completions), output_dir=self.config["output_dir"]))
        return sorted(completions)

    def get_completed_at(self):
        """Get completed time for tasks

        :return: list of string with formatted datetime
        """
        times = {}
        for _, data in self.target_storage.items():
            id = data["id"]
            try:
                times[id] = max(data["completions"],
                                key=itemgetter("created_at"))["created_at"]
            except Exception as exc:
                times[id] = "undefined"
        return times

    def get_cancelled_status(self):
        """Get was_cancelled (skipped) status for tasks: returns cancelled completion number for task

        :return: list of int
        """
        items = {}
        for _, data in self.target_storage.items():
            id = data["id"]
            try:
                # note: skipped will be deprecated
                flag = sum([
                    completion.get("skipped", False)
                    or completion.get("was_cancelled", False)
                    for completion in data["completions"]
                ])
            except Exception as exc:
                items[id] = -1
            else:
                items[id] = flag
        return items

    def get_task_with_completions(self, task_id):
        """Get task with completions

        :param task_id: task ids
        :return: json dict with completion
        """
        data = self.target_storage.get(task_id)
        logger.debug("Get task " + str(task_id) + " from target storage")

        if data:
            logger.debug("Get predictions " + str(task_id) +
                         " from source storage")
            # tasks can hold the newest version of predictions, so task it from tasks
            data["predictions"] = self.source_storage.get(task_id).get(
                "predictions", [])
        return data

    def save_completion(self, task_id, completion):
        """Save completion

        :param task_id: task id
        :param completion: json data from label (editor)
        """
        # try to get completions with task first
        task = self.get_task_with_completions(task_id)

        # init task if completions with task not exists
        if not task:
            task = deepcopy(self.source_storage.get(task_id))
            task["completions"] = []
        else:
            task = deepcopy(task)

        # remove possible stored predictions
        task.pop("predictions", None)
        # update old completion
        updated = False
        if "id" in completion:
            for i, item in enumerate(task["completions"]):
                if item["id"] == completion["id"]:
                    task["completions"][i].update(completion)
                    updated = True
        # write new completion
        if not updated:
            completion["id"] = task["id"] * 1000 + len(task["completions"]) + 1
            task["completions"].append(completion)

        try:
            self._update_derived_output_schema(completion)
        except Exception as exc:
            logger.error(exc, exc_info=True)
            logger.debug(json.dumps(completion, indent=2))

        # save completion time
        completion["created_at"] = timestamp_now()

        # write task + completions to file
        self.target_storage.set(task_id, task)
        logger.debug("Completion for task " + str(task_id) +
                     " saved with id =" + str(completion["id"]))
        return completion["id"]

    def delete_task_completion(self, task_id, completion_id):
        """Delete one task completion by id"""
        # try to get completions with task first
        task = self.get_task_with_completions(task_id)

        if not task:
            return False
        else:
            task = deepcopy(task)

        # remove completion from task
        for i, item in enumerate(task["completions"]):
            if item["id"] == completion_id:
                del task["completions"][i]

        self.update_derived_output_schema()

        # write task + completions to file
        self.target_storage.set(task_id, task)
        logger.debug("Completion " + str(completion_id) + " removed:\n")
        return True

    def delete_task_completions(self, task_id):
        """Delete all task completions"""
        self.target_storage.remove(task_id)
        self.update_derived_output_schema()

    def delete_all_completions(self):
        """Delete all completions from project"""
        self.target_storage.remove_all()
        self.update_derived_output_schema()

    def make_predictions(self, task):
        task = deepcopy(task)
        stored_predictions = task.get("predictions")
        task["predictions"] = []
        try:
            for ml_backend in self.ml_backends:
                if not ml_backend.connected:
                    continue
                predictions = ml_backend.make_predictions(task, self)
                predictions["created_by"] = ml_backend.model_name
                predictions["created_date"] = datetime.now().isoformat()
                task["predictions"].append(predictions)
        except Exception as exc:
            logger.debug(exc, exc_info=True)
        if not task["predictions"] and stored_predictions:
            task["predictions"] = stored_predictions
        return task

    def train(self):
        completions = []
        for _, c in self.target_storage.items():
            completions.append(c)
        train_status = False
        if self.ml_backends_connected:
            for ml_backend in self.ml_backends:
                if ml_backend.connected:
                    ml_backend.train(completions, self)
                    train_status = True
        return train_status

    @classmethod
    def get_project_dir(cls, project_name, args):
        return os.path.join(args.root_dir, project_name)

    @classmethod
    def get_input_data_tags(cls, label_config):
        tag_iter = ElementTree.fromstring(label_config).iter()
        return [
            tag for tag in tag_iter if tag.attrib.get("name")
            and tag.attrib.get("value", "").startswith("$")
        ]

    @classmethod
    def _load_tasks(cls, input_path, args, label_config_file):
        with io.open(label_config_file, encoding="utf8") as f:
            label_config = f.read()

        task_loader = Tasks()
        if args.input_format == "json":
            return task_loader.from_json_file(input_path)
        if args.input_format == "json-dir":
            return task_loader.from_dir_with_json_files(input_path)
        input_data_tags = cls.get_input_data_tags(label_config)

        if len(input_data_tags) > 1:
            val = ",".join(tag.attrib.get("name") for tag in input_data_tags)
            print("Warning! Multiple input data tags found: " + val +
                  ". Only first one is used.")
        elif len(input_data_tags) == 0:
            raise ValueError(
                'You\'ve specified input format "{fmt}" which requires label config being explicitly defined. '
                "Please specify --label-config=path/to/config.xml or use --format=json or format=json_dir"
                .format(fmt=args.input_format))
        input_data_tag = input_data_tags[0]
        data_key = input_data_tag.attrib.get("value").lstrip("$")

        if args.input_format == "text":
            return task_loader.from_text_file(input_path, data_key)
        if args.input_format == "text-dir":
            return task_loader.from_dir_with_text_files(input_path, data_key)
        if args.input_format == "image-dir":
            return task_loader.from_dir_with_image_files(input_path, data_key)
        if args.input_format == "audio-dir":
            return task_loader.from_dir_with_audio_files(input_path, data_key)
        raise RuntimeError("Can't load tasks for input format={}".format(
            args.input_format))

    @classmethod
    def _create_ml_backend_params(cls, url, project_name=None):
        if "=http" in url:
            name, url = url.split("=", 1)
        else:
            project_name = os.path.basename(project_name or "")
            name = project_name + str(uuid4())[:4]
        if not is_url(url):
            raise ValueError('Specified string "' + url +
                             "\" doesn't look like URL.")
        return {"url": url, "name": name}

    @classmethod
    def create_project_dir(cls, project_name, args):
        """
        Create project directory in args.root_dir/project_name, and initialize there all required files
        If some files are missed, restore them from defaults.
        If config files are specified by args, copy them in project directory
        :param project_name:
        :param args:
        :return:
        """
        dir = cls.get_project_dir(project_name, args)
        if args.force:
            delete_dir_content(dir)
        os.makedirs(dir, exist_ok=True)

        config = (json_load(args.config_path) if args.config_path else
                  json_load(find_file("default_config.json")))

        def already_exists_error(what, path):
            raise RuntimeError(
                '{path} {what} already exists. Use "--force" option to recreate it.'
                .format(path=path, what=what))

        input_path = args.input_path or config.get("input_path")

        # save label config
        config_xml = "config.xml"
        config_xml_path = os.path.join(dir, config_xml)
        label_config_file = args.label_config or config.get("label_config")
        if label_config_file:
            copy2(label_config_file, config_xml_path)
            print(label_config_file + " label config copied to " +
                  config_xml_path)
        else:
            if os.path.exists(config_xml_path) and not args.force:
                already_exists_error("label config", config_xml_path)
            if not input_path:
                # create default config with polygons only if input data is not set
                default_label_config = find_file(
                    "examples/image_polygons/config.xml")
                copy2(default_label_config, config_xml_path)
                print(default_label_config + " label config copied to " +
                      config_xml_path)
            else:
                with io.open(config_xml_path, mode="w") as fout:
                    fout.write("<View></View>")
                print("Empty config has been created in " + config_xml_path)

        config["label_config"] = config_xml

        if args.source:
            config["source"] = {
                "type": args.source,
                "path": args.source_path,
                "params": args.source_params,
            }
        else:
            # save tasks.json
            tasks_json = "tasks.json"
            tasks_json_path = os.path.join(dir, tasks_json)
            if input_path:
                tasks = cls._load_tasks(input_path, args, config_xml_path)
            else:
                tasks = {}
            with io.open(tasks_json_path, mode="w") as fout:
                json.dump(tasks, fout, indent=2)
            config["input_path"] = tasks_json
            config["source"] = {
                "name": "Tasks",
                "type": "tasks-json",
                "path": os.path.abspath(tasks_json_path),
            }
            logger.debug(
                "{tasks_json_path} input file with {n} tasks has been created from {input_path}"
                .format(tasks_json_path=tasks_json_path,
                        n=len(tasks),
                        input_path=input_path))

        if args.target:
            config["target"] = {
                "type": args.target,
                "path": args.target_path,
                "params": args.target_params,
            }
        else:
            completions_dir = os.path.join(dir, "completions")
            if os.path.exists(completions_dir) and not args.force:
                already_exists_error("output dir", completions_dir)
            if os.path.exists(completions_dir):
                delete_dir_content(completions_dir)
                print(completions_dir +
                      " output dir already exists. Clear it.")
            else:
                os.makedirs(completions_dir, exist_ok=True)
                print(completions_dir + " output dir has been created.")
            config["output_dir"] = "completions"
            config["target"] = {
                "name": "Completions",
                "type": "completions-dir",
                "path": os.path.abspath(completions_dir),
            }

        if "ml_backends" not in config or not isinstance(
                config["ml_backends"], list):
            config["ml_backends"] = []
        if args.ml_backends:
            for url in args.ml_backends:
                config["ml_backends"].append(
                    cls._create_ml_backend_params(url, project_name))

        if args.sampling:
            config["sampling"] = args.sampling
        if args.port:
            config["port"] = args.port
        if args.host:
            config["host"] = args.host
        if args.allow_serving_local_files:
            config["allow_serving_local_files"] = True
        if args.key_file and args.cert_file:
            config["protocol"] = "https://"
            config["cert"] = args.cert_file
            config["key"] = args.key_file
        if (hasattr(args, "web_gui_project_desc")
                and args.web_gui_project_desc) or args.project_desc:
            config[
                "description"] = args.web_gui_project_desc or args.project_desc

        # create config.json
        config_json = "config.json"
        config_json_path = os.path.join(dir, config_json)
        if os.path.exists(config_json_path) and not args.force:
            already_exists_error("config", config_json_path)
        with io.open(config_json_path, mode="w") as f:
            json.dump(config, f, indent=2)

        print("")
        print(
            "Label Studio has been successfully initialized. Check project states in "
            + dir)
        print("Start the server: label-studio start " + dir)
        return dir

    @classmethod
    def get_config(cls, project_name, args):
        return cls._get_config(cls.get_project_dir(project_name, args))

    @classmethod
    def _get_config(cls, project_dir, args=None):
        """
        Get config from input args Namespace acquired by Argparser
        :param args:
        :return:
        """
        # check if project directory exists
        if not os.path.exists(project_dir):
            project_name = args.project_name if args is not None else "<project_name>"
            raise FileNotFoundError(
                "Couldn't find directory " + project_dir +
                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start '
                + project_name + " --init")

        # check config.json exists in directory
        config_path = os.path.join(project_dir, "config.json")
        if not os.path.exists(config_path):
            project_name = args.project_name if args is not None else "<project_name>"
            raise FileNotFoundError(
                "Couldn't find config file " + config_path +
                " in project directory " + project_dir +
                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start '
                + project_name + " --init")

        config_path = os.path.abspath(config_path)
        with io.open(config_path) as c:
            config = json.load(c)

        config["config_path"] = config_path
        if config.get("input_path"):
            config["input_path"] = os.path.join(os.path.dirname(config_path),
                                                config["input_path"])
        config["label_config"] = os.path.join(os.path.dirname(config_path),
                                              config["label_config"])
        if config.get("output_dir"):
            config["output_dir"] = os.path.join(os.path.dirname(config_path),
                                                config["output_dir"])
        if not config.get("source"):
            config["source"] = {
                "name": "Tasks",
                "type": "tasks-json",
                "path": os.path.abspath(config["input_path"]),
            }
        if not config.get("target"):
            config["target"] = {
                "name": "Completions",
                "type": "completions-dir",
                "path": os.path.abspath(config["output_dir"]),
            }
        return config

    @classmethod
    def _load_from_dir(cls, project_dir, project_name, args, context):
        config = cls._get_config(project_dir, args)
        return cls(config,
                   project_name,
                   context=context,
                   root_dir=args.root_dir)

    @classmethod
    def get(cls, project_name, args, context):

        # If project stored in memory, just return it
        if project_name in cls._storage:
            return cls._storage[project_name]

        # If project directory exists, load project from directory and update in-memory storage
        project_dir = cls.get_project_dir(project_name, args)
        if os.path.exists(project_dir):
            project = cls._load_from_dir(project_dir, project_name, args,
                                         context)
            cls._storage[project_name] = project
            return project

        raise ProjectNotFound(
            "Project {p} doesn't exist".format(p=project_name))

    @classmethod
    def create(cls, project_name, args, context):
        # "create" method differs from "get" as it can create new directory with project resources
        project_dir = cls.create_project_dir(project_name, args)
        project = cls._load_from_dir(project_dir, project_name, args, context)
        cls._storage[project_name] = project
        return project

    @classmethod
    def get_or_create(cls, project_name, args, context):
        try:
            project = cls.get(project_name, args, context)
            logger.info('Get project "' + project_name + '".')
        except ProjectNotFound:
            project = cls.create(project_name, args, context)
            logger.info('Project "' + project_name + '" created.')
        return project

    def update_on_boarding_state(self):
        self.on_boarding["setup"] = self.config.get("label_config_updated",
                                                    False)
        self.on_boarding["import"] = not self.no_tasks()
        self.on_boarding["labeled"] = not self.target_storage.empty()
        return self.on_boarding

    @property
    def generate_sample_task_escape(self):
        return self.project_obj.generate_sample_task_escape

    @property
    def supported_formats(self):
        return self.project_obj.supported_formats

    def serialize(self):
        """Serialize project to json dict"""
        ban_list = ("json", "dir-jsons")
        available_storages = list(
            filter(lambda i: i[0] not in ban_list,
                   get_available_storage_names().items()))

        output = {
            "project_name": self.name,
            "task_count": len(self.source_storage.ids()),
            "completion_count": len(self.get_completions_ids()),
            "config": self.config,
            "instruction": self.config["instruction"],
            "can_manage_tasks": self.can_manage_tasks,
            "can_manage_completions": self.can_manage_completions,
            "can_delete_tasks": self.can_delete_tasks,
            "target_storage": {
                "readable_path": self.target_storage.readable_path
            },
            "source_storage": {
                "readable_path": self.source_storage.readable_path
            },
            "available_storages": available_storages,
            "source_syncing": self.source_storage.is_syncing,
            "target_syncing": self.target_storage.is_syncing,
            "data_types": self.data_types,
            "label_config_line": self.label_config_line,
        }
        return output
示例#8
0
class Project(object):

    _storage = {}

    _allowed_extensions = {
        'Text': ('.txt', ),
        'Image': ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'),
        'Audio': ('.wav', '.aiff', '.mp3', '.au', '.flac')
    }

    def __init__(self, config, name, context=None):
        self.config = config
        self.name = name

        self.on_boarding = {}
        self.context = context or {}

        self.tasks = None
        self.label_config_line, self.label_config_full, self.input_data_tags = None, None, None
        self.derived_input_schema, self.derived_output_schema = None, None
        self.load_tasks()
        self.load_label_config()
        self.load_derived_schemas()

        self.analytics = None
        self.load_analytics()

        self.project_obj, self.ml_backend = None, None
        self.load_project_ml_backend()

        self.converter = None
        self.load_converter()

    def load_tasks(self):
        self.tasks = {}
        self.derived_input_schema = set()
        tasks = json_load(self.config['input_path'])
        if len(tasks) == 0:
            logger.warning('No tasks loaded from ' + self.config['input_path'])
            return
        for task_id, task in tasks.items():
            self.tasks[int(task_id)] = task
            data_keys = set(task['data'].keys())
            if not self.derived_input_schema:
                self.derived_input_schema = data_keys
            else:
                self.derived_input_schema &= data_keys
        print(
            str(len(self.tasks)) + ' tasks loaded from: ' +
            self.config['input_path'])

    def load_label_config(self):
        self.label_config_full = config_comments_free(
            open(self.config['label_config']).read())
        self.label_config_line = config_line_stripped(self.label_config_full)
        self.input_data_tags = self.get_input_data_tags(self.label_config_line)

    def load_derived_schemas(self):

        self.derived_output_schema = {
            'from_name_to_name_type': set(),
            'labels': defaultdict(set)
        }

        # for all already completed tasks we update derived output schema for further label config validation
        for task_id in self.get_task_ids():
            task_with_completions = self.get_task_with_completions(task_id)
            if task_with_completions and 'completions' in task_with_completions:
                completions = task_with_completions['completions']
                for completion in completions:
                    self._update_derived_output_schema(completion)

    def load_analytics(self):
        collect_analytics = os.getenv('collect_analytics')
        if collect_analytics is None:
            collect_analytics = self.config.get('collect_analytics', True)
        collect_analytics = bool(collect_analytics)
        self.analytics = Analytics(self.label_config_line, collect_analytics,
                                   self.name, self.context)

    def load_project_ml_backend(self):
        # configure project
        self.project_obj = ProjectObj(label_config=self.label_config_line,
                                      label_config_full=self.label_config_full)
        # configure machine learning backend
        ml_backend_params = self.config.get('ml_backend')
        if ml_backend_params:
            self.ml_backend = MLBackend.from_params(ml_backend_params)
            self.project_obj.connect(self.ml_backend)

    def load_converter(self):
        self.converter = Converter(self.label_config_full)

    @property
    def id(self):
        return self.project_obj.id

    @property
    def data_types(self):
        return self.project_obj.data_types

    @property
    def label_config(self):
        return self.project_obj.label_config

    def extract_data_types(self, config):
        return self.project_obj.extract_data_types(config)

    def validate_label_config(self, config_string):
        self.project_obj.validate_label_config(config_string)

        parsed_config = parse_config(config_string)

        self.validate_label_config_on_derived_input_schema(parsed_config)
        self.validate_label_config_on_derived_output_schema(parsed_config)

    def update_label_config(self, new_label_config):
        label_config_file = self.config['label_config']
        # save xml label config to file
        with io.open(label_config_file, mode='w') as f:
            f.write(new_label_config)

        # reload everything that depends on label config
        self.load_label_config()
        self.load_derived_schemas()
        self.load_analytics()
        self.load_project_ml_backend()
        self.load_converter()

        # save project config state
        self.config['label_config_updated'] = True
        with io.open(self.config['config_path'], mode='w') as f:
            json.dump(self.config, f)
        logger.info(
            'Label config saved to: {path}'.format(path=label_config_file))

    @classmethod
    def _get_single_input_value(cls, input_data_tags):
        if len(input_data_tags) > 1:
            val = ",".join(tag.attrib.get("name") for tag in input_data_tags)
            print('Warning! Multiple input data tags found: ' + val +
                  '. Only first one is used.')
        input_data_tag = input_data_tags[0]
        data_key = input_data_tag.attrib.get('value').lstrip('$')
        return data_key

    def _update_derived_output_schema(self, completion):
        """
        Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type)
        and list of unique labels derived from existed completions
        :param completion:
        :return:
        """
        for result in completion['result']:
            result_type = result.get('type')
            if result_type == 'relation':
                continue
            if 'from_name' not in result or 'to_name' not in result:
                logger.error(
                    'Unexpected completion.result format: "from_name" or "to_name" not found in %r'
                    % result)
                continue

            self.derived_output_schema['from_name_to_name_type'].add(
                (result['from_name'], result['to_name'], result_type))
            for label in result['value'].get(result_type, []):
                self.derived_output_schema['labels'][result['from_name']].add(
                    label)

    def validate_label_config_on_derived_input_schema(
            self, config_string_or_parsed_config):
        """
        Validate label config on input schemas (tasks types and data keys) derived from imported tasks
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already imported tasks
        """

        # check if schema exists, i.e. at least one task has been uploaded
        if not self.derived_input_schema:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)
        input_types, input_values = set(), set()
        for input_items in map(itemgetter('inputs'), config.values()):
            for input_item in input_items:
                input_types.add(input_item['type'])
                input_values.add(input_item['value'])

        # check input data values: they must be in schema
        for item in input_values:
            if item not in self.derived_input_schema:
                raise ValidationError(
                    'You have already imported tasks and they are incompatible with a new config. '
                    'You\'ve specified value=${item}, but imported tasks contain only keys: {input_schema_values}'
                    .format(item=item,
                            input_schema_values=list(
                                self.derived_input_schema)))

    def validate_label_config_on_derived_output_schema(
            self, config_string_or_parsed_config):
        """
        Validate label config on output schema (from_names, to_names and labeling types) derived from completions
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already created completions
        """
        output_schema = self.derived_output_schema

        # check if schema exists, i.e. at least one completion has been created
        if not output_schema['from_name_to_name_type']:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)

        completion_tuples = set()

        for from_name, to in config.items():
            completion_tuples.add(
                (from_name, to['to_name'][0], to['type'].lower()))

        for from_name, to_name, type in output_schema[
                'from_name_to_name_type']:
            if (from_name, to_name, type) not in completion_tuples:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'name={from_name}, toName={to_name}, type={type} are expected'
                    .format(from_name=from_name, to_name=to_name, type=type))
        for from_name, expected_label_set in output_schema['labels'].items():
            if from_name not in config:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'name=' + from_name + ' is expected')
            found_labels = set(config[from_name]['labels'])
            extra_labels = list(expected_label_set - found_labels)
            if extra_labels:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'there are labels already created for "{from_name}":\n{extra_labels}'
                    .format(from_name=from_name, extra_labels=extra_labels))

    def get_tasks(self):
        """ Load tasks from JSON files in input_path directory

        :return: file list
        """
        return self.tasks

    def delete_tasks(self):
        """
        Deletes all tasks & completions from filesystem, then reloads clean project
        :return:
        """
        delete_dir_content(self.config['output_dir'])
        if os.path.exists(self.config['input_path']) and os.path.isfile(
                self.config['input_path']):
            with io.open(self.config['input_path'], mode='w') as f:
                json.dump({}, f)

        # delete everything on ML backend
        if self.ml_backend:
            self.ml_backend.clear(self)

        # reload everything related to tasks
        self.load_tasks()
        self.load_derived_schemas()

    def next_task(self, completed_tasks_ids):
        completed_tasks_ids = set(completed_tasks_ids)
        sampling = self.config.get('sampling', 'sequential')
        if sampling == 'sequential':
            actual_tasks = (self.tasks[task_id] for task_id in self.tasks
                            if task_id not in completed_tasks_ids)
            return next(actual_tasks, None)
        elif sampling == 'uniform':
            actual_tasks_ids = [
                task_id for task_id in self.tasks
                if task_id not in completed_tasks_ids
            ]
            if not actual_tasks_ids:
                return None
            random.shuffle(actual_tasks_ids)
            return self.tasks[actual_tasks_ids[0]]
        else:
            raise NotImplementedError('Unknown sampling method ' + sampling)

    def get_task_ids(self):
        """ Get task ids only

        :return: list of task ids
        """
        return list(self.tasks.keys())

    def get_task(self, task_id):
        """ Get one task

        :param task_id:
        :return: task
        """
        try:
            task_id = int(task_id)
        except ValueError:
            return None
        return self.tasks.get(task_id)

    def get_completions_ids(self):
        """ List completion ids from output_dir directory

        :return: filenames without extensions and directories
        """
        root_dir = self.config['output_dir']
        os.mkdir(root_dir) if not os.path.exists(root_dir) else ()
        files = os.listdir(root_dir)
        completions = [
            int(os.path.splitext(f)[0]) for f in files if f.endswith('.json')
        ]
        logger.debug('{num} completions found in {output_dir}'.format(
            num=len(completions), output_dir=self.config["output_dir"]))
        return sorted(completions)

    def get_completed_at(self, task_ids):
        """ Get completed time for list of task ids

        :param task_ids: list of task ids
        :return: list of string with formatted datetime
        """
        root_dir = self.config['output_dir']
        existing_completions = set(self.get_completions_ids())
        ids = existing_completions.intersection(task_ids)
        times = {
            i: os.path.getmtime(os.path.join(root_dir,
                                             str(i) + '.json'))
            for i in ids
        }
        times = {
            i: datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')
            for i, t in times.items()
        }
        return times

    def get_task_with_completions(self, task_id):
        """ Get task with completions

        :param task_id: task ids
        :return: json dict with completion
        """
        try:
            task_id = int(
                task_id
            )  # check task_id is int (disallow to escape from output_dir)
        except ValueError:
            return None

        if 'completions' in self.tasks[task_id]:
            return self.tasks[task_id]

        filename = os.path.join(self.config['output_dir'],
                                str(task_id) + '.json')

        if os.path.exists(filename):
            data = json.load(open(filename))
            # tasks can hold the newest version of predictions, so task it from tasks
            data['predictions'] = self.tasks[task_id].get('predictions', [])
        else:
            data = None
        return data

    def save_completion(self, task_id, completion):
        """ Save completion

        :param task_id: task id
        :param completion: json data from label (editor)
        """

        # try to get completions with task first
        task = self.get_task_with_completions(task_id)

        # init task if completions with task not exists
        if not task:
            task = self.get_task(task_id)
            task['completions'] = []

        # update old completion
        updated = False
        if 'id' in completion:
            for i, item in enumerate(task['completions']):
                if item['id'] == completion['id']:
                    task['completions'][i].update(completion)
                    updated = True

        # write new completion
        if not updated:
            completion['id'] = task['id'] * 1000 + len(task['completions']) + 1
            task['completions'].append(completion)

        self._update_derived_output_schema(completion)

        # write task + completions to file
        filename = os.path.join(self.config['output_dir'],
                                str(task_id) + '.json')
        os.mkdir(self.config['output_dir']) if not os.path.exists(
            self.config['output_dir']) else ()
        json.dump(task, open(filename, 'w'), indent=4, sort_keys=True)
        return completion['id']

    def delete_completion(self, task_id):
        """ Delete completion from disk

        :param task_id: task id
        """
        filename = os.path.join(self.config['output_dir'],
                                str(task_id) + '.json')
        os.remove(filename)

    @classmethod
    def get_project_dir(cls, project_name, args):
        return os.path.join(args.root_dir, project_name)

    @classmethod
    def get_input_data_tags(cls, label_config):
        tag_iter = ElementTree.fromstring(label_config).iter()
        return [
            tag for tag in tag_iter if tag.attrib.get('name')
            and tag.attrib.get('value', '').startswith('$')
        ]

    @classmethod
    def _load_tasks(cls, input_path, args, label_config_file):
        with io.open(label_config_file) as f:
            label_config = f.read()

        task_loader = Tasks()
        if args.input_format == 'json':
            return task_loader.from_json_file(input_path)
        if args.input_format == 'json-dir':
            return task_loader.from_dir_with_json_files(input_path)
        input_data_tags = cls.get_input_data_tags(label_config)
        data_key = Project._get_single_input_value(input_data_tags)
        if args.input_format == 'text':
            return task_loader.from_text_file(input_path, data_key)
        if args.input_format == 'text-dir':
            return task_loader.from_dir_with_text_files(input_path, data_key)
        if args.input_format == 'image-dir':
            return task_loader.from_dir_with_image_files(input_path, data_key)
        if args.input_format == 'audio-dir':
            return task_loader.from_dir_with_audio_files(input_path, data_key)
        raise RuntimeError('Can\'t load tasks for input format={}'.format(
            args.input_format))

    @classmethod
    def create_project_dir(cls, project_name, args):
        """
        Create project directory in args.root_dir/project_name, and initialize there all required files
        If some files are missed, restore them from defaults.
        If config files are specified by args, copy them in project directory
        :param project_name:
        :param args:
        :return:
        """
        dir = cls.get_project_dir(project_name, args)
        os.makedirs(dir, exist_ok=True)

        config = json_load(
            args.config_path) if args.config_path else json_load(
                find_file('default_config.json'))

        def already_exists_error(what, path):
            raise RuntimeError(
                '{path} {what} already exists. Use "--force" option to recreate it.'
                .format(path=path, what=what))

        input_path = args.input_path or config.get('input_path')

        # save label config
        config_xml = 'config.xml'
        config_xml_path = os.path.join(dir, config_xml)
        label_config_file = args.label_config or config.get('label_config')
        if label_config_file:
            copy2(label_config_file, config_xml_path)
            print(label_config_file + ' label config copied to ' +
                  config_xml_path)
        else:
            if os.path.exists(config_xml_path) and not args.force:
                already_exists_error('label config', config_xml_path)
            if not input_path:
                # create default config with polygons only if input data is not set
                default_label_config = find_file(
                    'examples/image_polygons/config.xml')
                copy2(default_label_config, config_xml_path)
                print(default_label_config + ' label config copied to ' +
                      config_xml_path)
            else:
                with io.open(config_xml_path, mode='w') as fout:
                    fout.write('<View></View>')
                print('Empty config has been created in ' + config_xml_path)

        config['label_config'] = config_xml

        # save tasks.json
        tasks_json = 'tasks.json'
        tasks_json_path = os.path.join(dir, tasks_json)
        if input_path:
            tasks = cls._load_tasks(input_path, args, config_xml_path)
            with io.open(tasks_json_path, mode='w') as fout:
                json.dump(tasks, fout, indent=2)
            print(tasks_json_path + ' input path has been created from ' +
                  input_path)
        else:
            if os.path.exists(tasks_json_path) and not args.force:
                already_exists_error('input path', tasks_json_path)
            with io.open(tasks_json_path, mode='w') as fout:
                json.dump({}, fout)
            print(tasks_json_path +
                  ' input path has been created with empty tasks.')
        config['input_path'] = tasks_json

        # create completions dir
        completions_dir = os.path.join(dir, 'completions')
        if os.path.exists(completions_dir) and not args.force:
            already_exists_error('output dir', completions_dir)
        if os.path.exists(completions_dir):
            delete_dir_content(completions_dir)
            print(completions_dir + ' output dir already exists. Clear it.')
        else:
            os.makedirs(completions_dir, exist_ok=True)
            print(completions_dir + ' output dir has been created.')
        config['output_dir'] = 'completions'

        if args.ml_backend_url:
            if 'ml_backend' not in config or not isinstance(
                    config['ml_backend'], dict):
                config['ml_backend'] = {}
            config['ml_backend']['url'] = args.ml_backend_url
            if args.ml_backend_name:
                config['ml_backend']['name'] = args.ml_backend_name
            else:
                config['ml_backend']['name'] = str(uuid4())

        # create config.json
        config_json = 'config.json'
        config_json_path = os.path.join(dir, config_json)
        if os.path.exists(config_json_path) and not args.force:
            already_exists_error('config', config_json_path)
        with io.open(config_json_path, mode='w') as f:
            json.dump(config, f, indent=2)

        print('')
        print(
            'Label Studio has been successfully initialized. Check project states in '
            + dir)
        print('Start the server: label-studio start ' + dir)
        return dir

    @classmethod
    def _get_config(cls, project_dir, args):
        """
        Get config from input args Namespace acquired by Argparser
        :param args:
        :return:
        """
        # check if project directory exists
        if not os.path.exists(project_dir):
            raise FileNotFoundError(
                'Couldn\'t find directory ' + project_dir +
                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start '
                + args.project_name + ' --init')

        # check config.json exists in directory
        config_path = os.path.join(project_dir, 'config.json')
        if not os.path.exists(config_path):
            raise FileNotFoundError(
                'Couldn\'t find config file ' + config_path +
                ' in project directory ' + project_dir +
                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start '
                + args.project_name + ' --init')

        config_path = os.path.abspath(config_path)
        with io.open(config_path) as c:
            config = json.load(c)

        config['config_path'] = config_path
        config['input_path'] = os.path.join(os.path.dirname(config_path),
                                            config['input_path'])
        config['label_config'] = os.path.join(os.path.dirname(config_path),
                                              config['label_config'])
        config['output_dir'] = os.path.join(os.path.dirname(config_path),
                                            config['output_dir'])
        return config

    @classmethod
    def _load_from_dir(cls, project_dir, project_name, args, context):
        config = cls._get_config(project_dir, args)
        return cls(config, project_name, context)

    @classmethod
    def get(cls, project_name, args, context):

        # If project stored in memory, just return it
        if project_name in cls._storage:
            return cls._storage[project_name]

        # If project directory exists, load project from directory and update in-memory storage
        project_dir = cls.get_project_dir(project_name, args)
        if os.path.exists(project_dir):
            project = cls._load_from_dir(project_dir, project_name, args,
                                         context)
            cls._storage[project_name] = project
            return project

        raise ProjectNotFound(
            'Project {p} doesn\'t exist'.format(p=project_name))

    @classmethod
    def create(cls, project_name, args, context):
        # "create" method differs from "get" as it can create new directory with project resources
        project_dir = cls.create_project_dir(project_name, args)
        project = cls._load_from_dir(project_dir, project_name, args, context)
        cls._storage[project_name] = project
        return project

    @classmethod
    def get_or_create(cls, project_name, args, context):
        try:
            project = cls.get(project_name, args, context)
            logger.info('Get project "' + project_name + '".')
        except ProjectNotFound:
            project = cls.create(project_name, args, context)
            logger.info('Project "' + project_name + '" created.')
        return project

    def update_on_boarding_state(self):
        self.on_boarding['setup'] = self.config.get('label_config_updated',
                                                    False)
        self.on_boarding['import'] = len(self.tasks) > 0
        self.on_boarding['labeled'] = len(os.listdir(
            self.config['output_dir'])) > 0
        return self.on_boarding
示例#9
0
class Project(object):

    _storage = {}

    def __init__(self, config, name, root_dir='.', context=None):
        self.config = config
        self.name = name
        self.path = os.path.join(root_dir, self.name)
        self.ml_backends = []

        self.on_boarding = {}
        self.context = context or {}
        self.project_obj = None
        self.source_storage = None
        self.target_storage = None
        self.create_storages()

        self.tasks = None
        self.label_config_line, self.label_config_full, self.parsed_label_config, self.input_data_tags = None, None, None, None  # noqa
        self.derived_input_schema, self.derived_output_schema = None, None

        self.load_label_config()
        self.load_project_and_ml_backends()
        self.update_derived_input_schema()
        self.update_derived_output_schema()

        self.converter = None
        self.load_converter()
        self.max_tasks_file_size = 250

    def get_storage(self, storage_for):
        if storage_for == 'source':
            return self.source_storage
        elif storage_for == 'target':
            return self.target_storage

    def get_available_storage_names(self, storage_for):
        if storage_for == 'source':
            return self.get_available_source_storage_names()
        elif storage_for == 'target':
            return self.get_available_target_storage_names()

    @classmethod
    def get_available_source_storages(cls):
        return ['tasks-json', 's3', 'gcs']

    @classmethod
    def get_available_target_storages(cls):
        return ['completions-dir', 's3-completions', 'gcs-completions']

    def get_available_source_storage_names(self):
        names = OrderedDict()
        nameset = set(self.get_available_source_storages())
        for name, desc in get_available_storage_names().items():
            # we don't expose configurable filesystem storage in UI to avoid security problems
            if name in nameset:
                names[name] = desc
        return names

    def get_available_target_storage_names(self):
        names = OrderedDict()
        nameset = set(self.get_available_target_storages())
        for name, desc in get_available_storage_names().items():
            # blobs have no sense for target storages
            if name in nameset:
                names[name] = desc
        return names

    def create_storages(self):
        source = self.config['source']
        target = self.config['target']
        self.source_storage = create_storage(source['type'], 'source',
                                             source['path'], self.path, self,
                                             **source.get('params', {}))
        self.target_storage = create_storage(target['type'], 'target',
                                             target['path'], self.path, self,
                                             **target.get('params', {}))

    def update_storage(self, storage_for, storage_kwargs):
        def _update_storage(storage_for, storage_kwargs):
            storage_name = storage_kwargs.pop('name', storage_for)
            storage_type = storage_kwargs.pop('type')
            storage_path = storage_kwargs.pop('path', None)
            # storage_path = self.config[storage_for]['path']
            storage = create_storage(storage_type, storage_name, storage_path,
                                     self.path, self, **storage_kwargs)
            self.config[storage_for] = {
                'name': storage_name,
                'type': storage_type,
                'path': storage_path,
                'params': storage_kwargs
            }
            self._save_config()
            logger.debug('Created storage type "' + storage_type + '"')
            return storage

        if storage_for == 'source':
            self.source_storage = _update_storage('source', storage_kwargs)
        elif storage_for == 'target':
            self.target_storage = _update_storage('target', storage_kwargs)
        self.update_derived_input_schema()
        self.update_derived_output_schema()

    @property
    def can_manage_tasks(self):
        return self.config['source']['type'] not in {
            's3', 's3-completions', 'gcs', 'gcs-completions'
        }

    @property
    def can_manage_completions(self):
        return self.config['target']['type'] not in {
            's3', 's3-completions', 'gcs', 'gcs-completions'
        }

    @property
    def can_delete_tasks(self):
        return self.can_manage_tasks and self.can_manage_completions

    @property
    def data_types_json(self):
        return self.project_obj.data_types_json

    def load_label_config(self):
        self.label_config_full = config_comments_free(
            open(self.config['label_config'], encoding='utf8').read())
        self.label_config_line = config_line_stripped(self.label_config_full)
        self.parsed_label_config = parse_config(self.label_config_line)
        self.input_data_tags = self.get_input_data_tags(self.label_config_line)

    def update_derived_input_schema(self):
        self.derived_input_schema = set()
        for task_id, task in self.source_storage.items():
            data_keys = set(task['data'].keys())
            if not self.derived_input_schema:
                self.derived_input_schema = data_keys
            else:
                self.derived_input_schema &= data_keys
        logger.debug('Derived input schema: ' + str(self.derived_input_schema))

    def update_derived_output_schema(self):
        self.derived_output_schema = {
            'from_name_to_name_type': set(),
            'labels': defaultdict(set)
        }

        # for all already completed tasks we update derived output schema for further label config validation
        for task_id, c in self.target_storage.items():
            for completion in c['completions']:
                self._update_derived_output_schema(completion)
        logger.debug('Derived output schema: ' +
                     str(self.derived_output_schema))

    def add_ml_backend(self, params, raise_on_error=True):
        ml_backend = MLBackend.from_params(params)
        if not ml_backend.connected and raise_on_error:
            raise ValueError('ML backend with URL: "' + str(params['url']) +
                             '" is not connected.')
        self.ml_backends.append(ml_backend)

    def remove_ml_backend(self, name):
        # remove from memory
        remove_idx = next(
            (i
             for i, b in enumerate(self.ml_backends) if b.model_name == name),
            None)
        if remove_idx is None:
            raise KeyError('Can\'t remove ML backend with name "' + name +
                           '": not found.')
        self.ml_backends.pop(remove_idx)

        # remove from config
        config_params = self.config.get('ml_backends', [])
        remove_idx = next(
            (i for i, b in enumerate(config_params) if b['name'] == name),
            None)
        if remove_idx is not None:
            config_params.pop(remove_idx)
        self.config['ml_backends'] = config_params
        self._save_config()

    def load_project_and_ml_backends(self):
        # configure project
        self.project_obj = ProjectObj(label_config=self.label_config_line,
                                      label_config_full=self.label_config_full)

        # configure multiple machine learning backends
        self.ml_backends = []
        ml_backends_params = self.config.get('ml_backends', [])
        for ml_backend_params in ml_backends_params:
            self.add_ml_backend(ml_backend_params, raise_on_error=False)

    def load_converter(self):
        self.converter = Converter(self.parsed_label_config)

    @property
    def id(self):
        return self.project_obj.id

    @property
    def data_types(self):
        return self.project_obj.data_types

    @property
    def label_config(self):
        return self.project_obj.label_config

    @property
    def ml_backends_connected(self):
        return len(self.ml_backends) > 0

    @property
    def task_data_login(self):
        return self.project_obj.task_data_login

    @property
    def task_data_password(self):
        return self.project_obj.task_data_password

    def extract_data_types(self, config):
        return self.project_obj.extract_data_types(config)

    def validate_label_config(self, config_string):
        logger.debug('Validate label config')
        self.project_obj.validate_label_config(config_string)

        logger.debug('Get parsed config')
        parsed_config = parse_config(config_string)

        logger.debug('Validate label config on derived input schema')
        self.validate_label_config_on_derived_input_schema(parsed_config)

        logger.debug('Validate label config on derived output schema')
        self.validate_label_config_on_derived_output_schema(parsed_config)

    def _save_config(self):
        with io.open(self.config['config_path'], mode='w') as f:
            json.dump(self.config, f, indent=2)

    def update_params(self, params):
        if 'ml_backend' in params:
            ml_backend_params = self._create_ml_backend_params(
                params['ml_backend'], self.name)
            self.add_ml_backend(ml_backend_params)
            self.config['ml_backends'].append(ml_backend_params)
            self._save_config()

    def update_label_config(self, new_label_config):
        label_config_file = self.config['label_config']
        # save xml label config to file
        new_label_config = new_label_config.replace('\r\n', '\n')
        with io.open(label_config_file, mode='w', encoding='utf8') as f:
            f.write(new_label_config)

        # reload everything that depends on label config
        self.load_label_config()
        self.update_derived_output_schema()
        self.load_project_and_ml_backends()
        self.load_converter()

        # save project config state
        self.config['label_config_updated'] = True
        with io.open(self.config['config_path'], mode='w',
                     encoding='utf8') as f:
            json.dump(self.config, f)
        logger.info(
            'Label config saved to: {path}'.format(path=label_config_file))

    def _update_derived_output_schema(self, completion):
        """
        Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type)
        and list of unique labels derived from existed completions
        :param completion:
        :return:
        """
        for result in completion['result']:
            result_type = result.get('type')
            if result_type in ('relation', 'rating', 'pairwise'):
                continue
            if 'from_name' not in result or 'to_name' not in result:
                logger.error(
                    'Unexpected completion.result format: "from_name" or "to_name" not found in %r'
                    % result)
                continue

            self.derived_output_schema['from_name_to_name_type'].add(
                (result['from_name'], result['to_name'], result_type))
            for label in result['value'].get(result_type, []):
                self.derived_output_schema['labels'][result['from_name']].add(
                    label)

    def validate_label_config_on_derived_input_schema(
            self, config_string_or_parsed_config):
        """
        Validate label config on input schemas (tasks types and data keys) derived from imported tasks
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already imported tasks
        """

        # check if schema exists, i.e. at least one task has been uploaded
        if not self.derived_input_schema:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)
        input_types, input_values = set(), set()
        for input_items in map(itemgetter('inputs'), config.values()):
            for input_item in input_items:
                input_types.add(input_item['type'])
                input_values.add(input_item['value'])

        # check input data values: they must be in schema
        for item in input_values:
            if item not in self.derived_input_schema:
                raise ValidationError(
                    'You have already imported tasks and they are incompatible with a new config. '
                    'You\'ve specified value=${item}, but imported tasks contain only keys: {input_schema_values}'
                    .format(item=item,
                            input_schema_values=list(
                                self.derived_input_schema)))

    def validate_label_config_on_derived_output_schema(
            self, config_string_or_parsed_config):
        """
        Validate label config on output schema (from_names, to_names and labeling types) derived from completions
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already created completions
        """
        output_schema = self.derived_output_schema

        # check if schema exists, i.e. at least one completion has been created
        if not output_schema['from_name_to_name_type']:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)
        completion_tuples = set()

        for from_name, to in config.items():
            completion_tuples.add(
                (from_name, to['to_name'][0], to['type'].lower()))
        for from_name, to_name, type in output_schema[
                'from_name_to_name_type']:
            if (from_name, to_name, type) not in completion_tuples:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'name={from_name}, toName={to_name}, type={type} are expected'
                    .format(from_name=from_name, to_name=to_name, type=type))
        for from_name, expected_label_set in output_schema['labels'].items():
            if from_name not in config:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'name=' + from_name + ' is expected')
            found_labels = set(config[from_name]['labels'])
            extra_labels = list(expected_label_set - found_labels)
            if extra_labels:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'there are labels already created for "{from_name}":\n{extra_labels}'
                    .format(from_name=from_name, extra_labels=extra_labels))

    def no_tasks(self):
        return self.source_storage.empty()

    def delete_tasks(self):
        """
        Deletes all tasks & completions from filesystem, then reloads clean project
        :return:
        """
        self.source_storage.remove_all()
        self.target_storage.remove_all()
        self.update_derived_input_schema()
        self.update_derived_output_schema()

        # delete everything on ML backend
        if self.ml_backends_connected:
            for m in self.ml_backends:
                m.clear(self)

    def next_task(self, completed_tasks_ids):
        completed_tasks_ids = set(completed_tasks_ids)
        sampling = self.config.get('sampling', 'sequential')

        # Tasks are ordered ascending by their "id" fields. This is default mode.
        task_iter = filter(lambda i: i not in completed_tasks_ids,
                           sorted(self.source_storage.ids()))
        if sampling == 'sequential':
            task_id = next(task_iter, None)
            if task_id is not None:
                return self.source_storage.get(task_id)

        # Tasks are sampled with equal probabilities
        elif sampling == 'uniform':
            actual_tasks_ids = list(task_iter)
            if not actual_tasks_ids:
                return None
            random.shuffle(actual_tasks_ids)
            return self.source_storage.get(actual_tasks_ids[0])

        # Task with minimum / maximum average prediction score is taken
        elif sampling.startswith('prediction-score'):
            id_score_map = {}
            for task_id, task in self.source_storage.items():
                if task_id in completed_tasks_ids:
                    continue
                if 'predictions' in task and len(task['predictions']) > 0:
                    score = sum((p['score'] for p in task['predictions']),
                                0) / len(task['predictions'])
                    id_score_map[task_id] = score
            if not id_score_map:
                return None
            if sampling.endswith('-min'):
                best_idx = min(id_score_map, key=id_score_map.get)
            elif sampling.endswith('-max'):
                best_idx = max(id_score_map, key=id_score_map.get)
            else:
                raise NotImplementedError('Unknown sampling method ' +
                                          sampling)
            return self.source_storage.get(best_idx)
        else:
            raise NotImplementedError('Unknown sampling method ' + sampling)

    def remove_task(self, task_id):
        self.source_storage.remove(task_id)
        self.delete_completion(task_id)

        self.update_derived_input_schema()
        self.update_derived_output_schema()

    def get_completions_ids(self):
        """ List completion ids from output_dir directory

        :return: filenames without extensions and directories
        """
        task_ids = set(self.source_storage.ids())
        completion_ids = set(self.target_storage.ids())
        completions = completion_ids.intersection(task_ids)
        #completions = list(self.target_storage.ids())
        logger.debug('{num} completions found in {output_dir}'.format(
            num=len(completions), output_dir=self.config["output_dir"]))
        return sorted(completions)

    def get_completed_at(self):
        """ Get completed time for tasks

        :return: list of string with formatted datetime
        """
        times = {}
        for _, data in self.target_storage.items():
            id = data['id']
            try:
                times[id] = max(data['completions'],
                                key=itemgetter('created_at'))['created_at']
            except Exception as exc:
                times[id] = 'undefined'
        return times

    def get_skipped_status(self):
        """ Get skipped status for tasks: returns skipped completion number for task

        :return: list of int
        """
        items = {}
        for _, data in self.target_storage.items():
            id = data['id']
            try:
                flag = sum([
                    completion.get('skipped', False)
                    for completion in data['completions']
                ])
            except Exception as exc:
                items[id] = -1
            else:
                items[id] = flag
        return items

    def get_task_with_completions(self, task_id):
        """ Get task with completions

        :param task_id: task ids
        :return: json dict with completion
        """
        data = self.target_storage.get(task_id)
        logger.debug('Get task ' + str(task_id) + ' from target storage')

        if data:
            logger.debug('Get predictions ' + str(task_id) +
                         ' from source storage')
            # tasks can hold the newest version of predictions, so task it from tasks
            data['predictions'] = self.source_storage.get(task_id).get(
                'predictions', [])
        return data

    def save_completion(self, task_id, completion):
        """ Save completion

        :param task_id: task id
        :param completion: json data from label (editor)
        """
        # try to get completions with task first
        task = self.get_task_with_completions(task_id)

        # init task if completions with task not exists
        if not task:
            task = deepcopy(self.source_storage.get(task_id))
            task['completions'] = []
        else:
            task = deepcopy(task)

        # remove possible stored predictions
        task.pop('predictions', None)
        # update old completion
        updated = False
        if 'id' in completion:
            for i, item in enumerate(task['completions']):
                if item['id'] == completion['id']:
                    task['completions'][i].update(completion)
                    updated = True
        # write new completion
        if not updated:
            completion['id'] = task['id'] * 1000 + len(task['completions']) + 1
            task['completions'].append(completion)

        try:
            self._update_derived_output_schema(completion)
        except Exception as exc:
            logger.error(exc, exc_info=True)
            logger.debug(json.dumps(completion, indent=2))

        # save completion time
        completion['created_at'] = timestamp_now()

        # write task + completions to file
        self.target_storage.set(task_id, task)
        logger.debug('Completion ' + str(task_id) + ' saved:\n' +
                     json.dumps(task, indent=2))
        return completion['id']

    def delete_completion(self, task_id):
        """ Delete completion

        :param task_id: task id
        """
        self.target_storage.remove(task_id)
        self.update_derived_output_schema()

    def delete_all_completions(self):
        """ Delete all completions from project
        """
        self.target_storage.remove_all()
        self.update_derived_output_schema()

    def make_predictions(self, task):
        task = deepcopy(task)
        stored_predictions = task.get('predictions')
        task['predictions'] = []
        try:
            for ml_backend in self.ml_backends:
                if not ml_backend.connected:
                    continue
                predictions = ml_backend.make_predictions(task, self)
                predictions['created_by'] = ml_backend.model_name
                predictions['created_date'] = datetime.now().isoformat()
                task['predictions'].append(predictions)
        except Exception as exc:
            logger.debug(exc, exc_info=True)
        if not task['predictions'] and stored_predictions:
            task['predictions'] = stored_predictions
        return task

    def train(self):
        completions = []
        for _, c in self.target_storage.items():
            completions.append(c)
        train_status = False
        if self.ml_backends_connected:
            for ml_backend in self.ml_backends:
                if ml_backend.connected:
                    ml_backend.train(completions, self)
                    train_status = True
        return train_status

    @classmethod
    def get_project_dir(cls, project_name, args):
        return os.path.join(args.root_dir, project_name)

    @classmethod
    def get_input_data_tags(cls, label_config):
        tag_iter = ElementTree.fromstring(label_config).iter()
        return [
            tag for tag in tag_iter if tag.attrib.get('name')
            and tag.attrib.get('value', '').startswith('$')
        ]

    @classmethod
    def _load_tasks(cls, input_path, args, label_config_file):
        with io.open(label_config_file, encoding='utf8') as f:
            label_config = f.read()

        task_loader = Tasks()
        if args.input_format == 'json':
            return task_loader.from_json_file(input_path)
        if args.input_format == 'json-dir':
            return task_loader.from_dir_with_json_files(input_path)
        input_data_tags = cls.get_input_data_tags(label_config)

        if len(input_data_tags) > 1:
            val = ",".join(tag.attrib.get("name") for tag in input_data_tags)
            print('Warning! Multiple input data tags found: ' + val +
                  '. Only first one is used.')
        elif len(input_data_tags) == 0:
            raise ValueError(
                'You\'ve specified input format "{fmt}" which requires label config being explicitly defined. '
                'Please specify --label-config=path/to/config.xml or use --format=json or format=json_dir'
                .format(fmt=args.input_format))
        input_data_tag = input_data_tags[0]
        data_key = input_data_tag.attrib.get('value').lstrip('$')

        if args.input_format == 'text':
            return task_loader.from_text_file(input_path, data_key)
        if args.input_format == 'text-dir':
            return task_loader.from_dir_with_text_files(input_path, data_key)
        if args.input_format == 'image-dir':
            return task_loader.from_dir_with_image_files(input_path, data_key)
        if args.input_format == 'audio-dir':
            return task_loader.from_dir_with_audio_files(input_path, data_key)
        raise RuntimeError('Can\'t load tasks for input format={}'.format(
            args.input_format))

    @classmethod
    def _create_ml_backend_params(cls, url, project_name=None):
        if '=http' in url:
            name, url = url.split('=', 1)
        else:
            project_name = os.path.basename(project_name or '')
            name = project_name + str(uuid4())[:4]
        if not is_url(url):
            raise ValueError('Specified string "' + url +
                             '" doesn\'t look like URL.')
        return {'url': url, 'name': name}

    @classmethod
    def create_project_dir(cls, project_name, args):
        """
        Create project directory in args.root_dir/project_name, and initialize there all required files
        If some files are missed, restore them from defaults.
        If config files are specified by args, copy them in project directory
        :param project_name:
        :param args:
        :return:
        """
        dir = cls.get_project_dir(project_name, args)
        if args.force:
            delete_dir_content(dir)
        os.makedirs(dir, exist_ok=True)

        config = json_load(
            args.config_path) if args.config_path else json_load(
                find_file('default_config.json'))

        def already_exists_error(what, path):
            raise RuntimeError(
                '{path} {what} already exists. Use "--force" option to recreate it.'
                .format(path=path, what=what))

        input_path = args.input_path or config.get('input_path')

        # save label config
        config_xml = 'config.xml'
        config_xml_path = os.path.join(dir, config_xml)
        label_config_file = args.label_config or config.get('label_config')
        if label_config_file:
            copy2(label_config_file, config_xml_path)
            print(label_config_file + ' label config copied to ' +
                  config_xml_path)
        else:
            if os.path.exists(config_xml_path) and not args.force:
                already_exists_error('label config', config_xml_path)
            if not input_path:
                # create default config with polygons only if input data is not set
                default_label_config = find_file(
                    'examples/image_polygons/config.xml')
                copy2(default_label_config, config_xml_path)
                print(default_label_config + ' label config copied to ' +
                      config_xml_path)
            else:
                with io.open(config_xml_path, mode='w') as fout:
                    fout.write('<View></View>')
                print('Empty config has been created in ' + config_xml_path)

        config['label_config'] = config_xml

        if args.source:
            config['source'] = {
                'type': args.source,
                'path': args.source_path,
                'params': args.source_params
            }
        else:
            # save tasks.json
            tasks_json = 'tasks.json'
            tasks_json_path = os.path.join(dir, tasks_json)
            if input_path:
                tasks = cls._load_tasks(input_path, args, config_xml_path)
            else:
                tasks = {}
            with io.open(tasks_json_path, mode='w') as fout:
                json.dump(tasks, fout, indent=2)
            config['input_path'] = tasks_json
            config['source'] = {
                'name': 'Tasks',
                'type': 'tasks-json',
                'path': os.path.abspath(tasks_json_path)
            }
            logger.debug(
                '{tasks_json_path} input file with {n} tasks has been created from {input_path}'
                .format(tasks_json_path=tasks_json_path,
                        n=len(tasks),
                        input_path=input_path))

        if args.target:
            config['target'] = {
                'type': args.target,
                'path': args.target_path,
                'params': args.target_params
            }
        else:
            completions_dir = os.path.join(dir, 'completions')
            if os.path.exists(completions_dir) and not args.force:
                already_exists_error('output dir', completions_dir)
            if os.path.exists(completions_dir):
                delete_dir_content(completions_dir)
                print(completions_dir +
                      ' output dir already exists. Clear it.')
            else:
                os.makedirs(completions_dir, exist_ok=True)
                print(completions_dir + ' output dir has been created.')
            config['output_dir'] = 'completions'
            config['target'] = {
                'name': 'Completions',
                'type': 'completions-dir',
                'path': os.path.abspath(completions_dir)
            }

        if 'ml_backends' not in config or not isinstance(
                config['ml_backends'], list):
            config['ml_backends'] = []
        if args.ml_backends:
            for url in args.ml_backends:
                config['ml_backends'].append(
                    cls._create_ml_backend_params(url, project_name))

        if args.sampling:
            config['sampling'] = args.sampling
        if args.port:
            config['port'] = args.port
        if args.host:
            config['host'] = args.host
        if args.allow_serving_local_files:
            config['allow_serving_local_files'] = True
        if args.key_file and args.cert_file:
            config['protocol'] = 'https://'
            config['cert'] = args.cert_file
            config['key'] = args.key_file

        # create config.json
        config_json = 'config.json'
        config_json_path = os.path.join(dir, config_json)
        if os.path.exists(config_json_path) and not args.force:
            already_exists_error('config', config_json_path)
        with io.open(config_json_path, mode='w') as f:
            json.dump(config, f, indent=2)

        print('')
        print(
            'Label Studio has been successfully initialized. Check project states in '
            + dir)
        print('Start the server: label-studio start ' + dir)
        return dir

    @classmethod
    def get_config(cls, project_name, args):
        return cls._get_config(cls.get_project_dir(project_name, args))

    @classmethod
    def _get_config(cls, project_dir, args=None):
        """
        Get config from input args Namespace acquired by Argparser
        :param args:
        :return:
        """
        # check if project directory exists
        if not os.path.exists(project_dir):
            project_name = args.project_name if args is not None else '<project_name>'
            raise FileNotFoundError(
                'Couldn\'t find directory ' + project_dir +
                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start '
                + project_name + ' --init')

        # check config.json exists in directory
        config_path = os.path.join(project_dir, 'config.json')
        if not os.path.exists(config_path):
            project_name = args.project_name if args is not None else '<project_name>'
            raise FileNotFoundError(
                'Couldn\'t find config file ' + config_path +
                ' in project directory ' + project_dir +
                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start '
                + project_name + ' --init')

        config_path = os.path.abspath(config_path)
        with io.open(config_path) as c:
            config = json.load(c)

        config['config_path'] = config_path
        if config.get('input_path'):
            config['input_path'] = os.path.join(os.path.dirname(config_path),
                                                config['input_path'])
        config['label_config'] = os.path.join(os.path.dirname(config_path),
                                              config['label_config'])
        if config.get('output_dir'):
            config['output_dir'] = os.path.join(os.path.dirname(config_path),
                                                config['output_dir'])
        if not config.get('source'):
            config['source'] = {
                'name': 'Tasks',
                'type': 'tasks-json',
                'path': os.path.abspath(config['input_path'])
            }
        if not config.get('target'):
            config['target'] = {
                'name': 'Completions',
                'type': 'completions-dir',
                'path': os.path.abspath(config['output_dir'])
            }
        return config

    @classmethod
    def _load_from_dir(cls, project_dir, project_name, args, context):
        config = cls._get_config(project_dir, args)
        return cls(config,
                   project_name,
                   context=context,
                   root_dir=args.root_dir)

    @classmethod
    def get(cls, project_name, args, context):

        # If project stored in memory, just return it
        if project_name in cls._storage:
            return cls._storage[project_name]

        # If project directory exists, load project from directory and update in-memory storage
        project_dir = cls.get_project_dir(project_name, args)
        if os.path.exists(project_dir):
            project = cls._load_from_dir(project_dir, project_name, args,
                                         context)
            cls._storage[project_name] = project
            return project

        raise ProjectNotFound(
            'Project {p} doesn\'t exist'.format(p=project_name))

    @classmethod
    def create(cls, project_name, args, context):
        # "create" method differs from "get" as it can create new directory with project resources
        project_dir = cls.create_project_dir(project_name, args)
        project = cls._load_from_dir(project_dir, project_name, args, context)
        cls._storage[project_name] = project
        return project

    @classmethod
    def get_or_create(cls, project_name, args, context):
        try:
            project = cls.get(project_name, args, context)
            logger.info('Get project "' + project_name + '".')
        except ProjectNotFound:
            project = cls.create(project_name, args, context)
            logger.info('Project "' + project_name + '" created.')
        return project

    def update_on_boarding_state(self):
        self.on_boarding['setup'] = self.config.get('label_config_updated',
                                                    False)
        self.on_boarding['import'] = not self.no_tasks()
        self.on_boarding['labeled'] = not self.target_storage.empty()
        return self.on_boarding

    @property
    def generate_sample_task_escape(self):
        return self.project_obj.generate_sample_task_escape

    @property
    def supported_formats(self):
        return self.project_obj.supported_formats

    def serialize(self):
        """ Serialize project to json dict
        """
        project = self
        banlist = ('json', 'dir-jsons')
        available_storages = list(
            filter(lambda i: i[0] not in banlist,
                   get_available_storage_names().items()))

        output = {
            'project_name': project.name,
            'task_count': len(project.source_storage.ids()),
            'completion_count': len(project.get_completions_ids()),
            'config': project.config,
            'can_manage_tasks': project.can_manage_tasks,
            'can_manage_completions': project.can_manage_completions,
            'can_delete_tasks': project.can_delete_tasks,
            'target_storage': {
                'readable_path': project.target_storage.readable_path
            },
            'source_storage': {
                'readable_path': project.source_storage.readable_path
            },
            'available_storages': available_storages,
            'source_syncing': self.source_storage.is_syncing,
            'target_syncing': self.target_storage.is_syncing,
            'data_types': self.data_types,
            'label_config_line': self.label_config_line
        }
        return output
示例#10
0
class Project(object):

    _storage = {}

    def __init__(self, config, name, context=None):
        self.config = config
        self.name = name

        self.on_boarding = {}
        self.context = context or {}

        self.tasks = None
        self.labeling_classes = None
        self.label_config_line, self.label_config_full, self.parsed_label_config, self.input_data_tags = None, None, None, None  # noqa
        self.derived_input_schema, self.derived_output_schema = None, None

        self.load_tasks()
        self.load_label_config()
        self.load_derived_schemas()
        self.load_labeling_classes()

        self.analytics = None
        self.load_analytics()

        self.project_obj = None
        self.ml_backends = []
        self.load_project_ml_backend()

        self.converter = None
        self.load_converter()
        self.max_tasks_file_size = 250

    def load_tasks(self):
        self.tasks = {}
        self.derived_input_schema = set()
        tasks = json_load(self.config['input_path'])
        if len(tasks) == 0:
            logger.warning('No tasks loaded from ' + self.config['input_path'])
            return
        for task_id, task in tasks.items():
            self.tasks[int(task_id)] = task
            data_keys = set(task['data'].keys())
            if not self.derived_input_schema:
                self.derived_input_schema = data_keys
            else:
                self.derived_input_schema &= data_keys
        print(str(len(self.tasks)) + ' tasks loaded from: ' + self.config['input_path'])

    def load_label_config(self):
        self.label_config_full = config_comments_free(open(self.config['label_config']).read())
        self.label_config_line = config_line_stripped(self.label_config_full)
        self.parsed_label_config = parse_config(self.label_config_line)
        self.input_data_tags = self.get_input_data_tags(self.label_config_line)

    def load_derived_schemas(self):

        self.derived_output_schema = {
            'from_name_to_name_type': set(),
            'labels': defaultdict(set)
        }

        # for all already completed tasks we update derived output schema for further label config validation
        for task_id in self.get_task_ids():
            task_with_completions = self.get_task_with_completions(task_id)
            if task_with_completions and 'completions' in task_with_completions:
                completions = task_with_completions['completions']
                for completion in completions:
                    self._update_derived_output_schema(completion)

    def load_analytics(self):
        collect_analytics = os.getenv('collect_analytics')
        if collect_analytics is None:
            collect_analytics = self.config.get('collect_analytics', True)
        collect_analytics = bool(collect_analytics)
        self.analytics = Analytics(self.label_config_line, collect_analytics, self.name, self.context)

    def add_ml_backend(self, params, raise_on_error=True):
        ml_backend = MLBackend.from_params(params)
        if not ml_backend.connected and raise_on_error:
            raise ValueError('ML backend with URL: "' + str(params['url']) + '" is not connected.')
        self.ml_backends.append(ml_backend)

    def remove_ml_backend(self, name):
        # remove from memory
        remove_idx = next((i for i, b in enumerate(self.ml_backends) if b.model_name == name), None)
        if remove_idx is None:
            raise KeyError('Can\'t remove ML backend with name "' + name + '": not found.')
        self.ml_backends.pop(remove_idx)

        # remove from config
        config_params = self.config.get('ml_backends', [])
        remove_idx = next((i for i, b in enumerate(config_params) if b['name'] == name), None)
        if remove_idx is not None:
            config_params.pop(remove_idx)
        self.config['ml_backends'] = config_params
        self._save_config()

    def load_project_ml_backend(self):
        # configure project
        self.project_obj = ProjectObj(label_config=self.label_config_line, label_config_full=self.label_config_full)

        # configure multiple machine learning backends
        self.ml_backends = []
        ml_backends_params = self.config.get('ml_backends', [])
        for ml_backend_params in ml_backends_params:
            self.add_ml_backend(ml_backend_params, raise_on_error=False)

    def load_converter(self):
        self.converter = Converter(self.parsed_label_config)

    def load_labeling_classes(self):
        tree = ElementTree.parse(self.config['label_config'])
        root = tree.getroot()
        self.labeling_classes = [i.attrib['value'] for i in root.iter('Label')]

    @property
    def id(self):
        return self.project_obj.id

    @property
    def data_types(self):
        return self.project_obj.data_types

    @property
    def label_config(self):
        return self.project_obj.label_config

    @property
    def ml_backends_connected(self):
        return len(self.ml_backends) > 0

    @property
    def task_data_login(self):
        return self.project_obj.task_data_login

    @property
    def task_data_password(self):
        return self.project_obj.task_data_password

    def extract_data_types(self, config):
        return self.project_obj.extract_data_types(config)

    def validate_label_config(self, config_string):
        logger.debug('Validate label config')
        self.project_obj.validate_label_config(config_string)

        logger.debug('Get parsed config')
        parsed_config = parse_config(config_string)

        logger.debug('Validate label config on derived input schema')
        self.validate_label_config_on_derived_input_schema(parsed_config)

        logger.debug('Validate label config on derived output schema')
        self.validate_label_config_on_derived_output_schema(parsed_config)

    def _save_config(self):
        with io.open(self.config['config_path'], mode='w') as f:
            json.dump(self.config, f, indent=2)

    def update_params(self, params):
        if 'ml_backend' in params:
            ml_backend_params = self._create_ml_backend_params(params['ml_backend'])
            self.add_ml_backend(ml_backend_params)
            self.config['ml_backends'].append(ml_backend_params)
            self._save_config()

    def update_label_config(self, new_label_config):
        label_config_file = self.config['label_config']
        # save xml label config to file
        with io.open(label_config_file, mode='w') as f:
            f.write(new_label_config)

        # reload everything that depends on label config
        self.load_label_config()
        self.load_derived_schemas()
        self.load_analytics()
        self.load_project_ml_backend()
        self.load_converter()

        # save project config state
        self.config['label_config_updated'] = True
        with io.open(self.config['config_path'], mode='w') as f:
            json.dump(self.config, f)
        logger.info('Label config saved to: {path}'.format(path=label_config_file))

    def _update_derived_output_schema(self, completion):
        """
        Given completion, output schema is updated. Output schema consists of unique tuples (from_name, to_name, type)
        and list of unique labels derived from existed completions
        :param completion:
        :return:
        """
        for result in completion['result']:
            result_type = result.get('type')
            if result_type in ('relation', 'rating', 'pairwise'):
                continue
            if 'from_name' not in result or 'to_name' not in result:
                logger.error('Unexpected completion.result format: "from_name" or "to_name" not found in %r' % result)
                continue

            self.derived_output_schema['from_name_to_name_type'].add((
                result['from_name'], result['to_name'], result_type
            ))
            for label in result['value'].get(result_type, []):
                self.derived_output_schema['labels'][result['from_name']].add(label)

    def validate_label_config_on_derived_input_schema(self, config_string_or_parsed_config):
        """
        Validate label config on input schemas (tasks types and data keys) derived from imported tasks
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already imported tasks
        """

        # check if schema exists, i.e. at least one task has been uploaded
        if not self.derived_input_schema:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)
        input_types, input_values = set(), set()
        for input_items in map(itemgetter('inputs'), config.values()):
            for input_item in input_items:
                input_types.add(input_item['type'])
                input_values.add(input_item['value'])

        # check input data values: they must be in schema
        for item in input_values:
            if item not in self.derived_input_schema:
                raise ValidationError(
                    'You have already imported tasks and they are incompatible with a new config. '
                    'You\'ve specified value=${item}, but imported tasks contain only keys: {input_schema_values}'
                        .format(item=item, input_schema_values=list(self.derived_input_schema)))

    def validate_label_config_on_derived_output_schema(self, config_string_or_parsed_config):
        """
        Validate label config on output schema (from_names, to_names and labeling types) derived from completions
        :param config_string_or_parsed_config: label config string or parsed config object
        :return: True if config match already created completions
        """
        output_schema = self.derived_output_schema

        # check if schema exists, i.e. at least one completion has been created
        if not output_schema['from_name_to_name_type']:
            return

        config = config_string_or_parsed_config
        if isinstance(config, str):
            config = parse_config(config)
        completion_tuples = set()

        for from_name, to in config.items():
            completion_tuples.add((from_name, to['to_name'][0], to['type'].lower()))
        for from_name, to_name, type in output_schema['from_name_to_name_type']:
            if (from_name, to_name, type) not in completion_tuples:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'name={from_name}, toName={to_name}, type={type} are expected'
                    .format(from_name=from_name, to_name=to_name, type=type)
                )
        for from_name, expected_label_set in output_schema['labels'].items():
            if from_name not in config:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'name=' + from_name + ' is expected'
                )
            found_labels = set(config[from_name]['labels'])
            extra_labels = list(expected_label_set - found_labels)
            if extra_labels:
                raise ValidationError(
                    'You\'ve already completed some tasks, but some of them couldn\'t be loaded with this config: '
                    'there are labels already created for "{from_name}":\n{extra_labels}'
                    .format(from_name=from_name, extra_labels=extra_labels)
                )

    def get_tasks(self):
        """ Load tasks from JSON files in input_path directory

        :return: file list
        """
        return self.tasks

    def delete_tasks(self):
        """
        Deletes all tasks & completions from filesystem, then reloads clean project
        :return:
        """
        delete_dir_content(self.config['output_dir'])
        if os.path.exists(self.config['input_path']) and os.path.isfile(self.config['input_path']):
            with io.open(self.config['input_path'], mode='w') as f:
                json.dump({}, f)

        # delete everything on ML backend
        if self.ml_backends_connected:
            for m in self.ml_backends:
                m.clear(self)

        # reload everything related to tasks
        self.load_tasks()
        self.load_derived_schemas()

    def next_task(self, completed_tasks_ids):
        completed_tasks_ids = set(completed_tasks_ids)
        sampling = self.config.get('sampling', 'sequential')
        if sampling == 'sequential':
            actual_tasks = (self.tasks[task_id] for task_id in self.tasks if task_id not in completed_tasks_ids)
            return next(actual_tasks, None)
        elif sampling == 'uniform':
            actual_tasks_ids = [task_id for task_id in self.tasks if task_id not in completed_tasks_ids]
            if not actual_tasks_ids:
                return None
            random.shuffle(actual_tasks_ids)
            return self.tasks[actual_tasks_ids[0]]
        else:
            raise NotImplementedError('Unknown sampling method ' + sampling)

    def get_task_ids(self):
        """ Get task ids only

        :return: list of task ids
        """
        return list(self.tasks.keys())

    def get_task(self, task_id):
        """ Get one task

        :param task_id:
        :return: task
        """
        try:
            task_id = int(task_id)
        except ValueError:
            return None
        return self.tasks.get(task_id)

    def iter_completions(self):
        root_dir = self.config['output_dir']
        os.mkdir(root_dir) if not os.path.exists(root_dir) else ()
        files = os.listdir(root_dir)
        for f in files:
            if f.endswith('.json'):
                yield os.path.join(root_dir, f)

    def get_completions_ids(self):
        """ List completion ids from output_dir directory

        :return: filenames without extensions and directories
        """
        completions = []
        for f in self.iter_completions():
            completions.append(int(os.path.splitext(os.path.basename(f))[0]))
        logger.debug('{num} completions found in {output_dir}'.format(
            num=len(completions), output_dir=self.config["output_dir"]))
        return sorted(completions)

    def get_completed_at(self, task_ids):
        """ Get completed time for list of task ids

        :param task_ids: list of task ids
        :return: list of string with formatted datetime
        """
        root_dir = self.config['output_dir']
        existing_completions = set(self.get_completions_ids())
        ids = existing_completions.intersection(task_ids)
        times = {i: os.path.getmtime(os.path.join(root_dir, str(i) + '.json')) for i in ids}
        times = {i: datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S') for i, t in times.items()}
        return times

    def get_task_with_completions(self, task_id):
        """ Get task with completions

        :param task_id: task ids
        :return: json dict with completion
        """
        try:
            task_id = int(task_id)  # check task_id is int (disallow to escape from output_dir)
        except ValueError:
            return None

        if 'completions' in self.tasks[task_id]:
            return self.tasks[task_id]

        filename = os.path.join(self.config['output_dir'], str(task_id) + '.json')

        if os.path.exists(filename):
            data = json.load(open(filename))
            # tasks can hold the newest version of predictions, so task it from tasks
            data['predictions'] = self.tasks[task_id].get('predictions', [])
        else:
            data = None
        return data

    def get_area_set(self):
        area_set = set()
        number_set = set()
        area_set.add('ALL')
        number_set.add('ALL')
        for task_id in self.get_task_ids():
            task_with_completions = self.get_task_with_completions(task_id)
            if task_with_completions and 'completions' in task_with_completions:
                area, num = task_with_completions['data']['area'].split()[-1].split('_')
                area_set.add(area)   # show col
                number_set.add(num)
        return list(area_set), list(number_set)

    def get_object_points(self):
        area_points_dict = defaultdict(list)
        for task_id in self.get_task_ids():
            task_with_completions = self.get_task_with_completions(task_id)
            if task_with_completions and 'completions' in task_with_completions:
                completions = task_with_completions['completions']
                area_key = task_with_completions['data']['area'].split()[-1]
                for completion in completions:
                    for result in completion['result']:
                        points = result['value']['points']
                        area_points_dict[area_key].append(points)
        return area_points_dict

    def get_area_class_number(self):
        area_dict = {}
        for task_id in self.get_task_ids():
            task_with_completions = self.get_task_with_completions(task_id)
            if task_with_completions and 'completions' in task_with_completions:
                completions = task_with_completions['completions']
                area_key = task_with_completions['data']['area'].split()[-1]
                class_defect_number = area_dict.get(area_key, {c: 0 for c in self.labeling_classes})
                for completion in completions:
                    for result in completion['result']:
                        class_defect_number[result['value']['polygonlabels'][0]] += 1
                area_dict[area_key] = class_defect_number
        return area_dict

    def get_class_area_number(self):
        class_defect_dict = {c: {} for c in self.labeling_classes}
        for task_id in self.get_task_ids():
            task_with_completions = self.get_task_with_completions(task_id)
            if task_with_completions and 'completions' in task_with_completions:
                completions = task_with_completions['completions']
                area_key = task_with_completions['data']['area'].split()[-1]
                for completion in completions:
                    for result in completion['result']:
                        c = result['value']['polygonlabels'][0]
                        class_defect_dict[c][area_key] = class_defect_dict[c].get(area_key, 0) + 1
                        class_defect_dict[c]['total'] = class_defect_dict[c].get('total', 0) + 1
        return class_defect_dict

    def save_completion(self, task_id, completion):
        """ Save completion

        :param task_id: task id
        :param completion: json data from label (editor)
        """

        # try to get completions with task first
        task = self.get_task_with_completions(task_id)

        # init task if completions with task not exists
        if not task:
            task = self.get_task(task_id)
            task['completions'] = []

        # update old completion
        updated = False
        if 'id' in completion:
            for i, item in enumerate(task['completions']):
                if item['id'] == completion['id']:
                    task['completions'][i].update(completion)
                    updated = True

        # write new completion
        if not updated:
            completion['id'] = task['id'] * 1000 + len(task['completions']) + 1
            task['completions'].append(completion)

        try:
            self._update_derived_output_schema(completion)
        except Exception as exc:
            logger.error(exc, exc_info=True)
            logger.debug(json.dumps(completion, indent=2))

        # write task + completions to file
        filename = os.path.join(self.config['output_dir'], str(task_id) + '.json')
        os.mkdir(self.config['output_dir']) if not os.path.exists(self.config['output_dir']) else ()
        json.dump(task, open(filename, 'w'), indent=4, sort_keys=True)

        # -------------------------  Save Labeling images  ------------------------- #
        # Read Image
        filename, root = task['data']['image'].split('?d=%2F')
        root = '/' + root.replace('%2F', '/')
        filename = os.path.split(filename)[-1]
        filepath = os.path.join(root, filename)
        objs = [[obj['value']['polygonlabels'][0], obj['value']['points']] for obj in task['completions'][0]['result']]
        img = cv2.imread(filepath)
        img_draw = np.zeros_like(img)
        h, w, _ = img.shape

        # draw and save image
        save_dir = os.path.join(os.path.split(root)[0], 'image-finish')
        save_filepath = os.path.join(save_dir, filename)
        if len(objs) > 0:
            for obj in objs:
                label, points = obj
                points = (np.array([points]) / 100 * [w, h]).astype(np.int32)
                img_draw = cv2.fillPoly(img_draw, points, [0, 0, 255])
                img_draw = self.print_chinese_opencv(img_draw, label, (points[0, :, 0].min(), points[0, :, 1].min()),
                                                     (0, 0, 255))
                img = cv2.addWeighted(img, 1, img_draw, 0.7, 0)
        else:
            cv2.putText(img_draw, 'No defect', (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            img = cv2.addWeighted(img, 1, img_draw, 0.7, 0)
        cv2.imwrite(save_filepath, img)
        return completion['id']

    def delete_completion(self, task_id):
        """ Delete completion from disk

        :param task_id: task id
        """
        filename = os.path.join(self.config['output_dir'], str(task_id) + '.json')
        os.remove(filename)

        self.load_tasks()
        self.load_derived_schemas()

    def make_predictions(self, task):
        task = deepcopy(task)
        task['predictions'] = []
        try:
            for ml_backend in self.ml_backends:
                if not ml_backend.connected:
                    continue
                predictions = ml_backend.make_predictions(task, self)
                predictions['created_by'] = ml_backend.model_name
                task['predictions'].append(predictions)
        except Exception as exc:
            logger.debug(exc)
        return task

    def train(self):
        completions = []
        for f in self.iter_completions():
            completions.append(json_load(f))
        train_status = False
        if self.ml_backends_connected:
            for ml_backend in self.ml_backends:
                if ml_backend.connected:
                    ml_backend.train(completions, self)
                    train_status = True
        return train_status

    @classmethod
    def print_chinese_opencv(cls, im, chinese, pos, color):
        img_PIL = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
        font = ImageFont.truetype('NotoSansCJK-Bold.ttc', 12)
        fillColor = color[::-1]
        position = (pos[0], pos[1]-12)
        draw = ImageDraw.Draw(img_PIL)
        draw.text(position, chinese, font=font, fill=fillColor)
        img = cv2.cvtColor(np.asarray(img_PIL), cv2.COLOR_RGB2BGR)
        return img

    @classmethod
    def get_project_dir(cls, project_name, args):
        return os.path.join(args.root_dir, project_name)

    @classmethod
    def get_input_data_tags(cls, label_config):
        tag_iter = ElementTree.fromstring(label_config).iter()
        return [
            tag for tag in tag_iter
            if tag.attrib.get('name') and tag.attrib.get('value', '').startswith('$')
        ]

    @classmethod
    def _load_tasks(cls, input_path, args, label_config_file):
        with io.open(label_config_file) as f:
            label_config = f.read()

        task_loader = Tasks()
        if args.input_format == 'json':
            return task_loader.from_json_file(input_path)
        if args.input_format == 'json-dir':
            return task_loader.from_dir_with_json_files(input_path)
        input_data_tags = cls.get_input_data_tags(label_config)

        if len(input_data_tags) > 1:
            val = ",".join(tag.attrib.get("name") for tag in input_data_tags)
            print('Warning! Multiple input data tags found: ' +
                  val + '. Only first one is used.')
        elif len(input_data_tags) == 0:
            raise ValueError(
                'You\'ve specified input format "{fmt}" which requires label config being explicitly defined. '
                'Please specify --label-config=path/to/config.xml or use --format=json or format=json_dir'.format(
                    fmt=args.input_format)
            )
        input_data_tag = input_data_tags[0]
        data_key = input_data_tag.attrib.get('value').lstrip('$')

        if args.input_format == 'text':
            return task_loader.from_text_file(input_path, data_key)
        if args.input_format == 'text-dir':
            return task_loader.from_dir_with_text_files(input_path, data_key)
        if args.input_format == 'image-dir':
            return task_loader.from_dir_with_image_files(input_path, data_key)
        if args.input_format == 'audio-dir':
            return task_loader.from_dir_with_audio_files(input_path, data_key)
        raise RuntimeError('Can\'t load tasks for input format={}'.format(args.input_format))

    @classmethod
    def _create_ml_backend_params(cls, url):
        if '=http' in url:
            name, url = url.split('=', 1)
        else:
            name = str(uuid4())[:8]
        if not is_url(url):
            raise ValueError('Specified string "' + url + '" doesn\'t look like URL.')
        return {'url': url, 'name': name}

    @classmethod
    def create_project_dir(cls, project_name, args):
        """
        Create project directory in args.root_dir/project_name, and initialize there all required files
        If some files are missed, restore them from defaults.
        If config files are specified by args, copy them in project directory
        :param project_name:
        :param args:
        :return:
        """
        dir = cls.get_project_dir(project_name, args)
        os.makedirs(dir, exist_ok=True)

        config = json_load(args.config_path) if args.config_path else json_load(find_file('default_config.json'))

        def already_exists_error(what, path):
            raise RuntimeError('{path} {what} already exists. Use "--force" option to recreate it.'.format(
                path=path, what=what
            ))

        input_path = args.input_path or config.get('input_path')

        # save label config
        config_xml = 'config.xml'
        config_xml_path = os.path.join(dir, config_xml)
        label_config_file = args.label_config or config.get('label_config')
        if label_config_file:
            copy2(label_config_file, config_xml_path)
            print(label_config_file + ' label config copied to ' + config_xml_path)
        else:
            if os.path.exists(config_xml_path) and not args.force:
                already_exists_error('label config', config_xml_path)
            if not input_path:
                # create default config with polygons only if input data is not set
                default_label_config = find_file('examples/image_polygons/config.xml')
                copy2(default_label_config, config_xml_path)
                print(default_label_config + ' label config copied to ' + config_xml_path)
            else:
                with io.open(config_xml_path, mode='w') as fout:
                    fout.write('<View></View>')
                print('Empty config has been created in ' + config_xml_path)

        config['label_config'] = config_xml

        # save tasks.json
        tasks_json = 'tasks.json'
        tasks_json_path = os.path.join(dir, tasks_json)
        if input_path:
            tasks = cls._load_tasks(input_path, args, config_xml_path)
            with io.open(tasks_json_path, mode='w') as fout:
                json.dump(tasks, fout, indent=2)
            print('{tasks_json_path} input file with {n} tasks has been created from {input_path}'.format(
                tasks_json_path=tasks_json_path, n=len(tasks), input_path=input_path))
        else:
            if os.path.exists(tasks_json_path) and not args.force:
                already_exists_error('input path', tasks_json_path)
            with io.open(tasks_json_path, mode='w') as fout:
                json.dump({}, fout)
            print(tasks_json_path + ' input path has been created with empty tasks.')
        config['input_path'] = tasks_json

        # create completions dir
        completions_dir = os.path.join(dir, 'completions')
        if os.path.exists(completions_dir) and not args.force:
            already_exists_error('output dir', completions_dir)
        if os.path.exists(completions_dir):
            delete_dir_content(completions_dir)
            print(completions_dir + ' output dir already exists. Clear it.')
        else:
            os.makedirs(completions_dir, exist_ok=True)
            print(completions_dir + ' output dir has been created.')
        config['output_dir'] = 'completions'

        if 'ml_backends' not in config or not isinstance(config['ml_backends'], list):
            config['ml_backends'] = []
        if args.ml_backends:
            for url in args.ml_backends:
                config['ml_backends'].append(cls._create_ml_backend_params(url))
        if args.sampling:
            config['sampling'] = args.sampling
        if args.port:
            config['port'] = args.port
        if args.host:
            config['host'] = args.host

        # create config.json
        config_json = 'config.json'
        config_json_path = os.path.join(dir, config_json)
        if os.path.exists(config_json_path) and not args.force:
            already_exists_error('config', config_json_path)
        with io.open(config_json_path, mode='w') as f:
            json.dump(config, f, indent=2)

        print('')
        print('Label Studio has been successfully initialized. Check project states in ' + dir)
        print('Start the server: label-studio start ' + dir)
        return dir

    @classmethod
    def get_config(cls, project_name, args):
        return cls._get_config(cls.get_project_dir(project_name, args))

    @classmethod
    def _get_config(cls, project_dir, args=None):
        """
        Get config from input args Namespace acquired by Argparser
        :param args:
        :return:
        """
        # check if project directory exists
        if not os.path.exists(project_dir):
            project_name = args.project_name if args is not None else '<project_name>'
            raise FileNotFoundError(
                'Couldn\'t find directory ' + project_dir +
                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' +
                project_name + ' --init'
            )

        # check config.json exists in directory
        config_path = os.path.join(project_dir, 'config.json')
        if not os.path.exists(config_path):
            project_name = args.project_name if args is not None else '<project_name>'
            raise FileNotFoundError(
                'Couldn\'t find config file ' + config_path + ' in project directory ' + project_dir +
                ', maybe you\'ve missed appending "--init" option:\nlabel-studio start ' + project_name + ' --init'
            )

        config_path = os.path.abspath(config_path)
        with io.open(config_path) as c:
            config = json.load(c)

        config['config_path'] = config_path
        config['input_path'] = os.path.join(os.path.dirname(config_path), config['input_path'])
        config['label_config'] = os.path.join(os.path.dirname(config_path), config['label_config'])
        config['output_dir'] = os.path.join(os.path.dirname(config_path), config['output_dir'])
        return config

    @classmethod
    def _load_from_dir(cls, project_dir, project_name, args, context):
        config = cls._get_config(project_dir, args)
        return cls(config, project_name, context)

    @classmethod
    def get(cls, project_name, args, context):

        # If project stored in memory, just return it
        if project_name in cls._storage:
            return cls._storage[project_name]

        # If project directory exists, load project from directory and update in-memory storage
        project_dir = cls.get_project_dir(project_name, args)
        if os.path.exists(project_dir):
            project = cls._load_from_dir(project_dir, project_name, args, context)
            cls._storage[project_name] = project
            return project

        raise ProjectNotFound('Project {p} doesn\'t exist'.format(p=project_name))

    @classmethod
    def create(cls, project_name, args, context):
        # "create" method differs from "get" as it can create new directory with project resources
        project_dir = cls.create_project_dir(project_name, args)
        project = cls._load_from_dir(project_dir, project_name, args, context)
        cls._storage[project_name] = project
        return project

    @classmethod
    def get_or_create(cls, project_name, args, context):
        try:
            project = cls.get(project_name, args, context)
            logger.info('Get project "' + project_name + '".')
        except ProjectNotFound:
            project = cls.create(project_name, args, context)
            logger.info('Project "' + project_name + '" created.')
        return project

    def update_on_boarding_state(self):
        self.on_boarding['setup'] = self.config.get('label_config_updated', False)
        self.on_boarding['import'] = len(self.tasks) > 0
        self.on_boarding['labeled'] = len(os.listdir(self.config['output_dir'])) > 0
        return self.on_boarding