Пример #1
0
 def can_accumulate_gradients(self):
     if config_value('caffe_root')['flavor'] == 'BVLC':
         return True
     elif config_value('caffe_root')['flavor'] == 'NVIDIA':
         return config_value('caffe_root')['version'] > parse_version('0.14.0-alpha')
     else:
         raise ValueError('Unknown flavor.  Support NVIDIA and BVLC flavors only.')
Пример #2
0
def save_weights(network_path, weights_path, gpu=None, logger=None):

    if config_value('torch_root') == '<PATHS>':
        torch_bin = 'th'
    else:
        torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th')

    args = [
        torch_bin,
        os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),
                     'tools', 'torch', 'wrapper.lua'), 'getWeights.lua',
        '--network=%s' % os.path.basename(network_path).split(".")[0],
        '--networkDirectory=%s' % os.path.split(network_path)[0],
        '--snapshot=%s' % os.path.split(weights_path)[1],
        '--save=%s' % ".",
        '--type=%s' % "float"
    ]

    # Convert them all to strings
    args = [str(x) for x in args]

    env = os.environ.copy()

    p = subprocess.Popen(args,
                         cwd=os.path.split(network_path)[0],
                         close_fds=True,
                         env=env)
    p.wait()
Пример #3
0
 def can_accumulate_gradients(self):
     if config_value('caffe')['flavor'] == 'BVLC':
         return True
     elif config_value('caffe')['flavor'] == 'NVIDIA':
         return (parse_version(config_value('caffe')['version']) > parse_version('0.14.0-alpha'))
     else:
         raise ValueError('Unknown flavor.  Support NVIDIA and BVLC flavors only.')
class DistributedCaffeFramework(CaffeFramework):
    """
    derive from CaffeFramework, use for training in long-distance server
    """
    """
    Defines required methods to interact with the Caffe framework
    This class can be instantiated as many times as there are compatible
    instances of Caffe
    """

    # short descriptive name
    NAME = 'Caffe'

    # identifier of framework class (intended to be the same across
    # all instances of this class)
    CLASS = 'caffe'

    # whether this framework can shuffle data during training
    CAN_SHUFFLE_DATA = False
    SUPPORTS_PYTHON_LAYERS_FILE = True
    SUPPORTS_TIMELINE_TRACING = False

    if config_value('caffe')['flavor'] == 'NVIDIA':
        if parse_version(config_value('caffe')['version']) > parse_version(
                '0.14.0-alpha'):
            SUPPORTED_SOLVER_TYPES = [
                'SGD', 'NESTEROV', 'ADAGRAD', 'RMSPROP', 'ADADELTA', 'ADAM'
            ]
        else:
            SUPPORTED_SOLVER_TYPES = ['SGD', 'NESTEROV', 'ADAGRAD']
    elif config_value('caffe')['flavor'] == 'BVLC':
        SUPPORTED_SOLVER_TYPES = [
            'SGD', 'NESTEROV', 'ADAGRAD', 'RMSPROP', 'ADADELTA', 'ADAM'
        ]
    else:
        raise ValueError(
            'Unknown flavor.  Support NVIDIA and BVLC flavors only.')

    SUPPORTED_DATA_TRANSFORMATION_TYPES = ['MEAN_SUBTRACTION', 'CROPPING']
    SUPPORTED_DATA_AUGMENTATION_TYPES = []

    @override
    def __init__(self):
        super(CaffeFramework, self).__init__()
        self.framework_id = self.CLASS

    @override
    def create_train_task(self, **kwargs):
        """
        create train task
        """
        print 'return DistributedTrainTask'
        return DistributedTrainTask(framework_id=self.framework_id, **kwargs)
Пример #5
0
def setup_logging():
    socketio_logger = logging.getLogger('socketio')
    socketio_logger.addHandler(logging.StreamHandler(sys.stdout))

    # Set custom logger
    logging.setLoggerClass(JobIdLogger)

    formatter = logging.Formatter(
            fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s",
            datefmt=DATE_FORMAT,
            )

    ### digits logger

    main_logger = logging.getLogger('digits')
    main_logger.setLevel(logging.DEBUG)
    # Log to stdout
    stdoutHandler = logging.StreamHandler(sys.stdout)
    stdoutHandler.setFormatter(formatter)
    stdoutHandler.setLevel(logging.DEBUG)
    main_logger.addHandler(stdoutHandler)

    ### digits.webapp logger

    if config_value('log_file'):
        webapp_logger = logging.getLogger('digits.webapp')
        webapp_logger.setLevel(logging.DEBUG)
        # Log to file
        fileHandler = logging.handlers.RotatingFileHandler(
                config_value('log_file'),
                maxBytes=(1024*1024*10), # 10 MB
                backupCount=10,
                )
        fileHandler.setFormatter(formatter)
        level = config_value('log_level')
        if level == 'debug':
            fileHandler.setLevel(logging.DEBUG)
        elif level == 'info':
            fileHandler.setLevel(logging.INFO)
        elif level == 'warning':
            fileHandler.setLevel(logging.WARNING)
        elif level == 'error':
            fileHandler.setLevel(logging.ERROR)
        elif level == 'critical':
            fileHandler.setLevel(logging.CRITICAL)
        webapp_logger.addHandler(fileHandler)

        ### Useful shortcut for the webapp, which may set job_id

        return JobIdLoggerAdapter(webapp_logger, {})
    else:
        print 'WARNING: log_file config option not found - no log file is being saved'
        return JobIdLoggerAdapter(main_logger, {})
Пример #6
0
def setup_logging():
    socketio_logger = logging.getLogger('socketio')
    socketio_logger.addHandler(logging.StreamHandler(sys.stdout))

    # Set custom logger
    logging.setLoggerClass(JobIdLogger)

    formatter = logging.Formatter(
        fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s",
        datefmt=DATE_FORMAT,
    )

    ### digits logger

    main_logger = logging.getLogger('digits')
    main_logger.setLevel(logging.DEBUG)
    # Log to stdout
    stdoutHandler = logging.StreamHandler(sys.stdout)
    stdoutHandler.setFormatter(formatter)
    stdoutHandler.setLevel(logging.DEBUG)
    main_logger.addHandler(stdoutHandler)

    ### digits.webapp logger

    if config_value('log_file'):
        webapp_logger = logging.getLogger('digits.webapp')
        webapp_logger.setLevel(logging.DEBUG)
        # Log to file
        fileHandler = logging.handlers.RotatingFileHandler(
            config_value('log_file'),
            maxBytes=(1024 * 1024 * 10),  # 10 MB
            backupCount=10,
        )
        fileHandler.setFormatter(formatter)
        level = config_value('log_level')
        if level == 'debug':
            fileHandler.setLevel(logging.DEBUG)
        elif level == 'info':
            fileHandler.setLevel(logging.INFO)
        elif level == 'warning':
            fileHandler.setLevel(logging.WARNING)
        elif level == 'error':
            fileHandler.setLevel(logging.ERROR)
        elif level == 'critical':
            fileHandler.setLevel(logging.CRITICAL)
        webapp_logger.addHandler(fileHandler)

        ### Useful shortcut for the webapp, which may set job_id

        return JobIdLoggerAdapter(webapp_logger, {})
    else:
        print 'WARNING: log_file config option not found - no log file is being saved'
        return JobIdLoggerAdapter(main_logger, {})
Пример #7
0
    def write_deploy(self):
        # Write torch layers to json for layerwise graph visualization
        if config_value('torch_root') == '<PATHS>':
            torch_bin = 'th'
        else:
            torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th')

        args = [torch_bin,
                os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),'tools','torch','toGraph.lua'),
                '--network=%s' % os.path.split(self.get_deploy_path())[1].split(".")[0],
                '--output=%s' % self.get_model_def_path(True),
                ]
        env = os.environ.copy()

        p = subprocess.Popen(args,cwd=self.job_dir,env=env)
Пример #8
0
def save_max_activations(network_path,
                         weights_path,
                         height,
                         width,
                         layer,
                         units=[-1],
                         mean_file_path=None,
                         gpu=None,
                         logger=None):
    if config_value('torch_root') == '<PATHS>':
        torch_bin = 'th'
    else:
        torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th')

    args = [
        torch_bin,
        os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),
                     'tools', 'torch', 'wrapper.lua'), 'gradientOptimizer.lua',
        '--network=%s' % os.path.basename(network_path).split(".")[0],
        '--weights=%s' % os.path.split(weights_path)[1],
        '--networkDirectory=%s' % os.path.split(network_path)[0],
        '--height=%s' % height,
        '--width=%s' % width,
        '--chain=%s' % layer,
        '--units=%s' % (','.join(str(x) for x in units))
    ]

    # Convert them all to strings
    args = [str(x) for x in args]

    env = os.environ.copy()

    if mean_file_path is not None:
        args.append('--mean_file_path=%s' % mean_file_path)

    if gpu is not None:
        args.append('--type=cuda')
        # make only the selected GPU visible
        env['CUDA_VISIBLE_DEVICES'] = "%d" % gpu
    else:
        args.append('--type=float')

    # Append units at end:
    p = subprocess.Popen(args,
                         cwd=os.path.split(network_path)[0],
                         close_fds=True,
                         env=env)
    p.wait()
Пример #9
0
def new(extension_id=None):
    """
    Return a form for a new GenericImageModelJob
    """
    form = GenericImageModelForm()
    form.dataset.choices = get_datasets(extension_id)
    form.standard_networks.choices = []
    form.previous_networks.choices = get_previous_networks()
    form.pretrained_networks.choices = get_pretrained_networks()
    prev_network_snapshots = get_previous_network_snapshots()

    # Is there a request to clone a job with ?clone=<job_id>
    fill_form_if_cloned(form)

    return flask.render_template(
        'models/images/generic/new.html',
        extension_id=extension_id,
        extension_title=extensions.data.get_extension(
            extension_id).get_title() if extension_id else None,
        form=form,
        frameworks=frameworks.get_frameworks(),
        previous_network_snapshots=prev_network_snapshots,
        previous_networks_fullinfo=get_previous_networks_fulldetails(),
        pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(),
        multi_gpu=config_value('caffe')['multi_gpu'],
    )
Пример #10
0
def new():
    """
    Return a form for a new ImageClassificationModelJob
    """
    form = ImageClassificationModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = get_standard_networks()
    form.standard_networks.default = get_default_standard_network()
    form.previous_networks.choices = get_previous_networks()
    form.pretrained_networks.choices = get_pretrained_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    # Is there a request to clone a job with ?clone=<job_id>
    fill_form_if_cloned(form)

    return flask.render_template(
        'models/images/classification/new.html',
        form=form,
        frameworks=frameworks.get_frameworks(),
        previous_network_snapshots=prev_network_snapshots,
        previous_networks_fullinfo=get_previous_networks_fulldetails(),
        pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(),
        multi_gpu=config_value('caffe')['multi_gpu'],
    )
Пример #11
0
def new(extension_id=None):
    """
    Return a form for a new GenericImageModelJob
    """
    form = GenericImageModelForm()
    form.dataset.choices = get_datasets(extension_id)
    form.standard_networks.choices = []
    form.previous_networks.choices = get_previous_networks()
    form.pretrained_networks.choices = get_pretrained_networks()
    prev_network_snapshots = get_previous_network_snapshots()

    ## Is there a request to clone a job with ?clone=<job_id>
    fill_form_if_cloned(form)

    return flask.render_template(
        'models/images/generic/new.html',
        extension_id=extension_id,
        extension_title=extensions.data.get_extension(extension_id).get_title() if extension_id else None,
        form=form,
        frameworks=frameworks.get_frameworks(),
        previous_network_snapshots=prev_network_snapshots,
        previous_networks_fullinfo=get_previous_networks_fulldetails(),
        pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(),
        multi_gpu=config_value('caffe')['multi_gpu'],
        )
Пример #12
0
    def test_inference_while_training(self):
        # make sure we can do inference while all GPUs are in use for training
        # if no GPUs, just test inference during a normal training job

        # get number of GPUs
        if self.FRAMEWORK == 'tensorflow':
            raise unittest.SkipTest('Tensorflow CPU inference during training not supported')

        gpu_count = 1
        if (config_value('gpu_list') and
                config_value('caffe')['cuda_enabled'] and
                config_value('caffe')['multi_gpu']):
            gpu_count = len(config_value('gpu_list').split(','))

        # grab an image for testing
        category = self.imageset_paths.keys()[-1]
        image_path = self.imageset_paths[category][-1]
        image_path = os.path.join(self.imageset_folder, image_path)
        with open(image_path, 'rb') as infile:
            # StringIO wrapping is needed to simulate POST file upload.
            image_upload = (StringIO(infile.read()), 'image.png')

        # create a long-running training job
        job2_id = self.create_model(
            select_gpu_count=gpu_count,
            batch_size=10 * gpu_count,
            train_epochs=1000,
        )
        try:
            while True:
                status = self.model_status(job2_id)
                if status in ['Initialized', 'Waiting']:
                    time.sleep(0.01)
                elif status == 'Running':
                    break
                else:
                    raise RuntimeError('job status is %s' % status)

            rv = self.app.post(
                '/models/images/classification/classify_one/json?job_id=%s' % self.model_id,
                data={'image_file': image_upload}
            )
            json.loads(rv.data)
            assert rv.status_code == 200, 'POST failed with %s' % rv.status_code
        finally:
            self.delete_model(job2_id)
Пример #13
0
 def test_select_gpus(self):
     # test all possible combinations
     gpu_list = config_value('gpu_list').split(',')
     for i in xrange(len(gpu_list)):
         for combination in itertools.combinations(gpu_list, i + 1):
             if self.FRAMEWORK == 'torch' and len(combination) > 1:
                 raise unittest.SkipTest('Torch not tested with multi-GPU')
             yield self.check_select_gpus, combination
Пример #14
0
    def test_inference_while_training(self):
        # make sure we can do inference while all GPUs are in use for training
        # if no GPUs, just test inference during a normal training job

        # get number of GPUs
        if self.FRAMEWORK == 'tensorflow':
            raise unittest.SkipTest(
                'Tensorflow CPU inference during training not supported')

        gpu_count = 1
        if (config_value('gpu_list') and config_value('caffe')['cuda_enabled']
                and config_value('caffe')['multi_gpu']):
            gpu_count = len(config_value('gpu_list').split(','))

        # grab an image for testing
        category = self.imageset_paths.keys()[-1]
        image_path = self.imageset_paths[category][-1]
        image_path = os.path.join(self.imageset_folder, image_path)
        with open(image_path, 'rb') as infile:
            # StringIO wrapping is needed to simulate POST file upload.
            image_upload = (StringIO(infile.read()), 'image.png')

        # create a long-running training job
        job2_id = self.create_model(
            select_gpu_count=gpu_count,
            batch_size=10 * gpu_count,
            train_epochs=1000,
        )
        try:
            while True:
                status = self.model_status(job2_id)
                if status in ['Initialized', 'Waiting']:
                    time.sleep(0.01)
                elif status == 'Running':
                    break
                else:
                    raise RuntimeError('job status is %s' % status)

            rv = self.app.post(
                '/models/images/classification/classify_one.json?job_id=%s' %
                self.model_id,
                data={'image_file': image_upload})
            json.loads(rv.data)
            assert rv.status_code == 200, 'POST failed with %s' % rv.status_code
        finally:
            self.delete_model(job2_id)
Пример #15
0
 def test_select_gpus(self):
     # test all possible combinations
     gpu_list = config_value('gpu_list').split(',')
     for i in xrange(len(gpu_list)):
         for combination in itertools.combinations(gpu_list, i+1):
             if self.FRAMEWORK=='torch' and len(combination)>1:
                 raise unittest.SkipTest('Torch not tested with multi-GPU')
             yield self.check_select_gpus, combination
Пример #16
0
 def test_select_gpus(self):
     # test all possible combinations
     gpu_list = config_value('gpu_list').split(',')
     for i in range(len(gpu_list)):
         for combination in itertools.combinations(gpu_list, i + 1):
             # Don't test more than 4 GPUs
             if len(combination) <= 4:
                 yield self.check_select_gpus, combination
Пример #17
0
def setup_logging():
    # Set custom logger
    logging.setLoggerClass(JobIdLogger)

    formatter = logging.Formatter(
        fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s",
        datefmt=DATE_FORMAT,
    )

    # digits logger

    main_logger = logging.getLogger('digits')
    main_logger.setLevel(logging.DEBUG)
    # Log to stdout
    stdoutHandler = logging.StreamHandler(sys.stdout)
    stdoutHandler.setFormatter(formatter)
    stdoutHandler.setLevel(logging.DEBUG)
    main_logger.addHandler(stdoutHandler)

    # digits.webapp logger

    logfile_filename = config_value('log_file')['filename']
    logfile_level = config_value('log_file')['level']

    if logfile_filename is not None:
        webapp_logger = logging.getLogger('digits.webapp')
        webapp_logger.setLevel(logging.DEBUG)
        # Log to file
        fileHandler = logging.handlers.RotatingFileHandler(
            logfile_filename,
            maxBytes=(1024 * 1024 * 10),  # 10 MB
            backupCount=10,
        )
        fileHandler.setFormatter(formatter)
        fileHandler.setLevel(logfile_level)
        webapp_logger.addHandler(fileHandler)

        # Useful shortcut for the webapp, which may set job_id

        return JobIdLoggerAdapter(webapp_logger, {})
    else:
        print(
            'WARNING: log_file config option not found - no log file is being saved'
        )
        return JobIdLoggerAdapter(main_logger, {})
Пример #18
0
def get_view_extensions():
    """
    return all enabled view extensions
    """
    view_extensions = {}
    all_extensions = config_value('view_extension_list')
    for extension in all_extensions:
        view_extensions[extension.get_id()] = extension.get_title()
    return view_extensions
Пример #19
0
def get_view_extensions():
    """
    return all enabled view extensions
    """
    view_extensions = {}
    all_extensions = config_value('view_extension_list')
    for extension in all_extensions:
        view_extensions[extension.get_id()] = extension.get_title()
    return view_extensions
Пример #20
0
    def setUpClass(cls):
        skipIfNotFramework('torch')
        if cls.FRAMEWORK == 'torch' and not config_value('torch')['enabled']:
            raise unittest.SkipTest('Torch not found')

        # Call super.setUpClass() unless we're the last in the class hierarchy
        supercls = super(TorchMixin, cls)
        if hasattr(supercls, 'setUpClass'):
            supercls.setUpClass()
Пример #21
0
    def setUpClass(cls):
        skipIfNotFramework('caffe')
        if cls.FRAMEWORK == 'caffe' and not config_value('caffe')['loaded']:
            raise unittest.SkipTest('Caffe not found')

        # Call super.setUpClass() unless we're the last in the class hierarchy
        supercls = super(CaffeMixin, cls)
        if hasattr(supercls, 'setUpClass'):
            supercls.setUpClass()
Пример #22
0
    def setUpClass(cls):
        skipIfNotFramework('torch')
        if cls.FRAMEWORK == 'torch' and not config_value('torch')['enabled']:
            raise unittest.SkipTest('Torch not found')

        # Call super.setUpClass() unless we're the last in the class hierarchy
        supercls = super(TorchMixin, cls)
        if hasattr(supercls, 'setUpClass'):
            supercls.setUpClass()
Пример #23
0
def setup_logging():
    # Set custom logger
    logging.setLoggerClass(JobIdLogger)

    formatter = logging.Formatter(
        fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s",
        datefmt=DATE_FORMAT,
    )

    # digits logger

    main_logger = logging.getLogger('digits')
    main_logger.setLevel(logging.DEBUG)
    # Log to stdout
    stdoutHandler = logging.StreamHandler(sys.stdout)
    stdoutHandler.setFormatter(formatter)
    stdoutHandler.setLevel(logging.DEBUG)
    main_logger.addHandler(stdoutHandler)

    # digits.webapp logger

    logfile_filename = config_value('log_file')['filename']
    logfile_level = config_value('log_file')['level']

    if logfile_filename is not None:
        webapp_logger = logging.getLogger('digits.webapp')
        webapp_logger.setLevel(logging.DEBUG)
        # Log to file
        fileHandler = logging.handlers.RotatingFileHandler(
            logfile_filename,
            maxBytes=(1024 * 1024 * 10),  # 10 MB
            backupCount=10,
        )
        fileHandler.setFormatter(formatter)
        fileHandler.setLevel(logfile_level)
        webapp_logger.addHandler(fileHandler)

        # Useful shortcut for the webapp, which may set job_id

        return JobIdLoggerAdapter(webapp_logger, {})
    else:
        print 'WARNING: log_file config option not found - no log file is being saved'
        return JobIdLoggerAdapter(main_logger, {})
Пример #24
0
    def get_net(self, epoch=None):
        """
        Returns an instance of caffe.Net

        Keyword Arguments:
        epoch -- which snapshot to load (default is -1 to load the most recently generated snapshot)
        """
        if not self.has_model():
            return False

        file_to_load = None

        if not epoch:
            epoch = self.snapshots[-1][1]
            file_to_load = self.snapshots[-1][0]
        else:
            for snapshot_file, snapshot_epoch in self.snapshots:
                if snapshot_epoch == epoch:
                    file_to_load = snapshot_file
                    break
        if file_to_load is None:
            raise Exception('snapshot not found for epoch "%s"' % epoch)

        # check if already loaded
        if (
            self.loaded_snapshot_file
            and self.loaded_snapshot_file == file_to_load
            and hasattr(self, "_caffe_net")
            and self._caffe_net is not None
        ):
            return self._caffe_net

        if config_value("caffe_root")["cuda_enabled"] and config_value("gpu_list"):
            caffe.set_mode_gpu()

        # load a new model
        self._caffe_net = caffe.Net(self.path(self.deploy_file), file_to_load, caffe.TEST)

        self.loaded_snapshot_epoch = epoch
        self.loaded_snapshot_file = file_to_load

        return self._caffe_net
Пример #25
0
    def get_net(self, epoch=None):
        """
        Returns an instance of caffe.Net

        Keyword Arguments:
        epoch -- which snapshot to load (default is -1 to load the most recently generated snapshot)
        """
        if not self.has_model():
            return False

        file_to_load = None

        if not epoch:
            epoch = self.snapshots[-1][1]
            file_to_load = self.snapshots[-1][0]
        else:
            for snapshot_file, snapshot_epoch in self.snapshots:
                if snapshot_epoch == epoch:
                    file_to_load = snapshot_file
                    break
        if file_to_load is None:
            raise Exception('snapshot not found for epoch "%s"' % epoch)

        # check if already loaded
        if self.loaded_snapshot_file and self.loaded_snapshot_file == file_to_load \
                and hasattr(self, '_caffe_net') and self._caffe_net is not None:
            return self._caffe_net

        if config_value('caffe_root')['cuda_enabled'] and\
                config_value('gpu_list'):
            caffe.set_mode_gpu()

        # load a new model
        self._caffe_net = caffe.Net(self.path(self.deploy_file), file_to_load,
                                    caffe.TEST)

        self.loaded_snapshot_epoch = epoch
        self.loaded_snapshot_file = file_to_load

        return self._caffe_net
Пример #26
0
    def test_inference_while_training(self):
        # make sure we can do inference while all GPUs are in use for training
        # if no GPUs, just test inference during a normal training job

        # get number of GPUs
        gpu_count = 1
        if config_value("gpu_list") and config_value("caffe")["cuda_enabled"] and config_value("caffe")["multi_gpu"]:
            gpu_count = len(config_value("gpu_list").split(","))

        # grab an image for testing
        category = self.imageset_paths.keys()[-1]
        image_path = self.imageset_paths[category][-1]
        image_path = os.path.join(self.imageset_folder, image_path)
        with open(image_path, "rb") as infile:
            # StringIO wrapping is needed to simulate POST file upload.
            image_upload = (StringIO(infile.read()), "image.png")

        # create a long-running training job
        job2_id = self.create_model(select_gpu_count=gpu_count, batch_size=10 * gpu_count, train_epochs=1000)
        try:
            while True:
                status = self.model_status(job2_id)
                if status in ["Initialized", "Waiting"]:
                    time.sleep(0.01)
                elif status == "Running":
                    break
                else:
                    raise RuntimeError("job status is %s" % status)

            rv = self.app.post(
                "/models/images/classification/classify_one.json?job_id=%s" % self.model_id,
                data={"image_file": image_upload},
            )
            data = json.loads(rv.data)
            assert rv.status_code == 200, "POST failed with %s" % rv.status_code
        finally:
            self.delete_model(job2_id)
Пример #27
0
    def task_arguments(self, resources):
        args = [config_value("caffe_root")["executable"], "train", "--solver=%s" % self.path(self.solver_file)]

        if "gpus" in resources:
            identifiers = []
            for identifier, value in resources["gpus"]:
                identifiers.append(identifier)
            if len(identifiers) == 1:
                args.append("--gpu=%s" % identifiers[0])
            elif len(identifiers) > 1:
                args.append("--gpus=%s" % ",".join(identifiers))
        if self.pretrained_model:
            args.append("--weights=%s" % self.path(self.pretrained_model))

        return args
Пример #28
0
    def __init__(self, name):
        """
        Arguments:
        name -- name of this job
        """
        super(Job, self).__init__()

        # create a unique ID
        self._id = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), os.urandom(2).encode('hex'))
        self._dir = os.path.join(config_value('jobs_dir'), self._id)
        self._name = name
        self.pickver_job = PICKLE_VERSION
        self.tasks = []
        self.exception = None

        os.mkdir(self._dir)
Пример #29
0
def generic_image_model_new():
    """
    Return a form for a new GenericImageModelJob
    """
    form = GenericImageModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = []
    form.previous_networks.choices = get_previous_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    return flask.render_template('models/images/generic/new.html',
            form = form,
            previous_network_snapshots = prev_network_snapshots,
            multi_gpu = config_value('caffe_root')['multi_gpu'],
            )
Пример #30
0
def image_classification_model_new():
    """
    Return a form for a new ImageClassificationModelJob
    """
    form = ImageClassificationModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = get_standard_networks()
    form.standard_networks.default = get_default_standard_network()
    form.previous_networks.choices = get_previous_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    return flask.render_template('models/images/classification/new.html',
            form = form,
            previous_network_snapshots = prev_network_snapshots,
            multi_gpu = config_value('caffe_root')['multi_gpu'],
            )
Пример #31
0
def image_classification_model_new():
    """
    Return a form for a new ImageClassificationModelJob
    """
    form = ImageClassificationModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = get_standard_networks()
    form.standard_networks.default = get_default_standard_network()
    form.previous_networks.choices = get_previous_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    return flask.render_template('models/images/classification/new.html',
            form = form,
            previous_network_snapshots = prev_network_snapshots,
            multi_gpu = config_value('caffe_root')['multi_gpu'],
            )
Пример #32
0
    def task_arguments(self, resources):
        args = [config_value('caffe_root')['executable'],
                'train',
                '--solver=%s' % self.path(self.solver_file),
                ]

        if 'gpus' in resources:
            identifiers = []
            for identifier, value in resources['gpus']:
                identifiers.append(identifier)
            if len(identifiers) == 1:
                args.append('--gpu=%s' % identifiers[0])
            elif len(identifiers) > 1:
                args.append('--gpus=%s' % ','.join(identifiers))
        if self.pretrained_model:
            args.append('--weights=%s' % self.path(self.pretrained_model))

        return args
Пример #33
0
    def __init__(self, name, workspace):
        """
        Arguments:
        name -- name of this job
        workspace -- name of workspace to which the new job belongs to
        """
        super(Job, self).__init__()

        # create a unique ID
        self._id = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), os.urandom(2).encode('hex'))
        workspace = Organization.objects.get(id = workspace['workspace_id'])
        WorkspaceJob.objects.create(job_id = self._id, workspace = workspace).save()
        self._dir = os.path.join(config_value('jobs_dir'), self._id)
        self._name = name
        self.pickver_job = PICKLE_VERSION
        self.tasks = []
        self.exception = None

        os.mkdir(self._dir)
Пример #34
0
    def load(cls, job_id):
        """
        Loads a Job in the given job_id
        Returns the Job or throws an exception
        """
        from digits.model.tasks import TrainTask

        job_dir = os.path.join(config_value('jobs_dir'), job_id)
        filename = os.path.join(job_dir, cls.SAVE_FILE)
        with open(filename, 'rb') as savefile:
            job = pickle.load(savefile)
            # Reset this on load
            job._dir = job_dir
            for task in job.tasks:
                task.job_dir = job_dir
                if isinstance(task, TrainTask):
                    # can't call this until the job_dir is set
                    task.detect_snapshots()
            return job
Пример #35
0
    def task_arguments(self, resources):
        args = [
            config_value('caffe_root')['executable'],
            'train',
            '--solver=%s' % self.path(self.solver_file),
        ]

        if 'gpus' in resources:
            identifiers = []
            for identifier, value in resources['gpus']:
                identifiers.append(identifier)
            if len(identifiers) == 1:
                args.append('--gpu=%s' % identifiers[0])
            elif len(identifiers) > 1:
                args.append('--gpus=%s' % ','.join(identifiers))
        if self.pretrained_model:
            args.append('--weights=%s' % self.path(self.pretrained_model))

        return args
Пример #36
0
    def load(cls, job_id):
        """
        Loads a Job in the given job_id
        Returns the Job or throws an exception
        """
        from digits.model.tasks import TrainTask

        job_dir = os.path.join(config_value('jobs_dir'), job_id)
        filename = os.path.join(job_dir, cls.SAVE_FILE)
        with open(filename, 'rb') as savefile:
            job = pickle.load(savefile)
            # Reset this on load
            job._dir = job_dir
            for task in job.tasks:
                task.job_dir = job_dir
                if isinstance(task, TrainTask):
                    # can't call this until the job_dir is set
                    task.detect_snapshots()
            return job
Пример #37
0
    def path(self, filename, relative=False):
        """
        Returns a path to the given file

        Arguments:
        filename -- the requested file

        Keyword arguments:
        relative -- If False, return an absolute path to the file
                    If True, return a path relative to the jobs directory
        """
        if not filename:
            return None
        if os.path.isabs(filename):
            path = filename
        else:
            path = os.path.join(self._dir, filename)
            if relative:
                path = os.path.relpath(path, config_value('jobs_dir'))
        return str(path).replace("\\","/")
Пример #38
0
def image_classification_model_new():
    """
    Return a form for a new ImageClassificationModelJob
    """
    form = ImageClassificationModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = get_standard_networks()
    form.standard_networks.default = get_default_standard_network()
    form.previous_networks.choices = get_previous_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    return flask.render_template(
        "models/images/classification/new.html",
        form=form,
        frameworks=frameworks.get_frameworks(),
        previous_network_snapshots=prev_network_snapshots,
        previous_networks_fullinfo=get_previous_networks_fulldetails(),
        multi_gpu=config_value("caffe_root")["multi_gpu"],
    )
Пример #39
0
    def path(self, filename, relative=False):
        """
        Returns a path to the given file

        Arguments:
        filename -- the requested file

        Keyword arguments:
        relative -- If False, return an absolute path to the file
                    If True, return a path relative to the jobs directory
        """
        if not filename:
            return None
        if os.path.isabs(filename):
            path = filename
        else:
            path = os.path.join(self._dir, filename)
        if relative:
            path = os.path.relpath(path, config_value('jobs_dir'))
        return str(path).replace("\\","/")
Пример #40
0
    def __init__(self, name, username, persistent = True):
        """
        Arguments:
        name -- name of this job
        username -- creator of this job
        """
        super(Job, self).__init__()

        # create a unique ID
        self._id = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), os.urandom(2).encode('hex'))
        self._dir = os.path.join(config_value('jobs_dir'), self._id)
        self._name = name
        self.username = username
        self.pickver_job = PICKLE_VERSION
        self.tasks = []
        self.exception = None
        self._notes = None
        self.event = threading.Event()
        self.persistent = persistent

        os.mkdir(self._dir)
Пример #41
0
def generic_image_model_new():
    """
    Return a form for a new GenericImageModelJob
    """
    form = GenericImageModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = []
    form.previous_networks.choices = get_previous_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    ## Is there a request to clone a job with ?clone=<job_id>
    fill_form_if_cloned(form)

    return flask.render_template('models/images/generic/new.html',
            form = form,
            frameworks = frameworks.get_frameworks(),
            previous_network_snapshots = prev_network_snapshots,
            previous_networks_fullinfo = get_previous_networks_fulldetails(),
            multi_gpu = config_value('caffe_root')['multi_gpu'],
            )
Пример #42
0
def generic_image_model_new():
    """
    Return a form for a new GenericImageModelJob
    """
    form = GenericImageModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = []
    form.previous_networks.choices = get_previous_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    ## Is there a request to clone a job with ?clone=<job_id>
    fill_form_if_cloned(form)

    return flask.render_template('models/images/generic/new.html',
            form = form,
            frameworks = frameworks.get_frameworks(),
            previous_network_snapshots = prev_network_snapshots,
            previous_networks_fullinfo = get_previous_networks_fulldetails(),
            multi_gpu = config_value('caffe_root')['multi_gpu'],
            )
Пример #43
0
    def __init__(self, name, username, group='', persistent=True):
        """
        Arguments:
        name -- name of this job
        username -- creator of this job
        """
        super(Job, self).__init__()

        # create a unique ID
        self._id = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), codecs.encode(os.urandom(2), 'hex'))
        self._dir = os.path.join(config_value('jobs_dir'), self._id)
        self._name = name
        self.group = group
        self.username = username
        self.pickver_job = PICKLE_VERSION
        self.tasks = []
        self.exception = None
        self._notes = None
        self.event = threading.Event()
        self.persistent = persistent

        os.mkdir(self._dir)
Пример #44
0
def new():
    """
    Return a form for a new ImageClassificationModelJob
    """
    form = ImageClassificationModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = get_standard_networks()
    form.standard_networks.default = get_default_standard_network()
    form.previous_networks.choices = get_previous_networks()
    form.pretrained_networks.choices = get_pretrained_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    # Is there a request to clone a job with ?clone=<job_id>
    fill_form_if_cloned(form)

    return flask.render_template('models/images/classification/new.html',
                                 form=form,
                                 frameworks=frameworks.get_frameworks(),
                                 previous_network_snapshots=prev_network_snapshots,
                                 previous_networks_fullinfo=get_previous_networks_fulldetails(),
                                 pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(),
                                 multi_gpu=config_value('caffe')['multi_gpu'],
                                 )
Пример #45
0
def create():
    """
    Create a new ImageClassificationModelJob

    Returns JSON when requested: {job_id,name,status} or {errors:[]}
    """
    form = ImageClassificationModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = get_standard_networks()
    form.standard_networks.default = get_default_standard_network()
    form.previous_networks.choices = get_previous_networks()
    form.pretrained_networks.choices = get_pretrained_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    # Is there a request to clone a job with ?clone=<job_id>
    fill_form_if_cloned(form)

    if not form.validate_on_submit():
        if request_wants_json():
            return flask.jsonify({'errors': form.errors}), 400
        else:
            return flask.render_template('models/images/classification/new.html',
                                         form=form,
                                         frameworks=frameworks.get_frameworks(),
                                         previous_network_snapshots=prev_network_snapshots,
                                         previous_networks_fullinfo=get_previous_networks_fulldetails(),
                                         pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(),
                                         multi_gpu=config_value('caffe')['multi_gpu'],
                                         ), 400

    datasetJob = scheduler.get_job(form.dataset.data)
    if not datasetJob:
        raise werkzeug.exceptions.BadRequest(
            'Unknown dataset job_id "%s"' % form.dataset.data)

    # sweeps will be a list of the the permutations of swept fields
    # Get swept learning_rate
    sweeps = [{'learning_rate': v} for v in form.learning_rate.data]
    add_learning_rate = len(form.learning_rate.data) > 1

    # Add swept batch_size
    sweeps = [dict(s.items() + [('batch_size', bs)]) for bs in form.batch_size.data for s in sweeps[:]]
    add_batch_size = len(form.batch_size.data) > 1
    n_jobs = len(sweeps)

    jobs = []
    for sweep in sweeps:
        # Populate the form with swept data to be used in saving and
        # launching jobs.
        form.learning_rate.data = sweep['learning_rate']
        form.batch_size.data = sweep['batch_size']

        # Augment Job Name
        extra = ''
        if add_learning_rate:
            extra += ' learning_rate:%s' % str(form.learning_rate.data[0])
        if add_batch_size:
            extra += ' batch_size:%d' % form.batch_size.data[0]

        job = None
        try:
            job = ImageClassificationModelJob(
                username=utils.auth.get_username(),
                name=form.model_name.data + extra,
                group=form.group_name.data,
                dataset_id=datasetJob.id(),
            )
            # get handle to framework object
            fw = frameworks.get_framework_by_id(form.framework.data)

            pretrained_model = None
            if form.method.data == 'standard':
                found = False

                # can we find it in standard networks?
                network_desc = fw.get_standard_network_desc(form.standard_networks.data)
                if network_desc:
                    found = True
                    network = fw.get_network_from_desc(network_desc)

                if not found:
                    raise werkzeug.exceptions.BadRequest(
                        'Unknown standard model "%s"' % form.standard_networks.data)
            elif form.method.data == 'previous':
                old_job = scheduler.get_job(form.previous_networks.data)
                if not old_job:
                    raise werkzeug.exceptions.BadRequest(
                        'Job not found: %s' % form.previous_networks.data)

                use_same_dataset = (old_job.dataset_id == job.dataset_id)
                network = fw.get_network_from_previous(old_job.train_task().network, use_same_dataset)

                for choice in form.previous_networks.choices:
                    if choice[0] == form.previous_networks.data:
                        epoch = float(flask.request.form['%s-snapshot' % form.previous_networks.data])
                        if epoch == 0:
                            pass
                        elif epoch == -1:
                            pretrained_model = old_job.train_task().pretrained_model
                        else:
                            # verify snapshot exists
                            pretrained_model = old_job.train_task().get_snapshot(epoch, download=True)
                            if pretrained_model is None:
                                raise werkzeug.exceptions.BadRequest(
                                    "For the job %s, selected pretrained_model for epoch %d is invalid!"
                                    % (form.previous_networks.data, epoch))
                            # the first is the actual file if a list is returned, other should be meta data
                            if isinstance(pretrained_model, list):
                                pretrained_model = pretrained_model[0]

                            if not (os.path.exists(pretrained_model)):
                                raise werkzeug.exceptions.BadRequest(
                                    "Pretrained_model for the selected epoch doesn't exist. "
                                    "May be deleted by another user/process. "
                                    "Please restart the server to load the correct pretrained_model details.")
                            # get logical path
                            pretrained_model = old_job.train_task().get_snapshot(epoch)
                        break

            elif form.method.data == 'pretrained':
                pretrained_job = scheduler.get_job(form.pretrained_networks.data)
                model_def_path = pretrained_job.get_model_def_path()
                weights_path = pretrained_job.get_weights_path()

                network = fw.get_network_from_path(model_def_path)
                pretrained_model = weights_path

            elif form.method.data == 'custom':
                network = fw.get_network_from_desc(form.custom_network.data)
                pretrained_model = form.custom_network_snapshot.data.strip()
            else:
                raise werkzeug.exceptions.BadRequest(
                    'Unrecognized method: "%s"' % form.method.data)

            policy = {'policy': form.lr_policy.data}
            if form.lr_policy.data == 'fixed':
                pass
            elif form.lr_policy.data == 'step':
                policy['stepsize'] = form.lr_step_size.data
                policy['gamma'] = form.lr_step_gamma.data
            elif form.lr_policy.data == 'multistep':
                policy['stepvalue'] = form.lr_multistep_values.data
                policy['gamma'] = form.lr_multistep_gamma.data
            elif form.lr_policy.data == 'exp':
                policy['gamma'] = form.lr_exp_gamma.data
            elif form.lr_policy.data == 'inv':
                policy['gamma'] = form.lr_inv_gamma.data
                policy['power'] = form.lr_inv_power.data
            elif form.lr_policy.data == 'poly':
                policy['power'] = form.lr_poly_power.data
            elif form.lr_policy.data == 'sigmoid':
                policy['stepsize'] = form.lr_sigmoid_step.data
                policy['gamma'] = form.lr_sigmoid_gamma.data
            else:
                raise werkzeug.exceptions.BadRequest(
                    'Invalid learning rate policy')

            if config_value('caffe')['multi_gpu']:
                if form.select_gpus.data:
                    selected_gpus = [str(gpu) for gpu in form.select_gpus.data]
                    gpu_count = None
                elif form.select_gpu_count.data:
                    gpu_count = form.select_gpu_count.data
                    selected_gpus = None
                else:
                    gpu_count = 1
                    selected_gpus = None
            else:
                if form.select_gpu.data == 'next':
                    gpu_count = 1
                    selected_gpus = None
                else:
                    selected_gpus = [str(form.select_gpu.data)]
                    gpu_count = None

            # Set up data augmentation structure
            data_aug = {}
            data_aug['flip'] = form.aug_flip.data
            data_aug['quad_rot'] = form.aug_quad_rot.data
            data_aug['rot'] = form.aug_rot.data
            data_aug['scale'] = form.aug_scale.data
            data_aug['noise'] = form.aug_noise.data
            data_aug['contrast'] = form.aug_contrast.data
            data_aug['whitening'] = form.aug_whitening.data
            data_aug['hsv_use'] = form.aug_hsv_use.data
            data_aug['hsv_h'] = form.aug_hsv_h.data
            data_aug['hsv_s'] = form.aug_hsv_s.data
            data_aug['hsv_v'] = form.aug_hsv_v.data

            # Python Layer File may be on the server or copied from the client.
            fs.copy_python_layer_file(
                bool(form.python_layer_from_client.data),
                job.dir(),
                (flask.request.files[form.python_layer_client_file.name]
                 if form.python_layer_client_file.name in flask.request.files
                 else ''), form.python_layer_server_file.data)

            job.tasks.append(fw.create_train_task(
                job=job,
                dataset=datasetJob,
                train_epochs=form.train_epochs.data,
                snapshot_interval=form.snapshot_interval.data,
                learning_rate=form.learning_rate.data[0],
                lr_policy=policy,
                gpu_count=gpu_count,
                selected_gpus=selected_gpus,
                batch_size=form.batch_size.data[0],
                batch_accumulation=form.batch_accumulation.data,
                val_interval=form.val_interval.data,
                traces_interval=form.traces_interval.data,
                pretrained_model=pretrained_model,
                crop_size=form.crop_size.data,
                use_mean=form.use_mean.data,
                network=network,
                random_seed=form.random_seed.data,
                solver_type=form.solver_type.data,
                rms_decay=form.rms_decay.data,
                shuffle=form.shuffle.data,
                data_aug=data_aug,
            )
            )

            # Save form data with the job so we can easily clone it later.
            save_form_to_job(job, form)

            jobs.append(job)
            scheduler.add_job(job)
            if n_jobs == 1:
                if request_wants_json():
                    return flask.jsonify(job.json_dict())
                else:
                    return flask.redirect(flask.url_for('digits.model.views.show', job_id=job.id()))

        except:
            if job:
                scheduler.delete_job(job)
            raise

    if request_wants_json():
        return flask.jsonify(jobs=[j.json_dict() for j in jobs])

    # If there are multiple jobs launched, go to the home page.
    return flask.redirect('/')
Пример #46
0
 def setUpClass(cls):
     super(BaseViewsTest, cls).setUpClass()
     if cls.FRAMEWORK=='torch' and not config_value('torch_root'):
         raise unittest.SkipTest('Torch not found')
Пример #47
0
    def task_arguments(self, resources, env):
        if config_value('torch_root') == '<PATHS>':
            torch_bin = 'th'
        else:
            torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th')

        dataset_backend = self.dataset.get_backend()
        assert dataset_backend=='lmdb' or dataset_backend=='hdf5'

        args = [torch_bin,
                os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),'tools','torch','wrapper.lua'),
                'main.lua',
                '--network=%s' % self.model_file.split(".")[0],
                '--epoch=%d' % int(self.train_epochs),
                '--networkDirectory=%s' % self.job_dir,
                '--save=%s' % self.job_dir,
                '--snapshotPrefix=%s' % self.snapshot_prefix,
                '--snapshotInterval=%s' % self.snapshot_interval,
                '--learningRate=%s' % self.learning_rate,
                '--policy=%s' % str(self.lr_policy['policy']),
                '--dbbackend=%s' % dataset_backend
                ]

        if self.batch_size is not None:
            args.append('--batchSize=%d' % self.batch_size)

        if self.use_mean != 'none':
            filename = self.create_mean_file()
            args.append('--mean=%s' % filename)

        if hasattr(self.dataset, 'labels_file'):
            args.append('--labels=%s' % self.dataset.path(self.dataset.labels_file))

        train_feature_db_path = self.dataset.get_feature_db_path(constants.TRAIN_DB)
        train_label_db_path = self.dataset.get_label_db_path(constants.TRAIN_DB)
        val_feature_db_path = self.dataset.get_feature_db_path(constants.VAL_DB)
        val_label_db_path = self.dataset.get_label_db_path(constants.VAL_DB)

        args.append('--train=%s' % train_feature_db_path)
        if train_label_db_path:
            args.append('--train_labels=%s' % train_label_db_path)
        if val_feature_db_path:
            args.append('--validation=%s' % val_feature_db_path)
        if val_label_db_path:
            args.append('--validation_labels=%s' % val_label_db_path)

        #learning rate policy input parameters
        if self.lr_policy['policy'] == 'fixed':
            pass
        elif self.lr_policy['policy'] == 'step':
            args.append('--gamma=%s' % self.lr_policy['gamma'])
            args.append('--stepvalues=%s' % self.lr_policy['stepsize'])
        elif self.lr_policy['policy'] == 'multistep':
            args.append('--stepvalues=%s' % self.lr_policy['stepvalue'])
            args.append('--gamma=%s' % self.lr_policy['gamma'])
        elif self.lr_policy['policy'] == 'exp':
            args.append('--gamma=%s' % self.lr_policy['gamma'])
        elif self.lr_policy['policy'] == 'inv':
            args.append('--gamma=%s' % self.lr_policy['gamma'])
            args.append('--power=%s' % self.lr_policy['power'])
        elif self.lr_policy['policy'] == 'poly':
            args.append('--power=%s' % self.lr_policy['power'])
        elif self.lr_policy['policy'] == 'sigmoid':
            args.append('--stepvalues=%s' % self.lr_policy['stepsize'])
            args.append('--gamma=%s' % self.lr_policy['gamma'])

        if self.shuffle:
            args.append('--shuffle=yes')

        if self.crop_size:
            args.append('--crop=yes')
            args.append('--croplen=%d' % self.crop_size)

        if self.use_mean == 'pixel':
            args.append('--subtractMean=pixel')
        elif self.use_mean == 'image':
            args.append('--subtractMean=image')
        else:
            args.append('--subtractMean=none')

        if self.random_seed is not None:
            args.append('--seed=%s' % self.random_seed)

        if self.solver_type == 'SGD':
            args.append('--optimization=sgd')
        elif self.solver_type == 'NESTEROV':
            args.append('--optimization=nag')
        elif self.solver_type == 'ADAGRAD':
            args.append('--optimization=adagrad')
        elif self.solver_type == 'RMSPROP':
            args.append('--optimization=rmsprop')
        elif self.solver_type == 'ADADELTA':
            args.append('--optimization=adadelta')
        elif self.solver_type == 'ADAM':
            args.append('--optimization=adam')
        else:
            raise ValueError('Unknown solver_type %s' % self.solver_type)

        if self.val_interval > 0:
            args.append('--interval=%s' % self.val_interval)

        if 'gpus' in resources:
            identifiers = []
            for identifier, value in resources['gpus']:
                identifiers.append(identifier)
            # make all selected GPUs visible to the Torch 'th' process.
            # don't make other GPUs visible though since Torch will load
            # CUDA libraries and allocate memory on all visible GPUs by
            # default.
            env['CUDA_VISIBLE_DEVICES'] = ','.join(identifiers)
            # switch to GPU mode
            args.append('--type=cuda')
        else:
            # switch to CPU mode
            args.append('--type=float')

        if self.pretrained_model:
            filenames = self.pretrained_model.split(os.path.pathsep)
            if len(filenames) > 1:
                raise ValueError('Torch does not support multiple pretrained model files')
            args.append('--weights=%s' % self.path(filenames[0]))

        # Augmentations
        assert self.data_aug['flip'] in ['none', 'fliplr', 'flipud', 'fliplrud'], 'Bad or unknown flag "flip"'
        args.append('--augFlip=%s' % self.data_aug['flip'])

        assert self.data_aug['quad_rot'] in ['none', 'rot90', 'rot180', 'rotall'], 'Bad or unknown flag "quad_rot"'
        args.append('--augQuadRot=%s' % self.data_aug['quad_rot'])

        if self.data_aug['rot']:
            args.append('--augRot=%s' % self.data_aug['rot'])

        if self.data_aug['scale']:
            args.append('--augScale=%s' % self.data_aug['scale'])

        if self.data_aug['noise']:
            args.append('--augNoise=%s' % self.data_aug['noise'])

        if self.data_aug['hsv_use']:
            args.append('--augHSVh=%s' % self.data_aug['hsv_h'])
            args.append('--augHSVs=%s' % self.data_aug['hsv_s'])
            args.append('--augHSVv=%s' % self.data_aug['hsv_v'])
        else:
            args.append('--augHSVh=0')
            args.append('--augHSVs=0')
            args.append('--augHSVv=0')

        return args
Пример #48
0
 def test_select_gpu(self):
     for index in config_value('gpu_list').split(','):
         yield self.check_select_gpu, index
Пример #49
0
 def test_select_gpus(self):
     # test all possible combinations
     gpu_list = config_value('gpu_list').split(',')
     for i in xrange(len(gpu_list)):
         for combination in itertools.combinations(gpu_list, i+1):
             yield self.check_select_gpus, combination
Пример #50
0
def generic_image_model_create():
    """
    Create a new GenericImageModelJob

    Returns JSON when requested: {job_id,name,status} or {errors:[]}
    """
    form = GenericImageModelForm()
    form.dataset.choices = get_datasets()
    form.standard_networks.choices = []
    form.previous_networks.choices = get_previous_networks()

    prev_network_snapshots = get_previous_network_snapshots()

    if not form.validate_on_submit():
        if request_wants_json():
            return flask.jsonify({'errors': form.errors}), 400
        else:
            return flask.render_template('models/images/generic/new.html',
                    form = form,
                    previous_network_snapshots = prev_network_snapshots,
                    previous_networks_fullinfo = get_previous_networks_fulldetails(),
                    multi_gpu = config_value('caffe_root')['multi_gpu'],
                    ), 400

    datasetJob = scheduler.get_job(form.dataset.data)
    if not datasetJob:
        raise werkzeug.exceptions.BadRequest(
                'Unknown dataset job_id "%s"' % form.dataset.data)

    job = None
    try:
        job = GenericImageModelJob(
                name        = form.model_name.data,
                dataset_id  = datasetJob.id(),
                )

        # get framework (hard-coded to caffe for now)
        fw = frameworks.get_framework_by_id('caffe')

        pretrained_model = None
        #if form.method.data == 'standard':
        if form.method.data == 'previous':
            old_job = scheduler.get_job(form.previous_networks.data)
            if not old_job:
                raise werkzeug.exceptions.BadRequest(
                        'Job not found: %s' % form.previous_networks.data)

            network = fw.get_network_from_previous(old_job.train_task().network)

            for choice in form.previous_networks.choices:
                if choice[0] == form.previous_networks.data:
                    epoch = float(flask.request.form['%s-snapshot' % form.previous_networks.data])
                    if epoch == 0:
                        pass
                    elif epoch == -1:
                        pretrained_model = old_job.train_task().pretrained_model
                    else:
                        for filename, e in old_job.train_task().snapshots:
                            if e == epoch:
                                pretrained_model = filename
                                break

                        if pretrained_model is None:
                            raise werkzeug.exceptions.BadRequest(
                                    "For the job %s, selected pretrained_model for epoch %d is invalid!"
                                    % (form.previous_networks.data, epoch))
                        if not (os.path.exists(pretrained_model)):
                            raise werkzeug.exceptions.BadRequest(
                                    "Pretrained_model for the selected epoch doesn't exists. May be deleted by another user/process. Please restart the server to load the correct pretrained_model details")
                    break

        elif form.method.data == 'custom':
            network = fw.get_network_from_desc(form.custom_network.data)
            pretrained_model = form.custom_network_snapshot.data.strip()
        else:
            raise werkzeug.exceptions.BadRequest(
                    'Unrecognized method: "%s"' % form.method.data)

        policy = {'policy': form.lr_policy.data}
        if form.lr_policy.data == 'fixed':
            pass
        elif form.lr_policy.data == 'step':
            policy['stepsize'] = form.lr_step_size.data
            policy['gamma'] = form.lr_step_gamma.data
        elif form.lr_policy.data == 'multistep':
            policy['stepvalue'] = form.lr_multistep_values.data
            policy['gamma'] = form.lr_multistep_gamma.data
        elif form.lr_policy.data == 'exp':
            policy['gamma'] = form.lr_exp_gamma.data
        elif form.lr_policy.data == 'inv':
            policy['gamma'] = form.lr_inv_gamma.data
            policy['power'] = form.lr_inv_power.data
        elif form.lr_policy.data == 'poly':
            policy['power'] = form.lr_poly_power.data
        elif form.lr_policy.data == 'sigmoid':
            policy['stepsize'] = form.lr_sigmoid_step.data
            policy['gamma'] = form.lr_sigmoid_gamma.data
        else:
            raise werkzeug.exceptions.BadRequest(
                    'Invalid learning rate policy')

        if config_value('caffe_root')['multi_gpu']:
            if form.select_gpu_count.data:
                gpu_count = form.select_gpu_count.data
                selected_gpus = None
            else:
                selected_gpus = [str(gpu) for gpu in form.select_gpus.data]
                gpu_count = None
        else:
            if form.select_gpu.data == 'next':
                gpu_count = 1
                selected_gpus = None
            else:
                selected_gpus = [str(form.select_gpu.data)]
                gpu_count = None

        job.tasks.append(fw.create_train_task(
                    job_dir         = job.dir(),
                    dataset         = datasetJob,
                    train_epochs    = form.train_epochs.data,
                    snapshot_interval   = form.snapshot_interval.data,
                    learning_rate   = form.learning_rate.data,
                    lr_policy       = policy,
                    gpu_count       = gpu_count,
                    selected_gpus   = selected_gpus,
                    batch_size      = form.batch_size.data,
                    val_interval    = form.val_interval.data,
                    pretrained_model= pretrained_model,
                    crop_size       = form.crop_size.data,
                    use_mean        = bool(form.use_mean.data),
                    network         = network,
                    random_seed     = form.random_seed.data,
                    solver_type     = form.solver_type.data,
                    )
                )

        scheduler.add_job(job)
        if request_wants_json():
            return flask.jsonify(job.json_dict())
        else:
            return flask.redirect(flask.url_for('models_show', job_id=job.id()))

    except:
        if job:
            scheduler.delete_job(job)
        raise
Пример #51
0
    def get_network_visualization(self, desc):
        """
        return visualization of network
        """
        # save network description to temporary file
        temp_network_handle, temp_network_path = tempfile.mkstemp(suffix='.py')
        os.write(temp_network_handle, desc)
        os.close(temp_network_handle)

        try:  # do this in a try..finally clause to make sure we delete the temp file
            # build command line
            mxnet_bin = config_value('mxnet')['executable']

            args = [mxnet_bin,
                    os.path.join(os.path.dirname(digits.__file__), 'tools', 'mxnet', 'train'),
                    '--network=%s' % os.path.splitext(os.path.basename(temp_network_path))[0],
                    '--networkDirectory=%s' % os.path.dirname(temp_network_path),
                    '--subtractMean=none',  # we are not providing a mean image
                    '--visualizeModel=yes',
                    '--type=float'
                    ]

            # execute command
            p = subprocess.Popen(args,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 close_fds=True,
                                 )

            # TODO: need to include regular expression for MAC color codes
            regex = re.compile('\x1b\[[0-9;]*m', re.UNICODE)

            # the network description will be accumulated from the command output
            # when collecting_net_definition==True
            collecting_net_definition = False
            desc = []
            unrecognized_output = []
            while p.poll() is None:
                for line in utils.nonblocking_readlines(p.stdout):
                    if line is not None:
                        # Remove whitespace and color codes.
                        # Color codes are appended to beginning and end of line by mxnet binary
                        # i.e., 'th'. Check the below link for more information
                        # https://groups.google.com/forum/#!searchin/mxnet7/color$20codes/mxnet7/8O_0lSgSzuA/Ih6wYg9fgcwJ  # noqa
                        line = regex.sub('', line)
                        timestamp, level, message = MxnetTrainTask.preprocess_output_mxnet(line.strip())
                        if message:
                            if message.startswith('Network definition'):
                                collecting_net_definition = not collecting_net_definition
                        else:
                            if collecting_net_definition:
                                desc.append(line)
                            elif len(line):
                                unrecognized_output.append(line)
                    else:
                        time.sleep(0.05)

            if not len(desc):
                # we did not find a network description
                raise NetworkVisualizationError(''.join(unrecognized_output))
            else:
                output = flask.Markup('<pre>')
                for line in desc:
                    output += flask.Markup.escape(line)
                output += flask.Markup('</pre>')
                return output
        finally:
            os.remove(temp_network_path)
Пример #52
0
# Copyright (c) 2015-2017, NVIDIA CORPORATION.  All rights reserved.
from __future__ import absolute_import

from .framework import Framework
from .torch_framework import TorchFramework
from digits.config import config_value

__all__ = [
    'Framework',
    'TorchFramework',
]

if config_value('tensorflow')['enabled']:
    from .tensorflow_framework import TensorflowFramework
    __all__.append('TensorflowFramework')

#
#  create framework instances
#

# torch is optional
torch = TorchFramework() if config_value('torch')['enabled'] else None

# tensorflow is optional
tensorflow = TensorflowFramework() if config_value(
    'tensorflow')['enabled'] else None

#
#  utility functions
#
Пример #53
0
    def infer_one_image(self, image, snapshot_epoch=None, layers=None, gpu=None):
        """
        Classify an image
        Returns (predictions, visualizations)
            predictions -- an array of [ (label, confidence), ...] for each label, sorted by confidence
            visualizations -- an array of (layer_name, activations, weights) for the specified layers
        Returns (None, None) if something goes wrong

        Arguments:
        image -- a np.array

        Keyword arguments:
        snapshot_epoch -- which snapshot to use
        layers -- which layer activation[s] and weight[s] to visualize
        """
        temp_image_handle, temp_image_path = tempfile.mkstemp(suffix='.png')
        os.close(temp_image_handle)
        image = PIL.Image.fromarray(image)
        try:
            image.save(temp_image_path, format='png')
        except KeyError:
            error_message = 'Unable to save file to "%s"' % temp_image_path
            self.logger.error(error_message)
            raise digits.inference.errors.InferenceError(error_message)

        if config_value('torch_root') == '<PATHS>':
            torch_bin = 'th'
        else:
            torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th')

        file_to_load = self.get_snapshot(snapshot_epoch)

        args = [torch_bin,
                os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),'tools','torch','wrapper.lua'),
                'test.lua',
                '--image=%s' % temp_image_path,
                '--network=%s' % self.model_file.split(".")[0],
                '--networkDirectory=%s' % self.job_dir,
                '--snapshot=%s' % file_to_load,
                '--allPredictions=yes',
                ]
        if hasattr(self.dataset, 'labels_file'):
            args.append('--labels=%s' % self.dataset.path(self.dataset.labels_file))

        if self.use_mean != 'none':
            filename = self.create_mean_file()
            args.append('--mean=%s' % os.path.join(self.job_dir, constants.MEAN_FILE_IMAGE))

        if self.use_mean == 'pixel':
            args.append('--subtractMean=pixel')
        elif self.use_mean == 'image':
            args.append('--subtractMean=image')
        else:
            args.append('--subtractMean=none')

        if self.crop_size:
            args.append('--crop=yes')
            args.append('--croplen=%d' % self.crop_size)

        if layers=='all':
            args.append('--visualization=yes')
            args.append('--save=%s' % self.job_dir)

        # Convert them all to strings
        args = [str(x) for x in args]

        regex = re.compile('\x1b\[[0-9;]*m', re.UNICODE)   #TODO: need to include regular expression for MAC color codes
        self.logger.info('%s classify one task started.' % self.get_framework_id())

        unrecognized_output = []
        predictions = []
        self.visualization_file = None

        env = os.environ.copy()

        if gpu is not None:
            args.append('--type=cuda')
            # make only the selected GPU visible
            env['CUDA_VISIBLE_DEVICES'] = "%d" % gpu
        else:
            args.append('--type=float')

        p = subprocess.Popen(args,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                cwd=self.job_dir,
                close_fds=True,
                env=env,
                )

        try:
            while p.poll() is None:
                for line in utils.nonblocking_readlines(p.stdout):
                    if self.aborted.is_set():
                        p.terminate()
                        raise digits.inference.errors.InferenceError('%s classify one task got aborted. error code - %d' % (self.get_framework_id(), p.returncode))

                    if line is not None:
                        # Remove color codes and whitespace
                        line=regex.sub('', line).strip()
                    if line:
                        if not self.process_test_output(line, predictions, 'one'):
                            self.logger.warning('%s classify one task unrecognized input: %s' % (self.get_framework_id(), line.strip()))
                            unrecognized_output.append(line)
                    else:
                        time.sleep(0.05)

        except Exception as e:
            if p.poll() is None:
                p.terminate()
            error_message = ''
            if type(e) == digits.inference.errors.InferenceError:
                error_message = e.__str__()
            else:
                error_message = '%s classify one task failed with error code %d \n %s' % (self.get_framework_id(), p.returncode, str(e))
            self.logger.error(error_message)
            if unrecognized_output:
                unrecognized_output = '\n'.join(unrecognized_output)
                error_message = error_message + unrecognized_output
            raise digits.inference.errors.InferenceError(error_message)

        finally:
            self.after_test_run(temp_image_path)

        if p.returncode != 0:
            error_message = '%s classify one task failed with error code %d' % (self.get_framework_id(), p.returncode)
            self.logger.error(error_message)
            if unrecognized_output:
                unrecognized_output = '\n'.join(unrecognized_output)
                error_message = error_message + unrecognized_output
            raise digits.inference.errors.InferenceError(error_message)
        else:
            self.logger.info('%s classify one task completed.' % self.get_framework_id())

        predictions = {'output': np.array(predictions)}

        visualizations = []

        if layers=='all' and self.visualization_file:
            vis_db = h5py.File(self.visualization_file, 'r')
            # the HDF5 database is organized as follows:
            # <root>
            # |- layers
            #    |- 1
            #    |  |- name
            #    |  |- activations
            #    |  |- weights
            #    |- 2
            for layer_id,layer in vis_db['layers'].items():
                layer_desc = layer['name'][...].tostring()
                if 'Sequential' in layer_desc or 'Parallel' in layer_desc:
                    # ignore containers
                    continue
                idx = int(layer_id)
                # activations
                if 'activations' in layer:
                    data = np.array(layer['activations'][...])
                    # skip batch dimension
                    if len(data.shape)>1 and data.shape[0]==1:
                        data = data[0]
                    vis = utils.image.get_layer_vis_square(data)
                    mean, std, hist = self.get_layer_statistics(data)
                    visualizations.append(
                                             {
                                                 'id':         idx,
                                                 'name':       layer_desc,
                                                 'vis_type':   'Activations',
                                                 'vis': vis,
                                                 'data_stats': {
                                                                  'shape':      data.shape,
                                                                  'mean':       mean,
                                                                  'stddev':     std,
                                                                  'histogram':  hist,
                                                 }
                                             }
                                         )
                # weights
                if 'weights' in layer:
                    data = np.array(layer['weights'][...])
                    if 'Linear' not in layer_desc:
                        vis = utils.image.get_layer_vis_square(data)
                    else:
                        # Linear (inner product) layers have too many weights
                        # to display
                        vis = None
                    mean, std, hist = self.get_layer_statistics(data)
                    parameter_count = reduce(operator.mul, data.shape, 1)
                    if 'bias' in layer:
                        bias = np.array(layer['bias'][...])
                        parameter_count += reduce(operator.mul, bias.shape, 1)
                    visualizations.append(
                                           {
                                               'id':          idx,
                                               'name':        layer_desc,
                                               'vis_type':    'Weights',
                                               'vis':  vis,
                                               'param_count': parameter_count,
                                               'data_stats': {
                                                                 'shape':      data.shape,
                                                                 'mean':       mean,
                                                                 'stddev':     std,
                                                                 'histogram':  hist,
                                               }
                                           }
                                         )
            # sort by layer ID
            visualizations = sorted(visualizations,key=lambda x:x['id'])
        return (predictions,visualizations)
Пример #54
0
    def infer_many_images(self, images, snapshot_epoch=None, gpu=None):
        """
        Returns (labels, results):
        labels -- an array of strings
        results -- a 2D np array:
            [
                [image0_label0_confidence, image0_label1_confidence, ...],
                [image1_label0_confidence, image1_label1_confidence, ...],
                ...
            ]

        Arguments:
        images -- a list of np.arrays

        Keyword arguments:
        snapshot_epoch -- which snapshot to use
        """

        # create a temporary folder to store images and a temporary file
        # to store a list of paths to the images
        temp_dir_path = tempfile.mkdtemp()
        try: # this try...finally clause is used to clean up the temp directory in any case
            temp_imglist_handle, temp_imglist_path = tempfile.mkstemp(dir=temp_dir_path, suffix='.txt')
            for image in images:
                temp_image_handle, temp_image_path = tempfile.mkstemp(
                        dir=temp_dir_path, suffix='.png')
                image = PIL.Image.fromarray(image)
                try:
                    image.save(temp_image_path, format='png')
                except KeyError:
                    error_message = 'Unable to save file to "%s"' % temp_image_path
                    self.logger.error(error_message)
                    raise digits.inference.errors.InferenceError(error_message)
                os.write(temp_imglist_handle, "%s\n" % temp_image_path)
                os.close(temp_image_handle)
            os.close(temp_imglist_handle)

            if config_value('torch_root') == '<PATHS>':
                torch_bin = 'th'
            else:
                torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th')

            file_to_load = self.get_snapshot(snapshot_epoch)

            args = [torch_bin,
                    os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),'tools','torch','wrapper.lua'),
                    'test.lua',
                    '--testMany=yes',
                    '--allPredictions=yes',   #all predictions are grabbed and formatted as required by DIGITS
                    '--image=%s' % str(temp_imglist_path),
                    '--network=%s' % self.model_file.split(".")[0],
                    '--networkDirectory=%s' % self.job_dir,
                    '--snapshot=%s' % file_to_load,
                    ]

            if hasattr(self.dataset, 'labels_file'):
                args.append('--labels=%s' % self.dataset.path(self.dataset.labels_file))

            if self.use_mean != 'none':
                filename = self.create_mean_file()
                args.append('--mean=%s' % os.path.join(self.job_dir, constants.MEAN_FILE_IMAGE))

            if self.use_mean == 'pixel':
                args.append('--subtractMean=pixel')
            elif self.use_mean == 'image':
                args.append('--subtractMean=image')
            else:
                args.append('--subtractMean=none')
            if self.crop_size:
                args.append('--crop=yes')
                args.append('--croplen=%d' % self.crop_size)

            # Convert them all to strings
            args = [str(x) for x in args]

            regex = re.compile('\x1b\[[0-9;]*m', re.UNICODE)   #TODO: need to include regular expression for MAC color codes
            self.logger.info('%s classify many task started.' % self.name())

            env = os.environ.copy()
            if gpu is not None:
                args.append('--type=cuda')
                # make only the selected GPU visible
                env['CUDA_VISIBLE_DEVICES'] = "%d" % gpu
            else:
                args.append('--type=float')

            unrecognized_output = []
            predictions = []
            p = subprocess.Popen(args,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    cwd=self.job_dir,
                    close_fds=True,
                    env=env
                    )

            try:
                while p.poll() is None:
                    for line in utils.nonblocking_readlines(p.stdout):
                        if self.aborted.is_set():
                            p.terminate()
                            raise digits.inference.errors.InferenceError('%s classify many task got aborted. error code - %d' % (self.get_framework_id(), p.returncode))

                        if line is not None:
                            # Remove whitespace and color codes. color codes are appended to beginning and end of line by torch binary i.e., 'th'. Check the below link for more information
                            # https://groups.google.com/forum/#!searchin/torch7/color$20codes/torch7/8O_0lSgSzuA/Ih6wYg9fgcwJ
                            line=regex.sub('', line).strip()
                        if line:
                            if not self.process_test_output(line, predictions, 'many'):
                                self.logger.warning('%s classify many task unrecognized input: %s' % (self.get_framework_id(), line.strip()))
                                unrecognized_output.append(line)
                        else:
                            time.sleep(0.05)
            except Exception as e:
                if p.poll() is None:
                    p.terminate()
                error_message = ''
                if type(e) == digits.inference.errors.InferenceError:
                    error_message = e.__str__()
                else:
                    error_message = '%s classify many task failed with error code %d \n %s' % (self.get_framework_id(), p.returncode, str(e))
                self.logger.error(error_message)
                if unrecognized_output:
                    unrecognized_output = '\n'.join(unrecognized_output)
                    error_message = error_message + unrecognized_output
                raise digits.inference.errors.InferenceError(error_message)

            if p.returncode != 0:
                error_message = '%s classify many task failed with error code %d' % (self.get_framework_id(), p.returncode)
                self.logger.error(error_message)
                if unrecognized_output:
                    unrecognized_output = '\n'.join(unrecognized_output)
                    error_message = error_message + unrecognized_output
                raise digits.inference.errors.InferenceError(error_message)
            else:
                self.logger.info('%s classify many task completed.' % self.get_framework_id())
        finally:
            shutil.rmtree(temp_dir_path)

        # task.infer_one() expects dictionary in return value
        return {'output': np.array(predictions)}