def task_arguments(self, **kwargs): gpu_id = kwargs.pop('gpu_id', None) if config_option('caffe_root') == 'SYS': caffe_bin = 'caffe' else: #caffe_bin = os.path.join(config_option('caffe_root'), 'bin', 'caffe.bin') caffe_bin = os.path.join(config_option('caffe_root'), 'build', 'tools', 'caffe.bin') args = [caffe_bin, 'train', '--solver=%s' % self.path(self.solver_file), ] if gpu_id: args.append('--gpu=%d' % gpu_id) if self.pretrained_model: args.append('--weights=%s' % self.path(self.pretrained_model)) return args
def load(cls, job_id): """ Loads a Job in the given job_id Returns the Job or None if an error occurred """ job_dir = os.path.join(config_option('jobs_dir'), job_id) filename = os.path.join(job_dir, cls.SAVE_FILE) with open(filename, 'rb') as savefile: o = pickle.load(savefile) # Reset this on load o._dir = job_dir return o
def setup_logging(): socketio_logger = logging.getLogger('socketio') socketio_logger.addHandler(logging.StreamHandler(sys.stdout)) # Set custom logger logging.setLoggerClass(JobIdLogger) formatter = logging.Formatter( fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s", datefmt=DATE_FORMAT, ) ### digits logger main_logger = logging.getLogger('digits') main_logger.setLevel(logging.DEBUG) # Log to stdout stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(formatter) stdoutHandler.setLevel(logging.DEBUG) main_logger.addHandler(stdoutHandler) ### digits.webapp logger if config_option('log_file'): webapp_logger = logging.getLogger('digits.webapp') webapp_logger.setLevel(logging.DEBUG) # Log to file fileHandler = logging.handlers.RotatingFileHandler( config_option('log_file'), maxBytes=(1024*1024*10), # 10 MB backupCount=10, ) fileHandler.setFormatter(formatter) if config_option('log_level') == 'debug': fileHandler.setLevel(logging.DEBUG) elif config_option('log_level') == 'info': fileHandler.setLevel(logging.INFO) elif config_option('log_level') == 'warning': fileHandler.setLevel(logging.WARNING) elif config_option('log_level') == 'error': fileHandler.setLevel(logging.ERROR) elif config_option('log_level') == 'critical': fileHandler.setLevel(logging.CRITICAL) webapp_logger.addHandler(fileHandler) ### Useful shortcut for the webapp, which may set job_id return JobIdLoggerAdapter(webapp_logger, {}) else: print 'WARNING: log_file config option not found - no log file is being saved' return JobIdLoggerAdapter(main_logger, {})
def setup_logging(): socketio_logger = logging.getLogger('socketio') socketio_logger.addHandler(logging.StreamHandler(sys.stdout)) # Set custom logger logging.setLoggerClass(JobIdLogger) formatter = logging.Formatter( fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s", datefmt=DATE_FORMAT, ) ### digits logger main_logger = logging.getLogger('digits') main_logger.setLevel(logging.DEBUG) # Log to stdout stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(formatter) stdoutHandler.setLevel(logging.DEBUG) main_logger.addHandler(stdoutHandler) ### digits.webapp logger if config_option('log_file'): webapp_logger = logging.getLogger('digits.webapp') webapp_logger.setLevel(logging.DEBUG) # Log to file fileHandler = logging.handlers.RotatingFileHandler( config_option('log_file'), maxBytes=(1024 * 1024 * 10), # 10 MB backupCount=10, ) fileHandler.setFormatter(formatter) if config_option('log_level') == 'debug': fileHandler.setLevel(logging.DEBUG) elif config_option('log_level') == 'info': fileHandler.setLevel(logging.INFO) elif config_option('log_level') == 'warning': fileHandler.setLevel(logging.WARNING) elif config_option('log_level') == 'error': fileHandler.setLevel(logging.ERROR) elif config_option('log_level') == 'critical': fileHandler.setLevel(logging.CRITICAL) webapp_logger.addHandler(fileHandler) ### Useful shortcut for the webapp, which may set job_id return JobIdLoggerAdapter(webapp_logger, {}) else: return JobIdLoggerAdapter(main_logger, {})
def __init__(self, name): """ Arguments: name -- name of this job """ super(Job, self).__init__() # create a unique ID self._id = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), os.urandom(2).encode('hex')) self._dir = os.path.join(config_option('jobs_dir'), self._id) self._name = name self.pickver_job = PICKLE_VERSION self.tasks = [] self.exception = None os.mkdir(self._dir)
def load(cls, job_id): """ Loads a Job in the given job_id Returns the Job or throws an exception """ from digits.model.tasks import TrainTask job_dir = os.path.join(config_option('jobs_dir'), job_id) filename = os.path.join(job_dir, cls.SAVE_FILE) with open(filename, 'rb') as savefile: job = pickle.load(savefile) # Reset this on load job._dir = job_dir for task in job.tasks: task.job_dir = job_dir if isinstance(task, TrainTask): # can't call this until the job_dir is set task.detect_snapshots() return job
def path(self, filename, relative=False): """ Returns a path to the given file Arguments: filename -- the requested file Keyword arguments: relative -- If False, return an absolute path to the file If True, return a path relative to the jobs directory """ if not filename: return None if os.path.isabs(filename): path = filename else: path = os.path.join(self._dir, filename) if relative: path = os.path.relpath(path, config_option('jobs_dir')) return str(path)
def save_prototxt_files(self): """ Save solver, train_val and deploy files to disk """ has_val_set = self.dataset.val_db_task() is not None ### Check what has been specified in self.network tops = [] bottoms = {} train_data_layer = None val_data_layer = None hidden_layers = caffe_pb2.NetParameter() loss_layers = [] accuracy_layers = [] for layer in self.network.layer: assert layer.type not in ['MemoryData', 'HDF5Data', 'ImageData'], 'unsupported data layer type' if layer.type == 'Data': for rule in layer.include: if rule.phase == caffe_pb2.TRAIN: assert train_data_layer is None, 'cannot specify two train data layers' train_data_layer = layer elif rule.phase == caffe_pb2.TEST: assert val_data_layer is None, 'cannot specify two test data layers' val_data_layer = layer elif layer.type == 'SoftmaxWithLoss': loss_layers.append(layer) elif layer.type == 'Accuracy': addThis = True if layer.accuracy_param.HasField('top_k'): if layer.accuracy_param.top_k >= len(self.get_labels()): self.logger.warning('Removing layer %s because top_k=%s while there are are only %s labels in this dataset' % (layer.name, layer.accuracy_param.top_k, len(self.get_labels()))) addThis = False if addThis: accuracy_layers.append(layer) else: hidden_layers.layer.add().CopyFrom(layer) if len(layer.bottom) == 1 and len(layer.top) == 1 and layer.bottom[0] == layer.top[0]: pass else: for top in layer.top: tops.append(top) for bottom in layer.bottom: bottoms[bottom] = True if train_data_layer is None: assert val_data_layer is None, 'cannot specify a test data layer without a train data layer' assert len(loss_layers) > 0, 'must specify a loss layer' network_outputs = [] for name in tops: if name not in bottoms: network_outputs.append(name) assert len(network_outputs), 'network must have an output' # Update num_output for any output InnerProduct layers automatically for layer in hidden_layers.layer: if layer.type == 'InnerProduct': for top in layer.top: if top in network_outputs: layer.inner_product_param.num_output = len(self.get_labels()) break ### Write train_val file train_val_network = caffe_pb2.NetParameter() # data layers if train_data_layer is not None: if train_data_layer.HasField('data_param'): assert not train_data_layer.data_param.HasField('source'), "don't set the data_param.source" assert not train_data_layer.data_param.HasField('backend'), "don't set the data_param.backend" max_crop_size = min(self.dataset.image_dims[0], self.dataset.image_dims[1]) if self.crop_size: assert self.crop_size <= max_crop_size, 'crop_size is larger than the image size' train_data_layer.transform_param.crop_size = self.crop_size elif train_data_layer.transform_param.HasField('crop_size'): cs = train_data_layer.transform_param.crop_size if cs > max_crop_size: # don't throw an error here cs = max_crop_size train_data_layer.transform_param.crop_size = cs self.crop_size = cs train_val_network.layer.add().CopyFrom(train_data_layer) train_data_layer = train_val_network.layer[-1] if val_data_layer is not None and has_val_set: if val_data_layer.HasField('data_param'): assert not val_data_layer.data_param.HasField('source'), "don't set the data_param.source" assert not val_data_layer.data_param.HasField('backend'), "don't set the data_param.backend" if self.crop_size: # use our error checking from the train layer val_data_layer.transform_param.crop_size = self.crop_size train_val_network.layer.add().CopyFrom(val_data_layer) val_data_layer = train_val_network.layer[-1] else: train_data_layer = train_val_network.layer.add(type = 'Data', name = 'data') train_data_layer.top.append('data') train_data_layer.top.append('label') train_data_layer.include.add(phase = caffe_pb2.TRAIN) train_data_layer.data_param.batch_size = constants.DEFAULT_BATCH_SIZE if self.crop_size: train_data_layer.transform_param.crop_size = self.crop_size if has_val_set: val_data_layer = train_val_network.layer.add(type = 'Data', name = 'data') val_data_layer.top.append('data') val_data_layer.top.append('label') val_data_layer.include.add(phase = caffe_pb2.TEST) val_data_layer.data_param.batch_size = constants.DEFAULT_BATCH_SIZE if self.crop_size: val_data_layer.transform_param.crop_size = self.crop_size train_data_layer.data_param.source = self.dataset.path(self.dataset.train_db_task().db_name) train_data_layer.data_param.backend = caffe_pb2.DataParameter.LMDB if val_data_layer is not None: val_data_layer.data_param.source = self.dataset.path(self.dataset.val_db_task().db_name) val_data_layer.data_param.backend = caffe_pb2.DataParameter.LMDB if self.use_mean: train_data_layer.transform_param.mean_file = self.dataset.path(self.dataset.train_db_task().mean_file) if val_data_layer is not None: val_data_layer.transform_param.mean_file = self.dataset.path(self.dataset.train_db_task().mean_file) if self.batch_size: train_data_layer.data_param.batch_size = self.batch_size if val_data_layer is not None: val_data_layer.data_param.batch_size = self.batch_size else: if not train_data_layer.data_param.HasField('batch_size'): train_data_layer.data_param.batch_size = constants.DEFAULT_BATCH_SIZE if val_data_layer is not None and not val_data_layer.data_param.HasField('batch_size'): val_data_layer.data_param.batch_size = constants.DEFAULT_BATCH_SIZE # hidden layers train_val_network.MergeFrom(hidden_layers) # output layers train_val_network.layer.extend(loss_layers) train_val_network.layer.extend(accuracy_layers) with open(self.path(self.train_val_file), 'w') as outfile: text_format.PrintMessage(train_val_network, outfile) ### Write deploy file deploy_network = caffe_pb2.NetParameter() # input deploy_network.input.append('data') deploy_network.input_dim.append(1) deploy_network.input_dim.append(self.dataset.image_dims[2]) if self.crop_size: deploy_network.input_dim.append(self.crop_size) deploy_network.input_dim.append(self.crop_size) else: deploy_network.input_dim.append(self.dataset.image_dims[0]) deploy_network.input_dim.append(self.dataset.image_dims[1]) # hidden layers deploy_network.MergeFrom(hidden_layers) # output layers if loss_layers[-1].type == 'SoftmaxWithLoss': prob_layer = deploy_network.layer.add( type = 'Softmax', name = 'prob') prob_layer.bottom.append(network_outputs[-1]) prob_layer.top.append('prob') with open(self.path(self.deploy_file), 'w') as outfile: text_format.PrintMessage(deploy_network, outfile) ### Write solver file solver = caffe_pb2.SolverParameter() solver.net = self.train_val_file # TODO: detect if caffe is built with CPU_ONLY gpu_list = config_option('gpu_list') if gpu_list and gpu_list != 'NONE': solver.solver_mode = caffe_pb2.SolverParameter.GPU else: solver.solver_mode = caffe_pb2.SolverParameter.CPU solver.snapshot_prefix = self.snapshot_prefix # Epochs -> Iterations train_iter = int(math.ceil(float(self.dataset.train_db_task().entries_count) / train_data_layer.data_param.batch_size)) solver.max_iter = train_iter * self.train_epochs snapshot_interval = self.snapshot_interval * train_iter if 0 < snapshot_interval <= 1: solver.snapshot = 1 # don't round down elif 1 < snapshot_interval < solver.max_iter: solver.snapshot = int(snapshot_interval) else: solver.snapshot = 0 # only take one snapshot at the end if self.dataset.val_db_task() and self.val_interval: solver.test_iter.append(int(math.ceil(float(self.dataset.val_db_task().entries_count) / val_data_layer.data_param.batch_size))) val_interval = self.val_interval * train_iter if 0 < val_interval <= 1: solver.test_interval = 1 # don't round down elif 1 < val_interval < solver.max_iter: solver.test_interval = int(val_interval) else: solver.test_interval = solver.max_iter # only test once at the end # Learning rate solver.base_lr = self.learning_rate solver.lr_policy = self.lr_policy['policy'] scale = float(solver.max_iter)/100.0 if solver.lr_policy == 'fixed': pass elif solver.lr_policy == 'step': # stepsize = stepsize * scale solver.stepsize = int(math.ceil(float(self.lr_policy['stepsize']) * scale)) solver.gamma = self.lr_policy['gamma'] elif solver.lr_policy == 'multistep': for value in self.lr_policy['stepvalue']: # stepvalue = stepvalue * scale solver.stepvalue.append(int(math.ceil(float(value) * scale))) solver.gamma = self.lr_policy['gamma'] elif solver.lr_policy == 'exp': # gamma = gamma^(1/scale) solver.gamma = math.pow(self.lr_policy['gamma'], 1.0/scale) elif solver.lr_policy == 'inv': # gamma = gamma / scale solver.gamma = self.lr_policy['gamma'] / scale solver.power = self.lr_policy['power'] elif solver.lr_policy == 'poly': solver.power = self.lr_policy['power'] elif solver.lr_policy == 'sigmoid': # gamma = -gamma / scale solver.gamma = -1.0 * self.lr_policy['gamma'] / scale # stepsize = stepsize * scale solver.stepsize = int(math.ceil(float(self.lr_policy['stepsize']) * scale)) else: raise Exception('Unknown lr_policy: "%s"' % solver.lr_policy) # go with the suggested defaults solver.momentum = 0.9 solver.weight_decay = 0.0005 # Display 8x per epoch, or once per 5000 images, whichever is more frequent solver.display = max(1, min( int(math.floor(float(solver.max_iter) / (self.train_epochs * 8))), int(math.ceil(5000.0 / train_data_layer.data_param.batch_size)) )) if self.random_seed is not None: solver.random_seed = self.random_seed with open(self.path(self.solver_file), 'w') as outfile: text_format.PrintMessage(solver, outfile) self.solver = solver # save for later return True
def save_prototxt_files(self): """ Save solver, train_val and deploy files to disk """ has_val_set = self.dataset.val_db_task() is not None ### Check what has been specified in self.network tops = [] bottoms = {} train_data_layer = None val_data_layer = None hidden_layers = caffe_pb2.NetParameter() loss_layer = None accuracy_layer = None for layer in self.network.layer: assert layer.type not in ['MemoryData', 'HDF5Data', 'ImageData'], 'unsupported data layer type' if layer.type == 'Data': for rule in layer.include: if rule.phase == caffe_pb2.TRAIN: assert train_data_layer is None, 'cannot specify two train data layers' train_data_layer = layer elif rule.phase == caffe_pb2.TEST: assert val_data_layer is None, 'cannot specify two test data layers' val_data_layer = layer elif layer.type == 'SoftmaxWithLoss': assert loss_layer is None, 'cannot specify two loss layers' loss_layer = layer elif layer.type == 'Accuracy': assert accuracy_layer is None, 'cannot specify two accuracy layers' accuracy_layer = layer else: hidden_layers.layer.add().CopyFrom(layer) if len(layer.bottom) == 1 and len(layer.top) == 1 and layer.bottom[0] == layer.top[0]: pass else: for top in layer.top: tops.append(top) for bottom in layer.bottom: bottoms[bottom] = True assert loss_layer is not None, 'must specify a SoftmaxWithLoss layer' assert accuracy_layer is not None, 'must specify an Accuracy layer' if not has_val_set: self.logger.warning('Discarding Data layer for validation') val_data_layer = None output_name = None for name in tops: if name not in bottoms: assert output_name is None, 'network cannot have more than one output' output_name = name assert output_name is not None, 'network must have one output' for layer in hidden_layers.layer: if output_name in layer.top and layer.type == 'InnerProduct': layer.inner_product_param.num_output = len(self.labels) break if train_data_layer is None: assert val_data_layer is None, 'cannot specify a test data layer without a train data layer' ### Write train_val file train_val_network = caffe_pb2.NetParameter() default_batch_size = 16 #XXX Reasonable default? # data layers if train_data_layer is not None: if train_data_layer.HasField('data_param'): assert not train_data_layer.data_param.HasField('source'), "don't set the data_param.source" assert not train_data_layer.data_param.HasField('backend'), "don't set the data_param.backend" max_crop_size = min(self.dataset.image_dims[0], self.dataset.image_dims[1]) if self.crop_size: assert self.crop_size <= max_crop_size, 'crop_size is larger than the image size' train_data_layer.transform_param.crop_size = self.crop_size elif train_data_layer.transform_param.HasField('crop_size'): cs = train_data_layer.transform_param.crop_size if cs > max_crop_size: # don't throw an error here cs = max_crop_size train_data_layer.transform_param.crop_size = cs self.crop_size = cs train_val_network.layer.add().CopyFrom(train_data_layer) train_data_layer = train_val_network.layer[-1] if val_data_layer is not None and has_val_set: if val_data_layer.HasField('data_param'): assert not val_data_layer.data_param.HasField('source'), "don't set the data_param.source" assert not val_data_layer.data_param.HasField('backend'), "don't set the data_param.backend" if self.crop_size: # use our error checking from the train layer val_data_layer.transform_param.crop_size = self.crop_size train_val_network.layer.add().CopyFrom(val_data_layer) val_data_layer = train_val_network.layer[-1] else: train_data_layer = train_val_network.layer.add(type = 'Data', name = 'data') train_data_layer.top.append('data') train_data_layer.top.append('label') train_data_layer.include.add(phase = caffe_pb2.TRAIN) train_data_layer.data_param.batch_size = default_batch_size if self.crop_size: train_data_layer.transform_param.crop_size = self.crop_size if has_val_set: val_data_layer = train_val_network.layer.add(type = 'Data', name = 'data') val_data_layer.top.append('data') val_data_layer.top.append('label') val_data_layer.include.add(phase = caffe_pb2.TEST) val_data_layer.data_param.batch_size = default_batch_size if self.crop_size: val_data_layer.transform_param.crop_size = self.crop_size train_data_layer.data_param.source = self.dataset.path(self.dataset.train_db_task().db_name) train_data_layer.data_param.backend = caffe_pb2.DataParameter.LMDB if val_data_layer is not None: val_data_layer.data_param.source = self.dataset.path(self.dataset.val_db_task().db_name) val_data_layer.data_param.backend = caffe_pb2.DataParameter.LMDB if self.use_mean: train_data_layer.transform_param.mean_file = self.dataset.path(self.dataset.train_db_task().mean_file) if val_data_layer is not None: val_data_layer.transform_param.mean_file = self.dataset.path(self.dataset.train_db_task().mean_file) if self.batch_size: train_data_layer.data_param.batch_size = self.batch_size if val_data_layer is not None: val_data_layer.data_param.batch_size = self.batch_size else: if not train_data_layer.data_param.HasField('batch_size'): train_data_layer.data_param.batch_size = default_batch_size if val_data_layer is not None and not val_data_layer.data_param.HasField('batch_size'): val_data_layer.data_param.batch_size = default_batch_size # hidden layers train_val_network.MergeFrom(hidden_layers) # output layers if loss_layer is not None: train_val_network.layer.add().CopyFrom(loss_layer) loss_layer = train_val_network.layer[-1] else: loss_layer = train_val_network.layer.add( type = 'SoftmaxWithLoss', name = 'loss') loss_layer.bottom.append(output_name) loss_layer.bottom.append('label') loss_layer.top.append('loss') if accuracy_layer is not None: train_val_network.layer.add().CopyFrom(accuracy_layer) accuracy_layer = train_val_network.layer[-1] elif self.dataset.val_db_task(): accuracy_layer = train_val_network.layer.add( type = 'Accuracy', name = 'accuracy') accuracy_layer.bottom.append(output_name) accuracy_layer.bottom.append('label') accuracy_layer.top.append('accuracy') accuracy_layer.include.add(phase = caffe_pb2.TEST) with open(self.path(self.train_val_file), 'w') as outfile: text_format.PrintMessage(train_val_network, outfile) ### Write deploy file deploy_network = caffe_pb2.NetParameter() # input deploy_network.input.append('data') deploy_network.input_dim.append(1) deploy_network.input_dim.append(self.dataset.image_dims[2]) if self.crop_size: deploy_network.input_dim.append(self.crop_size) deploy_network.input_dim.append(self.crop_size) else: deploy_network.input_dim.append(self.dataset.image_dims[0]) deploy_network.input_dim.append(self.dataset.image_dims[1]) # hidden layers deploy_network.MergeFrom(hidden_layers) # output layers prob_layer = deploy_network.layer.add( type = 'Softmax', name = 'prob') prob_layer.bottom.append(output_name) prob_layer.top.append('prob') with open(self.path(self.deploy_file), 'w') as outfile: text_format.PrintMessage(deploy_network, outfile) ### Write solver file solver = caffe_pb2.SolverParameter() solver.net = self.train_val_file if config_option('gpu_list'): solver.solver_mode = caffe_pb2.SolverParameter.GPU else: solver.solver_mode = caffe_pb2.SolverParameter.CPU solver.snapshot_prefix = self.snapshot_prefix # Epochs -> Iterations train_iter = int(math.ceil(float(self.dataset.train_db_task().entries_count) / train_data_layer.data_param.batch_size)) solver.max_iter = train_iter * self.train_epochs solver.snapshot = train_iter if self.dataset.val_db_task() and self.val_interval: solver.test_iter.append(int(math.ceil(float(self.dataset.val_db_task().entries_count) / val_data_layer.data_param.batch_size))) solver.test_interval = train_iter * self.val_interval # Learning rate solver.base_lr = self.learning_rate solver.lr_policy = self.lr_policy['policy'] scale = float(solver.max_iter)/100.0 if solver.lr_policy == 'fixed': pass elif solver.lr_policy == 'step': # stepsize = stepsize * scale solver.stepsize = int(math.ceil(float(self.lr_policy['stepsize']) * scale)) solver.gamma = self.lr_policy['gamma'] elif solver.lr_policy == 'multistep': for value in self.lr_policy['stepvalue']: # stepvalue = stepvalue * scale solver.stepvalue.append(int(math.ceil(float(value) * scale))) solver.gamma = self.lr_policy['gamma'] elif solver.lr_policy == 'exp': # gamma = gamma^(1/scale) solver.gamma = math.pow(self.lr_policy['gamma'], 1.0/scale) elif solver.lr_policy == 'inv': # gamma = gamma / scale solver.gamma = self.lr_policy['gamma'] / scale solver.power = self.lr_policy['power'] elif solver.lr_policy == 'poly': solver.power = self.lr_policy['power'] elif solver.lr_policy == 'sigmoid': # gamma = -gamma / scale solver.gamma = -1.0 * self.lr_policy['gamma'] / scale # stepsize = stepsize * scale solver.stepsize = int(math.ceil(float(self.lr_policy['stepsize']) * scale)) else: raise Exception('Unknown lr_policy: "%s"' % solver.lr_policy) # go with the suggested defaults solver.momentum = 0.9 solver.weight_decay = 0.0005 # Display 8x per epoch, or once per 5000 images, whichever is more frequent solver.display = min( int(math.floor(float(solver.max_iter) / (self.train_epochs * 8))), int(math.ceil(5000.0 / train_data_layer.data_param.batch_size)) ) with open(self.path(self.solver_file), 'w') as outfile: text_format.PrintMessage(solver, outfile) self.solver = solver # save for later return True