def server_utilization_updater(self): from digits.webapp import scheduler, socketio from digits import device_query devices = [] gpus = len(self.resources['gpus']) if gpus: for index in range(0, gpus): device = device_query.get_device(index) if device: devices.append((index, device)) else: raise RuntimeError( 'Failed to load gpu information for GPU #"%s"' % index) while True: data_gpu = [] for index, device in devices: update = {'name': device.name, 'index': index} nvml_info = device_query.get_nvml_info(index) if nvml_info is not None: update.update(nvml_info) data_gpu.append(update) socketio.emit('server update', { 'update': 'gpus_utilization', 'data_gpu': data_gpu, }, namespace='/jobs', room='job_management') gevent.sleep(1)
def hw_socketio_updater(self, gpus): """ This thread sends SocketIO messages about hardware utilization to connected clients Arguments: gpus -- a list of identifiers for the GPUs currently being used """ from digits.webapp import app, socketio devices = [] if gpus is not None: for index in gpus: device = device_query.get_device(index) if device: devices.append((index, device)) else: raise RuntimeError('Failed to load gpu information for GPU #"%s"' % index) # this thread continues until killed in after_run() while True: # CPU (Non-GPU) Info data_cpu = {} if hasattr(self, "p") and self.p is not None: data_cpu["pid"] = self.p.pid try: ps = psutil.Process(self.p.pid) # 'self.p' is the system call object if ps.is_running(): if psutil.version_info[0] >= 2: data_cpu["cpu_pct"] = ps.cpu_percent(interval=1) data_cpu["mem_pct"] = ps.memory_percent() data_cpu["mem_used"] = ps.memory_info().rss else: data_cpu["cpu_pct"] = ps.get_cpu_percent(interval=1) data_cpu["mem_pct"] = ps.get_memory_percent() data_cpu["mem_used"] = ps.get_memory_info().rss except psutil.NoSuchProcess: # In rare case of instant process crash or PID went zombie (report nothing) pass data_gpu = [] for index, device in devices: update = {"name": device.name, "index": index} nvml_info = device_query.get_nvml_info(index) if nvml_info is not None: update.update(nvml_info) data_gpu.append(update) with app.app_context(): html = flask.render_template("models/gpu_utilization.html", data_gpu=data_gpu, data_cpu=data_cpu) socketio.emit( "task update", {"task": self.html_id(), "update": "gpu_utilization", "html": html}, namespace="/jobs", room=self.job_id, ) gevent.sleep(1)
def gpu_socketio_updater(self, gpus): """ This thread sends SocketIO messages about GPU utilization to connected clients Arguments: gpus -- a list of identifiers for the GPUs currently being used """ from digits.webapp import app, socketio devices = [] for index in gpus: device = device_query.get_device(index) if device: devices.append((index, device)) if not devices: raise RuntimeError('Failed to load gpu information for "%s"' % gpus) # this thread continues until killed in after_run() while True: data = [] for index, device in devices: update = {'name': device.name, 'index': index} nvml_info = device_query.get_nvml_info(index) if nvml_info is not None: update.update(nvml_info) data.append(update) with app.app_context(): html = flask.render_template('models/gpu_utilization.html', data=data) socketio.emit( 'task update', { 'task': self.html_id(), 'update': 'gpu_utilization', 'html': html, }, namespace='/jobs', room=self.job_id, ) gevent.sleep(1)
def gpu_socketio_updater(self, gpus): """ This thread sends SocketIO messages about GPU utilization to connected clients Arguments: gpus -- a list of identifiers for the GPUs currently being used """ from digits.webapp import app, socketio devices = [] for index in gpus: device = device_query.get_device(index) if device: devices.append((index, device)) if not devices: raise RuntimeError('Failed to load gpu information for "%s"' % gpus) # this thread continues until killed in after_run() while True: data = [] for index, device in devices: update = {'name': device.name, 'index': index} nvml_info = device_query.get_nvml_info(index) if nvml_info is not None: update.update(nvml_info) data.append(update) with app.app_context(): html = flask.render_template('models/gpu_utilization.html', data = data) socketio.emit('task update', { 'task': self.html_id(), 'update': 'gpu_utilization', 'html': html, }, namespace='/jobs', room=self.job_id, ) gevent.sleep(1)
def get_gpu_info(self): data_gpu = [] devices = [] gpus = None if 'gpus' in self.current_resources: gpus = [identifier for (identifier, value) in self.current_resources['gpus']] if gpus is None: return None for index in gpus: device = device_query.get_device(index) if device: devices.append((index, device)) else: raise RuntimeError('Failed to load gpu information for GPU #"%s"' % index) for index, device in devices: update = {'name': device.name, 'index': index} nvml_info = device_query.get_nvml_info(index) if nvml_info is not None: update.update(nvml_info) data_gpu.append(update) return data_gpu
def hw_socketio_updater(self, gpus): """ This thread sends SocketIO messages about hardware utilization to connected clients Arguments: gpus -- a list of identifiers for the GPUs currently being used """ from digits.webapp import app, socketio devices = [] if gpus is not None: for index in gpus: device = device_query.get_device(index) if device: devices.append((index, device)) else: raise RuntimeError( 'Failed to load gpu information for GPU #"%s"' % index) # this thread continues until killed in after_run() while True: # CPU (Non-GPU) Info data_cpu = {} if hasattr(self, 'p') and self.p is not None: data_cpu['pid'] = self.p.pid try: ps = psutil.Process( self.p.pid) # 'self.p' is the system call object if ps.is_running(): if psutil.version_info[0] >= 2: data_cpu['cpu_pct'] = ps.cpu_percent(interval=1) data_cpu['mem_pct'] = ps.memory_percent() data_cpu['mem_used'] = ps.memory_info().rss else: data_cpu['cpu_pct'] = ps.get_cpu_percent( interval=1) data_cpu['mem_pct'] = ps.get_memory_percent() data_cpu['mem_used'] = ps.get_memory_info().rss except psutil.NoSuchProcess: # In rare case of instant process crash or PID went zombie (report nothing) pass data_gpu = [] for index, device in devices: update = {'name': device.name, 'index': index} nvml_info = device_query.get_nvml_info(index) if nvml_info is not None: update.update(nvml_info) data_gpu.append(update) with app.app_context(): html = flask.render_template('models/gpu_utilization.html', data_gpu=data_gpu, data_cpu=data_cpu) socketio.emit( 'task update', { 'task': self.html_id(), 'update': 'gpu_utilization', 'html': html, }, namespace='/jobs', room=self.job_id, ) gevent.sleep(1)
class ModelForm(Form): ### Methods def selection_exists_in_choices(form, field): found = False for choice in field.choices: if choice[0] == field.data: found = True if not found: raise validators.ValidationError( "Selected job doesn't exist. Maybe it was deleted by another user." ) def validate_NetParameter(form, field): fw = frameworks.get_framework_by_id(form['framework'].data) try: # below function raises a BadNetworkException in case of validation error fw.validate_network(field.data) except frameworks.errors.BadNetworkError as e: raise validators.ValidationError('Bad network: %s' % e.message) def validate_file_exists(form, field): from_client = bool(form.python_layer_from_client.data) filename = '' if not from_client and field.type == 'StringField': filename = field.data if filename == '': return if not os.path.isfile(filename): raise validators.ValidationError( 'Server side file, %s, does not exist.' % filename) def validate_py_ext(form, field): from_client = bool(form.python_layer_from_client.data) filename = '' if from_client and field.type == 'FileField': filename = flask.request.files[field.name].filename elif not from_client and field.type == 'StringField': filename = field.data if filename == '': return (root, ext) = os.path.splitext(filename) if ext != '.py' and ext != '.pyc': raise validators.ValidationError( 'Python file, %s, needs .py or .pyc extension.' % filename) ### Fields # The options for this get set in the view (since they are dynamic) dataset = utils.forms.SelectField( 'Select Dataset', choices=[], tooltip="Choose the dataset to use for this model.") python_layer_from_client = utils.forms.BooleanField( u'Use client-side file', default=False) python_layer_client_file = utils.forms.FileField( u'Client-side file', validators=[validate_py_ext], tooltip= "Choose a Python file on the client containing layer definitions.") python_layer_server_file = utils.forms.StringField( u'Server-side file', validators=[validate_file_exists, validate_py_ext], tooltip= "Choose a Python file on the server containing layer definitions.") train_epochs = utils.forms.IntegerField( 'Training epochs', validators=[validators.NumberRange(min=1)], default=30, tooltip="How many passes through the training data?") snapshot_interval = utils.forms.FloatField( 'Snapshot interval (in epochs)', default=1, validators=[ validators.NumberRange(min=0), ], tooltip="How many epochs of training between taking a snapshot?") val_interval = utils.forms.FloatField( 'Validation interval (in epochs)', default=1, validators=[validators.NumberRange(min=0)], tooltip= "How many epochs of training between running through one pass of the validation data?" ) random_seed = utils.forms.IntegerField( 'Random seed', validators=[ validators.NumberRange(min=0), validators.Optional(), ], tooltip= "If you provide a random seed, then back-to-back runs with the same model and dataset should give identical results." ) batch_size = utils.forms.MultiIntegerField( 'Batch size', validators=[ utils.forms.MultiNumberRange(min=1), utils.forms.MultiOptional(), ], tooltip= "How many images to process at once. If blank, values are used from the network definition." ) batch_accumulation = utils.forms.IntegerField( 'Batch Accumulation', validators=[ validators.NumberRange(min=1), validators.Optional(), ], tooltip= "Accumulate gradients over multiple batches (useful when you need a bigger batch size for training but it doesn't fit in memory)." ) ### Solver types solver_type = utils.forms.SelectField( 'Solver type', choices=[ ('SGD', 'Stochastic gradient descent (SGD)'), ('NESTEROV', "Nesterov's accelerated gradient (NAG)"), ('ADAGRAD', 'Adaptive gradient (AdaGrad)'), ('RMSPROP', 'RMSprop'), ('ADADELTA', 'AdaDelta'), ('ADAM', 'Adam'), ], default='SGD', tooltip="What type of solver will be used?", ) def validate_solver_type(form, field): fw = frameworks.get_framework_by_id(form.framework) if fw is not None: if not fw.supports_solver_type(field.data): raise validators.ValidationError( 'Solver type not supported by this framework') ### Learning rate learning_rate = utils.forms.MultiFloatField( 'Base Learning Rate', default=0.01, validators=[ utils.forms.MultiNumberRange(min=0), ], tooltip= "Affects how quickly the network learns. If you are getting NaN for your loss, you probably need to lower this value." ) lr_policy = wtforms.SelectField('Policy', choices=[ ('fixed', 'Fixed'), ('step', 'Step Down'), ('multistep', 'Step Down (arbitrary steps)'), ('exp', 'Exponential Decay'), ('inv', 'Inverse Decay'), ('poly', 'Polynomial Decay'), ('sigmoid', 'Sigmoid Decay'), ], default='step') lr_step_size = wtforms.FloatField('Step Size', default=33) lr_step_gamma = wtforms.FloatField('Gamma', default=0.1) lr_multistep_values = wtforms.StringField('Step Values', default="50,85") def validate_lr_multistep_values(form, field): if form.lr_policy.data == 'multistep': for value in field.data.split(','): try: float(value) except ValueError: raise validators.ValidationError('invalid value') lr_multistep_gamma = wtforms.FloatField('Gamma', default=0.5) lr_exp_gamma = wtforms.FloatField('Gamma', default=0.95) lr_inv_gamma = wtforms.FloatField('Gamma', default=0.1) lr_inv_power = wtforms.FloatField('Power', default=0.5) lr_poly_power = wtforms.FloatField('Power', default=3) lr_sigmoid_step = wtforms.FloatField('Step', default=50) lr_sigmoid_gamma = wtforms.FloatField('Gamma', default=0.1) ### Network # Use a SelectField instead of a HiddenField so that the default value # is used when nothing is provided (through the REST API) method = wtforms.SelectField( u'Network type', choices=[ ('standard', 'Standard network'), ('previous', 'Previous network'), ('pretrained', 'Pretrained network'), ('custom', 'Custom network'), ], default='standard', ) ## framework - hidden field, set by Javascript to the selected framework ID framework = wtforms.HiddenField( 'framework', validators=[ validators.AnyOf( [fw.get_id() for fw in frameworks.get_frameworks()], message='The framework you choose is not currently supported.') ], default=frameworks.get_frameworks()[0].get_id()) # The options for this get set in the view (since they are dependent on the data type) standard_networks = wtforms.RadioField( 'Standard Networks', validators=[ validate_required_iff(method='standard'), ], ) previous_networks = wtforms.RadioField( 'Previous Networks', choices=[], validators=[ validate_required_iff(method='previous'), selection_exists_in_choices, ], ) pretrained_networks = wtforms.RadioField( 'Pretrained Networks', choices=[], validators=[ validate_required_iff(method='pretrained'), selection_exists_in_choices, ], ) custom_network = utils.forms.TextAreaField( 'Custom Network', validators=[ validate_required_iff(method='custom'), validate_NetParameter, ], ) custom_network_snapshot = utils.forms.TextField( 'Pretrained model(s)', tooltip= "Paths to pretrained model files, separated by '%s'. Only edit this field if you understand how fine-tuning works in caffe or torch." % os.path.pathsep) def validate_custom_network_snapshot(form, field): if form.method.data == 'custom': for filename in field.data.strip().split(os.path.pathsep): if filename and not os.path.exists(filename): raise validators.ValidationError( 'File "%s" does not exist' % filename) # Select one of several GPUs select_gpu = wtforms.RadioField( 'Select which GPU you would like to use', choices=[('next', 'Next available')] + [( index, '#%s - %s (%s memory)' % (index, get_device(index).name, sizeof_fmt( get_nvml_info(index)['memory']['total'] if get_nvml_info(index) and 'memory' in get_nvml_info(index) else get_device(index).totalGlobalMem)), ) for index in config_value('gpu_list').split(',') if index], default='next', ) # Select N of several GPUs select_gpus = utils.forms.SelectMultipleField( 'Select which GPU[s] you would like to use', choices=[( index, '#%s - %s (%s memory)' % (index, get_device(index).name, sizeof_fmt( get_nvml_info(index)['memory']['total'] if get_nvml_info(index) and 'memory' in get_nvml_info(index) else get_device(index).totalGlobalMem)), ) for index in config_value('gpu_list').split(',') if index], tooltip= "The job won't start until all of the chosen GPUs are available.") # XXX For testing # The Flask test framework can't handle SelectMultipleFields correctly select_gpus_list = wtforms.StringField( 'Select which GPU[s] you would like to use (comma separated)') def validate_select_gpus(form, field): if form.select_gpus_list.data: field.data = form.select_gpus_list.data.split(',') # Use next available N GPUs select_gpu_count = wtforms.IntegerField( 'Use this many GPUs (next available)', validators=[ validators.NumberRange(min=1, max=len( config_value('gpu_list').split(','))) ], default=1, ) def validate_select_gpu_count(form, field): if field.data is None: if form.select_gpus.data: # Make this field optional field.errors[:] = [] raise validators.StopValidation() model_name = utils.forms.StringField( 'Model Name', validators=[validators.DataRequired()], tooltip= "An identifier, later used to refer to this model in the Application.") # allows shuffling data during training (for frameworks that support this, as indicated by # their Framework.can_shuffle_data() method) shuffle = utils.forms.BooleanField( 'Shuffle Train Data', default=True, tooltip='For every epoch, shuffle the data before training.')
class ModelForm(Form): ### Methods def selection_exists_in_choices(form, field): found = False for choice in field.choices: if choice[0] == field.data: found = True if not found: raise validators.ValidationError( "Selected job doesn't exist. Maybe it was deleted by another user." ) def validate_NetParameter(form, field): pb = caffe_pb2.NetParameter() try: text_format.Merge(field.data, pb) except text_format.ParseError as e: raise validators.ValidationError('Not a valid NetParameter: %s' % e) ### Fields # The options for this get set in the view (since they are dynamic) dataset = wtforms.SelectField('Select Dataset', choices=[]) train_epochs = wtforms.IntegerField( 'Training epochs', validators=[validators.NumberRange(min=1)], default=30, ) snapshot_interval = wtforms.FloatField( 'Snapshot interval (in epochs)', default=1, validators=[ validators.NumberRange(min=0), ], ) val_interval = wtforms.FloatField( 'Validation interval (in epochs)', default=1, validators=[validators.NumberRange(min=0)], ) random_seed = wtforms.IntegerField( 'Random seed', validators=[ validators.NumberRange(min=0), validators.Optional(), ], ) batch_size = wtforms.IntegerField( 'Batch size', validators=[ validators.NumberRange(min=1), validators.Optional(), ], ) ### Solver types solver_type = wtforms.SelectField( 'Solver type', choices=[ ('SGD', 'Stochastic gradient descent (SGD)'), ('ADAGRAD', 'Adaptive gradient (AdaGrad)'), ('NESTEROV', "Nesterov's accelerated gradient (NAG)"), ], default='SGD') ### Learning rate learning_rate = wtforms.FloatField('Base Learning Rate', default=0.01, validators=[ validators.NumberRange(min=0), ]) lr_policy = wtforms.SelectField('Policy', choices=[ ('fixed', 'Fixed'), ('step', 'Step Down'), ('multistep', 'Step Down (arbitrary steps)'), ('exp', 'Exponential Decay'), ('inv', 'Inverse Decay'), ('poly', 'Polynomial Decay'), ('sigmoid', 'Sigmoid Decay'), ], default='step') lr_step_size = wtforms.FloatField('Step Size', default=33) lr_step_gamma = wtforms.FloatField('Gamma', default=0.1) lr_multistep_values = wtforms.StringField('Step Values', default="50,85") def validate_lr_multistep_values(form, field): if form.lr_policy.data == 'multistep': for value in field.data.split(','): try: float(value) except ValueError: raise validators.ValidationError('invalid value') lr_multistep_gamma = wtforms.FloatField('Gamma', default=0.5) lr_exp_gamma = wtforms.FloatField('Gamma', default=0.95) lr_inv_gamma = wtforms.FloatField('Gamma', default=0.1) lr_inv_power = wtforms.FloatField('Power', default=0.5) lr_poly_power = wtforms.FloatField('Power', default=3) lr_sigmoid_step = wtforms.FloatField('Step', default=50) lr_sigmoid_gamma = wtforms.FloatField('Gamma', default=0.1) ### Network # Use a SelectField instead of a HiddenField so that the default value # is used when nothing is provided (through the REST API) method = wtforms.SelectField( u'Network type', choices=[ ('standard', 'Standard network'), ('previous', 'Previous network'), ('custom', 'Custom network'), ], default='standard', ) # The options for this get set in the view (since they are dependent on the data type) standard_networks = wtforms.RadioField( 'Standard Networks', validators=[ validate_required_iff(method='standard'), ], ) previous_networks = wtforms.RadioField( 'Previous Networks', choices=[], validators=[ validate_required_iff(method='previous'), selection_exists_in_choices, ], ) custom_network = wtforms.TextAreaField( 'Custom Network', validators=[ validate_required_iff(method='custom'), validate_NetParameter, ]) custom_network_snapshot = wtforms.TextField('Pretrained model') def validate_custom_network_snapshot(form, field): if form.method.data == 'custom': snapshot = field.data.strip() if snapshot: if not os.path.exists(snapshot): raise validators.ValidationError('File does not exist') # Select one of several GPUs select_gpu = wtforms.RadioField( 'Select which GPU you would like to use', choices=[('next', 'Next available')] + [( index, '#%s - %s%s' % ( index, get_device(index).name, ' (%s memory)' % sizeof_fmt(get_nvml_info(index)['memory']['total']) if get_nvml_info(index) and 'memory' in get_nvml_info(index) else '', ), ) for index in config_value('gpu_list').split(',') if index], default='next', ) # Select N of several GPUs select_gpus = wtforms.SelectMultipleField( 'Select which GPU[s] you would like to use', choices=[( index, '#%s - %s%s' % ( index, get_device(index).name, ' (%s memory)' % sizeof_fmt(get_nvml_info(index)['memory']['total']) if get_nvml_info(index) and 'memory' in get_nvml_info(index) else '', ), ) for index in config_value('gpu_list').split(',') if index]) # XXX For testing # The Flask test framework can't handle SelectMultipleFields correctly select_gpus_list = wtforms.StringField( 'Select which GPU[s] you would like to use (comma separated)') def validate_select_gpus(form, field): if form.select_gpus_list.data: field.data = form.select_gpus_list.data.split(',') # Use next available N GPUs select_gpu_count = wtforms.IntegerField( 'Use this many GPUs (next available)', validators=[ validators.NumberRange(min=1, max=len( config_value('gpu_list').split(','))) ], default=1, ) def validate_select_gpu_count(form, field): if field.data is None: if form.select_gpus.data: # Make this field optional field.errors[:] = [] raise validators.StopValidation() model_name = wtforms.StringField('Model Name', validators=[validators.DataRequired()])