def test_bigdl_pytorch_estimator_dataloader_creator(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x model = SimpleModel() estimator = Estimator.from_torch(model=model, loss=nn.BCELoss(), optimizer=Adam(), backend="bigdl") def get_dataloader(): inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) return torch.utils.data.DataLoader(TensorDataset(inputs, targets), batch_size=2) estimator.fit(data=get_dataloader, epochs=2, validation_data=get_dataloader, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) model = estimator.get_model() assert isinstance(model, nn.Module)
def optimize(self, end_trigger=None, checkpoint_trigger=None): """ Run the training loop of the this optimizer :param end_trigger: BigDL's Trigger to indicate when to stop the training. :param checkpoint_trigger: When to save a checkpoint and evaluate model. """ if end_trigger is None: end_trigger = MaxEpoch(1) if checkpoint_trigger is None: checkpoint_trigger = EveryEpoch() if self.tf_model.val_methods is not None and self.val_rdd is not None: self.estimator.train(train_set=self.training_rdd, criterion=IdentityCriterion(), end_trigger=end_trigger, checkpoint_trigger=checkpoint_trigger, validation_set=self.val_rdd, validation_method=self.tf_model.val_methods, batch_size=self.batch_size) else: self.estimator.train(train_set=self.training_rdd, criterion=IdentityCriterion(), end_trigger=end_trigger, batch_size=self.batch_size) self.tf_model.training_helper_layer.get_weights_to_python()
def test_train_model_with_bn(self): class SimpleTorchModel(nn.Module): def __init__(self): super(SimpleTorchModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x torch_model = SimpleTorchModel() loss_fn = torch.nn.BCELoss() az_model = TorchModel.from_pytorch(torch_model) zoo_loss = TorchLoss.from_pytorch(loss_fn) inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2) train_featureset = FeatureSet.pytorch_dataloader(train_loader) val_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2) val_featureset = FeatureSet.pytorch_dataloader(val_loader) zooOptimizer = Adam() estimator = Estimator(az_model, optim_methods=zooOptimizer) estimator.train_minibatch(train_featureset, zoo_loss, end_trigger=MaxEpoch(4), checkpoint_trigger=EveryEpoch(), validation_set=val_featureset, validation_method=[Accuracy()]) trained_model = az_model.to_pytorch()
def optimize(self, end_trigger=None, checkpoint_trigger=None): """ Run the training loop of the this optimizer :param end_trigger: BigDL's Trigger to indicate when to stop the training. :param checkpoint_trigger: When to save a checkpoint and evaluate model. """ if end_trigger is None: end_trigger = MaxEpoch(1) if checkpoint_trigger is None: checkpoint_trigger = EveryEpoch() if isinstance(self.train_data, FeatureSet): if self.train_data.value.getNumOfSlice() != 1: if isinstance(checkpoint_trigger, EveryEpoch): checkpoint_trigger = ZEveryEpoch() elif not isinstance(checkpoint_trigger, ZooTrigger): raise Exception("Please use a trigger defined in zoo.util.triggers") if self.tf_model.val_methods and self.val_data is not None: self.estimator.train_minibatch(train_set=self.train_data, criterion=self.tf_model.criterion, end_trigger=end_trigger, checkpoint_trigger=checkpoint_trigger, validation_set=self.val_data, validation_method=self.tf_model.val_methods) else: self.estimator.train_minibatch(train_set=self.train_data, criterion=self.tf_model.criterion, end_trigger=end_trigger, checkpoint_trigger=checkpoint_trigger) self.tf_model.training_helper_layer.get_weights_to_python()
def convert_trigger(trigger): if trigger is None: return None if isinstance(trigger, str): if trigger.lower() == "everyepoch": return EveryEpoch().get_trigger() else: raise ValueError( "Only 'EveryEpoch', orca triggers and bigdl triggers are " "supported now") elif isinstance(trigger, Trigger): return trigger.get_trigger() else: return trigger
def test_bigdl_pytorch_estimator_shard(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(2, 2) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) def transform(df): result = { "x": [df['user'].to_numpy(), df['item'].to_numpy()], "y": df['label'].to_numpy() } return result OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) estimator = Estimator.from_torch(model=model, loss=loss_func, optimizer=SGD(), backend="bigdl") estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, validation_methods=[Accuracy()], batch_size=2)
def __init__(self, loss, optim_method, sess=None, dataset=None, inputs=None, grads=None, variables=None, graph=None, val_outputs=None, val_labels=None, val_method=None, val_split=0.0, tensors_with_value=None, session_config=None): ''' TFOptimizer is used for distributed training of TensorFlow on Spark/BigDL. :param loss: The loss tensor of the TensorFlow model, should be a scalar :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam :param sess: the current TensorFlow Session, if you want to used a pre-trained model, you should use the Session to load the pre-trained variables and pass it to TFOptimizer. ''' import tensorflow as tf from tensorflow.python.util import nest from zoo.util.tf import export_tf if dataset is None: args = TFOptimizer._get_arguments_from_loss(loss, optim_method, sess, val_outputs, val_labels, val_method) loss, optim_method, sess, dataset, inputs = args[:5] grads, variables, graph, val_outputs, val_labels, val_method = args[5:] additional_inputs = [] additional_values = [] all_required_inputs = _find_placeholders([loss]) all_required_inputs_names = [v.name for v in all_required_inputs] if tensors_with_value: for t, v in tensors_with_value.items(): if t.name in all_required_inputs_names: additional_inputs.append(t) additional_values.append(v) if not isinstance(inputs, list): inputs = nest.flatten(inputs) self.optim_method = optim_method self.sess = sess self.dataset = dataset self.inputs = inputs + additional_inputs self.graph = graph self.session_config = session_config from zoo.util.tf import process_grad grads = [process_grad(grad) for grad in grads] if self.dataset.batch_size <= 0: raise ValueError("You should set batch_size instead of batch_per_thread for training") if val_outputs is not None and val_labels is not None: with self.graph.as_default(): val_labels = [tf.identity(v) for v in val_labels] outputs = val_outputs + val_labels + [loss] else: outputs = [loss] self.grads = grads self.outputs = outputs self.export_dir = tempfile.mkdtemp() export_tf(self.sess, self.export_dir, inputs=self.inputs, outputs=self.grads + self.outputs) variable_names = [v.name for v in variables] grad_names = [g.name for g in grads] output_names = [o.name for o in outputs] def to_floats(vs): return [float(v) for v in vs] meta = { "input_names": [i.name for i in self.inputs], "output_names": output_names, "variables": variable_names, "grad_variables": grad_names, "default_tensor_values": [to_floats(v) for v in additional_values] } with open(os.path.join(self.export_dir, "training_meta.json"), "w") as f: f.write(json.dumps(meta)) self.variable_placeholders = [] with self.graph.as_default(): assigns = [] for v in variables: p = tf.placeholder(dtype=tf.float32, shape=v.shape) a = tf.assign(v, p) self.variable_placeholders.append(p) assigns.append(a) assign = tf.group(*assigns) self.assign = assign try: self.training_helper_layer = TFTrainingHelper(self.export_dir, session_config) except Py4JJavaError as e: if "expects to be colocated with unknown node" in str(e): raise Exception(""" If you are using the embedding layer in tf.keras, then this is a known issue of TensorFlow, see https://github.com/tensorflow/tensorflow/issues/21889. Please add zoo.util.tf.variable_creator_scope before model construction. For example: from zoo.util.tf import variable_creator_scope with variable_creator_scope(): model = tf.keras.models.Sequential([ tf.keras.layers.Embedding(1, 1, input_length=1)]) """) else: raise e data = self.dataset.rdd batch_size = self.dataset.batch_size def to_sample(t): if isinstance(t, list): t = tuple(t) return Sample.from_ndarray(nest.flatten(t), [np.array([0.0])]) sample_rdd = data.map(to_sample) if val_outputs is not None and val_labels is not None: if self.dataset.val_rdd is not None: val_rdd = self.dataset.val_rdd.map(to_sample) val_method = [TFValidationMethod(m, len(val_outputs), len(val_labels)) for m in to_list(val_method)] training_rdd = sample_rdd elif val_split != 0.0: training_rdd, val_rdd = sample_rdd.randomSplit([1 - val_split, val_split]) val_method = [TFValidationMethod(m, len(val_outputs), len(val_labels)) for m in to_list(val_method)] else: raise ValueError("Validation data is not specified. Please set " + "val rdd in TFDataset, or set val_split larger than zero") self.optimizer = Optimizer.create(self.training_helper_layer, training_rdd, IdentityCriterion(), batch_size=batch_size, optim_method=self.optim_method) self.optimizer.set_validation(self.dataset.batch_size, val_rdd, EveryEpoch(), val_method) else: training_rdd = sample_rdd self.optimizer = Optimizer.create(self.training_helper_layer, training_rdd, IdentityCriterion(), batch_size=batch_size, optim_method=self.optim_method)
def __init__(self, loss, optim_method, sess=None, val_outputs=None, val_labels=None, val_method=None): import tensorflow as tf from zoo.util.tf import export_tf ''' TFOptimizer is used for distributed training of tensorflow on Spark/BigDL. :param loss: The loss tensor of the tensorflow model, should be a scalar :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam :param sess: the current tensorflow Session, if you want to used a pre-trained model, you should use the Session to load the pre-trained variables and pass it to TFOptimizer. ''' self.optim_method = optim_method if sess is None: self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) else: self.sess = sess grads_vars = tf.train.GradientDescentOptimizer(0).compute_gradients( loss) variables = [] grads = [] for (grad, var) in grads_vars: variables.append(var) grads.append(grad) self.export_dir = tempfile.mkdtemp() all_required_inputs = _find_placeholders([loss]) self.dataset = tf.get_collection(all_required_inputs[0].name)[0] if self.dataset.batch_size <= 0: raise ValueError( "You should set batch_size instead of batch_per_thread for training" ) self.inputs = self.dataset.tensors _check_the_same(all_required_inputs, self.inputs) if val_outputs is not None and val_labels is not None: outputs = val_outputs + val_labels + [loss] else: outputs = [loss] export_tf(self.sess, self.export_dir, inputs=self.inputs, outputs=grads + outputs) variable_names = [v.name for v in variables] grad_names = [g.name for g in grads] output_names = [o.name for o in outputs] meta = { "input_names": [i.name for i in self.inputs], "output_names": output_names, "variables": variable_names, "grad_variables": grad_names } with open(os.path.join(self.export_dir, "training_meta.json"), "w") as f: f.write(json.dumps(meta)) self.training_helper_layer = TFTrainingHelper(self.export_dir) self.variable_placeholders = [] assigns = [] for v in variables: p = tf.placeholder(dtype=tf.float32, shape=v.shape) a = tf.assign(v, p) self.variable_placeholders.append(p) assigns.append(a) self.assign = tf.group(*assigns) data = self.dataset.rdd batch_size = self.dataset.batch_size sample_rdd = data.map( lambda t: Sample.from_ndarray(t, [np.array([0.0])])) self.optimizer = Optimizer.create(self.training_helper_layer, sample_rdd, IdentityCriterion(), batch_size=batch_size, optim_method=self.optim_method) if val_outputs is not None and val_labels is not None: val_sample_rdd = self.dataset.val_rdd\ .map(lambda t: Sample.from_ndarray(t, [np.array([0.0])])) val_method = TFValidationMethod(val_method, len(val_outputs), len(val_labels)) self.optimizer.set_validation(self.dataset.batch_size, val_sample_rdd, EveryEpoch(), val_method)
y_valid = Variable(torch.Tensor(train_Y[val_.astype(int), np.newaxis])) model = MLP(x_train.shape[1], 512, classes, dropout=0.4) loss_fn = torch.nn.BCELoss() zooOptimizer = Adam(learningrate=learning_rate) zooModel = TorchModel.from_pytorch(model) zooLoss = TorchLoss.from_pytorch(loss_fn) train_featureSet = get_featureset(x_train, y_train, shuffle=True) val_featureSet = get_featureset(x_valid, y_valid, shuffle=False) estimator = Estimator(zooModel, optim_methods=zooOptimizer) estimator.train(train_featureSet, zooLoss, end_trigger=MaxEpoch(epochs), checkpoint_trigger=EveryEpoch(), validation_set=val_featureSet, validation_method=[Accuracy()], batch_size=batch_size) # Predict def get_rdd(x, y, shuffle=False): x = np.split(x.data.numpy(), x.shape[0]) y = np.split(y.data.numpy(), y.shape[0]) samples = [ Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i])) for i in range(len(x)) ] sample_rdd = sc.parallelize(samples) return sample_rdd
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--dir', default='/tmp/data', metavar='N', help='the folder store mnist data') parser.add_argument( '--batch-size', type=int, default=256, metavar='N', help='input batch size for training per executor(default: 256)') parser.add_argument( '--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing per executor(default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 2)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() torch.manual_seed(args.seed) train_loader = torch.utils.data.DataLoader(datasets.MNIST( args.dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( args.dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=False) # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided. if os.environ.get('HADOOP_CONF_DIR') is None: sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"}) else: num_executors = 2 num_cores_per_executor = 4 hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR') zoo_conda_name = os.environ.get( 'ZOO_CONDA_NAME') # The name of the created conda-env sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir, conda_name=zoo_conda_name, num_executors=num_executors, executor_cores=num_cores_per_executor, executor_memory="2g", driver_memory="10g", driver_cores=1, conf={ "spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1" }) model = LeNet() model.train() criterion = nn.NLLLoss() adam = Adam(args.lr) zoo_estimator = Estimator.from_torch(model=model, optimizer=adam, loss=criterion, backend="bigdl") from bigdl.optim.optimizer import EveryEpoch zoo_estimator.fit(data=train_loader, epochs=args.epochs, validation_data=test_loader, validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch()) zoo_estimator.evaluate(data=test_loader, validation_methods=[Accuracy()])
input_shape=input_shape)(both_input) encode_left = both_feature.index_select(1, 0) encode_right = both_feature.index_select(1, 1) distance = autograd.abs(encode_left - encode_right) predict = Dense(output_dim=NUM_CLASS_LABEL, activation="sigmoid", W_regularizer=L2Regularizer(args.penalty_rate))(distance) siamese_net = Model(input=both_input, output=predict) # 声明优化器, 训练并测试模型. optimizer = Optimizer(model=siamese_net, training_rdd=train_rdd, optim_method=Adam(args.learning_rate), criterion=CrossEntropyCriterion(), end_trigger=MaxEpoch(args.num_epoch), batch_size=args.batch_size) optimizer.set_validation(batch_size=args.batch_size, val_rdd=test_rdd, trigger=EveryEpoch(), val_method=[Top1Accuracy()]) # 设置训练日志, 可用 TensorBoard 查询. app_name = "logs" optimizer.set_train_summary(TrainSummary(log_dir=".", app_name=app_name)) optimizer.set_val_summary(ValidationSummary(log_dir=".", app_name=app_name)) optimizer.optimize()
def __init__(self): from bigdl.optim.optimizer import EveryEpoch self.trigger = EveryEpoch()
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Example') parser.add_argument('--dir', default='/tmp/data', metavar='N', help='the folder store cifar10 data') parser.add_argument( '--batch-size', type=int, default=128, metavar='N', help='input batch size for training per executor(default: 128)') parser.add_argument( '--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing per executor(default: 1000)') parser.add_argument('--epochs', type=int, default=135, metavar='N', help='number of epochs to train (default: 135)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--lrd', type=float, default=0.0, metavar='LRD', help='learning rate decay(default: 0.0)') parser.add_argument('--wd', type=float, default=5e-4, metavar='WD', help='weight decay(default: 5e-4)') parser.add_argument('--momentum', type=float, default=0.9, metavar='momentum', help='momentum (default: 0.9)') parser.add_argument('--dampening', type=float, default=0.0, metavar='dampening', help='dampening (default: 0.0)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() torch.manual_seed(args.seed) # 准备数据并预处理 transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), # 先四周填充0,在吧图像随机裁剪成32*32 transforms.RandomHorizontalFlip(), # 图像一半的概率翻转,一半的概率不翻转 transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), # R,G,B每层的归一化用到的均值和方差 ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_set = datasets.CIFAR10(args.dir, train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=2) test_set = datasets.CIFAR10(args.dir, train=False, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.test_batch_size, shuffle=False, num_workers=2) # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided. if os.environ.get('HADOOP_CONF_DIR') is None: sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"}) else: num_executors = 2 num_cores_per_executor = 4 hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR') zoo_conda_name = os.environ.get( 'ZOO_CONDA_NAME') # The name of the created conda-env sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir, conda_name=zoo_conda_name, num_executor=num_executors, executor_cores=num_cores_per_executor, executor_memory="2g", driver_memory="10g", driver_cores=1, spark_conf={ "spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1" }) model = ResNet18() model.train() criterion = nn.CrossEntropyLoss() optimizer = SGD(args.lr, args.lrd, args.wd, args.momentum, args.dampening) zoo_model = TorchModel.from_pytorch(model) zoo_criterion = TorchLoss.from_pytorch(criterion) zoo_estimator = Estimator(zoo_model, optim_methods=optimizer) train_featureset = FeatureSet.pytorch_dataloader(train_loader) test_featureset = FeatureSet.pytorch_dataloader(test_loader) from bigdl.optim.optimizer import MaxEpoch, EveryEpoch zoo_estimator.train_minibatch(train_featureset, zoo_criterion, end_trigger=MaxEpoch(args.epochs), checkpoint_trigger=EveryEpoch(), validation_set=test_featureset, validation_method=[Accuracy()])
def __init__(self, loss, optim_method, sess=None, dataset=None, inputs=None, grads=None, variables=None, graph=None, val_outputs=None, val_labels=None, val_method=None, val_split=0.0, tensors_with_value=None, session_config=None, clip_norm=None, clip_value=None, metrics=None): ''' TFOptimizer is used for distributed training of TensorFlow on Spark/BigDL. :param loss: The loss tensor of the TensorFlow model, should be a scalar :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam :param sess: the current TensorFlow Session, if you want to used a pre-trained model, you should use the Session to load the pre-trained variables and pass it to TFOptimizer. ''' if dataset is None: args = TFOptimizer._get_arguments_from_loss( loss, optim_method, sess, val_outputs, val_labels, val_method) loss, optim_method, sess, dataset, inputs = args[:5] grads, variables, graph, val_outputs, val_labels, val_method = args[ 5:] self.optim_method = optim_method self.sess = sess self.dataset = dataset self.graph = graph self.clip_norm = clip_norm if clip_value is not None and not isinstance(clip_value, tuple): raise ValueError( "The clip_value argument should be a tuple (min_value, max_value)" ) self.clip_constant = clip_value if self.dataset.batch_size <= 0: raise ValueError( "You should set batch_size instead of batch_per_thread for training" ) if val_method is not None: val_methods = to_list(val_method) if metrics is None: metrics = {} for i, method in enumerate(val_methods): metrics['bigdl_metirc_' + str(i)] = BigDLMetric( method, val_outputs, val_labels) self.tf_model = TFModel.create(loss, sess, inputs, grads, variables, graph, tensors_with_value, session_config, metrics) batch_size = self.dataset.batch_size sample_rdd = self.dataset.get_training_data() if val_split != 0.0: training_rdd, val_rdd = sample_rdd.randomSplit( [1 - val_split, val_split]) else: training_rdd = sample_rdd val_rdd = self.dataset.get_validation_data() if self.tf_model.val_methods is not None and val_rdd is not None: self.optimizer = Optimizer.create( self.tf_model.training_helper_layer, training_rdd, IdentityCriterion(), batch_size=batch_size, optim_method=self.optim_method) self.optimizer.set_validation(self.dataset.batch_size, val_rdd, EveryEpoch(), self.tf_model.val_methods) else: self.optimizer = Optimizer.create( self.tf_model.training_helper_layer, training_rdd, IdentityCriterion(), batch_size=batch_size, optim_method=self.optim_method) if self.clip_norm: self.optimizer.set_gradclip_l2norm(self.clip_norm) if self.clip_constant: min_value, max_value = self.clip_constant self.optimizer.set_gradclip_const(min_value, max_value)
def __init__(self, loss, optim_method, sess=None, dataset=None, inputs=None, grads=None, variables=None, graph=None, val_outputs=None, val_labels=None, val_method=None, add_sample_weights_num=0): import tensorflow as tf from zoo.util.tf import export_tf ''' TFOptimizer is used for distributed training of tensorflow on Spark/BigDL. :param loss: The loss tensor of the tensorflow model, should be a scalar :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam :param sess: the current tensorflow Session, if you want to used a pre-trained model, you should use the Session to load the pre-trained variables and pass it to TFOptimizer. ''' if dataset is None: args = TFOptimizer._get_arguments_from_loss( loss, optim_method, sess, val_outputs, val_labels, val_method) loss, optim_method, sess, dataset, inputs = args[:5] grads, variables, graph, val_outputs, val_labels, val_method = args[ 5:] self.optim_method = optim_method self.sess = sess self.dataset = dataset self.inputs = inputs self.graph = graph if self.dataset.batch_size <= 0: raise ValueError( "You should set batch_size instead of batch_per_thread for training" ) if val_outputs is not None and val_labels is not None: with self.graph.as_default(): val_labels = [tf.identity(v) for v in val_labels] outputs = val_outputs + val_labels + [loss] else: outputs = [loss] self.export_dir = tempfile.mkdtemp() export_tf(self.sess, self.export_dir, inputs=self.inputs, outputs=grads + outputs) variable_names = [v.name for v in variables] grad_names = [g.name for g in grads] output_names = [o.name for o in outputs] meta = { "input_names": [i.name for i in self.inputs], "output_names": output_names, "variables": variable_names, "grad_variables": grad_names } with open(os.path.join(self.export_dir, "training_meta.json"), "w") as f: f.write(json.dumps(meta)) self.variable_placeholders = [] with self.graph.as_default(): assigns = [] for v in variables: p = tf.placeholder(dtype=tf.float32, shape=v.shape) a = tf.assign(v, p) self.variable_placeholders.append(p) assigns.append(a) assign = tf.group(*assigns) self.assign = assign self.training_helper_layer = TFTrainingHelper(self.export_dir) data = self.dataset.rdd batch_size = self.dataset.batch_size sample_rdd = data.map(lambda t: Sample.from_ndarray( t + [np.array(1.0)] * add_sample_weights_num, [np.array([0.0])])) self.optimizer = Optimizer.create(self.training_helper_layer, sample_rdd, IdentityCriterion(), batch_size=batch_size, optim_method=self.optim_method) if val_outputs is not None and val_labels is not None: val_sample_rdd = self.dataset.val_rdd\ .map(lambda t: Sample.from_ndarray(t + [np.array(1.0)] * add_sample_weights_num, [np.array([0.0])])) val_method = [ TFValidationMethod(m, len(val_outputs), len(val_labels)) for m in to_list(val_method) ] self.optimizer.set_validation(self.dataset.batch_size, val_sample_rdd, EveryEpoch(), val_method)