def nd_f1(pred, label, num_class, average="micro"): """Evaluate F1 using mx.nd.NDArray Parameters ---------- pred : nd.NDArray Shape (num, label_num) or (num,) label : nd.NDArray Shape (num, label_num) or (num,) num_class : int average : str Returns ------- f1 : float """ if pred.dtype != np.float32: pred = pred.astype(np.float32) label = label.astype(np.float32) assert num_class > 1 assert pred.ndim == label.ndim if num_class == 2 and average == "micro": tp = nd.sum((pred == 1) * (label == 1)).asscalar() fp = nd.sum((pred == 1) * (label == 0)).asscalar() fn = nd.sum((pred == 0) * (label == 1)).asscalar() precision = float(tp) / (tp + fp) recall = float(tp) / (tp + fn) f1 = 2 * (precision * recall) / (precision + recall) else: assert num_class is not None pred_onehot = nd.one_hot(indices=pred, depth=num_class) label_onehot = nd.one_hot(indices=label, depth=num_class) tp = pred_onehot * label_onehot fp = pred_onehot * (1 - label_onehot) fn = (1 - pred_onehot) * label_onehot if average == "micro": tp = nd.sum(tp).asscalar() fp = nd.sum(fp).asscalar() fn = nd.sum(fn).asscalar() precision = float(tp) / (tp + fp) recall = float(tp) / (tp + fn) f1 = 2 * (precision * recall) / (precision + recall) elif average == "macro": if tp.ndim == 3: tp = nd.sum(tp, axis=(0, 1)) fp = nd.sum(fp, axis=(0, 1)) fn = nd.sum(fn, axis=(0, 1)) else: tp = nd.sum(tp, axis=0) fp = nd.sum(fp, axis=0) fn = nd.sum(fn, axis=0) precision = nd.mean(tp / (tp + fp)).asscalar() recall = nd.mean(tp / (tp + fn)).asscalar() f1 = 2 * (precision * recall) / (precision + recall) else: raise NotImplementedError return f1
def evaluate_accuracy(data_iterator, num_examples, batch_size, params, net, pool_type, pool_size, pool_stride, act_type, dilate_size, nf): numerator = 0. denominator = 0. for batch_i, (data, label) in enumerate(data_iterator): data = data.as_in_context(ctx).reshape((batch_size, 1, 1, -1)) label = label.as_in_context(ctx) label_one_hot = nd.one_hot(label, 10) output, _ = net(data, params, pool_type=pool_type, pool_size=pool_size, pool_stride=pool_stride, act_type=act_type, dilate_size=dilate_size, nf=nf) predictions = nd.argmax(output, axis=1) numerator += nd.sum(predictions == label) denominator += data.shape[0] print('Evaluating accuracy. (complete percent: %.2f/100' % (1.0 * batch_i / (num_examples // batch_size) * 100) + ')', end='') sys.stdout.write("\r") return (numerator / denominator).asscalar()
def generate_learned_samples(self): ''' Draw and generate data. Returns: `Tuple` data. The shape is ... - `mxnet.ndarray` of observed data points in training. - `mxnet.ndarray` of supervised data in training. - `mxnet.ndarray` of observed data points in test. - `mxnet.ndarray` of supervised data in test. ''' for _ in range(self.iter_n): training_batch_arr, test_batch_arr = None, None training_label_arr, test_label_arr = None, None row_arr = np.arange(self.__train_observed_arr.shape[0]) np.random.shuffle(row_arr) training_batch_arr = self.__train_observed_arr[ row_arr[:self.batch_size]] training_batch_arr = mx.ndarray.array(training_batch_arr, ctx=self.__ctx) training_batch_arr = self.pre_normalize(training_batch_arr) label_key_arr = self.__train_label_arr[row_arr[:self.batch_size]] label_key_arr = mx.ndarray.array(label_key_arr, ctx=self.__ctx) training_label_arr = nd.one_hot(label_key_arr, self.__label_n) test_row_arr = np.arange(self.__test_observed_arr.shape[0]) np.random.shuffle(test_row_arr) test_batch_arr = self.__test_observed_arr[ test_row_arr[:self.batch_size]] test_batch_arr = mx.ndarray.array(test_batch_arr, ctx=self.__ctx) test_batch_arr = self.pre_normalize(test_batch_arr) test_label_key_arr = self.__test_label_arr[ test_row_arr[:self.batch_size]] test_label_key_arr = mx.ndarray.array(test_label_key_arr, ctx=self.__ctx) test_label_arr = nd.one_hot(test_label_key_arr, self.__label_n) if self.__noiseable_data is not None: training_batch_arr = self.__noiseable_data.noise( training_batch_arr) yield training_batch_arr, training_label_arr, test_batch_arr, test_label_arr
def forward(self, pred, label): label = nd.one_hot(label, self.nclass) alpha_p = nd.relu(self.op - pred) alpha_n = nd.relu(pred - self.on) pred = (label * (alpha_p * (pred - self.delta_p)) + (1-label) * (alpha_n * (pred - self.delta_n))) * self.scale return self.loss(pred, label)
def sample_v_given_h(self, h0): v1_prob = self.propdown(h0).reshape([-1, self.n_val]) v1_prob = nd.softmax(v1_prob) v1_args = nd.sample_multinomial(v1_prob) v1 = nd.one_hot(v1_args, self.n_val) return [ v1_prob.reshape([-1, self.n_node]), v1.reshape([-1, self.n_node]) ]
def get_minibatch(data_iter): try: batch = data_iter.next() except StopIteration: data_iter.reset() batch = data_iter.next() x = batch.data[0] x = nd.reshape(x, (x.shape[0], -1)) y = nd.one_hot(batch.label[0], 10) return x, y
def evaluate_accuracy(data_iterator, net): numerator = 0. denominator = 0. loss_avg = 0. for i, (data, label) in enumerate(data_iterator): data = data.as_in_context(ctx).reshape((-1, 784)) label = label.as_in_context(ctx) label_one_hot = nd.one_hot(label, 10) output = net(data) loss = cross_entropy(output, label_one_hot) predictions = nd.argmax(output, axis=1) numerator += nd.sum(predictions == label) denominator += data.shape[0] loss_avg = loss_avg * i / (i + 1) + nd.mean(loss).asscalar() / (i + 1) return (numerator / denominator).asscalar(), loss_avg
def loss(self, data, label, train=True): data = data.as_in_context(ctx).reshape((data.shape[0],self.num_channel,1,-1)) label = label.as_in_context(ctx) label_one_hot = nd.one_hot(label, self.model.output_dim) if train: with autograd.record(): output, _= self.model.network(X=data) loss = softmax_cross_entropy(output, label_one_hot) loss.backward() return loss else: output, _ = self.model.network(X=data) loss = softmax_cross_entropy(output, label_one_hot) return loss, output
def plot(netG): num_image = 8 for i in range(num_image): latent_z = mx.nd.random_normal(0, 1, shape=(1, latent_z_size, 1, 1), ctx=ctx) y_z = mx.nd.array(np.random.randint(0, 9, size=1), ctx=ctx) y_z = nd.one_hot(y_z, depth=10) img = netG(latent_z, y_z) plt.subplot(2, 4, i + 1) visualize(img[0]) plt.show()
def validate(self): self.val_iter.reset() val_metrics = self.get_metrics() val_loss, val_num = 0, 0 for i, batch in enumerate(self.val_iter): data, label = unpack_batch(batch, self.context) output = [self.net(x) for x in data] loss = [ loss_func(o, nd.one_hot(l, self.K)) for (o, l) in zip(output, label) ] val_loss += np.sum([nd.sum(L).asnumpy() for L in loss]) val_num += batch.data[0].shape[0] for val_metric in val_metrics: val_metric.update(label, output) return val_loss / val_num, val_metrics
def predict_nd(self): self._random_data() if self.oldversion: pass else: self._reset_noise() prob_list = [] label_list = [] for batch_i, (data, label) in enumerate(self.test_data,): data = data.as_in_context(ctx).reshape((data.shape[0],self.num_channel,1,-1)) label = label.as_in_context(ctx) label = nd.one_hot(label, self.model.output_dim).asnumpy()[:,1].tolist() output, _ = self.model.network(X=data) prob = transform_softmax(output)[:,1].asnumpy().tolist() prob_list.extend(prob) label_list.extend(label) return prob_list, label_list, output
def load_enum_states(self, fn): lines = open(fn, 'r').readlines() n_states = int(lines[0]) dat_lst = [] self.prob_states = nd.zeros([n_states], ctx=self.ctx) for i in range(1, n_states + 1): es = lines[i].strip().split() for v in range(self.n_vis): dat_lst.append(int(es[0][v])) self.prob_states[i - 1] = float(es[1]) if self.prob_states[i - 1] < 1e-10: self.prob_states[i - 1] = 1e-10 dat_lst = nd.array(dat_lst) self.enum_states = nd.one_hot(dat_lst, self.n_val).reshape( [-1, self.n_vis * self.n_val]).copyto(self.ctx) sys.stderr.write("Exact states info loaded!\n") return
def get_data(fn, n_vis, n_val): if fn.isdigit(): #random num_data = int(fn) prob = nd.ones([num_data * n_vis, n_val]) / n_val dat_lst = nd.sample_multinomial(prob) sys.stderr.write("Generating random data: nv= %d, nd= %d\n" % (n_vis, num_data)) else: #read from file with open(fn, 'r') as fp: lines = fp.readlines() es = lines[0].split() nl = int(es[0]) nv = int(es[1]) dat_lst = [] sys.stderr.write("Loading data: nv= %d, nd= %d\n" % (nv, nl)) for l in range(1, nl + 1): for i in range(nv): dat_lst.append(int(lines[l][i])) dat_lst = nd.array(dat_lst) data = nd.one_hot(dat_lst, n_val).reshape([-1, n_vis * n_val]) return data
def get_input_data(data,vocab_size): return [nd.one_hot(X,vocab_size).asnumpy() for X in data]
def forward(self, x): x = nd.one_hot(x, self.vsize) self.h1 = rnn(x, self.h1, self.W1, self.b1) return nd.dot(self.h1, self.Wy) + self.by
epochs = 1000 moving_loss = 0 niter = 0 l2_strength = .1 loss_seq_train = [] loss_seq_test = [] acc_seq_train = [] acc_seq_test = [] for e in range(epochs): for i, (data, label) in enumerate(train_data): data = data.as_in_context(model_ctx).reshape((-1, 784)) label = label.as_in_context(model_ctx) label_one_hot = nd.one_hot(label, 10) with autograd.record(): output = net(data) loss = cross_entropy( output, label_one_hot) + l2_strength * penalty_l2(params) loss.backward() SGD(params, 0.001) niter += 1 moving_loss = 0.99 * moving_loss + .01 * nd.sum(loss).asscalar() est_loss = moving_loss / (1 - .99 * niter) test_accuracy, test_loss = evaluate_accuracy(test_data, net) train_accuracy, train_loss = evaluate_accuracy(train_data, net) # save them for later
def GRU(epoch = 100 , batch_size=100, save_period=100 , load_period=100 ,learning_rate= 0.1, ctx=mx.gpu(0)): train_data , test_data = FashionMNIST(batch_size) #network parameter time_step = 28 num_inputs = 28 num_hidden = 200 num_outputs = 10 path = "weights/FashionMNIST_GRUweights-{}".format(load_period) if os.path.exists(path): print("loading weights") [wxz, wxr, wxh, whz, whr, whh, bz, br, bh, why, by] = nd.load(path) # weights load wxz = wxz.as_in_context(ctx) wxr = wxr.as_in_context(ctx) whz = whz.as_in_context(ctx) whz = whz.as_in_context(ctx) whr = whr.as_in_context(ctx) whh = whh.as_in_context(ctx) bz = bz.as_in_context(ctx) br = br.as_in_context(ctx) bh = bh.as_in_context(ctx) why = why.as_in_context(ctx) by = by.as_in_context(ctx) params = [wxz , wxr , wxh , whz, whr, whh, bz, br, bh, why , by] else: print("initializing weights") with ctx: wxz = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_inputs)) wxr = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_inputs)) wxh = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_inputs)) whz = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_hidden)) whr = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_hidden)) whh = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_hidden)) bz = nd.random.normal(loc=0,scale=0.01,shape=(num_hidden,)) br = nd.random.normal(loc=0,scale=0.01,shape=(num_hidden,)) bh = nd.random.normal(loc=0,scale=0.01,shape=(num_hidden,)) why = nd.random.normal(loc=0,scale=0.1,shape=(num_outputs , num_hidden)) by = nd.random.normal(loc=0,scale=0.1,shape=(num_outputs,)) params = [wxz , wxr , wxh , whz, whr, whh, bz, br, bh, why , by] # attach gradient!!! for param in params: param.attach_grad() #Fully Neural Network with 1 Hidden layer def GRU_Cell(input, state): for x in input: z_t = nd.Activation(nd.FullyConnected(data=x,weight=wxz,no_bias=True,num_hidden=num_hidden)+ nd.FullyConnected(data=state,weight=whz,no_bias=True,num_hidden=num_hidden)+bz,act_type="sigmoid") r_t = nd.Activation(nd.FullyConnected(data=x,weight=wxr,no_bias=True,num_hidden=num_hidden)+ nd.FullyConnected(data=state,weight=whr,no_bias=True,num_hidden=num_hidden)+br,act_type="sigmoid") g_t = nd.Activation(nd.FullyConnected(data=x,weight=wxh,no_bias=True,num_hidden=num_hidden)+ nd.FullyConnected(data=r_t*state,weight=whh,no_bias=True,num_hidden=num_hidden)+bh,act_type="tanh") state = nd.multiply(z_t,state) + nd.multiply(1-z_t,g_t) output = nd.FullyConnected(data=state, weight=why, bias=by, num_hidden=num_outputs) output = nd.softmax(data=output) return output, state def cross_entropy(output, label): return - nd.sum(label * nd.log(output), axis=0 , exclude=True) #Adam optimizer state=[] optimizer=mx.optimizer.Adam(rescale_grad=1,learning_rate=learning_rate) for param in params: state.append(optimizer.create_state(0,param)) for i in tqdm(range(1,epoch+1,1)): for data,label in train_data: states = nd.zeros(shape=(data.shape[0], num_hidden), ctx=ctx) data = data.as_in_context(ctx) data = data.reshape(shape=(-1,time_step,num_inputs)) data=nd.transpose(data=data,axes=(1,0,2)) label = label.as_in_context(ctx) label = nd.one_hot(label , num_outputs) with autograd.record(): outputs, states = GRU_Cell(data, states) loss = cross_entropy(outputs,label) # (batch_size,) loss.backward() cost = nd.mean(loss).asscalar() for j,param in enumerate(params): optimizer.update(0,param,param.grad,state[j]) test_accuracy = evaluate_accuracy(test_data, time_step, num_inputs, num_hidden, GRU_Cell, ctx) print(" epoch : {} , last batch cost : {}".format(i,cost)) print("Test_acc : {0:0.3f}%".format(test_accuracy * 100)) #weight_save if i % save_period==0: if not os.path.exists("weights"): os.makedirs("weights") print("saving weights") nd.save("weights/FashionMNIST_GRUweights-{}".format(i),params) test_accuracy = evaluate_accuracy(test_data, time_step, num_inputs, num_hidden, GRU_Cell, ctx) print("Test_acc : {0:0.3f}%".format(test_accuracy * 100)) return "optimization completed"
def Train(train, test, Debug, batch_size, lr, smoothing_constant, num_fc1, num_fc2, num_outputs, epochs, SNR, sl, pool_type, pool_size, pool_stride, params_init=None, period=None): num_examples = train.shape[0] # 训练集数据类型转换 y = nd.array(~train.sigma.isnull() + 0) X = nd.array( Normolise( train.drop([ 'mass', 'positions', 'gaps', 'max_peak', 'sigma', 'SNR_mf', 'SNR_mf0' ], axis=1))) print('Label for training:', y.shape) print('Dataset for training:', X.shape, end='\n\n') dataset_train = gluon.data.ArrayDataset(X, y) train_data = gluon.data.DataLoader(dataset_train, batch_size, shuffle=True, last_batch='discard') y = nd.array(~test.sigma.isnull() + 0) X = nd.array( Normolise( test.drop([ 'mass', 'positions', 'gaps', 'max_peak', 'sigma', 'SNR_mf', 'SNR_mf0' ], axis=1))) print('Label for testing:', y.shape) print('Dataset for testing:', X.shape, end='\n\n') # 这里使用data模块来读取数据。创建测试数据。 (不shuffle) dataset_test = gluon.data.ArrayDataset(X, y) test_data = gluon.data.DataLoader(dataset_test, batch_size, shuffle=True, last_batch='discard') # Train loss_history = [] loss_v_history = [] moving_loss_history = [] test_accuracy_history = [] train_accuracy_history = [] # assert period >= batch_size and period % batch_size == 0 # Initializate parameters if params_init: print('Loading params...') params = params_init # [W1, b1, W2, b2, W3, b3, W4, b4, W5, b5, W6, b6, W7, b7] = params # # random fc layers # weight_scale = .01 # W5 = nd.random_normal(loc=0, scale=weight_scale, shape=(sl, num_fc1), ctx=ctx ) # W6 = nd.random_normal(loc=0, scale=weight_scale, shape=(num_fc1, num_fc2), ctx=ctx ) # W7 = nd.random_normal(loc=0, scale=weight_scale, shape=(num_fc2, num_outputs), ctx=ctx ) # b5 = nd.random_normal(shape=num_fc1, scale=weight_scale, ctx=ctx) # b6 = nd.random_normal(shape=num_fc2, scale=weight_scale, ctx=ctx) # b7 = nd.random_normal(shape=num_outputs, scale=weight_scale, ctx=ctx) # params = [W1, b1, W2, b2, W3, b3, W4, b4, W5, b5] # print('Random the FC1&2-layers...') vs = [] sqrs = [] for param in params: param.attach_grad() vs.append(param.zeros_like()) sqrs.append(param.zeros_like()) else: params, vs, sqrs = init_params(num_fc1=128, num_fc2=64, num_outputs=2, sl=sl) print('Initiate weights from random...') # Debug if Debug: print('Debuging...') if params_init: params = params_init else: params, vs, sqrs = init_params(num_fc1=128, num_fc2=64, num_outputs=2, sl=sl) for data, _ in train_data: data = data.as_in_context(ctx).reshape((batch_size, 1, 1, -1)) break _, _ = net_PLB(data, params, debug=Debug, pool_type=pool_type, pool_size=pool_size, pool_stride=pool_stride) print() # total_loss = [Total_loss(train_data_10, params, batch_size, num_outputs)] t = 0 # Epoch starts from 1. print('pool_type: ', pool_type) print('pool_size: ', pool_size) print('pool_stride: ', pool_stride) print('sl: ', sl) best_test_acc = 0 best_params_epoch = 0 for epoch in range(1, epochs + 1): Epoch_loss = [] # 学习率自我衰减。 if epoch > 2: # lr *= 0.1 lr /= (1 + 0.01 * epoch) for batch_i, ((data, label), (data_v, label_v)) in enumerate(zip(train_data, test_data)): data = data.as_in_context(ctx).reshape((batch_size, 1, 1, -1)) label = label.as_in_context(ctx) label_one_hot = nd.one_hot(label, num_outputs) with autograd.record(): output, _ = net_PLB(data, params, pool_type=pool_type, pool_size=pool_size, pool_stride=pool_stride) loss = softmax_cross_entropy(output, label_one_hot) loss.backward() # print(output) # sgd(params, lr, batch_size) # Increment t before invoking adam. t += 1 adam(params, vs, sqrs, lr, batch_size, t) data_v = data_v.as_in_context(ctx).reshape((batch_size, 1, 1, -1)) label_v = label_v.as_in_context(ctx) label_v_one_hot = nd.one_hot(label_v, num_outputs) output_v, _ = net_PLB(data_v, params, pool_type=pool_type, pool_size=pool_size, pool_stride=pool_stride) loss_v = softmax_cross_entropy(output_v, label_v_one_hot) # ######################### # Keep a moving average of the losses # ######################### curr_loss = nd.mean(loss).asscalar() curr_loss_v = nd.mean(loss_v).asscalar() moving_loss = (curr_loss if ((batch_i == 0) and (epoch - 1 == 0)) else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss) loss_history.append(curr_loss) loss_v_history.append(curr_loss_v) moving_loss_history.append(moving_loss) Epoch_loss.append(curr_loss) # if batch_i * batch_size % period == 0: # print('Curr_loss: ', curr_loss) # print('Working on epoch %d. Curr_loss: %.5f (complete percent: %.2f/100' %(epoch, curr_loss*1.0, 1.0 * batch_i / (num_examples//batch_size) * 100) +')' , end='') # sys.stdout.write("\r") # print('{"metric": "Training Loss for ALL", "value": %.5f}' %(curr_loss*1.0) ) # print('{"metric": "Testing Loss for ALL", "value": %.5f}' %(curr_loss_v*1.0) ) print('{"metric": "Training Loss for SNR=%s", "value": %.5f}' % (str(SNR), curr_loss * 1.0)) print('{"metric": "Testing Loss for SNR=%s", "value": %.5f}' % (str(SNR), curr_loss_v * 1.0)) test_accuracy = evaluate_accuracy(test_data, num_examples, batch_size, params, net_PLB, pool_type=pool_type, pool_size=pool_size, pool_stride=pool_stride) train_accuracy = evaluate_accuracy(train_data, num_examples, batch_size, params, net_PLB, pool_type=pool_type, pool_size=pool_size, pool_stride=pool_stride) test_accuracy_history.append(test_accuracy) train_accuracy_history.append(train_accuracy) if test_accuracy >= best_test_acc: best_test_acc = test_accuracy best_params_epoch = epoch # print("Epoch %d, Moving_loss: %.6f, Epoch_loss(mean): %.6f, Train_acc %.4f, Test_acc %.4f" % # (epoch, moving_loss, np.mean(Epoch_loss), train_accuracy, test_accuracy)) print('{"metric": "Train_acc. for SNR=%s in epoches", "value": %.4f}' % (str(SNR), train_accuracy)) print('{"metric": "Test_acc. for SNR=%s in epoches", "value": %.4f}' % (str(SNR), test_accuracy)) yield (params, loss_history, loss_v_history, moving_loss_history, test_accuracy_history, train_accuracy_history, best_params_epoch)
def CNN(epoch = 100 , batch_size=256, save_period=10 , load_period=100 , weight_decay=0.001 ,learning_rate= 0.1 , dataset = "MNIST", ctx=mx.cpu(0)): #only for fullynetwork , 2d convolution def BN(X,gamma,beta,momentum=0.9,eps=1e-5,scope_name="",is_training=True): if len(X.shape)==2 : mean = nd.mean(X,axis=0) variance = nd.mean(nd.square(X-mean),axis=0) if is_training: Normalized_X=(X-mean)/nd.sqrt(variance+eps) elif is_training==False and not os.path.exists(path1) and epoch==0: #not param Normalized_X = (X - mean) / nd.sqrt(variance + eps) else: Normalized_X=(X-MOVING_MEANS[scope_name] / nd.sqrt(MOVING_VARS[scope_name]+eps)) out=gamma*Normalized_X+beta #pay attention that when it comes to (2D) CNN , We normalize batch_size * height * width over each channel, so that gamma and beta have the lengths the same as channel_count , #referenced by http://gluon.mxnet.io/chapter04_convolutional-neural-networks/cnn-batch-norm-scratch.html elif len(X.shape)==4: N , C , H , W = X.shape mean = nd.mean(X , axis=(0,2,3)) #normalize batch_size * height * width over each channel variance = nd.mean(nd.square(X-mean.reshape((1,C,1,1))),axis=(0,2,3)) if is_training: Normalized_X = (X-mean.reshape((1,C,1,1)))/nd.sqrt(variance.reshape((1,C,1,1))+eps) elif is_training == False and not os.path.exists(path1) and epoch==0: # load param , when epoch=0 Normalized_X = (X-mean.reshape((1,C,1,1)))/nd.sqrt(variance.reshape((1,C,1,1))+eps) else: Normalized_X = (X - MOVING_MEANS[scope_name].reshape((1, C, 1, 1))) / nd.sqrt(MOVING_VARS[scope_name].reshape((1, C, 1, 1)) + eps) out=gamma.reshape((1,C,1,1))*Normalized_X+beta.reshape((1,C,1,1)) if scope_name not in MOVING_MEANS and scope_name not in MOVING_VARS: MOVING_MEANS[scope_name] = mean MOVING_VARS[scope_name] = variance else: MOVING_MEANS[scope_name] = MOVING_MEANS[scope_name] * momentum + mean * (1.0 - momentum) MOVING_VARS[scope_name] = MOVING_VARS[scope_name] * momentum + variance * (1.0 - momentum) return out #data selection if dataset =="MNIST": train_data , test_data = MNIST(batch_size) elif dataset == "CIFAR10": train_data, test_data = CIFAR10(batch_size) elif dataset == "FashionMNIST": train_data, test_data = FashionMNIST(batch_size) else: return "The dataset does not exist." # data structure if dataset == "MNIST" or dataset =="FashionMNIST": color = 1 elif dataset == "CIFAR10": color = 3 num_outputs = 10 if dataset == "MNIST": path1 = "weights/MNIST_weights-{}".format(load_period) path2 = "weights/MNIST_weights_MEANS-{}".format(load_period) path3 = "weights/MNIST_weights_VARS-{}".format(load_period) elif dataset == "FashionMNIST": path1 = "weights/FashionMNIST_weights-{}".format(load_period) path2 = "weights/FashionMNIST_weights_MEANS-{}".format(load_period) path3 = "weights/FashionMNIST_weights_VARS-{}".format(load_period) elif dataset == "CIFAR10": path1 = "weights/CIFAR10_weights-{}".format(load_period) path2 = "weights/CIFAR10_weights_MEANS-{}".format(load_period) path3 = "weights/CIFAR10_weights_VARS-{}".format(load_period) if os.path.exists(path1): print("loading weights") [W1, B1, gamma1, beta1, W2, B2, gamma2, beta2, W3, B3, gamma3, beta3, W4, B4, gamma4, beta4, W5, B5]= nd.load(path1) # weights load MOVING_MEANS = nd.load(path2) MOVING_VARS = nd.load(path3) for m,v in zip(MOVING_MEANS.values() , MOVING_VARS.values()): m.as_in_context(ctx) v.as_in_context(ctx) W1=W1.as_in_context(ctx) B1=B1.as_in_context(ctx) gamma1=gamma1.as_in_context(ctx) beta1=beta1.as_in_context(ctx) W2=W2.as_in_context(ctx) B2=B2.as_in_context(ctx) gamma2=gamma2.as_in_context(ctx) beta2=beta2.as_in_context(ctx) W3=W3.as_in_context(ctx) B3=B3.as_in_context(ctx) gamma3=gamma3.as_in_context(ctx) beta3=beta3.as_in_context(ctx) W4=W4.as_in_context(ctx) B4=B4.as_in_context(ctx) gamma4=gamma4.as_in_context(ctx) beta4=beta4.as_in_context(ctx) W5=W5.as_in_context(ctx) B5=B5.as_in_context(ctx) params = [W1 , B1 , gamma1 , beta1 , W2 , B2 , gamma2 , beta2 , W3 , B3 , gamma3 , beta3 , W4 , B4, gamma4 , beta4 , W5 , B5] else: print("initializing weights") weight_scale=0.1 BN_weight_scale = 0.01 MOVING_MEANS, MOVING_VARS = {}, {} with ctx: W1 = nd.random.normal(loc=0 , scale=weight_scale , shape=(60,color,3,3)) B1 = nd.random.normal(loc=0 , scale=weight_scale , shape=60) gamma1 = nd.random.normal(shape=60, loc=1, scale=BN_weight_scale) beta1 = nd.random.normal(shape=60, scale=BN_weight_scale) W2 = nd.random.normal(loc=0 , scale=weight_scale , shape=(30,60,6,6)) B2 = nd.random.normal(loc=0 , scale=weight_scale , shape=30) gamma2 = nd.random.normal(shape=30, loc=1, scale=BN_weight_scale) beta2 = nd.random.normal(shape=30, scale=BN_weight_scale) if dataset == "CIFAR10": reshape=750 elif dataset == "MNIST" or dataset == "FashionMNIST": reshape=480 W3 = nd.random.normal(loc=0 , scale=weight_scale , shape=(120, reshape)) B3 = nd.random.normal(loc=0 , scale=weight_scale , shape=120) gamma3 = nd.random.normal(shape=120, loc=1, scale=BN_weight_scale) beta3 = nd.random.normal(shape=120, scale=BN_weight_scale) W4 = nd.random.normal(loc=0 , scale=weight_scale , shape=(64, 120)) B4 = nd.random.normal(loc=0 , scale=weight_scale , shape=64) gamma4 = nd.random.normal(shape=64, loc=1, scale=BN_weight_scale) beta4 = nd.random.normal(shape=64, scale=BN_weight_scale) W5 = nd.random.normal(loc=0 , scale=weight_scale , shape=(num_outputs , 64)) B5 = nd.random.normal(loc=0 , scale=weight_scale , shape=num_outputs) params = [W1 , B1 , gamma1 , beta1 , W2 , B2 , gamma2 , beta2 , W3 , B3 , gamma3 , beta3 , W4 , B4, gamma4 , beta4 , W5 , B5] # attach gradient!!! for i, param in enumerate(params): param.attach_grad() # network - similar to lenet5 '''Convolution parameter data: (batch_size, channel, height, width) weight: (num_filter, channel, kernel[0], kernel[1]) bias: (num_filter,) out: (batch_size, num_filter, out_height, out_width). ''' def network(X, is_training=True, drop_rate=0.0): # formula : output_size=((input−weights+2*Padding)/Stride)+1 #data size # MNIST,FashionMNIST = (batch size , 1 , 28 , 28) # CIFAR = (batch size , 3 , 32 , 32) C_H1=nd.Activation(data=BN(nd.Convolution(data=X , weight = W1 , bias = B1 , kernel=(3,3) , stride=(1,1) , num_filter=60), gamma1 , beta1 ,scope_name="BN1",is_training=is_training) , act_type="relu") # MNIST : result = ( batch size , 60 , 26 , 26) , CIFAR10 : : result = ( batch size , 60 , 30 , 30) P_H1=nd.Pooling(data = C_H1 , pool_type = "max" , kernel=(2,2), stride = (2,2)) # MNIST : result = (batch size , 60 , 13 , 13) , CIFAR10 : result = (batch size , 60 , 15 , 15) C_H2=nd.Activation(data=BN(nd.Convolution(data=P_H1 , weight = W2 , bias = B2 , kernel=(6,6) , stride=(1,1) , num_filter=30), gamma2 , beta2 ,scope_name="BN2",is_training=is_training), act_type="relu") # MNIST : result = ( batch size , 30 , 8 , 8), CIFAR10 : result = ( batch size , 30 , 10 , 10) P_H2=nd.Pooling(data = C_H2 , pool_type = "max" , kernel=(2,2), stride = (2,2)) # MNIST : result = (batch size , 30 , 4 , 4) , CIFAR10 : result = (batch size , 30 , 5 , 5) P_H2 = nd.flatten(data=P_H2) '''FullyConnected parameter • data: (batch_size, input_dim) • weight: (num_hidden, input_dim) • bias: (num_hidden,) • out: (batch_size, num_hidden) ''' F_H1 =nd.Activation(BN(nd.FullyConnected(data=P_H2 , weight=W3 , bias=B3 , num_hidden=120), gamma3, beta3 ,scope_name="BN3",is_training=is_training),act_type="relu") F_H1 =nd.Dropout(data=F_H1, p=drop_rate) F_H2 =nd.Activation(BN(nd.FullyConnected(data=F_H1 , weight=W4 , bias=B4 , num_hidden=64), gamma4, beta4, scope_name="BN4",is_training=is_training),act_type="relu") F_H2 =nd.Dropout(data=F_H2, p=drop_rate) softmax_Y = nd.softmax(nd.FullyConnected(data=F_H2 ,weight=W5 , bias=B5 , num_hidden=10)) return softmax_Y def cross_entropy(output, label): return - nd.sum(label * nd.log(output), axis=1) #Adam optimizer state=[] optimizer=mx.optimizer.Adam(rescale_grad=1,learning_rate=learning_rate) for i,param in enumerate(params): state.append(optimizer.create_state(0,param)) def SGD(params, lr , wd , bs): for param in params: param -= ((lr * param.grad)/bs+wd*param) for i in tqdm(range(1,epoch+1,1)): for data,label in train_data: data = data.as_in_context(ctx) label = label.as_in_context(ctx) label = nd.one_hot(label , num_outputs) with autograd.record(): output = network(data,is_training=True,drop_rate=0.0) #loss definition loss = cross_entropy(output,label) # (batch_size,) cost = nd.mean(loss).asscalar() loss.backward() for j,param in enumerate(params): optimizer.update(0,param,param.grad,state[j]) #SGD(params, learning_rate , weight_decay , batch_size) print(" epoch : {} , last batch cost : {}".format(i,cost)) #weight_save if i % save_period==0: if not os.path.exists("weights"): os.makedirs("weights") print("saving weights") if dataset=="MNIST": nd.save("weights/MNIST_weights-{}".format(i), params) nd.save("weights/MNIST_weights_MEANS-{}".format(i), MOVING_MEANS) nd.save("weights/MNIST_weights_VARS-{}".format(i), MOVING_VARS) elif dataset=="CIFAR10": nd.save("weights/CIFAR10_weights-{}".format(i), params) nd.save("weights/CIFAR10_weights_MEANS-{}".format(i), MOVING_MEANS) nd.save("weights/CIFAR10_weights_VARS-{}".format(i), MOVING_VARS) elif dataset=="FashionMNIST": nd.save("weights/FashionMNIST_weights-{}".format(i),params) nd.save("weights/FashionMNIST_weights_MEANS-{}".format(i), MOVING_MEANS) nd.save("weights/FashionMNIST_weights_VARS-{}".format(i), MOVING_VARS) test_accuracy = evaluate_accuracy(test_data , network , ctx) print("Test_acc : {}".format(test_accuracy)) return "optimization completed"
def backward(self, out_grads=None): #print('in backward') assert self.binded and self.params_initialized #tmp_ctx = self._ctx_cpu tmp_ctx = self._ctx_single_gpu fc7_outs = [] ctx_fc7_max = self.get_ndarray(tmp_ctx, 'ctx_fc7_max', (self._batch_size, len(self._context))) #local_fc7_max = nd.zeros( (self.global_label.shape[0],1), ctx=mx.cpu()) for i, _module in enumerate(self._arcface_modules): _fc7 = _module.get_outputs(merge_multi_context=True)[0] fc7_outs.append(_fc7) _fc7_max = nd.max(_fc7, axis=1).as_in_context(tmp_ctx) ctx_fc7_max[:,i] = _fc7_max local_fc7_max = self.get_ndarray(tmp_ctx, 'local_fc7_max', (self._batch_size, 1)) nd.max(ctx_fc7_max, axis=1, keepdims=True, out=local_fc7_max) global_fc7_max = local_fc7_max #local_fc7_sum = None local_fc7_sum = self.get_ndarray(tmp_ctx, 'local_fc7_sum', (self._batch_size,1)) local_fc7_sum[:,:] = 0.0 for i, _module in enumerate(self._arcface_modules): _max = self.get_ndarray2(fc7_outs[i].context, 'fc7_max', global_fc7_max) fc7_outs[i] = nd.broadcast_sub(fc7_outs[i], _max) fc7_outs[i] = nd.exp(fc7_outs[i]) _sum = nd.sum(fc7_outs[i], axis=1, keepdims=True).as_in_context(tmp_ctx) local_fc7_sum += _sum global_fc7_sum = local_fc7_sum if self._iter%self._verbose==0: #_ctx = self._context[-1] _ctx = self._ctx_cpu _probs = [] for i, _module in enumerate(self._arcface_modules): _prob = self.get_ndarray2(_ctx, '_fc7_prob_%d'%i, fc7_outs[i]) _probs.append(_prob) fc7_prob = self.get_ndarray(_ctx, 'test_fc7_prob', (self._batch_size, self._ctx_num_classes*len(self._context))) nd.concat(*_probs, dim=1, out=fc7_prob) fc7_pred = nd.argmax(fc7_prob, axis=1) local_label = self.global_label - self._local_class_start #local_label = self.get_ndarray2(_ctx, 'test_label', local_label) _pred = nd.equal(fc7_pred, local_label) print('{fc7_acc}', self._iter, nd.mean(_pred).asnumpy()[0]) #local_fc1_grad = [] #fc1_grad_ctx = self._ctx_cpu fc1_grad_ctx = self._ctx_single_gpu local_fc1_grad = self.get_ndarray(fc1_grad_ctx, 'local_fc1_grad', (self._batch_size,self._emb_size)) local_fc1_grad[:,:] = 0.0 loss = nd.zeros(shape=(self._batch_size), ctx=self._ctx_cpu) for i, _module in enumerate(self._arcface_modules): _sum = self.get_ndarray2(fc7_outs[i].context, 'fc7_sum', global_fc7_sum) fc7_outs[i] = nd.broadcast_div(fc7_outs[i], _sum) a = i*self._ctx_num_classes b = (i+1)*self._ctx_num_classes _label = self.global_label - self._ctx_class_start[i] _label = self.get_ndarray2(fc7_outs[i].context, 'label', _label) onehot_label = self.get_ndarray(fc7_outs[i].context, 'label_onehot', (self._batch_size, self._ctx_num_classes)) nd.one_hot(_label, depth=self._ctx_num_classes, on_value = 1.0, off_value = 0.0, out=onehot_label) #for debug loss -= (mx.nd.sum(mx.nd.log(fc7_outs[i]) * onehot_label, axis=1)).as_in_context(self._ctx_cpu) fc7_outs[i] -= onehot_label _module.backward(out_grads = [fc7_outs[i]]) print('for debug, fc7 outs max is ', i, mx.nd.max(fc7_outs[i])) print('for debug, fc7 outs min is ', i, mx.nd.min(fc7_outs[i])) #ctx_fc1_grad = _module.get_input_grads()[0].as_in_context(mx.cpu()) ctx_fc1_grad = self.get_ndarray2(fc1_grad_ctx, 'ctx_fc1_grad_%d'%i, _module.get_input_grads()[0]) local_fc1_grad += ctx_fc1_grad print('for debug, global fc1_grad max is ', i, mx.nd.max(ctx_fc1_grad)) print('for debug, ctx fc1 grad shape, ', ctx_fc1_grad.shape) global_fc1_grad = local_fc1_grad # global_fc1_grad = mx.nd.clip(local_fc1_grad, a_min=-15, a_max=15) print('for debug, after clip global fc1_grad max is ', mx.nd.max(global_fc1_grad)) self._curr_module.backward(out_grads = [global_fc1_grad]) # for debug return mx.nd.sum(loss)
def muitlclass_logistic_regression(epoch=100, batch_size=10, save_period=10, load_period=100, weight_decay=0.001, learning_rate=0.1, dataset="MNIST", ctx=mx.gpu(0)): #data selection if dataset == "MNIST": train_data, test_data = MNIST(batch_size) elif dataset == "CIFAR10": train_data, test_data = CIFAR10(batch_size) elif dataset == "FashionMNIST": train_data, test_data = FashionMNIST(batch_size) else: return "The dataset does not exist." # data structure if dataset == "MNIST" or dataset == "FashionMNIST": num_inputs = 28 * 28 elif dataset == "CIFAR10": num_inputs = 32 * 32 num_outputs = 10 if dataset == "MNIST": path = "weights/MNIST_weights-{}".format(load_period) elif dataset == "FashionMNIST": path = "weights/FashionMNIST_weights-{}".format(load_period) elif dataset == "CIFAR10": path = "weights/CIFAR10_weights-{}".format(load_period) if os.path.exists(path): print("loading weights") [W, B] = nd.load(path) # weights load W = W.as_in_context(ctx) B = B.as_in_context(ctx) params = [W, B] else: print("initializing weights") with ctx: W = nd.random.normal(loc=0, scale=0.01, shape=(num_inputs, num_outputs)) B = nd.random.normal(loc=0, scale=0.01, shape=num_outputs) params = [W, B] # attach gradient!!! for i, param in enumerate(params): param.attach_grad() def network(X): Y = nd.dot(X, W) + B softmax_Y = nd.softmax(Y) return softmax_Y def cross_entropy(output, label): return -nd.sum(label * nd.log(output), axis=1) #Adam optimizer state = [] optimizer = mx.optimizer.Adam(rescale_grad=1, learning_rate=learning_rate) for i, param in enumerate(params): state.append(optimizer.create_state(0, param)) def SGD(params, lr, wd, bs): for param in params: param -= ((lr * param.grad) / bs + wd * param) for i in tqdm(range(1, epoch + 1, 1)): for data, label in train_data: if dataset == "CIFAR10": data = nd.slice_axis(data=data, axis=3, begin=0, end=1) data = data.as_in_context(ctx).reshape((-1, num_inputs)) label = label.as_in_context(ctx) label = nd.one_hot(label, num_outputs) with autograd.record(): output = network(data) #loss definition loss = cross_entropy(output, label) # (batch_size,) cost = nd.mean(loss).asscalar() loss.backward() for j, param in enumerate(params): optimizer.update(0, param, param.grad, state[j]) #SGD(params, learning_rate , weight_decay , batch_size) print(" epoch : {} , last batch cost : {}".format(i, cost)) #weight_save if i % save_period == 0: if not os.path.exists("weights"): os.makedirs("weights") print("saving weights") if dataset == "MNIST": nd.save("weights/MNIST_weights-{}".format(i), params) elif dataset == "CIFAR10": nd.save("weights/CIFAR10_weights-{}".format(i), params) elif dataset == "FashionMNIST": nd.save("weights/FashionMNIST_weights-{}".format(i), params) test_accuracy = evaluate_accuracy(test_data, num_inputs, network, ctx, dataset) print("Test_acc : {}".format(test_accuracy)) return "optimization completed"
import numpy as np
def clsmap2channel(self, x): y = ndarray.one_hot(x, 11) y = ndarray.transpose(y, (2, 0, 1)) return y
def backward(self, out_grads=None): #print('in backward') assert self.binded and self.params_initialized ## ============= forward classifier layer =========== fc7_outs = [] for i, _module in enumerate(self._arcface_modules): _fc7 = _module.get_outputs(merge_multi_context=True)[0] fc7_outs.append(_fc7) ctx_max = map( lambda fc7_out: nd.max(fc7_out, axis=1, keepdims=True). as_in_context(self._ctx_single_gpu), fc7_outs) local_fc7_max = nd.max(nd.concat(*ctx_max, dim=1), axis=1, keepdims=True) fc7_exps = list( map( lambda fc7_out: nd.exp(fc7_out - local_fc7_max.as_in_context( fc7_out.context)), fc7_outs)) ctx_sum = map( lambda fc7_out: nd.sum(fc7_out, axis=1, keepdims=True). as_in_context(self._ctx_single_gpu), fc7_exps) exp_sum = nd.sum(nd.concat(*ctx_sum, dim=1), axis=1, keepdims=True) softmax_outs = list( map( lambda fc7_exp: nd.broadcast_div( fc7_exp, exp_sum.as_in_context(fc7_exp.context)), fc7_exps)) onehot_device_labels = [ nd.one_hot((self.global_label).as_in_context(device) - self._ctx_class_start[i], depth=self._ctx_num_classes, on_value=1.0, off_value=0.0) for i, device in enumerate(self._context) ] ## ============= verbose train accuracy and loss =========== if self._iter % self._verbose == 0: local_label = self.global_label - self._local_class_start fc7_pred = self.parall_argmax(softmax_outs, self._ctx_single_gpu) _pred = nd.equal(fc7_pred, local_label).asnumpy()[0] loss = self.parall_loss(softmax_outs, onehot_device_labels, self._ctx_single_gpu).asscalar() assert not math.isnan(loss) self.logger.info( '[Iter {}] train acc : {}, total loss : {}'.format( self._iter, np.mean(_pred), loss)) ## ============= backward large weight classifier layer with gradient =========== local_fc1_grad = self.get_ndarray_by_shape( self._ctx_single_gpu, 'local_fc1_grad', (self._batch_size, self._emb_size)) local_fc1_grad[:, :] = 0.0 for i, _module in enumerate(self._arcface_modules): _module.backward( out_grads=[softmax_outs[i] - onehot_device_labels[i]]) ctx_fc1_grad = self.get_ndarray_by_v_arr( self._ctx_single_gpu, 'ctx_fc1_grad_%d' % i, _module.get_input_grads()[0]) local_fc1_grad += ctx_fc1_grad ## ============= backward backbone =============== global_fc1_grad = local_fc1_grad self._backbone_module.backward(out_grads=[global_fc1_grad])
def forward(self, x): x = nd.one_hot(x, self.vsize) self.h1 = gru(x, self.h1, *self.a1) return nd.dot(self.h1, self.Wy) + self.by
def forward(self, x): x = nd.one_hot(x, self.vsize) self.s1 = lstm(x, *self.s1, *self.a1) return nd.dot(self.s1[0], self.Wy) + self.by
def CNN(epoch = 100 , batch_size=10, save_period=10 , load_period=100 , weight_decay=0.001 ,learning_rate= 0.1 , dataset = "MNIST", ctx=mx.cpu(0)): #data selection if dataset =="MNIST": train_data , test_data = MNIST(batch_size) elif dataset == "CIFAR10": train_data, test_data = CIFAR10(batch_size) elif dataset == "FashionMNIST": train_data, test_data = FashionMNIST(batch_size) else: return "The dataset does not exist." # data structure if dataset == "MNIST" or dataset =="FashionMNIST": color = 1 elif dataset == "CIFAR10": color = 3 num_outputs = 10 if dataset == "MNIST": path = "weights/MNIST_weights-{}".format(load_period) elif dataset == "FashionMNIST": path = "weights/FashionMNIST_weights-{}".format(load_period) elif dataset == "CIFAR10": path = "weights/CIFAR10_weights-{}".format(load_period) if os.path.exists(path): print("loading weights") [W1, B1, W2, B2, W3, B3, W4, B4, W5, B5] = nd.load(path) # weights load W1=W1.as_in_context(ctx) B1=B1.as_in_context(ctx) W2=W2.as_in_context(ctx) B2=B2.as_in_context(ctx) W3=W3.as_in_context(ctx) B3=B3.as_in_context(ctx) W4=W4.as_in_context(ctx) B4=B4.as_in_context(ctx) W5=W5.as_in_context(ctx) B5=B5.as_in_context(ctx) params = [W1 , B1 , W2 , B2 , W3 , B3 , W4 , B4 , W5 , B5] else: print("initializing weights") with ctx: W1 = nd.random.normal(loc=0 , scale=0.1 , shape=(60,color,3,3)) B1 = nd.random.normal(loc=0 , scale=0.1 , shape=60) W2 = nd.random.normal(loc=0 , scale=0.1 , shape=(30,60,6,6)) B2 = nd.random.normal(loc=0 , scale=0.1 , shape=30) if dataset == "CIFAR10": reshape=750 elif dataset == "MNIST" or dataset == "FashionMNIST": reshape=480 W3 = nd.random.normal(loc=0 , scale=0.1 , shape=(120, reshape)) B3 = nd.random.normal(loc=0 , scale=0.1 , shape=120) W4 = nd.random.normal(loc=0 , scale=0.1 , shape=(64, 120)) B4 = nd.random.normal(loc=0 , scale=0.1 , shape=64) W5 = nd.random.normal(loc=0 , scale=0.1 , shape=(num_outputs , 64)) B5 = nd.random.normal(loc=0 , scale=0.1 , shape=num_outputs) params = [W1 , B1 , W2 , B2 , W3 , B3 , W4 , B4, W5 , B5] # attach gradient!!! for i, param in enumerate(params): param.attach_grad() # network - similar to lenet5 '''Convolution parameter data: (batch_size, channel, height, width) weight: (num_filter, channel, kernel[0], kernel[1]) bias: (num_filter,) out: (batch_size, num_filter, out_height, out_width). ''' def network(X,drop_rate=0.0): # formula : output_size=((input−weights+2*Padding)/Stride)+1 #data size # MNIST,FashionMNIST = (batch size , 1 , 28 , 28) # CIFAR = (batch size , 3 , 32 , 32) C_H1=nd.Activation(data= nd.Convolution(data=X , weight = W1 , bias = B1 , kernel=(3,3) , stride=(1,1) , num_filter=60) , act_type="relu") # MNIST : result = ( batch size , 60 , 26 , 26) , CIFAR10 : : result = ( batch size , 60 , 30 , 30) P_H1=nd.Pooling(data = C_H1 , pool_type = "max" , kernel=(2,2), stride = (2,2)) # MNIST : result = (batch size , 60 , 13 , 13) , CIFAR10 : result = (batch size , 60 , 15 , 15) C_H2=nd.Activation(data= nd.Convolution(data=P_H1 , weight = W2 , bias = B2 , kernel=(6,6) , stride=(1,1) , num_filter=30), act_type="relu") # MNIST : result = ( batch size , 30 , 8 , 8), CIFAR10 : result = ( batch size , 30 , 10 , 10) P_H2=nd.Pooling(data = C_H2 , pool_type = "max" , kernel=(2,2), stride = (2,2)) # MNIST : result = (batch size , 30 , 4 , 4) , CIFAR10 : result = (batch size , 30 , 5 , 5) P_H2 = nd.flatten(data=P_H2) '''FullyConnected parameter • data: (batch_size, input_dim) • weight: (num_hidden, input_dim) • bias: (num_hidden,) • out: (batch_size, num_hidden) ''' F_H1 =nd.Activation(nd.FullyConnected(data=P_H2 , weight=W3 , bias=B3 , num_hidden=120),act_type="sigmoid") F_H1 =nd.Dropout(data=F_H1, p=drop_rate) F_H2 =nd.Activation(nd.FullyConnected(data=F_H1 , weight=W4 , bias=B4 , num_hidden=64),act_type="sigmoid") F_H2 =nd.Dropout(data=F_H2, p=drop_rate) softmax_Y = nd.softmax(nd.FullyConnected(data=F_H2 ,weight=W5 , bias=B5 , num_hidden=10)) return softmax_Y def cross_entropy(output, label): return - nd.sum(label * nd.log(output), axis=1) #Adam optimizer state=[] optimizer=mx.optimizer.Adam(rescale_grad=1,learning_rate=learning_rate) for i,param in enumerate(params): state.append(optimizer.create_state(0,param)) def SGD(params, lr , wd , bs): for param in params: param -= ((lr * param.grad)/bs+wd*param) for i in tqdm(range(1,epoch+1,1)): for data,label in train_data: data = data.as_in_context(ctx) label = label.as_in_context(ctx) label = nd.one_hot(label , num_outputs) with autograd.record(): output = network(data,drop_rate=0.2) #loss definition loss = cross_entropy(output,label) # (batch_size,) cost = nd.mean(loss).asscalar() loss.backward() for j,param in enumerate(params): optimizer.update(0,param,param.grad,state[j]) #SGD(params, learning_rate , weight_decay , batch_size) print(" epoch : {} , last batch cost : {}".format(i,cost)) #weight_save if i % save_period==0: if not os.path.exists("weights"): os.makedirs("weights") print("saving weights") if dataset=="MNIST": nd.save("weights/MNIST_weights-{}".format(i),params) elif dataset=="CIFAR10": nd.save("weights/CIFAR10_weights-{}".format(i),params) elif dataset=="FashionMNIST": nd.save("weights/FashionMNIST_weights-{}".format(i),params) test_accuracy = evaluate_accuracy(test_data , network , ctx) print("Test_acc : {}".format(test_accuracy)) return "optimization completed"
def backward(self, out_grads=None): #print('in backward') assert self.binded and self.params_initialized #tmp_ctx = self._ctx_cpu tmp_ctx = self._ctx_single_gpu fc7_outs = [] ctx_fc7_max = self.get_ndarray(tmp_ctx, 'ctx_fc7_max', (self._batch_size, len(self._context))) #local_fc7_max = nd.zeros( (self.global_label.shape[0],1), ctx=mx.cpu()) arcface_module_outputs = [] for i, _module in enumerate(self._arcface_modules): #_fc7 = _module.get_outputs(merge_multi_context=True)[0] out = _module.get_outputs(merge_multi_context=True) #print(out[0].shape) #print(out[1].shape) arcface_module_outputs.append(out) _fc7 = out[0] fc7_outs.append(_fc7) _fc7_max = nd.max(_fc7, axis=1).as_in_context(tmp_ctx) ctx_fc7_max[:,i] = _fc7_max local_fc7_max = self.get_ndarray(tmp_ctx, 'local_fc7_max', (self._batch_size, 1)) nd.max(ctx_fc7_max, axis=1, keepdims=True, out=local_fc7_max) global_fc7_max = local_fc7_max #local_fc7_sum = None local_fc7_sum = self.get_ndarray(tmp_ctx, 'local_fc7_sum', (self._batch_size,1)) local_fc7_sum[:,:] = 0.0 for i, _module in enumerate(self._arcface_modules): _max = self.get_ndarray2(fc7_outs[i].context, 'fc7_max', global_fc7_max) fc7_outs[i] = nd.broadcast_sub(fc7_outs[i], _max) fc7_outs[i] = nd.exp(fc7_outs[i]) _sum = nd.sum(fc7_outs[i], axis=1, keepdims=True).as_in_context(tmp_ctx) local_fc7_sum += _sum global_fc7_sum = local_fc7_sum if self._iter%self._verbose==0: #_ctx = self._context[-1] _ctx = self._ctx_cpu _probs = [] for i, _module in enumerate(self._arcface_modules): _prob = self.get_ndarray2(_ctx, '_fc7_prob_%d'%i, fc7_outs[i]) _probs.append(_prob) fc7_prob = self.get_ndarray(_ctx, 'test_fc7_prob', (self._batch_size, self._ctx_num_classes*len(self._context))) nd.concat(*_probs, dim=1, out=fc7_prob) fc7_pred = nd.argmax(fc7_prob, axis=1) local_label = self.global_label - self._local_class_start #local_label = self.get_ndarray2(_ctx, 'test_label', local_label) _pred = nd.equal(fc7_pred, local_label) print('{fc7_acc}', self._iter, nd.mean(_pred).asnumpy()[0]) #local_fc1_grad = [] #fc1_grad_ctx = self._ctx_cpu fc1_grad_ctx = self._ctx_single_gpu local_fc1_grad = self.get_ndarray(fc1_grad_ctx, 'local_fc1_grad', (self._batch_size,self._emb_size)) local_fc1_grad[:,:] = 0.0 total_eloss = [] celoss_verbose = 1000 if self._iter%celoss_verbose==0: fc7_celoss = self.get_ndarray(tmp_ctx, 'test_fc7_celoss', (self._batch_size,)) fc7_celoss[:] = 0.0 for i, _module in enumerate(self._arcface_modules): _sum = self.get_ndarray2(fc7_outs[i].context, 'fc7_sum', global_fc7_sum) fc7_outs[i] = nd.broadcast_div(fc7_outs[i], _sum) a = i*self._ctx_num_classes b = (i+1)*self._ctx_num_classes _label = self.global_label - self._ctx_class_start[i] _label = self.get_ndarray2(fc7_outs[i].context, 'label', _label) onehot_label = self.get_ndarray(fc7_outs[i].context, 'label_onehot', (self._batch_size, self._ctx_num_classes)) nd.one_hot(_label, depth=self._ctx_num_classes, on_value = 1.0, off_value = 0.0, out=onehot_label) #print(fc7_outs[i].shape, onehot_label.shape) if self._iter%celoss_verbose==0: _ce_loss = fc7_outs[i] * onehot_label _ce_loss = nd.sum(_ce_loss, axis=1) fc7_celoss += _ce_loss.as_in_context(tmp_ctx) fc7_outs[i] -= onehot_label out = arcface_module_outputs[i] out_grads = [fc7_outs[i]] for j in range(1, len(out)): eloss = out[j] #print('eloss%d:'%j, eloss.shape) #print(out_grads[0].shape) #egrad_shape = (out_grads[0].shape[0], eloss.shape[0]) egrad_shape = eloss.shape egrad = self.get_ndarray(fc7_outs[i].context, 'egrad%d'%j, egrad_shape) #egrad[:][:] = 1.0/egrad_shape[0] egrad[:][:] = 1.0 out_grads.append(egrad) if self._iter%self._verbose==0: total_eloss.append(np.mean(eloss.asnumpy())) _module.backward(out_grads = out_grads) #ctx_fc1_grad = _module.get_input_grads()[0].as_in_context(mx.cpu()) ctx_fc1_grad = self.get_ndarray2(fc1_grad_ctx, 'ctx_fc1_grad_%d'%i, _module.get_input_grads()[0]) local_fc1_grad += ctx_fc1_grad if self._iter%self._verbose==0 and len(total_eloss)>0: print('{eloss}', self._iter, np.mean(total_eloss)) #if self._iter%self._verbose==0: if self._iter%celoss_verbose==0: ce_loss = nd.log(fc7_celoss) * -1.0 ce_loss = nd.mean(ce_loss) print('CELOSS,%d,%f'% (self._iter, ce_loss.asnumpy())) global_fc1_grad = local_fc1_grad self._curr_module.backward(out_grads = [global_fc1_grad])
def train(): for epoch in range(num_epochs): btic = time.time() i = 0 #import pdb #pdb.set_trace() for data, labels in test_data: real_label = nd.ones([ labels.shape[0], ], ctx=ctx) fake_label = nd.zeros([labels.shape[0]], ctx=ctx) labels = labels.as_in_context(ctx) x = data.as_in_context(ctx) y = nd.one_hot(labels, depth=10) #z = mx.nd.random_normal(0, 1, shape=(batch_size, latent_z_size, 1, 1), ctx=ctx) z = mx.nd.random_normal(0, 1, shape=(labels.shape[0], latent_z_size, 1, 1), ctx=ctx) #y_z = mx.nd.array(np.random.randint(0, 9, size=batch_size), ctx=ctx) y_z = mx.nd.array(np.random.randint(0, 9, size=labels.shape[0]), ctx=ctx) y_z = nd.one_hot(y_z, depth=10) # Train Discriminator with autograd.record(): output = netD(x, y) errD_real = loss(output, real_label) logging.info( f"YuWang: shapes: x: {x.shape}, y:{y.shape}, out: {output.shape}, real_label: {real_label.shape}" ) fake = netG(z, y_z) output = netD(fake.detach(), y_z) errD_fake = loss(output, fake_label) logging.info( f"YuWang: shapes: out: {output.shape}, real_label: {real_label.shape}, fake_label: {fake_label.shape}, errD_real: {errD_real.shape}, errD_fake: {errD_fake.shape}" ) errD = errD_real + errD_fake errD.backward() trainerD.step(data.shape[0]) # Train Generator with autograd.record(): fake = netG(z, y_z) output = netD(fake, y_z) errG = loss(output, real_label) errG.backward() trainerG.step(data.shape[0]) if i % 50 == 0: logging.info( f'speed: {batch_size / (time.time() - btic)} samples/s') logging.info( f'discriminator loss = {nd.mean(errD).asscalar()}, generator loss = {nd.mean(errG).asscalar()} at iter {i} epoch {epoch}' ) i = i + 1 btic = time.time() if epoch % 5 == 0: netD.save_params("netD.params") netG.save_params("netG.params")
def get_inputs(data): return [nd.one_hot(X, vocab_size) for X in data.T]
def get_inputs(data): return [nd.one_hot(X, vocab_size) for X in data.T]