def run_boston_housing_DistilledSGLD(): X, Y, X_test, Y_test, X_mean, X_std, Y_mean, Y_std = load_boston_housing() print(X.shape, Y.shape, X_test.shape, Y_test.shape) minibatch_size = 1 teacher_noise_precision = 1.25 teacher_net = get_boston_housing_sym(True, teacher_noise_precision) student_net = get_boston_housing_sym(False) data_shape = (minibatch_size,) + X.shape[1::] teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()), 'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())} student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev())} # 'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev())} teacher_initializer = BiasXavier(factor_type="in", magnitude=1) student_initializer = BiasXavier(factor_type="in", magnitude=1) student_grad_f = lambda student_outputs, teacher_pred: \ regression_student_grad(student_outputs, teacher_pred, teacher_noise_precision) student_exe, student_params, _ = \ DistilledSGLD(teacher_sym=teacher_net, student_sym=student_net, teacher_data_inputs=teacher_data_inputs, student_data_inputs=student_data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test, X_mean=X_mean, X_std=X_std, Y_mean=Y_mean, Y_std=Y_std, total_iter_num=5000000, teacher_initializer=teacher_initializer, student_initializer=student_initializer, teacher_learning_rate=2E-7, student_learning_rate=1E-2, student_optimizing_algorithm='sgd', teacher_lr_scheduler=mx.lr_scheduler.FactorScheduler(80000, 0.5, 1E-7), student_lr_scheduler=mx.lr_scheduler.FactorScheduler(step=5000, factor=0.8, stop_factor_lr=1E-6), student_grad_f=student_grad_f, teacher_prior_precision=2.5, student_prior_precision=0.001, perturb_deviation=0.05, minibatch_size=minibatch_size, task='boston', dev=dev())
def run_toy_DistilledSGLD(gpu_id=None): X, Y, X_test, Y_test = load_toy() minibatch_size = 1 teacher_noise_precision = 1.0 teacher_net = get_toy_sym(True, teacher_noise_precision) student_net = get_toy_sym(False) data_shape = (minibatch_size,) + X.shape[1::] teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)), 'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))} student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id))} teacher_initializer = mx.init.Uniform(0.07) student_initializer = mx.init.Uniform(0.07) student_grad_f = lambda student_outputs, teacher_pred: \ regression_student_grad(student_outputs, teacher_pred, teacher_noise_precision) student_exe, student_params, _ = \ DistilledSGLD(teacher_sym=teacher_net, student_sym=student_net, teacher_data_inputs=teacher_data_inputs, student_data_inputs=student_data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=80000, teacher_initializer=teacher_initializer, student_initializer=student_initializer, teacher_learning_rate=1E-4, student_learning_rate=0.01, # teacher_lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5), student_lr_scheduler=mx.lr_scheduler.FactorScheduler(8000, 0.8), student_grad_f=student_grad_f, teacher_prior_precision=0.1, student_prior_precision=0.001, perturb_deviation=0.1, minibatch_size=minibatch_size, task='regression', dev=dev(gpu_id))
def run_toy_SGLD(gpu_id=None): """Run SGLD on toy dataset""" X, Y, X_test, Y_test = load_toy() minibatch_size = 1 teacher_noise_precision = 1.0 / 9.0 net = get_toy_sym(True, teacher_noise_precision) data_shape = (minibatch_size,) + X.shape[1::] data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)), 'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))} initializer = mx.init.Uniform(0.07) exe, params, _ = SGLD(sym=net, data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=50000, initializer=initializer, learning_rate=1E-4, # lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5), prior_precision=0.1, burn_in_iter_num=1000, thin_interval=10, task='regression', minibatch_size=minibatch_size, dev=dev(gpu_id)) # disable=unbalanced-tuple-unpacking
def _get_or_reshape(name, shared_data_arrays, arg_shape, arg_type, context, logger): """Internal helper to get a memory block or re-use by re-shaping""" if name in shared_data_arrays: arg_arr = shared_data_arrays[name] if np.prod(arg_arr.shape) >= np.prod(arg_shape): # nice, we can directly re-use this data blob assert arg_arr.dtype == arg_type arg_arr = arg_arr.reshape(arg_shape) else: logger.warning(('bucketing: data "%s" has a shape %s' % (name, arg_shape)) + (', which is larger than already allocated ') + ('shape %s' % (arg_arr.shape,)) + ('. Need to re-allocate. Consider putting ') + ('default_bucket_key to') + (' be the bucket taking the largest input for better ') + ('memory sharing.')) arg_arr = nd.zeros(arg_shape, context, dtype=arg_type) # replace existing shared array because the new one is bigger shared_data_arrays[name] = arg_arr else: arg_arr = nd.zeros(arg_shape, context, dtype=arg_type) shared_data_arrays[name] = arg_arr return arg_arr
def main(args): ctx = mx.gpu(args.gpu) args.ctx_num = 1 prop = face_image.load_property(args.data) image_size = prop.image_size print('image_size', image_size) vec = args.model.split(',') prefix = vec[0] epoch = int(vec[1]) print('loading',prefix, epoch) sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) arg_params, aux_params = ch_dev(arg_params, aux_params, ctx) all_layers = sym.get_internals() sym = all_layers['fc1_output'] #model = mx.mod.Module.load(prefix, epoch, context = ctx) model = mx.mod.Module(symbol=sym, context=ctx, label_names = None) #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))]) model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))]) model.set_params(arg_params, aux_params) path_imgrec = os.path.join(args.data, 'train.rec') path_imgidx = os.path.join(args.data, 'train.idx') imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type s = imgrec.read_idx(0) header, _ = mx.recordio.unpack(s) assert header.flag>0 print('header0 label', header.label) header0 = (int(header.label[0]), int(header.label[1])) #assert(header.flag==1) imgidx = range(1, int(header.label[0])) stat = [] count = 0 data = nd.zeros( (1 ,3, image_size[0], image_size[1]) ) label = nd.zeros( (1,) ) for idx in imgidx: if len(stat)%100==0: print('processing', len(stat)) s = imgrec.read_idx(idx) header, img = mx.recordio.unpack(s) img = mx.image.imdecode(img) img = nd.transpose(img, axes=(2, 0, 1)) data[0][:] = img #input_blob = np.expand_dims(img.asnumpy(), axis=0) #arg_params["data"] = mx.nd.array(input_blob, ctx) #arg_params["softmax_label"] = mx.nd.empty((1,), ctx) time_now = datetime.datetime.now() #exe = sym.bind(ctx, arg_params ,args_grad=None, grad_req="null", aux_states=aux_params) #exe.forward(is_train=False) #_embedding = exe.outputs[0].asnumpy().flatten() #db = mx.io.DataBatch(data=(data,), label=(label,)) db = mx.io.DataBatch(data=(data,)) model.forward(db, is_train=False) net_out = model.get_outputs()[0].asnumpy() time_now2 = datetime.datetime.now() diff = time_now2 - time_now stat.append(diff.total_seconds()) if len(stat)==args.param1: break stat = stat[10:] print('avg infer time', np.mean(stat))
def train(input_variable, target_variable, encoder, decoder, teacher_forcing_ratio, encoder_optimizer, decoder_optimizer, criterion, max_length, ctx): with autograd.record(): loss = F.zeros((1,), ctx=ctx) encoder_hidden = encoder.initHidden(ctx) input_length = input_variable.shape[0] target_length = target_variable.shape[0] encoder_outputs, encoder_hidden = encoder( input_variable.expand_dims(0), encoder_hidden) if input_length < max_length: encoder_outputs = F.concat(encoder_outputs.flatten(), F.zeros((max_length - input_length, encoder.hidden_size), ctx=ctx), dim=0) else: encoder_outputs = encoder_outputs.flatten() decoder_input = F.array([SOS_token], ctx=ctx) decoder_hidden = encoder_hidden use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False if use_teacher_forcing: # Teacher forcing: Feed the target as the next input for di in range(target_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss = F.add(loss, criterion(decoder_output, target_variable[di])) print criterion(decoder_output, target_variable[di]) decoder_input = target_variable[di] # Teacher forcing else: # Without teacher forcing: use its own predictions as the next input for di in range(target_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) topi = decoder_output.argmax(axis=1) decoder_input = F.array([topi.asscalar()], ctx=ctx) loss = F.add(loss, criterion(decoder_output, target_variable[di])) if topi.asscalar() == EOS_token: break loss.backward() encoder_optimizer.step(1) decoder_optimizer.step(1) return loss.asscalar()/target_length
def weights_init(layers): for layer in layers: classname = layer.__class__.__name__ if hasattr(layer, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1): layer.weight.set_data(nd.random.normal(0.0,0.02,shape=layer.weight.data().shape)) if hasattr(layer, 'bias') and layer.bias is not None: layer.bias.set_data(nd.zeros(layer.bias.data().shape)) elif classname.find('BatchNorm') != -1: layer.gamma.set_data(nd.random.normal(1.0, 0.02,shape=layer.gamma.data().shape)) layer.beta.set_data(nd.zeros(layer.bias.data().shape))
def get_params(): W_xh = nd.random_normal(scale=std, shape=(input_dim, hidden_dim), ctx=ctx) W_hh = nd.random_normal(scale=std, shape=(hidden_dim, hidden_dim), ctx=ctx) b_h = nd.zeros(hidden_dim, ctx=ctx) W_hy = nd.random_normal(scale=std, shape=(hidden_dim, output_dim), ctx=ctx) b_y = nd.zeros(output_dim, ctx=ctx) params = [W_xh, W_hh, b_h, W_hy, b_y] for param in params: param.attach_grad() return params
def run_toy_HMC(gpu_id=None): X, Y, X_test, Y_test = load_toy() minibatch_size = Y.shape[0] noise_precision = 1 / 9.0 net = get_toy_sym(True, noise_precision) data_shape = (minibatch_size,) + X.shape[1::] data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)), 'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))} initializer = mx.init.Uniform(0.07) sample_pool = HMC(net, data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test, sample_num=300000, initializer=initializer, prior_precision=1.0, learning_rate=1E-3, L=10, dev=dev(gpu_id))
def get_parameters(): W_xh = nd.random_normal(scale=config.std, shape=(config.input_dim, config.hidden_dim)) W_hh = nd.random_normal(scale=config.std, shape=(config.hidden_dim, config.hidden_dim)) b_h = nd.zeros(config.hidden_dim) W_hy = nd.random_normal(scale=config.std, shape=(config.hidden_dim, config.output_dim)) b_y = nd.zeros(config.output_dim) parameters = [W_xh, W_hh, b_h, W_hy, b_y] for parameter in parameters: parameter.attach_grad() return parameters
def run_mnist_SGD(training_num=50000, gpu_id=None): X, Y, X_test, Y_test = load_mnist(training_num) minibatch_size = 100 net = get_mnist_sym() data_shape = (minibatch_size,) + X.shape[1::] data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)), 'softmax_label': nd.zeros((minibatch_size,), ctx=dev(gpu_id))} initializer = mx.init.Xavier(factor_type="in", magnitude=2.34) exe, exe_params, _ = SGD(sym=net, dev=dev(gpu_id), data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=1000000, initializer=initializer, lr=5E-6, prior_precision=1.0, minibatch_size=100)
def reset_c2c(self): self.select_triplets() for identity,v in self.id2range.iteritems(): _list = range(*v) for idx in _list: s = imgrec.read_idx(idx) ocontents.append(s) embeddings = None #print(len(ocontents)) ba = 0 while True: bb = min(ba+args.batch_size, len(ocontents)) if ba>=bb: break _batch_size = bb-ba _batch_size2 = max(_batch_size, args.ctx_num) data = nd.zeros( (_batch_size2,3, image_size[0], image_size[1]) ) label = nd.zeros( (_batch_size2,) ) count = bb-ba ii=0 for i in xrange(ba, bb): header, img = mx.recordio.unpack(ocontents[i]) img = mx.image.imdecode(img) img = nd.transpose(img, axes=(2, 0, 1)) data[ii][:] = img label[ii][:] = header.label ii+=1 while ii<_batch_size2: data[ii][:] = data[0][:] label[ii][:] = label[0][:] ii+=1 db = mx.io.DataBatch(data=(data,), label=(label,)) self.mx_model.forward(db, is_train=False) net_out = self.mx_model.get_outputs() net_out = net_out[0].asnumpy() model.forward(db, is_train=False) net_out = model.get_outputs() net_out = net_out[0].asnumpy() if embeddings is None: embeddings = np.zeros( (len(ocontents), net_out.shape[1])) embeddings[ba:bb,:] = net_out[0:_batch_size,:] ba = bb embeddings = sklearn.preprocessing.normalize(embeddings) embedding = np.mean(embeddings, axis=0, keepdims=True) embedding = sklearn.preprocessing.normalize(embedding) sims = np.dot(embeddings, embedding).flatten() assert len(sims)==len(_list) for i in xrange(len(_list)): _idx = _list[i] self.idx2cos[_idx] = sims[i]
def run_mnist_SGLD(training_num=50000): X, Y, X_test, Y_test = load_mnist(training_num) minibatch_size = 100 net = get_mnist_sym() data_shape = (minibatch_size,) + X.shape[1::] data_inputs = {'data': nd.zeros(data_shape, ctx=dev()), 'softmax_label': nd.zeros((minibatch_size,), ctx=dev())} initializer = mx.init.Xavier(factor_type="in", magnitude=2.34) exe, sample_pool = SGLD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=1000000, initializer=initializer, learning_rate=4E-6, prior_precision=1.0, minibatch_size=100, thin_interval=100, burn_in_iter_num=1000)
def run_boston_housing_SGLD(): X, Y, X_test, Y_test = load_boston_housing() minibatch_size = 1 teacher_noise_precision = 1.25 net = get_boston_housing_sym(True, teacher_noise_precision) data_shape = (minibatch_size,) + X.shape[1::] data_inputs = {'data': nd.zeros(data_shape, ctx=dev()), 'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())} initializer = BiasXavier(factor_type="in", magnitude=2.34) exe, sample_pool = SGLD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=1000000, initializer=initializer, learning_rate=5E-10, prior_precision=1.0, minibatch_size=minibatch_size, thin_interval=100, burn_in_iter_num=1000, task='boston')
def orthonormal_VanillaLSTMBuilder(lstm_layers, input_dims, lstm_hiddens, dropout_x=0., dropout_h=0., debug=False): """Build a standard LSTM cell, with variational dropout, with weights initialized to be orthonormal (https://arxiv.org/abs/1312.6120) Parameters ---------- lstm_layers : int Currently only support one layer input_dims : int word vector dimensions lstm_hiddens : int hidden size dropout_x : float dropout on inputs, not used in this implementation, see `biLSTM` below dropout_h : float dropout on hidden states debug : bool set to True to skip orthonormal initialization Returns ------- lstm_cell : VariationalDropoutCell A LSTM cell """ assert lstm_layers == 1, 'only accept one layer lstm' W = orthonormal_initializer(lstm_hiddens, lstm_hiddens + input_dims, debug) W_h, W_x = W[:, :lstm_hiddens], W[:, lstm_hiddens:] b = nd.zeros((4 * lstm_hiddens,)) b[lstm_hiddens:2 * lstm_hiddens] = -1.0 lstm_cell = rnn.LSTMCell(input_size=input_dims, hidden_size=lstm_hiddens, i2h_weight_initializer=mx.init.Constant(np.concatenate([W_x] * 4, 0)), h2h_weight_initializer=mx.init.Constant(np.concatenate([W_h] * 4, 0)), h2h_bias_initializer=mx.init.Constant(b)) wrapper = VariationalDropoutCell(lstm_cell, drop_states=dropout_h) return wrapper
def init_params(): w = nd.random_normal(scale=1, shape=(num_inputs, 1)) b = nd.zeros(shape=(1,)) params = [w, b] for param in params: param.attach_grad()#自动求导 需要创建它们的梯度 return params
def try_gpu(): try: ctx = mx.gpu() _ = nd.zeros((1,), ctx=ctx) except: ctx = mx.cpu() return ctx
def get_feature(name, vid, args): global feature_cache key = (name,vid) if key in feature_cache: return feature_cache[key] input_dir = os.path.join(args.image_dir, name, str(vid)) data = nd.zeros( (1 ,3, image_size[0], image_size[1]) ) F = [] for img in os.listdir(input_dir): img = os.path.join(input_dir, img) img = cv2.imread(img) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.transpose(img, (2,0,1)) data[0][:] = img db = mx.io.DataBatch(data=(data,)) model.forward(db, is_train=False) net_out = model.get_outputs()[0].asnumpy().flatten() F.append(net_out) F = np.array(F) F = sklearn.preprocessing.normalize(F) feature = np.mean(F, axis=0, keepdims=True) feature = sklearn.preprocessing.normalize(feature).flatten() feature_cache[key] = feature return feature
def calc_sum(matA, matB): height,width = matA.shape matC = nd.zeros( matA.shape, ctx=matA.context) for y in range(height): for x in range(width): matC[y,x] = matA[y,x] + matB[y,x] return matC
def try_gpu(): """If GPU is available, return mx.gpu(0); else return mx.cpu()""" try: ctx = mx.gpu() _ = nd.zeros((1,), ctx=ctx) except: ctx = mx.cpu() return ctx
def calc_sum(self,matA, matB): height,width = matA.shape ptrA = self.get_pointer(matA) ptrB = self.get_pointer(matB) matC = nd.zeros( matA.shape, ctx = matA.context) ptrC = self.get_pointer(matC) self.fun_calc_sum(ptrA, ptrB, ptrC, width, height) return matC
def gan_loss(input,target_is_real): if target_is_real: target = nd.ones(input.shape,ctx=input.context) else: target = nd.zeros(input.shape, ctx=input.context) #mse loss for lsgan e = ((input - target) ** 2).mean(axis=0, exclude=True) return e
def get_embedding(args, imgrec, id, image_size, model): s = imgrec.read_idx(id) header, _ = mx.recordio.unpack(s) ocontents = [] for idx in xrange(int(header.label[0]), int(header.label[1])): s = imgrec.read_idx(idx) ocontents.append(s) embeddings = None #print(len(ocontents)) ba = 0 while True: bb = min(ba+args.batch_size, len(ocontents)) if ba>=bb: break _batch_size = bb-ba _batch_size2 = max(_batch_size, args.ctx_num) data = nd.zeros( (_batch_size2,3, image_size[0], image_size[1]) ) label = nd.zeros( (_batch_size2,) ) count = bb-ba ii=0 for i in xrange(ba, bb): header, img = mx.recordio.unpack(ocontents[i]) img = mx.image.imdecode(img) img = nd.transpose(img, axes=(2, 0, 1)) data[ii][:] = img label[ii][:] = header.label ii+=1 while ii<_batch_size2: data[ii][:] = data[0][:] label[ii][:] = label[0][:] ii+=1 #db = mx.io.DataBatch(data=(data,), label=(label,)) db = mx.io.DataBatch(data=(data,)) model.forward(db, is_train=False) net_out = model.get_outputs() net_out = net_out[0].asnumpy() if embeddings is None: embeddings = np.zeros( (len(ocontents), net_out.shape[1])) embeddings[ba:bb,:] = net_out[0:_batch_size,:] ba = bb embeddings = sklearn.preprocessing.normalize(embeddings) embedding = np.mean(embeddings, axis=0, keepdims=True) embedding = sklearn.preprocessing.normalize(embedding).flatten() return embedding
def transform_mnist(data, label): # transform a batch of examples if resize: n = data.shape[0] new_data = nd.zeros((n, resize, resize, data.shape[3])) for i in range(n): new_data[i] = image.imresize(data[i], resize, resize) data = new_data # change data from batch x height x weight x channel to batch x channel x height x weight return nd.transpose(data.astype('float32'), (0,3,1,2))/255, label.astype('float32')
def run_mnist_DistilledSGLD(num_training=50000, gpu_id=None): """Run DistilledSGLD on mnist dataset""" X, Y, X_test, Y_test = load_mnist(num_training) minibatch_size = 100 if num_training >= 10000: num_hidden = 800 total_iter_num = 1000000 teacher_learning_rate = 1E-6 student_learning_rate = 0.0001 teacher_prior = 1 student_prior = 0.1 perturb_deviation = 0.1 else: num_hidden = 400 total_iter_num = 20000 teacher_learning_rate = 4E-5 student_learning_rate = 0.0001 teacher_prior = 1 student_prior = 0.1 perturb_deviation = 0.001 teacher_net = get_mnist_sym(num_hidden=num_hidden) logsoftmax = LogSoftmax() student_net = get_mnist_sym(output_op=logsoftmax, num_hidden=num_hidden) data_shape = (minibatch_size,) + X.shape[1::] teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)), 'softmax_label': nd.zeros((minibatch_size,), ctx=dev(gpu_id))} student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)), 'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev(gpu_id))} teacher_initializer = BiasXavier(factor_type="in", magnitude=1) student_initializer = BiasXavier(factor_type="in", magnitude=1) student_exe, student_params, _ = \ DistilledSGLD(teacher_sym=teacher_net, student_sym=student_net, teacher_data_inputs=teacher_data_inputs, student_data_inputs=student_data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=total_iter_num, student_initializer=student_initializer, teacher_initializer=teacher_initializer, student_optimizing_algorithm="adam", teacher_learning_rate=teacher_learning_rate, student_learning_rate=student_learning_rate, teacher_prior_precision=teacher_prior, student_prior_precision=student_prior, perturb_deviation=perturb_deviation, minibatch_size=100, dev=dev(gpu_id))
def run_boston_housing_SGD(): X, Y, X_test, Y_test, X_mean, X_std, Y_mean, Y_std = load_boston_housing() minibatch_size = 1 teacher_noise_precision = 1.25 net = get_boston_housing_sym(True, teacher_noise_precision) data_shape = (minibatch_size,) + X.shape[1::] print data_shape data_inputs = {'data': nd.zeros(data_shape, ctx=dev()), 'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())} initializer = BiasXavier(factor_type="in", magnitude=1) # initializer = mx.init.Normal(sigma=0.01) exe, exe_params, _ = SGD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test, X_mean=X_mean, X_std=X_std, Y_mean=Y_mean, Y_std=Y_std, total_iter_num=2000000, initializer=initializer, # lr_scheduler=mx.lr_scheduler.FactorScheduler(80000, 0.5), lr=1E-6, prior_precision=1, minibatch_size=minibatch_size, task="boston")
def predict_rnn(rnn, prefix, num_chars, params, hidden_dim, ctx, idx_to_char, char_to_idx, get_inputs, is_lstm=False): prefix = prefix.lower() state_h = nd.zeros(shape=(1, hidden_dim), ctx=ctx) if is_lstm: state_c = nd.zeros(shape=(1, hidden_dim), ctx=ctx) #pdb.set_trace() output = [char_to_idx[prefix[0]]] for i in range(num_chars + len(prefix)): X = nd.array([output[-1]], ctx=ctx) if is_lstm: Y, state_h, state_c = rnn(get_inputs(X), state_h, state_c, *params) else: Y, state_h = rnn(get_inputs(X), state_h, *params) if i < len(prefix)-1: next_input = char_to_idx[prefix[i+1]] else: next_input = int(Y[0].argmax(axis=1).asscalar()) output.append(next_input) return ''.join([idx_to_char[i] for i in output])
def test_token_embedding_manual_extension(initializeidxtovecbyextending, tmpdir): if not initializeidxtovecbyextending: # Load a TokenEmbedding with idx_to_vec already initialized embed_root = str(tmpdir) embed_name = 'my_embed' elem_delim = '\t' pretrain_file = 'my_pretrain_file.txt' _mk_my_pretrain_file( os.path.join(embed_root, embed_name), elem_delim, pretrain_file) pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file) TokEmb = functools.partial(nlp.embedding.TokenEmbedding.from_file, pretrain_file_path, elem_delim, allow_extend=True) else: TokEmb = functools.partial( nlp.embedding.token_embedding.TokenEmbedding, allow_extend=True) # Uninitialized token_embedding._idx_to_vec based token_embedding = TokEmb() token_embedding['hello'] = nd.zeros(shape=(1, 5)) assert np.all(np.isclose(0, token_embedding['hello'].asnumpy())) token_embedding = TokEmb() token_embedding['hello'] = nd.zeros(shape=(5, )) assert np.all(np.isclose(0, token_embedding['hello'].asnumpy())) token_embedding = TokEmb() token_embedding[['hello', 'world']] = nd.zeros(shape=(2, 5)) assert np.all(np.isclose(0, token_embedding['hello'].asnumpy())) assert np.all(np.isclose(0, token_embedding['world'].asnumpy())) with pytest.raises(AssertionError): token_embedding = TokEmb() token_embedding[['hello', 'world']] = nd.zeros(shape=(1, 5)) with pytest.raises(AssertionError): token_embedding = TokEmb() token_embedding[['hello', 'world']] = nd.zeros(shape=(5, ))
def create_state(self, index, weight): """Create additional optimizer state such as momentum. Parameters ---------- weight : NDArray The weight data """ if self.momentum == 0.0: return None else: return zeros(weight.shape, weight.context, dtype=weight.dtype)
def parse_groundtruth_for_target(labels, box_per_cell, xywh): B,H,W,A,_ = xywh.shape _,maxObjNum,_ = labels.shape #pdb.set_trace() boxMask = nd.zeros( (B,H,W,A,1), ctx = xywh.context ) boxCls = nd.ones_like(boxMask, ctx = xywh.context) * (-1) #default negative label boxObject = nd.zeros((B,H,W,A,1),ctx = xywh.context) boxXYWH = nd.zeros((B,H,W,A,4), ctx = xywh.context) for b in range(B): label = labels[b].asnumpy() validLabel = label[np.where(label[:,1] >-0.5)[0],:] #pdb.set_trace() np.random.shuffle(validLabel) for l in validLabel: cls,x0,y0,x1,y1 = l w = x1 - x0 h = y1 - y0 #find best box for this object indx,indy = int(x0*W), int(y0*H) #position pws, phs = xywh[b,indy, indx, :, -2], xywh[b,indy,indx,:,-1] ious = [] pws = pws.asnumpy() phs = phs.asnumpy() pws, phs = [1,1],[1,1] for pw, ph in zip(pws,phs): intersect = np.minimum(pw,w*W) * np.minimum(ph,h*H) ious.append( intersect / (pw * ph + w * h - intersect) ) #pdb.set_trace() bestbox = int(np.argmax(ious)) boxMask[b,indy,indx,bestbox,:] = 1.0 boxCls[b,indy,indx,bestbox,:] = cls boxObject[b,indy,indx,bestbox,:] = 1.0 # ious[bestbox] tx = x0 * W - indx ty = y0 * H - indy tw,th = math.sqrt(w), math.sqrt(h) #predict sqrt(w) sqrt(h) #pdb.set_trace() boxXYWH[b,indy,indx,bestbox,:] = nd.array([tx,ty,tw,th]) return boxMask, boxCls, boxObject,boxXYWH
def forward(self, input_vec, loss=None, training=True): # print('************* ' + str(input_vec.shape[1]) + ' *************') # print('############# ' + str(input_vec.shape) + ' #############') assert input_vec.shape[1] == self.input_dimension # get inputs for every slot(including global) inputs = {} for slot in self.slots: inputs[slot] = input_vec[:, self.slot_dimension[slot][0]:self.slot_dimension[slot][1]] input_global = [] for seg in self.global_dimension: input_global.append(input_vec[:, seg[0]:seg[1]]) inputs['global'] = nd.concat(*input_global, dim=1) layer = [] # inputs -> first_hidden_layer if (not self.sort_input_vec) and self.state_feature != 'dip': layer.append([]) for slot in self.slots: layer[0].append(self.input_trans[slot](inputs[slot])) layer[0].append(self.input_trans['global'](inputs['global'])) elif self.state_feature == 'dip': sorted_inputs = [] for slot in self.slots: sorted_inputs.append(inputs[slot]) sorted_inputs.append(inputs['global']) layer.append(self.input_trans.forward(sorted_inputs, loss, training=training)) elif self.sort_input_vec: sorted_inputs = [] for slot in self.slots: tmp = inputs[slot][:, :-2].sort(is_ascend=False) if tmp.shape[1] < 20: tmp = nd.concat(tmp, nd.zeros((tmp.shape[0], 20 - tmp.shape[1]), ctx=CTX), dim=1) else: tmp = nd.slice_axis(tmp, axis=1, begin=0, end=20) sorted_inputs.append(nd.concat(tmp, inputs[slot][:, -2:], dim=1)) sorted_inputs.append(inputs['global']) layer.append(self.input_trans.forward(sorted_inputs, loss, training=training)) # hidden_layers for i in range(self.hidden_layers - 1): if self.recurrent_mode is False: # equal to 'layer.append(self.ma_trans[i](layer[-1], loss))' layer.append(self.ma_trans[i](layer[i], loss)) else: layer.append(self.ma_trans(layer[i], loss)) if self.share_last_layer is False: # dropout of last hidden layer for j in range(len(self.slots)): layer[-1][j] = self.local_out_drop_op.forward(layer[-1][j]) layer[-1][-1] = self.global_out_drop_op.forward(layer[-1][-1]) # last_hidden_layer -> outputs outputs = [] slotv_probs = [] slotqs = [] slot_probs = [] top_decision = [] for i in range(len(self.slots) + 1): if self.use_dueling is False: outputs.append(self.output_trans[i](layer[-1][i])) else: if i < len(self.slots): cur_slotv_prob = self.output_trans_local_valueP.forward(layer[-1][i], training=training) cur_slotv_prob = nd.softmax(cur_slotv_prob) else: cur_slotv_prob = self.output_trans_global_valueP.forward(layer[-1][i], training=training) cur_slotv_prob = nd.softmax(cur_slotv_prob) if self.dueling_share_last: if i < len(self.slots): cur_slotq = self.output_trans_local_slotQ.forward(layer[-1][i], training=training) cur_slot_prob = self.output_trans_local_slotP.forward(layer[-1][i], training=training).reshape(-1,1) cur_slotv_prob = cur_slotv_prob*cur_slot_prob # cur_slot_prob = nd.softmax(cur_slot_prob) if self.shared_last_layer_use_bias: cur_slotq = cur_slotq + nd.slice(self.value_bias_local.data(), begin=(i, ), end=(i + 1, )) else: cur_slotq = self.output_trans_global_slotQ.forward(layer[-1][i], training=training) cur_slot_prob = self.output_trans_global_slotP.forward(layer[-1][i], training=training).reshape(-1,1) cur_slotv_prob = cur_slotv_prob*cur_slot_prob # cur_slot_prob = nd.softmax(cur_slot_prob) top_decision.append(cur_slot_prob) else: cur_slotq = self.output_trans_value[i](layer[-1][i]) slotv_probs.append(cur_slotv_prob) slot_probs.append(cur_slot_prob) slotqs.append(cur_slotq) # batch_slotv_probs_list = [] # slot_prob_softmax = nd.softmax(nd.concat(*slot_probs, dim=1)) # slot_prob_split = nd.split(slot_prob_softmax, axis=1, num_outputs=len(self.slots)+1) # assert len(slotv_probs) == len(self.slots)+1 # for i in range(len(slotv_probs)): # tmp = slot_prob_split[i].reshape(-1,1)*slotv_probs[i] # batch_slotv_probs_list.append(tmp) batch_slot_prob = nd.softmax(nd.concat(*slot_probs, dim=1)) batch_slot_slotq = nd.concat(*slotqs, dim=1) batch_slotv_prob = nd.softmax(nd.concat(*slotv_probs, dim=1)) batch_top_decision = nd.softmax(nd.concat(*top_decision,dim=1)) # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') # print(batch_slotv_prob) # print(batch_slot_prob.shape) # print(batch_slot_slotq.shape) # print(batch_slotv_prob.shape) prob = batch_slotv_prob value = nd.max(batch_slot_slotq, axis=1) top_decision = batch_top_decision # CTname = threading.currentThread().getName() # print(CTname+' top decision is : ') # print(top_decision) return prob, value, top_decision
def train(): """training""" image_pool = ImagePool(pool_size) metric = mx.metric.CustomMetric(facc) stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='%s' % dir_out_sw, flush_secs=5, verbose=False) global_step = 0 for epoch in range(epochs): if epoch == 0: netG.hybridize() netD.hybridize() # sw.add_graph(netG) # sw.add_graph(netD) tic = time.time() btic = time.time() train_data.reset() val_data.reset() iter = 0 for local_step, batch in enumerate(train_data): ############################ # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z))) ########################### tmp = mx.nd.concat(batch.data[0], batch.data[1], batch.data[2], dim=1) tmp = augmenter(tmp, patch_size=128, offset=offset, aug_type=1, aug_methods=aug_methods, random_crop=False) real_in = tmp[:, :1].as_in_context(ctx) real_out = tmp[:, 1:2].as_in_context(ctx) m = tmp[:, 2:3].as_in_context(ctx) # mask fake_out = netG(real_in) * m # loss weight based on mask, applied on L1 loss if no_loss_weights: loss_weight = m else: loss_weight = m.asnumpy() loss_weight[loss_weight == 0] = .1 loss_weight = mx.nd.array(loss_weight, ctx=m.context) fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1)) with autograd.record(): # Train with fake image # Use image pooling to utilize history images output = netD(fake_concat) fake_label = nd.zeros(output.shape, ctx=ctx) errD_fake = GAN_loss(output, fake_label) metric.update([ fake_label, ], [ output, ]) # Train with real image real_concat = nd.concat(real_in, real_out, dim=1) output = netD(real_concat) real_label = nd.ones(output.shape, ctx=ctx) errD_real = GAN_loss(output, real_label) errD = (errD_real + errD_fake) * 0.5 errD.backward() metric.update([ real_label, ], [ output, ]) trainerD.step(batch.data[0].shape[0]) ############################ # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z)) ########################### with autograd.record(): fake_out = netG(real_in) fake_concat = nd.concat(real_in, fake_out, dim=1) output = netD(fake_concat) real_label = nd.ones(output.shape, ctx=ctx) errG = GAN_loss(output, real_label) + loss_2nd( real_out, fake_out, loss_weight) * lambda1 errG.backward() trainerG.step(batch.data[0].shape[0]) sw.add_scalar(tag='loss', value=('d_loss', errD.mean().asscalar()), global_step=global_step) sw.add_scalar(tag='loss', value=('g_loss', errG.mean().asscalar()), global_step=global_step) global_step += 1 if epoch + local_step == 0: sw.add_graph((netG)) img_in_list, img_out_list, m_val = val_data.next().data m_val = m_val.as_in_context(ctx) sw.add_image('first_minibatch_train_real', norm3(real_out)) sw.add_image('first_minibatch_val_real', norm3(img_out_list.as_in_context(ctx))) netG.export('%snetG' % dir_out_checkpoints) if local_step == 0: # Log the first batch of images of each epoch (training) sw.add_image('first_minibatch_train_fake', norm3(fake_out * m) * m, epoch) sw.add_image( 'first_minibatch_val_fake', norm3(netG(img_in_list.as_in_context(ctx)) * m_val) * m_val, epoch) # norm3(netG(img_in_list.as_in_context(ctx)) * m_val.as_in_context(ctx)), epoch) if (iter + 1) % 10 == 0: name, acc = metric.get() logging.info('speed: {} samples/s'.format( batch_size / (time.time() - btic))) logging.info( 'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d' % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc, iter, epoch)) iter += 1 btic = time.time() sw.add_scalar(tag='binary_training_acc', value=('acc', acc), global_step=epoch) name, acc = metric.get() metric.reset() fake_val = netG(val_data.data[0][1].as_in_context(ctx)) loss_val = loss_2nd(val_data.data[1][1].as_in_context(ctx), fake_val, val_data.data[2][1].as_in_context(ctx)) * lambda1 sw.add_scalar(tag='loss_val', value=('g_loss', loss_val.mean().asscalar()), global_step=epoch) if (epoch % check_point_interval == 0) | (epoch == epochs - 1): netD.save_params('%snetD-%04d' % (dir_out_checkpoints, epoch)) netG.save_params('%snetG-%04d' % (dir_out_checkpoints, epoch)) logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc)) logging.info('time: %f' % (time.time() - tic)) sw.export_scalars('scalar_dict.json') sw.close()
def forward(self, inputs, loss=None, training=True, commtype='average', topo='FC'): assert len(inputs) == self.slots + 1 local_drop_vec = nd.ones_like(inputs[0]) local_drop_vec = self.local_dropout_op(local_drop_vec) for i in range(self.slots): inputs[i] = inputs[i] * local_drop_vec inputs[-1] = self.global_dropout_op(inputs[-1]) if topo == 'FC': comm_rate = nd.ones(shape=(self.slots + 1, self.slots + 1)) elif topo == 'FUC': comm_rate = nd.zeros(shape=(self.slots + 1, self.slots + 1)) elif topo == 'Master': comm_rate = nd.ones(shape=(self.slots + 1, self.slots + 1)) for i in range(self.slots): for j in range(self.slots): comm_rate[i][j] = 0 if self.use_comm and self.topo_learning_mode: proba = nd.sigmoid(self.topo.data()) if random.random() < 1e-2: print '---------------------------------------------' print proba.asnumpy() print '---------------------------------------------' u_vec = nd.random_uniform(low=1e-5, high=1. - 1e-5, shape=(self.slots + 1, self.slots + 1)) comm_rate = nd.sigmoid(10. * ( nd.log(proba) - nd.log(1. - proba) + nd.log(u_vec) - nd.log(1. - u_vec) )) if loss is not None: loss.append(4e-4 * nd.sum(proba * nd.log(proba) + (1. - proba) * nd.log(1. - proba))) results = [] for i in range(self.slots): results.append(self.local_share_trans.forward(inputs[i], training=training)) results.append(self.global_trans.forward(inputs[-1], training=training)) if commtype == 'average': for i in range(self.slots): tmp = nd.zeros_like(results[i]) norm = nd.zeros_like(comm_rate[0][0]) for j in range(self.slots): if i != j: tmp = tmp + self.local2local_share_comm.forward(nd.concat(inputs[j], dim=1), training=training) * comm_rate[j][i] norm = norm + comm_rate[j][i] # results[i] = results[i] + self.global2local_comm(inputs[-1]) * comm_rate[-1][i] tmp = tmp + self.global2local_comm.forward(nd.concat(inputs[-1], dim=1), training=training) * \ comm_rate[-1][i] norm = norm + comm_rate[-1][i] if nd.sum(norm) > 1e-5: results[i] = results[i] + tmp / norm tmp = nd.zeros_like(results[-1]) norm = nd.zeros_like(comm_rate[0][0]) for j in range(self.slots): tmp = tmp + self.local2global_comm.forward(nd.concat(inputs[j], dim=1), training=training) * \ comm_rate[j][-1] norm = norm + comm_rate[j][-1] if nd.sum(norm) > 1e-5: results[-1] = results[-1] + tmp / norm elif commtype == 'maxpooling': for i in range(self.slots): tmp = [] for j in range(self.slots): if j != i: tmp.append(self.local2local_share_comm.forward(inputs[j], training=training)) tmp.append(self.global2local_comm.forward(inputs[-1], training=training)) for k in range(len(tmp)): tmp[k] = tmp[k].reshape((tmp[k].shape[0], 1, tmp[k].shape[1])) tmp = nd.concat(*tmp, dim=1) maxcomm = nd.max(tmp, axis=1) results[i] = results[i] + maxcomm tmp = [] for i in range(self.slots): tmp.append(self.local2global_comm.forward(inputs[i], training=training)) for k in range(len(tmp)): tmp[k] = tmp[k].reshape((tmp[k].shape[0], 1, tmp[k].shape[1])) tmp = nd.concat(*tmp, dim=1) maxcomm = nd.max(tmp, axis=1) results[-1] = results[-1] + maxcomm return results
def _bind_ith_exec(self, i, data_shapes, label_shapes, shared_group): """Internal utility function to bind the i-th executor. """ shared_exec = None if shared_group is None else shared_group.execs[i] context = self.contexts[i] shared_data_arrays = self.shared_data_arrays[i] input_shapes = dict(data_shapes) if label_shapes is not None: input_shapes.update(dict(label_shapes)) arg_shapes, _, aux_shapes = self.symbol.infer_shape(**input_shapes) assert arg_shapes is not None, "shape inference failed" input_types = {x.name: x.dtype for x in data_shapes} if label_shapes is not None: input_types.update({x.name: x.dtype for x in label_shapes}) arg_types, _, aux_types = self.symbol.infer_type(**input_types) assert arg_types is not None, "type inference failed" arg_arrays = [] grad_arrays = {} if self.for_training else None def _get_or_reshape(name, shared_data_arrays, arg_shape, arg_type, context, logger): """Internal helper to get a memory block or re-use by re-shaping""" if name in shared_data_arrays: arg_arr = shared_data_arrays[name] if np.prod(arg_arr.shape) >= np.prod(arg_shape): # nice, we can directly re-use this data blob assert arg_arr.dtype == arg_type arg_arr = arg_arr.reshape(arg_shape) else: logger.warning(('bucketing: data "%s" has a shape %s' % (name, arg_shape)) + (', which is larger than already allocated ') + ('shape %s' % (arg_arr.shape,)) + ('. Need to re-allocate. Consider putting ') + ('default_bucket_key to') + (' be the bucket taking the largest input for better ') + ('memory sharing.')) arg_arr = nd.zeros(arg_shape, context, dtype=arg_type) # replace existing shared array because the new one is bigger shared_data_arrays[name] = arg_arr else: arg_arr = nd.zeros(arg_shape, context, dtype=arg_type) shared_data_arrays[name] = arg_arr return arg_arr # create or borrow arguments and gradients for j in range(len(self.arg_names)): name = self.arg_names[j] if name in self.param_names: # model parameters if shared_exec is None: arg_arr = nd.zeros(arg_shapes[j], context, dtype=arg_types[j]) if self.grad_req[name] != 'null': grad_arr = nd.zeros(arg_shapes[j], context, dtype=arg_types[j]) grad_arrays[name] = grad_arr else: arg_arr = shared_exec.arg_dict[name] assert arg_arr.shape == arg_shapes[j] assert arg_arr.dtype == arg_types[j] if self.grad_req[name] != 'null': grad_arrays[name] = shared_exec.grad_dict[name] else: # data, label, or states arg_arr = _get_or_reshape(name, shared_data_arrays, arg_shapes[j], arg_types[j], context, self.logger) # data might also need grad if inputs_need_grad is True if self.grad_req[name] != 'null': grad_arrays[name] = _get_or_reshape('grad of ' + name, shared_data_arrays, arg_shapes[j], arg_types[j], context, self.logger) arg_arrays.append(arg_arr) # create or borrow aux variables if shared_exec is None: aux_arrays = [nd.zeros(s, context, dtype=t) for s, t in zip(aux_shapes, aux_types)] else: for j, arr in enumerate(shared_exec.aux_arrays): assert aux_shapes[j] == arr.shape assert aux_types[j] == arr.dtype aux_arrays = shared_exec.aux_arrays[:] executor = self.symbol.bind(ctx=context, args=arg_arrays, args_grad=grad_arrays, aux_states=aux_arrays, grad_req=self.grad_req, shared_exec=shared_exec) # Get the total bytes allocated for this executor return executor
def penalty_l2(params): penalty = nd.zeros(shape=1) for param in params: penalty = penalty + nd.sum(param**2) return penalty
def forward(self, input_vec, loss=None): assert input_vec.shape[1] == self.input_dimension # get inputs for every slot(including global) inputs = {} for slot in self.slots: inputs[slot] = input_vec[:, self.slot_dimension[slot][0]:self.slot_dimension[slot][1]] input_global = [] for seg in self.global_dimension: input_global.append(input_vec[:, seg[0]:seg[1]]) inputs['global'] = nd.concat(*input_global, dim=1) layer = [] # inputs -> first_hidden_layer if (not self.sort_input_vec) and self.state_feature != 'dip': layer.append([]) for slot in self.slots: layer[0].append(self.input_trans[slot](inputs[slot])) layer[0].append(self.input_trans['global'](inputs['global'])) elif self.state_feature == 'dip': sorted_inputs = [] for slot in self.slots: sorted_inputs.append(inputs[slot]) sorted_inputs.append(inputs['global']) layer.append(self.input_trans(sorted_inputs, loss)) elif self.sort_input_vec: sorted_inputs = [] for slot in self.slots: tmp = inputs[slot][:, :-2].sort(is_ascend=False) if tmp.shape[1] < 20: tmp = nd.concat(tmp, nd.zeros((tmp.shape[0], 20 - tmp.shape[1]), ctx=CTX), dim=1) else: tmp = nd.slice_axis(tmp, axis=1, begin=0, end=20) sorted_inputs.append(nd.concat(tmp, inputs[slot][:, -2:], dim=1)) sorted_inputs.append(inputs['global']) layer.append(self.input_trans(sorted_inputs, loss)) # hidden_layers for i in range(self.hidden_layers - 1): if self.recurrent_mode is False: # equal to 'layer.append(self.ma_trans[i](layer[-1], loss))' layer.append(self.ma_trans[i](layer[i], loss)) else: layer.append(self.ma_trans(layer[i], loss)) if self.share_last_layer is False: # dropout of last hidden layer for j in range(len(self.slots)): layer[-1][j] = self.local_out_drop_op(layer[-1][j]) layer[-1][-1] = self.global_out_drop_op(layer[-1][-1]) # last_hidden_layer -> outputs outputs = [] for i in range(len(self.slots) + 1): if self.use_dueling is False: outputs.append(self.output_trans[i](layer[-1][i])) else: if i < len(self.slots): tmp_adv = self.output_trans_local_advantage(sorted_inputs[i]) else: tmp_adv = self.output_trans_global_advantage(sorted_inputs[-1]) if self.dueling_share_last: if i < len(self.slots): cur_value = self.output_trans_local_value(layer[-1][i]) if self.shared_last_layer_use_bias: cur_value = cur_value + nd.slice(self.value_bias_local.data(), begin=(i, ), end=(i + 1, )) else: cur_value = self.output_trans_global_value(layer[-1][i]) else: cur_value = self.output_trans_value[i](layer[-1][i]) outputs.append( cur_value + tmp_adv - tmp_adv.mean(axis=1).reshape( (tmp_adv.shape[0], 1)).broadcast_axes(axis=1, size=tmp_adv.shape[1])) else: outputs = [] for i in range(len(self.slots)): output_i = self.output_trans_local(layer[-1][i]) if self.shared_last_layer_use_bias: output_i = output_i + self.output_trans_local_biases[i].data() outputs.append(output_i) outputs.append(self.output_trans_global(layer[-1][-1])) return nd.concat(*outputs, dim=1)
def GRU(epoch = 100 , batch_size=100, save_period=100 , load_period=100 ,learning_rate= 0.1, ctx=mx.gpu(0)): train_data , test_data = FashionMNIST(batch_size) #network parameter time_step = 28 num_inputs = 28 num_hidden = 200 num_outputs = 10 path = "weights/FashionMNIST_GRUweights-{}".format(load_period) if os.path.exists(path): print("loading weights") [wxz, wxr, wxh, whz, whr, whh, bz, br, bh, why, by] = nd.load(path) # weights load wxz = wxz.as_in_context(ctx) wxr = wxr.as_in_context(ctx) whz = whz.as_in_context(ctx) whz = whz.as_in_context(ctx) whr = whr.as_in_context(ctx) whh = whh.as_in_context(ctx) bz = bz.as_in_context(ctx) br = br.as_in_context(ctx) bh = bh.as_in_context(ctx) why = why.as_in_context(ctx) by = by.as_in_context(ctx) params = [wxz , wxr , wxh , whz, whr, whh, bz, br, bh, why , by] else: print("initializing weights") with ctx: wxz = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_inputs)) wxr = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_inputs)) wxh = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_inputs)) whz = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_hidden)) whr = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_hidden)) whh = nd.random.normal(loc=0, scale=0.01, shape=(num_hidden, num_hidden)) bz = nd.random.normal(loc=0,scale=0.01,shape=(num_hidden,)) br = nd.random.normal(loc=0,scale=0.01,shape=(num_hidden,)) bh = nd.random.normal(loc=0,scale=0.01,shape=(num_hidden,)) why = nd.random.normal(loc=0,scale=0.1,shape=(num_outputs , num_hidden)) by = nd.random.normal(loc=0,scale=0.1,shape=(num_outputs,)) params = [wxz , wxr , wxh , whz, whr, whh, bz, br, bh, why , by] # attach gradient!!! for param in params: param.attach_grad() #Fully Neural Network with 1 Hidden layer def GRU_Cell(input, state): for x in input: z_t = nd.Activation(nd.FullyConnected(data=x,weight=wxz,no_bias=True,num_hidden=num_hidden)+ nd.FullyConnected(data=state,weight=whz,no_bias=True,num_hidden=num_hidden)+bz,act_type="sigmoid") r_t = nd.Activation(nd.FullyConnected(data=x,weight=wxr,no_bias=True,num_hidden=num_hidden)+ nd.FullyConnected(data=state,weight=whr,no_bias=True,num_hidden=num_hidden)+br,act_type="sigmoid") g_t = nd.Activation(nd.FullyConnected(data=x,weight=wxh,no_bias=True,num_hidden=num_hidden)+ nd.FullyConnected(data=r_t*state,weight=whh,no_bias=True,num_hidden=num_hidden)+bh,act_type="tanh") state = nd.multiply(z_t,state) + nd.multiply(1-z_t,g_t) output = nd.FullyConnected(data=state, weight=why, bias=by, num_hidden=num_outputs) output = nd.softmax(data=output) return output, state def cross_entropy(output, label): return - nd.sum(label * nd.log(output), axis=0 , exclude=True) #Adam optimizer state=[] optimizer=mx.optimizer.Adam(rescale_grad=1,learning_rate=learning_rate) for param in params: state.append(optimizer.create_state(0,param)) for i in tqdm(range(1,epoch+1,1)): for data,label in train_data: states = nd.zeros(shape=(data.shape[0], num_hidden), ctx=ctx) data = data.as_in_context(ctx) data = data.reshape(shape=(-1,time_step,num_inputs)) data=nd.transpose(data=data,axes=(1,0,2)) label = label.as_in_context(ctx) label = nd.one_hot(label , num_outputs) with autograd.record(): outputs, states = GRU_Cell(data, states) loss = cross_entropy(outputs,label) # (batch_size,) loss.backward() cost = nd.mean(loss).asscalar() for j,param in enumerate(params): optimizer.update(0,param,param.grad,state[j]) test_accuracy = evaluate_accuracy(test_data, time_step, num_inputs, num_hidden, GRU_Cell, ctx) print(" epoch : {} , last batch cost : {}".format(i,cost)) print("Test_acc : {0:0.3f}%".format(test_accuracy * 100)) #weight_save if i % save_period==0: if not os.path.exists("weights"): os.makedirs("weights") print("saving weights") nd.save("weights/FashionMNIST_GRUweights-{}".format(i),params) test_accuracy = evaluate_accuracy(test_data, time_step, num_inputs, num_hidden, GRU_Cell, ctx) print("Test_acc : {0:0.3f}%".format(test_accuracy * 100)) return "optimization completed"
def interclass_reset(self): self.seq2 = [] self.oseq2 = [] while len(self.seq2) < self.seq_min_size: self.time_reset() embeddings = None bag_size = self.interclass_bag_size # 3600 batch_size2 = self.batch_size2 # 200 # data = np.zeros( (bag_size,)+self.data_shape ) # label = np.zeros( (bag_size,) ) tag = [] # idx = np.zeros( (bag_size,) ) #print('eval %d images..' % bag_size, self.interclass_oseq_cur) # 3600 0 first time #print('interclass time stat', self.times) if self.interclass_oseq_cur + bag_size > len(self.oseq2): self.interclass_oseq_reset() print('eval %d images..' % bag_size, self.interclass_oseq_cur) self.times[0] += self.time_elapsed() self.time_reset() # print(data.shape) data = nd.zeros(self.provide_data2[0][1]) label = nd.zeros(self.provide_label2[0][1]) ba = 0 all_layers = self.mx_model.symbol.get_internals() if self.model_t is None: symbol_t = all_layers['blockgrad0_output'] self.model_t = mx.mod.Module(symbol=symbol_t, context=self.ctx, label_names=None) self.model_t.bind(data_shapes=self.provide_data2) arg_t, aux_t = self.mx_model.get_params() self.model_t.set_params(arg_t, aux_t) else: arg_t, aux_t = self.mx_model.get_params() self.model_t.set_params(arg_t, aux_t) while True: bb = min(ba + batch_size2, bag_size) if ba >= bb: break # _batch = self.data_iter.next() # _data = _batch.data[0].asnumpy() # print(_data.shape) # _label = _batch.label[0].asnumpy() # data[ba:bb,:,:,:] = _data # label[ba:bb] = _label for i in xrange(ba, bb): _idx = self.oseq2[i + self.interclass_oseq_cur] s = self.imgrec2.read_idx(_idx) header, img = recordio.unpack(s) img = self.imdecode(img) data[i - ba][:] = self.postprocess_data(img) #label[i-ba][:] = header.label #print('header.label', header.label) #print('header.label', header.label.shape) #tag.append((int(header.label), _idx)) #print('header.label',header.label) label0 = header.label if not isinstance(label0, numbers.Number): label0 = label0[0] #print('label0', label0) label[i - ba][:] = label0 tag.append((int(label0), _idx)) # idx[i] = _idx #print('tag:' ,tag) #print(data,label) #db = mx.io.DataBatch(data=(data,), label=(label,)) #self.mx_model.forward(db, is_train=False) #net_out = self.mx_model.get_outputs() #print("self.mx_model",self.mx_model) db = mx.io.DataBatch(data=(data, ), label=(label, )) self.model_t.forward(db, is_train=False) net_out = self.model_t.get_outputs() #print('eval for selecting interclasses',ba,bb) #print(net_out) #print(len(net_out)) #print(net_out[0].asnumpy()) net_out = net_out[0].asnumpy() #print(len(net_out)) #print('net_out', net_out.shape) if embeddings is None: embeddings = np.zeros((bag_size, net_out.shape[1])) #print ("net_out.shape: ", net_out.shape) #print("ba,bb: ", ba,bb) embeddings[ba:bb, :] = net_out ba = bb assert len(tag) == bag_size self.interclass_oseq_cur += bag_size #print("embeddings: ",embeddings) embeddings = sklearn.preprocessing.normalize(embeddings) self.times[1] += self.time_elapsed() self.time_reset() nrof_images_per_class = [1] for i in xrange(1, bag_size): if tag[i][0] == tag[i - 1][0]: nrof_images_per_class[-1] += 1 else: nrof_images_per_class.append(1) id_sel = self.pick_interclass(embeddings, nrof_images_per_class, self.batchsize_id) # shape=(T,3) #print('found interclass', id_sel) #2 if self.images_per_identity == 1: for j in xrange(self.batchsize_id // 3): idsel_0 = tag[id_sel[j] * self.images_per_identity][1] self.seq2.append(idsel_0) else: for j in xrange(self.batchsize_id // 3): idsel_0 = tag[id_sel[j] * self.images_per_identity][1] self.seq2.append(idsel_0) idsel_0 = tag[id_sel[j] * self.images_per_identity + 1][1] self.seq2.append(idsel_0) idsel_0 = tag[id_sel[j] * self.images_per_identity + 2][1] self.seq2.append(idsel_0) self.times[2] += self.time_elapsed()
#encoding:utf-8 import sys sys.path.append('..') import utils batch_size = 256 train_data, test_data = utils.load_data_fashion_mnist(batch_size) from mxnet import ndarray as nd num_inputs = 28 * 28 num_outputs = 10 num_hidden1 = 256 num_hidden2 = 256 weight_scale = .01 W1 = nd.random_normal(shape=(num_inputs, num_hidden1), scale=weight_scale) b1 = nd.zeros(num_hidden1) W2 = nd.random_normal(shape=(num_hidden1, num_hidden2), scale=weight_scale) b2 = nd.zeros(num_hidden2) W3 = nd.random_normal(shape=(num_hidden2, num_outputs), scale=weight_scale) b3 = nd.zeros(num_outputs) params = [W1, b1, W2, b2, W3, b3] for param in params: param.attach_grad() def dropout(X, drop_probability): keep_probability = 1 - drop_probability
def reset(self): """Resets the iterator to the beginning of the data.""" if self.first_reset == 1: print("first reset") #all_layers = self.mx_model.symbol.get_internals() # print('all_layers: ',all_layers) if self.model_t is None: vec = self.mx_pretrained.split(',') assert len(vec) > 1 prefix = vec[0] epoch = int(vec[1]) print('loading', prefix, epoch) sym, arg_params, aux_params = mx.model.load_checkpoint( prefix, epoch) all_layers = sym.get_internals() print('all_layers:', all_layers) sym = all_layers['blockgrad1_output'] self.model_t = mx.mod.Module(symbol=sym, context=self.ctx) self.model_t.bind(data_shapes=self.provide_data_mining, label_shapes=self.provide_label_mining) self.model_t.set_params(arg_params, aux_params) ba = 0 tag = [] data = nd.zeros(self.provide_data_mining[0][1]) label = nd.zeros(self.provide_label_mining[0][1]) outfilew = os.path.join(self.bin_dir, "%d_noiselist.txt" % (self.save)) with open(outfilew, 'w') as fp: while True: bb = min(ba + self.batch_size_mining, len(self.oseq)) print("start bb,ba", ba, bb) if ba >= bb: break for i in xrange(ba, bb): _idx = self.oseq[i] s = self.imgrec.read_idx(_idx) header, img = recordio.unpack(s) img = self.imdecode(img) data[i - ba][:] = self.postprocess_data(img) label0 = header.label if not isinstance(label0, numbers.Number): label0 = label0[0] # print('label0', label0) label[i - ba][:] = label0 tag.append((int(label0), _idx)) db = mx.io.DataBatch(data=(data, ), label=(label, )) self.model_t.forward(db, is_train=False) net_out = self.model_t.get_outputs() net_P = mx.nd.softmax(net_out[0], axis=1) net_P = net_P.asnumpy() for ii in range(bb - ba): #print('label:',label[ii]) #print('tag:',tag[ii][0]) P = net_P[ii] #print(P) #print(max(P)) if max(P) < self.threshold: line = '%d %d %s %s\n' % (tag[ii][0], tag[ii][1], max(P), P[tag[ii][0]]) fp.write(line) else: self.seq.append(tag[ii][1]) tag = [] ba = bb self.save += 1 print("Initialize done: ", len(self.oseq), len(self.seq), len(self.oseq) - len(self.seq)) self.first_reset += 1 else: print('call reset()') self.cur = 0 if self.shuffle: random.shuffle(self.seq) self.first_reset += 1
def data_iter(): idx = list(range(num_example)) random.shuffle(idx) for i in range(0, num_example, batch_size): j = nd.array(idx[i:min(i + batch_size, num_example)]) yield nd.take(X, j), nd.take(Y, j) for data, label in data_iter(): #data:预测值;label:真实值 print(data, label) break # 3.初始化模型 w = nd.random_normal(shape=(num_inputs, 1)) b = nd.zeros(1) params = [w, b] for param in params: param.attach_grad() # 4.定义模型 def net(X): return nd.dot(X, w) + b print(net(data)) # 5.损失函数
def bind(modQ, data_shapes, label_shapes=None, for_training=True, inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req='write'): if force_rebind: modQ._reset_bind() if modQ.binded: modQ.logger.warning('Already binded, ignoring bind()') return modQ.for_training = for_training modQ.inputs_need_grad = inputs_need_grad modQ.binded = True modQ._grad_req = grad_req if not for_training: assert not inputs_need_grad else: pass # this is not True, as some module might not contains a loss function # that consumes the labels # assert label_shapes is not None modQ._data_shapes, modQ._label_shapes = _parse_data_desc( modQ.data_names, modQ.label_names, data_shapes, label_shapes) if shared_module is not None: assert isinstance(shared_module, Module) and \ shared_module.binded and shared_module.params_initialized shared_group = shared_module._exec_group else: shared_group = None modQ._exec_group = DataParallelExecutorGroup( modQ._symbol, modQ._context, modQ._work_load_list, modQ._data_shapes, modQ._label_shapes, modQ._param_names, for_training, inputs_need_grad, shared_group, logger=modQ.logger, fixed_param_names=modQ._fixed_param_names, grad_req=grad_req, state_names=modQ._state_names) modQ._total_exec_bytes = modQ._exec_group._total_exec_bytes if shared_module is not None: modQ.params_initialized = True modQ._arg_params = shared_module._arg_params modQ._aux_params = shared_module._aux_params elif modQ.params_initialized: # if the parameters are already initialized, we are re-binding # so automatically copy the already initialized params modQ._exec_group.set_params(modQ._arg_params, modQ._aux_params) else: assert modQ._arg_params is None and modQ._aux_params is None param_arrays = [ nd.zeros(x[0].shape, dtype=x[0].dtype, ctx=x[0][0].context) for x in modQ._exec_group.param_arrays ] modQ._arg_params = { name: arr for name, arr in zip(modQ._param_names, param_arrays) } aux_arrays = [ nd.zeros(x[0].shape, dtype=x[0].dtype, ctx=x[0][0].context) for x in modQ._exec_group.aux_arrays ] modQ._aux_params = { name: arr for name, arr in zip(modQ._aux_names, aux_arrays) } if shared_module is not None and shared_module.optimizer_initialized: modQ.borrow_optimizer(shared_module)
def main(): parser = argparse.ArgumentParser(description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('arena', 'games', 'roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument('-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument('--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument('--kvstore-update-period', required=False, type=int, default=1, help='The period that the worker updates the parameters from the sever') parser.add_argument('--kv-type', required=False, type=str, default=None, help='type of kvstore, default will not use kvstore, could also be dist_async') args, unknown = parser.parse_known_args() if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] args.dir_path = 'dqn-%s' % rom_name ctx = re.findall('([a-z]+)(\d*)', args.ctx) ctx = [(device, int(num)) if len(num) >0 else (device, 0) for device, num in ctx] replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = 1000000 history_length = 4 rows = 84 cols = 84 q_ctx = mx.Context(*ctx[0]) game = AtariGame(rom_path=args.rom, resize_mode='scale', replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length) ##RUN NATURE freeze_interval = 10000 epoch_num = 200 steps_per_epoch = 250000 update_interval = 4 discount = 0.99 eps_start = args.start_eps eps_min = 0.1 eps_decay = (eps_start - 0.1) / 1000000 eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 action_num = len(game.action_set) data_shapes = {'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size,), 'dqn_reward': (minibatch_size,)} #optimizer = mx.optimizer.create(name='sgd', learning_rate=args.lr,wd=args.wd) optimizer = mx.optimizer.Nop() dqn_output_op = DQNOutputNpyOp() dqn_sym = dqn_sym_nature(action_num, dqn_output_op) qnet = Base(data_shapes=data_shapes, sym=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) # Create kvstore testShape = (1,1686180*100) testParam = nd.ones(testShape,ctx=q_ctx) testGrad = nd.zeros(testShape,ctx=q_ctx) # Create kvstore if args.kv_type != None: kvType = args.kv_type kvStore = kvstore.create(kvType) #Initialize kvstore for idx,v in enumerate(qnet.params.values()): kvStore.init(idx,v); # Set optimizer on kvstore kvStore.set_optimizer(optimizer) kvstore_update_period = args.kvstore_update_period else: updater = mx.optimizer.get_updater(optimizer) # if args.kv_type != None: # kvType = args.kv_type # kvStore = kvstore.create(kvType) # kvStore.init(0,testParam) # testOptimizer = mx.optimizer.Nop() # kvStore.set_optimizer(testOptimizer) # kvstore_update_period = args.kvstore_update_period qnet.print_stat() target_qnet.print_stat() # Begin Playing Game training_steps = 0 total_steps = 0 while(1): time_before_wait = time.time() # kvStore.push(0,testGrad,priority=0) # kvStore.pull(0,testParam,priority=0) # testParam.wait_to_read() for paramIndex in range(len(qnet.params)):#range(6):# k=qnet.params.keys()[paramIndex] kvStore.push(paramIndex,qnet.params_grad[k],priority=-paramIndex) kvStore.pull(paramIndex,qnet.params[k],priority=-paramIndex) for v in qnet.params.values(): v.wait_to_read() logging.info("wait time %f" %(time.time()-time_before_wait)) for epoch in xrange(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() game.start() while steps_left > 0: # Running New Episode episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 time_episode_start = time.time() game.begin_episode(steps_left) while not game.episode_terminate: # 1. We need to choose a new action based on the current game status if game.state_enabled and game.replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! current_state = game.current_state() state = nd.array(current_state.reshape((1,) + current_state.shape), ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(batch_size=1, data=state)[0].asnumpy() action = numpy.argmax(qval_npy) episode_q_value += qval_npy[0, action] episode_action_step += 1 else: action = npy_rng.randint(action_num) # 2. Play the game for a single mega-step (Inside the game, the action may be repeated for several times) game.play(action) total_steps += 1 # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps % update_interval == 0 and game.replay_memory.sample_enabled: # 3.1 Draw sample from the replay_memory training_steps += 1 episode_update_step += 1 states, actions, rewards, next_states, terminate_flags \ = game.replay_memory.sample(batch_size=minibatch_size) states = nd.array(states, ctx=q_ctx) / float(255.0) next_states = nd.array(next_states, ctx=q_ctx) / float(255.0) actions = nd.array(actions, ctx=q_ctx) rewards = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward(batch_size=minibatch_size, data=next_states)[0] qval = qnet.forward(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount outputs = qnet.forward(batch_size=minibatch_size,is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward(batch_size=minibatch_size) nd.waitall() time_before_update = time.time() if args.kv_type != None: if total_steps % kvstore_update_period == 0: update_to_kvstore(kvStore,qnet.params,qnet.params_grad) else: qnet.update(updater=updater) logging.info("update time %f" %(time.time()-time_before_update)) time_before_wait = time.time() nd.waitall() logging.info("wait time %f" %(time.time()-time_before_wait)) '''nd.waitall() time_before_wait = time.time() kvStore.push(0,testGrad,priority=0) kvStore.pull(0,testParam,priority=0) nd.waitall() logging.info("wait time %f" %(time.time()-time_before_wait))''' # 3.3 Calculate Loss diff = nd.abs(nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = (0.5 * nd.sum(nd.square(quadratic_part)) + nd.sum(diff - quadratic_part)).asscalar() episode_loss += loss # 3.3 Update the target network every freeze_interval # (We can do annealing instead of hard copy) if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) steps_left -= game.episode_step time_episode_end = time.time() # Update the statistics epoch_reward += game.episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, game.episode_step / (time_episode_end - time_episode_start), eps_curr) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % (episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % (episode_q_value / episode_action_step, episode_action_step) logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
def bind(self, data_shapes, label_shapes=None, for_training=True, inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req='write'): """Binds the symbols to construct executors. This is necessary before one can perform computation with the module. Parameters ---------- data_shapes : list of (str, tuple) Typically is ``data_iter.provide_data``. label_shapes : list of (str, tuple) Typically is ``data_iter.provide_label``. for_training : bool Default is ``True``. Whether the executors should be bound for training. inputs_need_grad : bool Default is ``False``. Whether the gradients to the input data need to be computed. Typically this is not needed. But this might be needed when implementing composition of modules. force_rebind : bool Default is ``False``. This function does nothing if the executors are already bound. But with this ``True``, the executors will be forced to rebind. shared_module : Module Default is ``None``. This is used in bucketing. When not ``None``, the shared module essentially corresponds to a different bucket -- a module with different symbol but with the same sets of parameters (e.g. unrolled RNNs with different lengths). """ # force rebinding is typically used when one want to switch from # training to prediction phase. if force_rebind: self._reset_bind() if self.binded: self.logger.warning('Already bound, ignoring bind()') return self.for_training = for_training self.inputs_need_grad = inputs_need_grad self.binded = True self._grad_req = grad_req if not for_training: assert not inputs_need_grad else: pass # this is not True, as some module might not contains a loss function # that consumes the labels # assert label_shapes is not None self._data_shapes, self._label_shapes = _parse_data_desc( self.data_names, self.label_names, data_shapes, label_shapes) if shared_module is not None: assert isinstance(shared_module, Module) and \ shared_module.binded and shared_module.params_initialized shared_group = shared_module._exec_group assert len(shared_group.execs) >= len(self._context) else: shared_group = None self._exec_group = DataParallelExecutorGroup( self._symbol, self._context, self._work_load_list, self._data_shapes, self._label_shapes, self._param_names, for_training, inputs_need_grad, shared_group, logger=self.logger, fixed_param_names=self._fixed_param_names, grad_req=grad_req, group2ctxs=self._group2ctxs, state_names=self._state_names) self._total_exec_bytes = self._exec_group._total_exec_bytes if shared_module is not None: self.params_initialized = True self._arg_params = shared_module._arg_params self._aux_params = shared_module._aux_params elif self.params_initialized: # if the parameters are already initialized, we are re-binding # so automatically copy the already initialized params self._exec_group.set_params(self._arg_params, self._aux_params) else: assert self._arg_params is None and self._aux_params is None param_arrays = [ zeros(shape=x[0].shape, dtype=x[0].dtype, stype=x[0].stype) for x in self._exec_group.param_arrays ] self._arg_params = { name: arr for name, arr in zip(self._param_names, param_arrays) } aux_arrays = [ zeros(x[0].shape, dtype=x[0].dtype) for x in self._exec_group.aux_arrays ] self._aux_params = { name: arr for name, arr in zip(self._aux_names, aux_arrays) } if shared_module is not None and shared_module.optimizer_initialized: self.borrow_optimizer(shared_module)
def get_vector(self, i, cnt): i = int(i) vec = nd.zeros((1, cnt)) vec[0][i] = 1 return vec
import time import os, sys import mxnet as mx from mxnet import ndarray as nd from mxnet import autograd from mxnet import gluon import random from data_utils import * mx.random.seed(1) random.seed(1) try: ctx = mx.gpu() _ = nd.zeros((1, ), ctx=ctx) except: ctx = mx.cpu() print('CPU or GPU? : ', ctx) # zero mean and unit variance as it makes traning process easier def Normolise(data): data_array = np.array(data) data_array_shape = data_array.shape[0] return pd.DataFrame( (data_array - np.mean(data_array, axis=1).reshape(data_array_shape, -1)) / np.std(data_array, axis=1).reshape(data_array_shape, -1), index=data.index)
def exchange_rate_model(epoch=1000, time_step=28, day=7, normalization_factor=100, save_period=1000, load_period=1000, learning_rate=0.001, ctx=mx.gpu(0)): ''' 28 time x 1 day ''' #network parameter normalization_factor = normalization_factor time_step = time_step # 28 step day = day # 1 day num_hidden = 300 training, test = JPY_to_KRW(time_step, day, normalization_factor) path = "weights/GRUCell_weights-{}.params".format(load_period) model = GRUCell(num_hidden, day) model.hybridize() # weight initialization if os.path.exists(path): print("loading weights") model.load_params(filename=path, ctx=ctx) # weights load else: print("initializing weights") model.collect_params().initialize(mx.init.Normal(sigma=0.01), ctx=ctx) # weights initialization trainer = gluon.Trainer(model.collect_params(), "rmsprop", {"learning_rate": learning_rate}) for i in tqdm(range(1, epoch + 1, 1)): for data, label in training: states = [nd.zeros(shape=(1, num_hidden), ctx=ctx)] data = data.as_in_context(ctx) label = label.as_in_context(ctx) data = data.reshape(shape=(-1, time_step, day)) data = nd.transpose(data=data, axes=(1, 0, 2)) loss = 0 with autograd.record(): for j in range(time_step): outputs, states = model(data[j], states) loss = loss + gluon.loss.L2Loss()( outputs, label[j].reshape(shape=outputs.shape)) loss.backward() trainer.step(batch_size=1) cost = nd.mean(loss).asscalar() print(" epoch : {} , last batch cost : {}".format(i, cost)) #weight_save if i % save_period == 0: if not os.path.exists("weights"): os.makedirs("weights") print("saving weights") model.save_params("weights/GRUCell_weights-{}.params".format(i)) prediction(test, time_step, day, normalization_factor, num_hidden, model, ctx)
def generate_learned_samples(self): ''' Draw and generate data. Returns: `Tuple` data. The shape is ... - `mxnet.ndarray` of observed data points in training. - `mxnet.ndarray` of supervised data in training. - `mxnet.ndarray` of observed data points in test. - `mxnet.ndarray` of supervised data in test. ''' for _ in range(self.iter_n): training_batch_arr, test_batch_arr = None, None training_label_arr, test_label_arr = None, None for batch_size in range(self.batch_size): dir_key = np.random.randint(low=0, high=len(self.__training_file_path_list)) training_one_hot_arr = nd.zeros((1, len(self.__training_file_path_list)), ctx=self.__ctx) training_one_hot_arr[0, dir_key] = 1 training_file_path_list = self.__split_at_intervals( self.__training_file_path_list[dir_key], start_pos=0, seq_interval=self.__at_intervals ) training_data_arr, test_data_arr = None, None training_file_key = np.random.randint( low=0, high=len(training_file_path_list) - self.__seq_len ) test_dir_key = np.random.randint(low=0, high=len(self.__test_file_path_list)) test_one_hot_arr = nd.zeros((1, len(self.__test_file_path_list)), ctx=self.__ctx) test_one_hot_arr[0, test_dir_key] = 1 test_file_path_list = self.__split_at_intervals( self.__test_file_path_list[test_dir_key], start_pos=0, seq_interval=self.__at_intervals ) test_file_key = np.random.randint( low=0, high=len(test_file_path_list) - self.__seq_len ) for seq in range(self.__seq_len): seq_training_batch_arr = self.__image_extractor.extract( path=training_file_path_list[training_file_key+seq], ) seq_training_batch_arr = self.pre_normalize(seq_training_batch_arr) seq_training_batch_arr = nd.expand_dims(seq_training_batch_arr, axis=0) seq_test_batch_arr = self.__image_extractor.extract( path=test_file_path_list[test_file_key+seq], ) seq_test_batch_arr = self.pre_normalize(seq_test_batch_arr) seq_test_batch_arr = nd.expand_dims(seq_test_batch_arr, axis=0) if training_data_arr is not None: training_data_arr = nd.concat(training_data_arr, seq_training_batch_arr, dim=0) else: training_data_arr = seq_training_batch_arr if test_data_arr is not None: test_data_arr = nd.concat(test_data_arr, seq_test_batch_arr, dim=0) else: test_data_arr = seq_test_batch_arr training_data_arr = nd.expand_dims(training_data_arr, axis=0) test_data_arr = nd.expand_dims(test_data_arr, axis=0) if training_batch_arr is not None: training_batch_arr = nd.concat(training_batch_arr, training_data_arr, dim=0) else: training_batch_arr = training_data_arr if test_batch_arr is not None: test_batch_arr = nd.concat(test_batch_arr, test_data_arr, dim=0) else: test_batch_arr = test_data_arr if training_label_arr is not None: training_label_arr = nd.concat(training_label_arr, training_one_hot_arr, dim=0) else: training_label_arr = training_one_hot_arr if test_label_arr is not None: test_label_arr = nd.concat(test_label_arr, test_one_hot_arr, dim=0) else: test_label_arr = test_one_hot_arr if self.__noiseable_data is not None: training_batch_arr = self.__noiseable_data.noise(training_batch_arr) yield training_batch_arr, training_label_arr, test_batch_arr, test_label_arr
# trainer for the generator and the discriminator trainerG = gluon.Trainer(netG.collect_params(), 'adam', { 'learning_rate': lr, 'beta1': beta1 }) trainerD = gluon.Trainer(netD.collect_params(), 'adam', { 'learning_rate': lr, 'beta1': beta1 }) from datetime import datetime import time import logging real_label = nd.ones((batch_size, ), ctx=ctx) fake_label = nd.zeros((batch_size, ), ctx=ctx) def facc(label, pred): pred = pred.ravel() label = label.ravel() return ((pred > 0.5) == label).mean() metric = mx.metric.CustomMetric(facc) stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) for epoch in range(epochs): tic = time.time()
def select_triplets(self): self.seq = [] while len(self.seq)<self.seq_min_size: self.time_reset() embeddings = None bag_size = self.triplet_bag_size batch_size = self.batch_size #data = np.zeros( (bag_size,)+self.data_shape ) #label = np.zeros( (bag_size,) ) tag = [] #idx = np.zeros( (bag_size,) ) print('eval %d images..'%bag_size, self.triplet_cur) print('triplet time stat', self.times) if self.triplet_cur+bag_size>len(self.triplet_seq): self.triplet_reset() #bag_size = min(bag_size, len(self.triplet_seq)) print('eval %d images..'%bag_size, self.triplet_cur) self.times[0] += self.time_elapsed() self.time_reset() #print(data.shape) data = nd.zeros( self.provide_data[0][1] ) label = None if self.provide_label is not None: label = nd.zeros( self.provide_label[0][1] ) ba = 0 while True: bb = min(ba+batch_size, bag_size) if ba>=bb: break _count = bb-ba #data = nd.zeros( (_count,)+self.data_shape ) #_batch = self.data_iter.next() #_data = _batch.data[0].asnumpy() #print(_data.shape) #_label = _batch.label[0].asnumpy() #data[ba:bb,:,:,:] = _data #label[ba:bb] = _label for i in range(ba, bb): #print(ba, bb, self.triplet_cur, i, len(self.triplet_seq)) _idx = self.triplet_seq[i+self.triplet_cur] s = self.imgrec.read_idx(_idx) header, img = recordio.unpack(s) img = self.imdecode(img) data[i-ba][:] = self.postprocess_data(img) _label = header.label if not isinstance(_label, numbers.Number): _label = _label[0] if label is not None: label[i-ba][:] = _label tag.append( ( int(_label), _idx) ) #idx[i] = _idx db = mx.io.DataBatch(data=(data,)) self.mx_model.forward(db, is_train=False) net_out = self.mx_model.get_outputs() #print('eval for selecting triplets',ba,bb) #print(net_out) #print(len(net_out)) #print(net_out[0].asnumpy()) net_out = net_out[0].asnumpy() #print(net_out) #print('net_out', net_out.shape) if embeddings is None: embeddings = np.zeros( (bag_size, net_out.shape[1])) embeddings[ba:bb,:] = net_out ba = bb assert len(tag)==bag_size self.triplet_cur+=bag_size embeddings = sklearn.preprocessing.normalize(embeddings) self.times[1] += self.time_elapsed() self.time_reset() nrof_images_per_class = [1] for i in range(1, bag_size): if tag[i][0]==tag[i-1][0]: nrof_images_per_class[-1]+=1 else: nrof_images_per_class.append(1) triplets = self.pick_triplets(embeddings, nrof_images_per_class) # shape=(T,3) print('found triplets', len(triplets)) ba = 0 while True: bb = ba+self.per_batch_size//3 if bb>len(triplets): break _triplets = triplets[ba:bb] for i in range(3): for triplet in _triplets: _pos = triplet[i] _idx = tag[_pos][1] self.seq.append(_idx) ba = bb self.times[2] += self.time_elapsed()
def zeros(shape, dtype, ctx): return nd.zeros(shape, dtype=dtype, ctx=ctx)
def hard_mining_reset(self): #import faiss from annoy import AnnoyIndex data = nd.zeros( self.provide_data[0][1] ) label = nd.zeros( self.provide_label[0][1] ) #label = np.zeros( self.provide_label[0][1] ) X = None ba = 0 batch_num = 0 while ba<len(self.oseq): batch_num+=1 if batch_num%10==0: print('loading batch',batch_num, ba) bb = min(ba+self.batch_size, len(self.oseq)) _count = bb-ba for i in range(_count): idx = self.oseq[i+ba] s = self.imgrec.read_idx(idx) header, img = recordio.unpack(s) img = self.imdecode(img) data[i][:] = self.postprocess_data(img) label[i][:] = header.label db = mx.io.DataBatch(data=(data,self.data_extra), label=(label,)) self.mx_model.forward(db, is_train=False) net_out = self.mx_model.get_outputs() embedding = net_out[0].asnumpy() nembedding = sklearn.preprocessing.normalize(embedding) if _count<self.batch_size: nembedding = nembedding[0:_count,:] if X is None: X = np.zeros( (len(self.id2range), nembedding.shape[1]), dtype=np.float32 ) nplabel = label.asnumpy() for i in range(_count): ilabel = int(nplabel[i]) #print(ilabel, ilabel.__class__) X[ilabel] += nembedding[i] ba = bb X = sklearn.preprocessing.normalize(X) d = X.shape[1] t = AnnoyIndex(d, metric='euclidean') for i in range(X.shape[0]): t.add_item(i, X[i]) print('start to build index') t.build(20) print(X.shape) k = self.per_identities self.seq = [] for i in range(X.shape[0]): nnlist = t.get_nns_by_item(i, k) assert nnlist[0]==i for _label in nnlist: assert _label<len(self.id2range) _id = self.header0[0]+_label v = self.id2range[_id] _list = range(*v) if len(_list)<self.images_per_identity: random.shuffle(_list) else: _list = np.random.choice(_list, self.images_per_identity, replace=False) for i in range(self.images_per_identity): _idx = _list[i%len(_list)] self.seq.append(_idx)
def hybrid_forward(self, F, X): # (batch_size, num_channel_prev, h, w, dim_vector) # -->(batch_size,num_capsule_prev,1,1,dim_vector) X = X.reshape((0, -1, 1, 1, 0)) self.num_capsules_prev = X.shape[1] self.batch_size = X.shape[0] # (batch_size,num_capsule_prev,out_channels,1,dim_vector) X_tile = nd.tile(X, reps=(1, 1, self.out_channels, 1, 1)) if self.routing_weight_initial: self.routing_weight = nd.random_normal( shape=(1, self.num_capsules_prev, self.out_channels, self.dim_input_vector, self.dim_vector), name='routing_weight').as_in_context(mx.gpu(0)) self.routing_weight_initial = False # (batch_size,num_capsule_prev,out_channels,dim_input_vector,dim_vector) # (64, 1152, 10, 8, 16) W_tile = nd.tile(self.routing_weight, reps=(self.batch_size, 1, 1, 1, 1)) linear_combination_3d = nd.batch_dot( X_tile.reshape((-1, X_tile.shape[-2], X_tile.shape[-1])), W_tile.reshape((-1, W_tile.shape[-2], W_tile.shape[-1]))) # (64, 1152, 10, 1, 16) linear_combination = linear_combination_3d.reshape( (self.batch_size, self.num_capsules_prev, self.out_channels, 1, self.dim_vector)) # b_ij (1, 1152, 10, 1, 1) priors = nd.zeros((1, self.num_capsules_prev, self.out_channels, 1, 1)) ############################################################################ ## Rounting ## ############################################################################ for iter_index in range(self.num_routing_iter): # NOTE: RoutingAlgorithm-line 4 # b_ij (1, 1152, 10, 1, 1) softmax_prior = nd.softmax(priors, axis=2) # on num_capsule dimension # NOTE: RoutingAlgorithm-line 5 # (64, 1152, 10, 1, 16) # output = torch.mul(softmax_prior, linear_combination) output = softmax_prior * linear_combination # (64, 1, 10, 1, 16) output_sum = output.sum(axis=1, keepdims=True) # s_J # NOTE: RoutingAlgorithm-line 6 # (64, 1, 10, 1, 16) output_squashed = self.squash(output_sum) # v_J # NOTE: RoutingAlgorithm-line 7 # (64, 1152, 10, 1, 16) output_tile = nd.tile(output_squashed, reps=(1, self.num_capsules_prev, 1, 1, 1)) # (64, 1152, 10, 1, 16) x (64, 1152, 10, 1, 16) (transpose on last two axis) # ==> (64, 1152, 10, 1, 1) U_times_v = nd.batch_dot(linear_combination.reshape( (-1, 1, self.dim_vector)), output_tile.reshape( (-1, 1, self.dim_vector)), transpose_b=True) U_times_v = U_times_v.reshape( (self.batch_size, self.num_capsules_prev, self.out_channels, 1, 1)) priors = priors + U_times_v.sum(axis=0).expand_dims(axis=0) return output_squashed # v_J
def train(pool_size, epochs, train_data, val_data, ctx, netEn, netDe, netD, netD2, trainerEn, trainerDe, trainerD, trainerD2, lambda1, batch_size, expname, append=True, useAE = False): tp_file = open(expname + "_trainloss.txt", "w") tp_file.close() text_file = open(expname + "_validtest.txt", "w") text_file.close() #netGT, netDT, _, _ = set_test_network(opt.depth, ctx, opt.lr, opt.beta1,opt.ndf, opt.ngf, opt.append) GAN_loss = gluon.loss.SigmoidBinaryCrossEntropyLoss() L1_loss = gluon.loss.L2Loss() image_pool = imagePool.ImagePool(pool_size) metric = mx.metric.CustomMetric(facc) metric2 = mx.metric.CustomMetric(facc) metricMSE = mx.metric.MSE() loss_rec_G = [] loss_rec_D = [] loss_rec_R = [] acc_rec = [] acc2_rec = [] loss_rec_D2 = [] loss_rec_G2 = [] lr = 0.002 #mu = nd.random_normal(loc=0, scale=1, shape=(batch_size/2,64,1,1), ctx=ctx) mu = nd.random.uniform(low= -1, high=1, shape=(batch_size/2,64,1,1),ctx=ctx) #mu = nd.zeros((batch_size/2,64,1,1),ctx=ctx) sigma = nd.ones((64,1,1),ctx=ctx) mu.attach_grad() sigma.attach_grad() stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) for epoch in range(epochs): tic = time.time() btic = time.time() train_data.reset() iter = 0 #print('learning rate : '+str(trainerD.learning_rate )) for batch in train_data: ############################ # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z))) ########################### real_in = batch.data[0].as_in_context(ctx) real_out = batch.data[1].as_in_context(ctx) fake_latent= netEn(real_in) #real_latent = nd.random_normal(loc=0, scale=1, shape=fake_latent.shape, ctx=ctx) real_latent = nd.multiply(nd.power(sigma,2),nd.random_normal(loc=0, scale=1, shape=fake_latent.shape, ctx=ctx)) #nd.random.uniform( low=-1, high=1, shape=fake_latent.shape, ctx=ctx) fake_out = netDe(fake_latent) fake_concat = nd.concat(real_in, fake_out, dim=1) if append else fake_out with autograd.record(): # Train with fake image # Use image pooling to utilize history imagesi output = netD(fake_concat) output2 = netD2(fake_latent) fake_label = nd.zeros(output.shape, ctx=ctx) fake_latent_label = nd.zeros(output2.shape, ctx=ctx) noiseshape = (fake_latent.shape[0]/2,fake_latent.shape[1],fake_latent.shape[2],fake_latent.shape[3]) eps2 = nd.multiply(nd.power(sigma,2),nd.random_normal(loc=0, scale=1, shape=fake_latent.shape, ctx=ctx)) #eps2 = nd.random_normal(loc=0, scale=sigma.asscalar(), shape=fake_latent.shape, ctx=ctx) # #eps = nd.random.uniform( low=-1, high=1, shape=noiseshape, ctx=ctx) rec_output = netD(netDe(eps2)) errD_fake = GAN_loss(rec_output, fake_label) errD_fake2 = GAN_loss(output, fake_label) errD2_fake = GAN_loss(output2, fake_latent_label) metric.update([fake_label, ], [output, ]) metric2.update([fake_latent_label, ], [output2, ]) real_concat = nd.concat(real_in, real_out, dim=1) if append else real_out output = netD(real_concat) output2 = netD2(real_latent) real_label = nd.ones(output.shape, ctx=ctx) real_latent_label = nd.ones(output2.shape, ctx=ctx) errD_real = GAN_loss(output, real_label) errD2_real = GAN_loss(output2, real_latent_label) #errD = (errD_real + 0.5*(errD_fake+errD_fake2)) * 0.5 errD = (errD_real + errD_fake) * 0.5 errD2 = (errD2_real + errD2_fake) * 0.5 totalerrD = errD+errD2 totalerrD.backward() #errD2.backward() metric.update([real_label, ], [output, ]) metric2.update([real_latent_label, ], [output2, ]) trainerD.step(batch.data[0].shape[0]) trainerD2.step(batch.data[0].shape[0]) ############################ # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z)) ########################### with autograd.record(): sh = fake_latent.shape eps2 = nd.multiply(nd.power(sigma,2),nd.random_normal(loc=0, scale=1, shape=fake_latent.shape, ctx=ctx)) #eps2 = nd.random_normal(loc=0, scale=sigma.asscalar(), shape=fake_latent.shape, ctx=ctx) # #eps = nd.random.uniform( low=-1, high=1, shape=noiseshape, ctx=ctx) rec_output = netD(netDe(eps2)) fake_latent= (netEn(real_in)) output2 = netD2(fake_latent) fake_out = netDe(fake_latent) fake_concat = nd.concat(real_in, fake_out, dim=1) if append else fake_out output = netD(fake_concat) real_label = nd.ones(output.shape, ctx=ctx) real_latent_label = nd.ones(output2.shape, ctx=ctx) errG2 = GAN_loss(rec_output, real_label) errR = L1_loss(real_out, fake_out) * lambda1 errG = 10.0*GAN_loss(output2, real_latent_label)+errG2+errR+nd.mean(nd.power(sigma,2)) errG.backward() if epoch>50: sigma -= lr / sigma.shape[0] * sigma.grad print(sigma) trainerDe.step(batch.data[0].shape[0]) trainerEn.step(batch.data[0].shape[0]) loss_rec_G2.append(nd.mean(errG2).asscalar()) loss_rec_G.append(nd.mean(nd.mean(errG)).asscalar()-nd.mean(errG2).asscalar()-nd.mean(errR).asscalar()) loss_rec_D.append(nd.mean(errD).asscalar()) loss_rec_R.append(nd.mean(errR).asscalar()) loss_rec_D2.append(nd.mean(errD2).asscalar()) _, acc2 = metric2.get() name, acc = metric.get() acc_rec.append(acc) acc2_rec.append(acc2) # Print log infomation every ten batches if iter % 10 == 0: _, acc2 = metric2.get() name, acc = metric.get() logging.info('speed: {} samples/s'.format(batch_size / (time.time() - btic))) #print(errD) logging.info('discriminator loss = %f, D2 loss = %f, generator loss = %f, G2 loss = %f, binary training acc = %f , D2 acc = %f, reconstruction error= %f at iter %d epoch %d' % (nd.mean(errD).asscalar(),nd.mean(errD2).asscalar(), nd.mean(errG-errG2-errR).asscalar(),nd.mean(errG2).asscalar(), acc,acc2,nd.mean(errR).asscalar() ,iter, epoch)) iter = iter + 1 btic = time.time() name, acc = metric.get() _, acc2 = metric2.get() tp_file = open(expname + "_trainloss.txt", "a") tp_file.write(str(nd.mean(errG2).asscalar()) + " " + str( nd.mean(nd.mean(errG)).asscalar() - nd.mean(errG2).asscalar() - nd.mean(errR).asscalar()) + " " + str( nd.mean(errD).asscalar()) + " " + str(nd.mean(errD2).asscalar()) + " " + str(nd.mean(errR).asscalar()) +" "+str(acc) + " " + str(acc2)+"\n") tp_file.close() metric.reset() metric2.reset() train_data.reset() logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc)) logging.info('time: %f' % (time.time() - tic)) if epoch%10 ==0:# and epoch>0: text_file = open(expname + "_validtest.txt", "a") filename = "checkpoints/"+expname+"_"+str(epoch)+"_D.params" netD.save_params(filename) filename = "checkpoints/"+expname+"_"+str(epoch)+"_D2.params" netD2.save_params(filename) filename = "checkpoints/"+expname+"_"+str(epoch)+"_En.params" netEn.save_params(filename) filename = "checkpoints/"+expname+"_"+str(epoch)+"_De.params" netDe.save_params(filename) fake_img1 = nd.concat(real_in[0],real_out[0], fake_out[0], dim=1) fake_img2 = nd.concat(real_in[1],real_out[1], fake_out[1], dim=1) fake_img3 = nd.concat(real_in[2],real_out[2], fake_out[2], dim=1) fake_img4 = nd.concat(real_in[3],real_out[3], fake_out[3], dim=1) val_data.reset() text_file = open(expname + "_validtest.txt", "a") for vbatch in val_data: real_in = vbatch.data[0].as_in_context(ctx) real_out = vbatch.data[1].as_in_context(ctx) fake_latent= netEn(real_in) y = netDe(fake_latent) fake_out = y metricMSE.update([fake_out, ], [real_out, ]) _, acc2 = metricMSE.get() text_file.write("%s %s %s\n" % (str(epoch), nd.mean(errR).asscalar(), str(acc2))) metricMSE.reset() images = netDe(eps2) fake_img1T = nd.concat(images[0],images[1], images[2], dim=1) fake_img2T = nd.concat(images[3],images[4], images[5], dim=1) fake_img3T = nd.concat(images[6],images[7], images[8], dim=1) fake_img = nd.concat(fake_img1T,fake_img2T, fake_img3T,dim=2) visual.visualize(fake_img) plt.savefig('outputs/'+expname+'_fakes_'+str(epoch)+'.png') text_file.close() # Do 10 iterations of sampler update fake_img1T = nd.concat(real_in[0],real_out[0], fake_out[0], dim=1) fake_img2T = nd.concat(real_in[1],real_out[1], fake_out[1], dim=1) fake_img3T = nd.concat(real_in[2],real_out[2], fake_out[2], dim=1) #fake_img4T = nd.concat(real_in[3],real_out[3], fake_out[3], dim=1) fake_img = nd.concat(fake_img1,fake_img2, fake_img3,fake_img1T,fake_img2T, fake_img3T,dim=2) visual.visualize(fake_img) plt.savefig('outputs/'+expname+'_'+str(epoch)+'.png') '''if epoch > 100: for ep2 in range(10): with autograd.record(): #eps = nd.random_normal(loc=0, scale=1, shape=noiseshape, ctx=ctx) # eps = nd.random.uniform( low=-1, high=1, shape=noiseshape, ctx=ctx) eps2 = nd.random_normal(loc=0, scale=0.02, shape=noiseshape, ctx=ctx) eps2 = nd.tanh(eps2*sigma+mu) eps2 = nd.concat(eps,eps2,dim=0) rec_output = netD(netDe(eps2)) fake_label = nd.zeros(rec_output.shape, ctx=ctx) errGS = GAN_loss(rec_output, fake_label) errGS.backward() mu -= lr / mu.shape[0] * mu.grad sigma -= lr / sigma.shape[0] * sigma.grad print('mu ' + str(mu[0,0,0,0].asnumpy())+ ' sigma '+ str(sigma[0,0,0,0].asnumpy())) ''' images = netDe(eps2) fake_img1T = nd.concat(images[0],images[1], images[2], dim=1) fake_img2T = nd.concat(images[3],images[4], images[5], dim=1) fake_img3T = nd.concat(images[6],images[7], images[8], dim=1) fake_img = nd.concat(fake_img1T,fake_img2T, fake_img3T,dim=2) visual.visualize(fake_img) plt.savefig('outputs/'+expname+'_fakespost_'+str(epoch)+'.png') return([loss_rec_D,loss_rec_G, loss_rec_R, acc_rec, loss_rec_D2, loss_rec_G2, acc2_rec])
def build_graph(self): import mxnet as mx from mxnet import ndarray as nd from mxnet import gluon, autograd import dgl user_ids = list(self.users.index) product_ids = list(self.products.index) user_ids_invmap = {id_: i for i, id_ in enumerate(user_ids)} product_ids_invmap = {id_: i for i, id_ in enumerate(product_ids)} self.user_ids = user_ids self.product_ids = product_ids self.user_ids_invmap = user_ids_invmap self.product_ids_invmap = product_ids_invmap g = dgl.DGLGraph(multigraph=True) g.add_nodes(len(user_ids) + len(product_ids)) # node type node_type = nd.zeros(g.number_of_nodes(), dtype='float32') node_type[:len(user_ids)] = 1 g.ndata['type'] = node_type # user features print('Adding user features...') for user_column in self.users.columns: udata = nd.zeros(g.number_of_nodes(), dtype='int64') # 0 for padding udata[:len(user_ids)] = \ nd.from_numpy(self.users[user_column].cat.codes.values.astype('int64') + 1) g.ndata[user_column] = udata # product genre print('Adding product features...') product_genres = nd.from_numpy( self.products[self.genres].values.copy().astype('float32')) g.ndata['genre'] = nd.zeros((g.number_of_nodes(), len(self.genres))) g.ndata['genre'][len(user_ids):len(user_ids) + len(product_ids)] = product_genres # product year if 'year' in self.products.columns: g.ndata['year'] = nd.zeros(g.number_of_nodes(), dtype='int64') # 0 for padding g.ndata['year'][len(user_ids):len(user_ids) + len(product_ids)] = \ nd.from_numpy(self.products['year'].cat.codes.values.astype('int64') + 1) ''' # product title print('Parsing title...') nlp = stanfordnlp.Pipeline(use_gpu=False, processors='tokenize,lemma') vocab = set() title_words = [] for t in tqdm.tqdm(self.products['title'].values): doc = nlp(t) words = set() for s in doc.sentences: words.update(w.lemma.lower() for w in s.words if not re.fullmatch(r'['+string.punctuation+']+', w.lemma)) vocab.update(words) title_words.append(words) vocab = list(vocab) vocab_invmap = {w: i for i, w in enumerate(vocab)} # bag-of-words g.ndata['title'] = nd.zeros((g.number_of_nodes(), len(vocab))) for i, tw in enumerate(tqdm.tqdm(title_words)): g.ndata['title'][len(user_ids) + i, [vocab_invmap[w] for w in tw]] = 1 self.vocab = vocab self.vocab_invmap = vocab_invmap ''' rating_user_vertices = [ user_ids_invmap[id_] for id_ in self.ratings['user_id'].values ] rating_product_vertices = [ product_ids_invmap[id_] + len(user_ids) for id_ in self.ratings['product_id'].values ] self.rating_user_vertices = rating_user_vertices self.rating_product_vertices = rating_product_vertices g.add_edges(rating_user_vertices, rating_product_vertices, data={ 'inv': nd.zeros(self.ratings.shape[0], dtype='int32'), 'rating': nd.from_numpy( self.ratings['rating'].values.astype('float32')) }) g.add_edges(rating_product_vertices, rating_user_vertices, data={ 'inv': nd.ones(self.ratings.shape[0], dtype='int32'), 'rating': nd.from_numpy( self.ratings['rating'].values.astype('float32')) }) self.g = g g.readonly()
def main(opt): ctx = mx.gpu() if opt.use_gpu else mx.cpu() testclasspaths = [] testclasslabels = [] print('loading test files') filename = '_testlist.txt' with open(opt.dataset + "_" + opt.expname + filename, 'r') as f: for line in f: testclasspaths.append(line.split(' ')[0]) if int(line.split(' ')[1]) == -1: testclasslabels.append(0) else: testclasslabels.append(1) neworder = range(len(testclasslabels)) neworder = shuffle(neworder) c = list(zip(testclasslabels, testclasspaths)) print('shuffling') random.shuffle(c) #testclasslabels, testclasspaths = zip(*c) #testclasslabels = testclasslabels[1:5000] #testclasspaths = testclasspaths[1:5000] ltnt = 512 print('loading pictures') test_data = load_image.load_test_images(testclasspaths, testclasslabels, opt.batch_size, opt.img_wd, opt.img_ht, ctx, opt.noisevar) print('picture loading done') netEn, netDe, netD, netD2, netDS = set_network(opt.depth, ctx, 0, 0, opt.ndf, opt.ngf, opt.append) netEn.load_params('checkpoints/' + opt.expname + '_' + str(opt.epochs) + '_En.params', ctx=ctx) netDe.load_params('checkpoints/' + opt.expname + '_' + str(opt.epochs) + '_De.params', ctx=ctx) netD.load_params('checkpoints/' + opt.expname + '_' + str(opt.epochs) + '_D.params', ctx=ctx) netD2.load_params('checkpoints/' + opt.expname + '_' + str(opt.epochs) + '_D2.params', ctx=ctx) netDS.load_params('checkpoints/' + opt.expname + '_' + str(opt.epochs) + '_SD.params', ctx=ctx) print('Model loading done') lbllist = [] scorelist1 = [] scorelist2 = [] scorelist3 = [] scorelist4 = [] test_data.reset() count = 0 for batch in (test_data): count += 1 print(str(count)) #, end="\r") real_in = batch.data[0].as_in_context(ctx) real_out = batch.data[1].as_in_context(ctx) lbls = batch.label[0].as_in_context(ctx) code = netEn((real_out)) code = code + nd.random.normal( loc=0, scale=0.002, shape=code.shape, ctx=ctx) outnn = (netDe(code)) out_concat = nd.concat(real_out, outnn, dim=1) if opt.append else outnn output4 = nd.mean((netD(out_concat)), (1, 3, 2)).asnumpy() code = netEn(real_in) #code=codet+nd.random.normal(loc=0, scale=0.0000001, shape=code.shape,ctx=ctx) #code2=codet+nd.random.normal(loc=0, scale=0.000001, shape=code.shape,ctx=ctx) #eq_code = heq(code.asnumpy(),2) #code = nd.array(eq_code, ctx=ctx) out = netDe(code) #out2 = netDe(code2) out_concat = nd.concat(real_in, out, dim=1) if opt.append else out output = netD(out_concat) #Denoised image output3 = nd.mean((out - real_out)**2, (1, 3, 2)).asnumpy() #denoised-real output = nd.mean(output, (1, 3, 2)).asnumpy() out_concat = nd.concat(real_out, real_out, dim=1) if opt.append else real_out output2 = netDS(netDe(code)) #Image with no noise output2 = nd.mean(output2, (1, 3, 2)).asnumpy() lbllist = lbllist + list(lbls.asnumpy()) scorelist1 = scorelist1 + list(output) scorelist2 = scorelist2 + list(output2) scorelist3 = scorelist3 + list(output3) scorelist4 = scorelist4 + list(output4) fake_img1 = nd.concat(real_in[0], real_out[0], out[0], outnn[0], dim=1) fake_img2 = nd.concat(real_in[1], real_out[1], out[1], outnn[1], dim=1) fake_img3 = nd.concat(real_in[2], real_out[2], out[2], outnn[2], dim=1) fake_img4 = nd.concat(real_in[3], real_out[3], out[3], outnn[3], dim=1) fake_img = nd.concat(fake_img1, fake_img2, fake_img3, fake_img4, dim=2) #print(np.shape(fake_img)) visual.visualize(fake_img) plt.savefig('outputs/T_' + opt.expname + '_' + str(count) + '.png') if not opt.isvalidation: fpr, tpr, _ = roc_curve(lbllist, scorelist1, 1) roc_auc1 = auc(fpr, tpr) fpr, tpr, _ = roc_curve(lbllist, scorelist2, 1) roc_auc2 = auc(fpr, tpr) fpr, tpr, _ = roc_curve(lbllist, scorelist3, 1) roc_auc3 = auc(fpr, tpr) fpr, tpr, _ = roc_curve(lbllist, scorelist4, 1) roc_auc4 = auc(fpr, tpr) plt.gcf().clear() plt.clf() sns.set(color_codes=True) posscores = [ scorelist3[i] for i, v in enumerate(lbllist) if int(v) == 1 ] negscores = [ scorelist3[i] for i, v in enumerate(lbllist) if int(v) == 0 ] #sns.distplot(posscores, hist=False, label="Known Classes" ,rug=True) sns.kdeplot(posscores, label="Known Classes") sns.kdeplot(negscores, label="Unnown Classes") #plt.hold() #sns.distplot(negscores, hist=False, label = "Unknown Classes", rug=True); plt.legend() plt.savefig('outputs/matdist_' + opt.expname + '_.png') plt.gcf().clear() inputT = nd.zeros((ltnt, ltnt, 1, 1), ctx=ctx) for i in range(0, ltnt): inputT[i, i, :, :] = -1 out = netDe(inputT) count = 0 for i in range(int(math.ceil(math.sqrt(ltnt)))): for j in range(int(math.ceil(math.sqrt(ltnt)))): if count < ltnt: plt.subplot(math.ceil(math.sqrt(ltnt)), math.ceil(math.sqrt(ltnt)), count + 1) plt.imshow( ((out[count].asnumpy().transpose(1, 2, 0) + 1.0) * 127.5).astype(np.uint8)) plt.axis('off') count += 1 plt.savefig('outputs/atoms_' + opt.expname + '_.png') plt.gcf().clear() plt.clf() return ([roc_auc1, roc_auc2, roc_auc3, roc_auc4]) else: return ([0, 0, 0, 0]) fakecode = nd.random_normal(loc=0, scale=1, shape=(16, 4096, 1, 1), ctx=ctx) out = netDe(fakecode) fake_img1 = nd.concat(out[0], out[1], out[2], out[3], dim=1) fake_img2 = nd.concat(out[7], out[6], out[5], out[4], dim=1) fake_img3 = nd.concat(out[8], out[9], out[10], out[11], dim=1) fake_img4 = nd.concat(out[15], out[14], out[13], out[12], dim=1) fake_img = nd.concat(fake_img1, fake_img2, fake_img3, fake_img4, dim=2) #print(np.shape(fake_img)) visual.visualize(fake_img) plt.savefig('outputs/fakes_' + opt.expname + '_.png')
def set_ctx(self): try: self.__ctx = mx.gpu() _ = nd.zeros(shape=(1, ), ctx=self.__ctx) except: self.__ctx = mx.cpu()
idx = list(range(num_examples)) # 将索引序列打乱 random.shuffle(idx) for i in range(0, num_examples, batch_size): j = nd.array(idx[i:min(i + batch_size, num_examples)]) yield nd.take(X, j), nd.take(y, j) # for data, label in data_iter(): # print(data, label) # break # 初始化参数 w = nd.random_normal(shape=(num_inputs, 1)) b = nd.zeros([ 1, ]) params = [w, b] # 创建参数的梯度 for param in params: param.attach_grad() # 定义模型 def net(X): return nd.dot(X, w) + b # 损失函数 def sequare_loss(yhat, y):
def train(): image_pool = ImagePool(pool_size) metric = mx.metric.CustomMetric(facc) stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) for epoch in range(epochs): tic = time.time() btic = time.time() train_data.reset() iter = 0 for batch in train_data: ############################ # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z))) ########################### real_in = batch.data[0].as_in_context(ctx) real_out = batch.data[1].as_in_context(ctx) fake_out = netG(real_in) fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1)) with autograd.record(): # Train with fake image # Use image pooling to utilize history images output = netD(fake_concat) fake_label = nd.zeros(output.shape, ctx=ctx) errD_fake = GAN_loss(output, fake_label) metric.update([ fake_label, ], [ output, ]) # Train with real image real_concat = nd.concat(real_in, real_out, dim=1) output = netD(real_concat) real_label = nd.ones(output.shape, ctx=ctx) errD_real = GAN_loss(output, real_label) errD = (errD_real + errD_fake) * 0.5 errD.backward() metric.update([ real_label, ], [ output, ]) trainerD.step(batch.data[0].shape[0]) ############################ # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z)) ########################### with autograd.record(): fake_out = netG(real_in) fake_concat = nd.concat(real_in, fake_out, dim=1) output = netD(fake_concat) real_label = nd.ones(output.shape, ctx=ctx) errG = GAN_loss( output, real_label) + L1_loss(real_out, fake_out) * lambda1 errG.backward() trainerG.step(batch.data[0].shape[0]) # Print log infomation every ten batches if iter % 10 == 0: name, acc = metric.get() logging.info('speed: {} samples/s'.format( batch_size / (time.time() - btic))) logging.info( 'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d' % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc, iter, epoch)) iter = iter + 1 btic = time.time() name, acc = metric.get() metric.reset() logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc)) logging.info('time: %f' % (time.time() - tic)) # Visualize one generated image for each epoch fake_img = fake_out[0] visualize(fake_img)
from chapter1 import c1_utils from mxnet import ndarray as nd from mxnet import gluon from mxnet import autograd as autograd batch_size = 256 train_data, test_data = c1_utils.load_data_fashion_mnist(batch_size) num_inputs = 28 * 28 num_outputs = 10 num_hidden = 784 weight_scale = .05 W1 = nd.random_normal(shape=(num_inputs, num_hidden), scale=weight_scale) b1 = nd.zeros(num_hidden) W2 = nd.random_normal(shape=(num_hidden, num_outputs), scale=weight_scale) b2 = nd.zeros(num_outputs) params = [W1, b1, W2, b2] for param in params: param.attach_grad() def relu(X): return nd.maximum(X, 0) def net(X):