self.plot('loss', 'fold-%d-rep-%d-epoch-%d' % (fold, rep, epoch), 'Class loss', epoch * 984 + step, loss) ############################################################################ ############################################################################ # Change the default progress callback to the Visdom plotter. openml.extensions.mxnet.config.active = VisdomLinePlotter() ############################################################################ ############################################################################ # A sequential network used for classification on the MNIST dataset. with mxnet.Context(mxnet.gpu(0)): model = mxnet.gluon.nn.HybridSequential() with model.name_scope(): model.add( mxnet.gluon.nn.HybridLambda( lambda F, x: F.reshape(x, shape=(-1, 1, 28, 28))), mxnet.gluon.nn.BatchNorm(), mxnet.gluon.nn.Conv2D(channels=32, kernel_size=5), mxnet.gluon.nn.LeakyReLU(alpha=1e-2), mxnet.gluon.nn.MaxPool2D(), mxnet.gluon.nn.Conv2D(channels=64, kernel_size=5), mxnet.gluon.nn.LeakyReLU(alpha=1e-2), mxnet.gluon.nn.MaxPool2D(), mxnet.gluon.nn.Flatten(), mxnet.gluon.nn.Dense(units=256), mxnet.gluon.nn.LeakyReLU(alpha=1e-2), mxnet.gluon.nn.Dropout(rate=0.2), mxnet.gluon.nn.Dense(units=10)) ############################################################################
def train(self, train_file: List[str], dev_file: List[str], save_dir, pretrained_embeddings_file=None, min_occur_count=2, lstm_layers=3, word_dims=100, tag_dims=100, dropout_emb=0.33, lstm_hiddens=400, dropout_lstm_input=0.33, dropout_lstm_hidden=0.33, mlp_arc_size=500, mlp_rel_size=100, dropout_mlp=0.33, learning_rate=1e-3, decay=.75, decay_steps=5000, beta_1=.9, beta_2=.9, epsilon=1e-12, num_buckets_train=40, num_buckets_valid=10, train_iters=50000, train_batch_size=5000, dev_batch_size=5000, validate_every=100, save_after=5000, root='root', transfer=None, bert_path=None, debug=False): """Train a deep biaffine dependency parser Parameters ---------- train_file : str path to training set dev_file : str path to dev set save_dir : str a directory for saving model and related meta-data pretrained_embeddings_file : str pre-trained embeddings file, plain text format min_occur_count : int threshold of rare words, which will be replaced with UNKs, lstm_layers : int layers of lstm word_dims : int dimension of word embedding tag_dims : int dimension of tag embedding dropout_emb : float word dropout lstm_hiddens : int size of lstm hidden states dropout_lstm_input : int dropout on x in variational RNN dropout_lstm_hidden : int dropout on h in variational RNN mlp_arc_size : int output size of MLP for arc feature extraction mlp_rel_size : int output size of MLP for rel feature extraction dropout_mlp : float dropout on the output of LSTM learning_rate : float learning rate decay : float see ExponentialScheduler decay_steps : int see ExponentialScheduler beta_1 : float see ExponentialScheduler beta_2 : float see ExponentialScheduler epsilon : float see ExponentialScheduler num_buckets_train : int number of buckets for training data set num_buckets_valid : int number of buckets for dev data set train_iters : int training iterations train_batch_size : int training batch size dev_batch_size : int test batch size validate_every : int validate on dev set every such number of batches save_after : int skip saving model in early epochs root : str token for ROOT debug : bool debug mode Returns ------- DepParser parser itself """ logger = init_logger(save_dir) config = _Config(train_file, dev_file, None, save_dir, pretrained_embeddings_file, min_occur_count, lstm_layers, word_dims, tag_dims, dropout_emb, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, learning_rate, decay, decay_steps, beta_1, beta_2, epsilon, num_buckets_train, num_buckets_valid, None, train_iters, train_batch_size, 0, debug) if transfer: with open(os.path.join(transfer, 'vocab.pkl'), 'rb') as f: self._vocab = pickle.load(f) self._vocab.append( ParserVocabulary( train_file[-1], pretrained_embeddings_file, min_occur_count, root=root, shared_vocab=self._vocab[0], )) else: for t, d in zip(train_file, dev_file): self._vocab.append( ParserVocabulary( t, pretrained_embeddings_file, min_occur_count, root=root, shared_vocab=None if len(self._vocab) == 0 else self._vocab[0], )) with open(config.save_vocab_path, 'wb') as f: pickle.dump(self._vocab, f) for voc in self._vocab: voc.log_info(logger) with mx.Context(mxnet_prefer_gpu()): data_loaders = [ DataLoader(t, num_buckets_train, vocab, bert=bert_path[0] if bert_path else None) for t, vocab in zip(train_file, self._vocab) ] config.bert_dim = data_loaders[0].bert_dim config.save() self._parser = parser = self.cls_parser( self._vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, bert=data_loaders[0].bert_dim, debug=debug) if transfer: parser.transfer = True parser.fill(transfer) parser.initialize() scheduler = ExponentialScheduler(learning_rate, decay, decay_steps) optimizer = mx.optimizer.Adam(learning_rate, beta_1, beta_2, epsilon, lr_scheduler=scheduler) trainer = gluon.Trainer(parser.collect_params(), optimizer=optimizer) global_step = 0 best_LF = 0. batch_id = 0 epoch = 1 total_epoch = math.ceil(train_iters / validate_every) logger.info("Epoch {} out of {}".format(epoch, total_epoch)) bar = Progbar(target=min(validate_every, train_iters)) gs = [ dl.get_batches(batch_size=train_batch_size, shuffle=False) for dl in data_loaders ] while global_step < train_iters: arcs_tasks = [] rels_tasks = [] bert_tasks = [] for g in gs: words, bert, tags, arcs, rels = next( g, (None, None, None, None, None)) if words is None: break arcs_tasks.append(arcs) rels_tasks.append(rels) bert_tasks.append(bert) if words is None: gs = [ dl.get_batches(batch_size=train_batch_size, shuffle=False) for dl in data_loaders ] continue with autograd.record(): arc_accuracy, rel_accuracy, loss = parser.forward( words, bert, tags, arcs_tasks, rels_tasks) loss_value = loss.asscalar() loss.backward() trainer.step(train_batch_size) batch_id += 1 try: bar.update(batch_id, exact=[("LR", rel_accuracy, 2), ("loss", loss_value)]) except OverflowError: pass # sometimes loss can be 0 or infinity, crashes the bar global_step += 1 if global_step % validate_every == 0: batch_id = 0 UF, LF, speed = evaluate_joint_official_script( parser, self._vocab, num_buckets_valid, dev_batch_size, dev_file, os.path.join(save_dir, 'dev.predict.conllu'), bert=None if bert_path is None else bert_path[1]) score_str = '' for dataset, lf in zip(dev_file, LF): dataset = os.path.basename(dataset).replace( '.conllu', '') lf = lf * 100 score_str += '{}={:0.1f} '.format(dataset, lf) if transfer: LF = LF[-1] * 100 else: LF = sum(LF) / len(LF) * 100 score_str += '{}={:0.1f} '.format('avg', LF) logger.info(score_str + '%d sents/s' % (speed)) epoch += 1 bar = Progbar(target=min(validate_every, train_iters - global_step)) if global_step > save_after and LF > best_LF: logger.info('- new best score!') best_LF = LF parser.save(config.save_model_path) if global_step < train_iters: logger.info("Epoch {} out of {}".format( epoch, total_epoch)) # When validate_every is too big if not os.path.isfile(config.save_model_path) or best_LF == 0: parser.save(config.save_model_path) return self
model_path = 'data/model/wsj-pos-bebu-ge-fe4' columns = {0: 'text', 1: 'pos'} corpus = NLPTaskDataFetcher.fetch_column_corpus('data/wsj-pos', columns, train_file='train.tsv', test_file='test.tsv', dev_file='dev.tsv') # 2. what tag do we want to predict? tag_type = 'pos' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings with mx.Context(mxnet_prefer_gpu()): embedding_types = [ WordEmbeddings('data/embedding/glove/glove.6B.100d.txt'), BERTEmbeddings([ 'data/embedding/bert_base_sum/wsj.train.bert', 'data/embedding/bert_base_sum/wsj.dev.bert', 'data/embedding/bert_base_sum/wsj.test.bert' ]), CharLMEmbeddings('data/model/lm-news-forward'), CharLMEmbeddings('data/model/lm-news-backward'), ] embeddings = StackedEmbeddings(embeddings=embedding_types) # 5. initialize sequence tagger tagger = SequenceTagger(hidden_size=256,
def main(): parser = argparse.ArgumentParser( description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('arena', 'games', 'roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument( '-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument( '--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument( '--kvstore-update-period', required=False, type=int, default=1, help='The period that the worker updates the parameters from the sever' ) parser.add_argument( '--kv-type', required=False, type=str, default=None, help= 'type of kvstore, default will not use kvstore, could also be dist_async' ) parser.add_argument('--optimizer', required=False, type=str, default="adagrad", help='type of optimizer') parser.add_argument('--momentum', required=False, type=float, default=None, help='momentum value') args, unknown = parser.parse_known_args() if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] args.dir_path = 'dqn-%s-%de_5' % (rom_name, int(args.lr * 10**5)) ctx = re.findall('([a-z]+)(\d*)', args.ctx) ctx = [(device, int(num)) if len(num) > 0 else (device, 0) for device, num in ctx] replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = 1000000 history_length = 4 rows = 84 cols = 84 q_ctx = mx.Context(*ctx[0]) game = AtariGame(rom_path=args.rom, resize_mode='scale', replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length) ##RUN NATURE freeze_interval = 10000 epoch_num = 200 steps_per_epoch = 250000 update_interval = 4 discount = 0.99 eps_start = args.start_eps eps_min = 0.1 eps_decay = (eps_start - 0.1) / 1000000 eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 action_num = len(game.action_set) data_shapes = { 'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size, ), 'dqn_reward': (minibatch_size, ) } dqn_output_op = DQNOutputNpyOp() dqn_sym = dqn_sym_nature(action_num, dqn_output_op) qnet = Base(data_shapes=data_shapes, sym=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) use_easgd = False if args.optimizer != "easgd": optimizer = mx.optimizer.create(name='adagrad', learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) else: use_easgd = True easgd_beta = 0.9 easgd_p = 4 easgd_alpha = easgd_beta / (args.kvstore_update_period * easgd_p) optimizer = mx.optimizer.Easgd(learning_rate=easgd_alpha) easgd_eta = 0.00025 local_optimizer = mx.optimizer.create(name='adagrad', learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) central_weight = OrderedDict([(n, nd.zeros(v.shape, ctx=q_ctx)) for n, v in qnet.params.items()]) if args.momentum != None: easgd_delta = 0.99 velocity = OrderedDict([(n, nd.zeros(v.shape, ctx=q_ctx)) for n, v in qnet.params.items()]) paramsBackup = OrderedDict([(n, nd.zeros(v.shape, ctx=q_ctx)) for n, v in qnet.params.items()]) # Create kvstore if args.kv_type != None: kvType = args.kv_type kvStore = kvstore.create(kvType) #Initialize kvstore for idx, v in enumerate(qnet.params.values()): kvStore.init(idx, v) if use_easgd == False: # Set optimizer on kvstore kvStore.set_optimizer(optimizer) else: # kvStore.send_updater_to_server(easgd_server_update) kvStore.set_optimizer(optimizer) local_updater = mx.optimizer.get_updater(local_optimizer) kvstore_update_period = args.kvstore_update_period args.dir_path = args.dir_path + "-" + str(kvStore.rank) else: updater = mx.optimizer.get_updater(optimizer) qnet.print_stat() target_qnet.print_stat() # Begin Playing Game training_steps = 0 total_steps = 0 for epoch in xrange(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() game.start() while steps_left > 0: # Running New Episode episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 time_episode_start = time.time() game.begin_episode(steps_left) while not game.episode_terminate: # 1. We need to choose a new action based on the current game status if game.state_enabled and game.replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! current_state = game.current_state() state = nd.array( current_state.reshape((1, ) + current_state.shape), ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(batch_size=1, data=state)[0].asnumpy() action = numpy.argmax(qval_npy) episode_q_value += qval_npy[0, action] episode_action_step += 1 else: action = npy_rng.randint(action_num) # 2. Play the game for a single mega-step (Inside the game, the action may be repeated for several times) game.play(action) total_steps += 1 # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps % update_interval == 0 and game.replay_memory.sample_enabled: # 3.1 Draw sample from the replay_memory training_steps += 1 episode_update_step += 1 states, actions, rewards, next_states, terminate_flags \ = game.replay_memory.sample(batch_size=minibatch_size) states = nd.array(states, ctx=q_ctx) / float(255.0) next_states = nd.array(next_states, ctx=q_ctx) / float(255.0) actions = nd.array(actions, ctx=q_ctx) rewards = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward( batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward( batch_size=minibatch_size, data=next_states)[0] qval = qnet.forward(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount if args.momentum == None: outputs = qnet.forward(batch_size=minibatch_size, is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward(batch_size=minibatch_size) if args.kv_type != None: if total_steps % kvstore_update_period == 0: if use_easgd == False: update_to_kvstore(kvStore, qnet.params, qnet.params_grad) else: for paramIndex in range(len(qnet.params)): k = qnet.params.keys()[paramIndex] kvStore.pull(paramIndex, central_weight[k], priority=-paramIndex) qnet.params[k][:] -= easgd_alpha * ( qnet.params[k] - central_weight[k]) kvStore.push(paramIndex, qnet.params[k], priority=-paramIndex) if use_easgd: if args.momentum == None: for paramIndex in range(len(qnet.params)): k = qnet.params.keys()[paramIndex] '''qnet.params[k][:] += -easgd_eta*nd.clip(qnet.params_grad[k], -args.clip_gradient, args.clip_gradient)''' local_updater(index=paramIndex, grad=qnet.params_grad[k], weight=qnet.params[k]) else: for i, k in enumerate(qnet.params.keys()): paramsBackup[k][:] = qnet.params[k] qnet.params[ k][:] += easgd_delta * velocity[k] outputs = qnet.forward( batch_size=minibatch_size, is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward(batch_size=minibatch_size) for i, k in enumerate(qnet.params.keys()): velocity[k][:] = easgd_delta * velocity[ k] - args.lr * qnet.params_grad[k] qnet.params[ k][:] = paramsBackup[k] + velocity[ k] - args.wd * qnet.params[k][:] else: qnet.update(updater=updater) # 3.3 Calculate Loss diff = nd.abs( nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = (0.5 * nd.sum(nd.square(quadratic_part)) + nd.sum(diff - quadratic_part)).asscalar() episode_loss += loss # 3.3 Update the target network every freeze_interval # (We can do annealing instead of hard copy) if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) steps_left -= game.episode_step time_episode_end = time.time() # Update the statistics epoch_reward += game.episode_reward if args.kv_type != None: info_str = "Node[%d]: " % kvStore.rank else: info_str = "" info_str += "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, game.episode_step / (time_episode_end - time_episode_start), eps_curr) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % ( episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % ( episode_q_value / episode_action_step, episode_action_step) logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
def parse_ctx(ctx_args): ctx = re.findall('([a-z]+)(\d*)', ctx_args) ctx = [(device, int(num)) if len(num) > 0 else (device, 0) for device, num in ctx] ctx = [mx.Context(*ele) for ele in ctx] return ctx
def train_semimyo_pose_smooth(args): import re if re.search(r'20161223.2.image-latest.trial-\d+.1', args.root): click.echo('deprecated') return if args.root: if args.log: args.log = os.path.join(args.root, args.log) if args.snapshot: args.snapshot = os.path.join(args.root, args.snapshot) with Context(args.log, parallel=False, mxnet_context=mx.Context(mx.gpu(args.gpu[0]))): logger.info('Args:\n{}', pformat(args)) for i in range(args.num_epoch): path = args.snapshot + '-%04d.params' % (i + 1) if os.path.exists(path): logger.info('Found snapshot {}, exit', path) return dataset = get_dataset(args.dataset, **args.dataset_args) get_crossval_data = getattr(dataset, 'get_%s_data' % args.crossval_type.replace('-', '_')) train, val = get_crossval_data( batch_size=args.batch_size, fold=args.fold, preprocess=args.preprocess, num_mini_batch=args.num_mini_batch, balance_gesture=args.balance_gesture, window=args.window ) logger.info('Train samples: {}', train.num_data) logger.info('Val samples: {}', val.num_data) mod = get_module( args.module, network=args.symbol, adabn=args.adabn, num_adabn_epoch=args.num_adabn_epoch, for_training=True, num_eval_epoch=args.num_eval_epoch, snapshot_period=args.snapshot_period, symbol_kargs=dict( stochastic_input=args.stochastic_input, stochastic_net=args.stochastic_net, window=args.window, batch_size=args.batch_size, num_pose=dataset.num_pose, shared_net=args.shared_net, gesture_net=args.gesture_net, pose_net=args.pose_net, pose_head_net=args.pose_head_net, pose_tail_net=args.pose_tail_net, num_gesture=dataset.num_gesture, num_semg_channel=1, num_semg_row=dataset.num_semg_row, num_semg_col=dataset.num_semg_col, dropout=args.dropout, num_mini_batch=args.num_mini_batch, gesture_loss_weight=args.gesture_loss_weight, pose_loss_weight=args.pose_loss_weight, smooth_loss_weight=args.smooth_loss_weight, cudnn_tune=args.cudnn_tune, num_stochastic_sample=args.num_stochastic_sample, ), context=[mx.gpu(i) for i in args.gpu] ) mod.fit( monitor_pattern=args.monitor_pattern, monitor_interval=args.monitor_interval, train_data=train, eval_data=val, num_epoch=args.num_epoch, num_train=train.num_data, batch_size=args.batch_size, lr_step=args.lr_step, lr_factor=args.lr_factor, lr=args.lr, wd=args.wd, snapshot=args.snapshot, params=args.params, ignore_params=args.ignore_params, fix_params=args.fix_params, decay_all=args.decay_all )
if math.floor(ans) == ans: ans = int(ans) return ans else: return default context = input('Choose a device(c for cpu, number for gpu(n)):') if context == 'c': context = mxnet.cpu(0) else: context = mxnet.gpu(int(context)) # get trainer with mxnet.Context(context): model_num = input('Which model do you want to train?\n' + '1. timeSVD++\t2. v1\t3. v2\t 4. v3\n') bin_cnt = input_param('bin_cnt', 30) beta = input_param('beta', .4) factor_cnt = input_param('factor_cnt', 10) batch_size = input_param('batch_size', 40) # timeSVD++ if model_num == '1': model = timeSVDpp_batch.TimeSVDpp(nItems, nUsers, nDays, average_rating, factor_cnt, bin_cnt, beta, batch_size) # # neuralTimeSVD++ v1 # if model_num == '2': # trainer = v1. \ # Trainer(userItems, rating_cnt, test_userItems, test_rating_cnt, # user_meanday, nItems, nUsers, nDays, average_rating,
def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, no_bias): with mx.Context('gpu', 0): # run fp32 conv data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32') conv2d = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride, no_bias=no_bias, cudnn_off=False, name='conv2d') arg_shapes, _, _ = conv2d.infer_shape(data=data_shape) arg_names = conv2d.list_arguments() conv_exe_fp32 = conv2d.simple_bind(ctx=mx.current_context(), grad_req='null') conv_exe_fp32.arg_dict[arg_names[0]][:] = mx.nd.random.uniform( low=-127.0, high=127.0, shape=data_shape).astype('int32') conv_exe_fp32.arg_dict[arg_names[1]][:] = mx.nd.random.uniform( low=-127.0, high=127.0, shape=arg_shapes[1]).astype('int32') if not no_bias: conv_exe_fp32.arg_dict[arg_names[2]][:] = mx.nd.random.uniform( low=-127.0, high=127.0, shape=arg_shapes[2]).astype('int32') output = conv_exe_fp32.forward()[0] # run quantized conv qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8') qweight = mx.sym.Variable(name='qweight', dtype='int8') min_data = mx.sym.Variable(name='min_data') max_data = mx.sym.Variable(name='max_data') min_weight = mx.sym.Variable(name='min_weight') max_weight = mx.sym.Variable(name='max_weight') quantized_conv2d = mx.sym.contrib.quantized_conv( data=qdata, weight=qweight, min_data=min_data, max_data=max_data, min_weight=min_weight, max_weight=max_weight, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride, no_bias=no_bias) qarg_names = quantized_conv2d.list_arguments() type_dict = None if not no_bias: type_dict = {qarg_names[2]: 'int8'} conv_exe_int8 = quantized_conv2d.simple_bind( ctx=mx.current_context(), type_dict=type_dict, grad_req='null') conv_exe_int8.arg_dict[qarg_names[0]][:] = conv_exe_fp32.arg_dict[ arg_names[0]].astype('int8') conv_exe_int8.arg_dict[qarg_names[1]][:] = conv_exe_fp32.arg_dict[ arg_names[1]].astype('int8') quantized_range = 127.0 if no_bias: conv_exe_int8.arg_dict[qarg_names[2]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[3]][:] = quantized_range conv_exe_int8.arg_dict[qarg_names[4]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[5]][:] = quantized_range else: conv_exe_int8.arg_dict[ qarg_names[2]][:] = conv_exe_fp32.arg_dict[ arg_names[2]].astype('int8') conv_exe_int8.arg_dict[qarg_names[3]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[4]][:] = quantized_range conv_exe_int8.arg_dict[qarg_names[5]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[6]][:] = quantized_range conv_exe_int8.arg_dict[qarg_names[7]][:] = -quantized_range conv_exe_int8.arg_dict[qarg_names[8]][:] = quantized_range qoutput, min_range, max_range = conv_exe_int8.forward() if no_bias: assert_almost_equal(output.asnumpy(), qoutput.asnumpy()) else: # with adding bias, accuracy loss should not be greater than one diff = mx.nd.abs(output - qoutput.astype(output.dtype)) cond = mx.nd.lesser(2, diff).sum().asscalar() assert cond == 0
def check_quantized_fc(data_shape, num_hidden, no_bias, flatten=True): with mx.Context('gpu', 0): data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32') fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten) arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape) arg_names = fc_fp32.list_arguments() fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null') fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform( low=-127.0, high=127.0, shape=data_shape).astype('int32') fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform( low=-127.0, high=127.0, shape=arg_shapes[1]).astype('int32') if not no_bias: fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform( low=-127.0, high=127.0, shape=arg_shapes[2]).astype('int32') output = fc_fp32_exe.forward()[0] qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8') fc_int8 = mx.sym.contrib.quantized_fully_connected( data=qdata, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten) qarg_names = fc_int8.list_arguments() type_dict = {qarg_names[1]: 'int8'} if not no_bias: type_dict.update({qarg_names[2]: 'int8'}) fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null') fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[ arg_names[0]].astype('int8') fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[ arg_names[1]].astype('int8') quantized_range = 127.0 if no_bias: fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range else: fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[ arg_names[2]].astype('int8') fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range qoutput, min_range, max_range = fc_int8_exe.forward() if no_bias: assert_almost_equal(output.asnumpy(), qoutput.asnumpy()) else: # with adding bias, accuracy loss should not be greater than one diff = mx.nd.abs(output - qoutput.astype(output.dtype)) cond = mx.nd.lesser(2, diff).sum().asscalar() assert cond == 0
# 由于FancyMLP和 Sequential 类都是 Block 类的子类,我们可以嵌套调用它们。 class NestMLP(nn.Block): def __init__(self, **kwargs): super(NestMLP, self).__init__(**kwargs) self.net = nn.Sequential() self.net.add(nn.Dense(64, activation='relu'), nn.Dense(32, activation='relu')) self.dense = nn.Dense(16, activation='relu') def forward(self, x): return self.dense(self.net(x)) if __name__ == '__main__': with mx.Context(mx.gpu()): x = nd.random.uniform(shape=(2, 20)) net1 = MySequential() net1.add(nn.Dense(256, activation='relu')) net1.add(nn.Dense(10)) net1.initialize() print net1(x) net2 = FancyMLP() net2.initialize() print net2(x) # 由于FancyMLP和 Sequential 类都是 Block 类的子类,我们可以嵌套调用它们。6666 net3 = nn.Sequential() net3.add(NestMLP(), nn.Dense(20), FancyMLP()) net3.initialize()
def as_mxnet_context(self): _logger.debug("typeid:{}, id:{}".format(self.device_typeid, self.device_id)) return mxnet.Context(self.devtype2str[self.device_typeid], self.device_id)
def generate_poisoning_fun(index=0, total_round=5): # load dataset, model structure, parameter dev = mx.gpu(0) batch_size = 1 # data_shape = (batch_size, 2352) data_shape = (batch_size, 3, 28, 28) train_iter = mx.io.ImageRecordIter( path_imgrec="data/cifar10/cifar10_train.rec", data_shape=(3, 28, 28), batch_size=batch_size, # mean_img="data/cifar10/mean.bin", rand_crop=True, rand_mirror=True, round_batch=True) val_iter = mx.io.ImageRecordIter( path_imgrec="data/cifar10/cifar10_val.rec", data_shape=(3, 28, 28), batch_size=batch_size, # mean_img="data/cifar10/mean.bin", rand_crop=True, rand_mirror=True, round_batch=True) all_data = [] all_label = [] poisoning_d = [] poisoning_l = [] poisoing_data_list = [] poisoing_label_list = [] for i in range(index): batch_data = copy.deepcopy(train_iter.next()) poisoning_d.append(batch_data.data[0]) poisoning_l.append(batch_data.label[0] + 1) for j in range(index): with mx.Context(dev): # load original model softmax, arg_params, aux_params = mx.model.load_checkpoint( 'model/cifar10_model', 300) model = mx.mod.Module(softmax, context=dev) model.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label, inputs_need_grad=True) model.set_params(arg_params, aux_params) # define autoencoder de_out = mx.symbol.load('model/cifar10_model-symbol.json') ae_arg_arrays_load = mx.nd.load('model/cifar10_model-0300.params') ae_arg_shapes, ae_output_shapes, ae_aux_shapes = de_out.infer_shape( data=data_shape) ae_grad_arrays = [ mx.nd.zeros(shape, ctx=dev) for shape in ae_arg_shapes ] ae_arg_arrays = [ mx.nd.zeros(shape, ctx=dev) for shape in ae_arg_shapes ] ae_model = de_out.simple_bind(ctx=dev, data=(1, batch_size, 28, 28), grad_req='write') # load pre-trained weight print(len(ae_arg_arrays_load)) for i in range(1, len(ae_arg_arrays_load)): ae_arg_arrays_load[i].copyto(ae_model.arg_arrays[i]) train_iter.reset() dataBatchP = copy.deepcopy(train_iter.next()) poisoning_d[j].copyto(dataBatchP.data[0]) poisoning_l[j].copyto(dataBatchP.label[0]) data = dataBatchP.data[0] num_normal = 1000 attacked_model_lr = 0.005 generative_model_lr = 0.001 model.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', attacked_model_lr), )) # get normal data loss and accuracy loss = 0 for num in range(num_normal): dataBatch = copy.deepcopy(train_iter.next()) label = dataBatch.label[0] model.forward(dataBatch) output = model.get_outputs()[0].asnumpy() loss += CalLogLoss(output, label.asnumpy()) print 'normal data loss: %.4f' % loss val_iter.reset() metric = mx.metric.create('acc') for batch in val_iter: model.forward(batch, is_train=False) model.update_metric(metric, batch.label) print metric.get() # val_iter.reset() # val_acc = model.score(val_iter, 'acc') # print 'Val Acc: %.4f' % val_acc[0][1] #attack with initial data, get normal data loss and accuracy print 'after initial attack' model.forward(dataBatchP, is_train=True) model.backward() model.update() val_iter.reset() metric.reset() for batch in val_iter: model.forward(batch, is_train=False) model.update_metric(metric, batch.label) print metric.get() # val_iter.reset() # val_acc = model.score(val_iter, 'acc') # print 'Val Acc: %.4f' % val_acc[0][1] # re-evaluate normal data loss loss = 0 train_iter.reset() dataBatch = copy.deepcopy(train_iter.next()) for num in range(num_normal): dataBatch = copy.deepcopy(train_iter.next()) label = dataBatch.label[0] model.forward(dataBatch) loss += CalLogLoss(model.get_outputs()[0].asnumpy(), label.asnumpy()) print 'normal data loss: %.4f' % loss plt.figure('poisoned data') plt.subplot(1, 5, 1) plt.imshow( (dataBatchP.data[0]).asnumpy()[0].astype(np.uint8).transpose( 1, 2, 0)) # generate poisoned data ae_de_grad = mx.nd.zeros(ae_model.outputs[0].shape, ctx=dev) pre_loss = 0 for round in range(total_round): start = time.time() print 'round %d' % round # update original model train_iter.reset() dataBatch = copy.deepcopy(train_iter.next()) ae_model.arg_dict['data'][:] = dataBatchP.data[0].reshape( (1, 2352)) / 255 ae_model.forward() ae_output = ae_model.outputs[0].asnumpy() label_tmp = copy.deepcopy(dataBatchP.label) dataBatch_tmp = copy.deepcopy( mx.io.DataBatch( [mx.nd.array(ae_output.reshape(data_shape))], label_tmp)) # load pre-trained weight model.set_params(arg_params, aux_params) model.forward(dataBatch_tmp, is_train=True) model.backward() # update attacked model model.update() print 'poisoned network' val_iter.reset() metric.reset() for batch in val_iter: model.forward(batch, is_train=False) model.update_metric(metric, batch.label) print metric.get() # val_iter.reset() # val_acc = model.score(val_iter, 'acc') # print 'Val Acc: %.4f' % val_acc[0][1] if round % 2 == 0: plt.subplot(1, 5, round / 2 + 2) plt.imshow((ae_output.reshape(3, 28, 28) * 255).astype( np.uint8).transpose(1, 2, 0)) # get normal data loss loss = 0 tmpGrad = np.zeros(data.shape) dataBatch = copy.deepcopy(train_iter.next()) for num in range(num_normal): dataBatch = copy.deepcopy(train_iter.next()) label = dataBatch.label[0] model.forward(dataBatch, is_train=True) model.backward() output = model.get_outputs()[0].asnumpy() loss += CalLogLoss(output, label.asnumpy()) tmpGrad += model.get_input_grads()[0].asnumpy() # ae_de_grad[:] = -np.sign(tmpGrad.reshape(1,2352))*1 ae_de_grad[:] = -np.sign(tmpGrad.reshape( 1, 2352)) * np.sign(loss - pre_loss) ae_model.backward([ae_de_grad]) for key in ae_model.arg_dict.keys(): SGD(key, ae_model.arg_dict[key], ae_model.grad_dict[key], generative_model_lr, batch_size) end = time.time() print 'time: %.4f' % (end - start) print 'Update autocoder' print 'normal data loss: %.4f' % loss pre_loss = loss poisoing_data_list.append(ae_output) poisoing_label_list.append(dataBatchP.label[0].asnumpy()) all_data.append(poisoing_data_list) all_label.append(poisoing_label_list) return all_data, all_label
LATENT_SIZE = 32 BATCH_SIZE = 64 PRINT_EVERY = 100 MAX_ITERATIONS = 1000000 OUT_DIR = pathlib.Path(pathlib.os.environ['LOG']) / 'debug' dataset = mx.gluon.data.vision.MNIST( train=True, transform=lambda data, label: (np.round(data.astype(np.float32) / 255), label)) train_data = mx.gluon.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True) ctx = [mx.gpu(0)] if USE_GPU else [mx.cpu()] with mx.Context(ctx[0]): variational = AmortizedGammaVariational(LATENT_SIZE, BATCH_SIZE) model = DeepLatentGammaModel() elbo = ELBO(model, variational) variational.hybridize() model.hybridize() elbo.hybridize() variational.initialize(mx.init.Xavier()) model.initialize(mx.init.Xavier()) params = model.collect_params() params.update(variational.collect_params()) trainer = gluon.Trainer(params, 'rmsprop', { 'learning_rate': 0.00001,
import math import cv2 from multiprocessing import Pool from itertools import repeat from itertools import izip from symbols import get_PNet, get_RNet, get_ONet, get_gender_attractive_Net, get_smile_Net, get_QNet, get_attractive_Net, get_attractive_small_Net, get_rotation_Net, get_glass_Net, get_true_Net, get_clear_Net from time import time from helper import nms, adjust_input, generate_bbox, detect_first_stage, detect_first_stage_warpper, init_executor import threading from nms.gpu_nms import * from config import GPU_ID first_has_reg = True has_reg = True has_landmark = True mx.Context(mx.gpu(GPU_ID)) class MyThread(threading.Thread): def __init__(self, arg): super(MyThread, self).__init__() self.arg = arg self.return_boxes = [] def run(self): self.return_boxes = detect_first_stage_warpper(self.arg) class MtcnnDetector(object): """ Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Neural Networks
#!/usr/bin/python import mxnet as mx def ab(): a = mx.nd.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) b = mx.nd.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) c = mx.nd.dot(a,b) print (c.asnumpy()) print ("<===============") print ("Dot product (gpu):") gpu_device=mx.gpu(0) # Change this to mx.cpu() in absence of GPUs. with mx.Context(gpu_device): ab()
def main(): parser = argparse.ArgumentParser( description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Directory path of the model files.') parser.add_argument('-m', '--model_prefix', required=True, type=str, default='QNet', help='Prefix of the saved model file.') parser.add_argument('-t', '--test-steps', required=False, type=int, default=125000, help='Test steps.') parser.add_argument( '-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument( '-e', '--epoch-range', required=False, type=str, default='22', help='Epochs to run testing. E.g `-e 0,80`, `-e 0,80,2`') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--symbol', required=False, type=str, default="nature", help='type of network, nature or nips') args, unknown = parser.parse_known_args() max_start_nullops = 30 holdout_size = 3200 replay_memory_size = 1000000 exploartion = 0.05 history_length = 4 rows = 84 cols = 84 ctx = re.findall('([a-z]+)(\d*)', args.ctx) ctx = [(device, int(num)) if len(num) > 0 else (device, 0) for device, num in ctx] q_ctx = mx.Context(*ctx[0]) minibatch_size = 32 epoch_range = [int(n) for n in args.epoch_range.split(',')] epochs = range(*epoch_range) game = AtariGame(rom_path=args.rom, history_length=history_length, resize_mode='scale', resized_rows=rows, replay_start_size=4, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, death_end_episode=False, display_screen=args.visualization) if not args.visualization: holdout_samples = collect_holdout_samples(game, sample_num=holdout_size) action_num = len(game.action_set) data_shapes = { 'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size, ), 'dqn_reward': (minibatch_size, ) } if args.symbol == "nature": dqn_sym = dqn_sym_nature(action_num) elif args.symbol == "nips": dqn_sym = dqn_sym_nips(action_num) else: raise NotImplementedError qnet = Base(data_shapes=data_shapes, sym_gen=dqn_sym, name=args.model_prefix, ctx=q_ctx) for epoch in epochs: qnet.load_params(name=args.model_prefix, dir_path=args.dir_path, epoch=epoch) if not args.visualization: avg_q_score = calculate_avg_q(holdout_samples, qnet) avg_reward = calculate_avg_reward(game, qnet, args.test_steps, exploartion) print("Epoch:%d Avg Reward: %f, Avg Q Score:%f" % (epoch, avg_reward, avg_q_score)) else: avg_reward = calculate_avg_reward(game, qnet, args.test_steps, exploartion) print("Epoch:%d Avg Reward: %f" % (epoch, avg_reward))
def __init__( self, sigma: Tensor, kernel: Kernel, prediction_length: Optional[int] = None, context_length: Optional[int] = None, num_samples: Optional[int] = None, ctx: mx.Context = mx.Context('cpu'), float_type: DType = np.float64, jitter_method: str = 'iter', max_iter_jitter: int = 10, neg_tol: float = -1e-8, diag_weight: float = 1e-6, increase_jitter: int = 10, sample_noise: bool = True, F=None, ) -> None: """ Parameters ---------- sigma Noise parameter of shape (batch_size, num_data_points, 1), where num_data_points is the number of rows in the Cholesky matrix. kernel Kernel object. prediction_length Prediction length. context_length Training length. num_samples The number of samples to be drawn. ctx Determines whether to compute on the cpu or gpu. float_type Determines whether to use single or double precision. jitter_method Iteratively jitter method or use eigenvalue decomposition depending on problem size. max_iter_jitter Maximum number of iterations for jitter to iteratively make the matrix positive definite. neg_tol Parameter in the jitter methods to eliminate eliminate matrices with diagonal elements smaller than this when checking if a matrix is positive definite. diag_weight Multiple of mean of diagonal entries to initialize the jitter. increase_jitter Each iteration multiply by jitter by this amount sample_noise Boolean to determine whether to add :math:`\sigma^2I` to the predictive covariance matrix. F A module that can either refer to the Symbol API or the NDArray API in MXNet. """ assert (prediction_length is None or prediction_length > 0 ), "The value of `prediction_length` should be > 0" assert (context_length is None or context_length > 0 ), "The value of `context_length` should be > 0" assert (num_samples is None or num_samples > 0), "The value of `num_samples` should be > 0" self.sigma = sigma self.kernel = kernel self.prediction_length = prediction_length self.context_length = (context_length if context_length is not None else prediction_length) self.num_samples = num_samples self.F = F if F else getF(sigma) self.ctx = ctx self.float_type = float_type self.jitter_method = jitter_method self.max_iter_jitter = max_iter_jitter self.neg_tol = neg_tol self.diag_weight = diag_weight self.increase_jitter = increase_jitter self.sample_noise = sample_noise
def train(self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 2, save_model: bool = True, embeddings_in_memory: bool = True, train_with_dev: bool = False, context: mx.Context = None, show_test=False, cn=False) -> float: """ :param base_path: a folder to store model, log etc. :param learning_rate: :param mini_batch_size: :param max_epochs: :param anneal_factor: :param patience: :param save_model: :param embeddings_in_memory: :param train_with_dev: :return: best dev f1 """ evaluation_method = 'F1' if self.model.tag_type in ['ner', 'np', 'srl']: evaluation_method = 'span-F1' if self.model.tag_type in ['pos', 'upos']: evaluation_method = 'accuracy' print(evaluation_method) os.makedirs(base_path, exist_ok=True) loss_txt = os.path.join(base_path, "loss.txt") open(loss_txt, "w", encoding='utf-8').close() anneal_mode = 'min' if train_with_dev else 'max' train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: with mx.Context(context if context else mxnet_prefer_gpu()): self.model.initialize() scheduler = ReduceLROnPlateau(lr=learning_rate, verbose=True, factor=anneal_factor, patience=patience, mode=anneal_mode) optimizer = mx.optimizer.SGD(learning_rate=learning_rate, lr_scheduler=scheduler, clip_gradient=5.0) trainer = gluon.Trainer(self.model.collect_params(), optimizer=optimizer) for epoch in range(0, max_epochs): current_loss = 0 if not self.test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] batch_no = 0 for batch in batches: batch = batch batch_no += 1 # if batch_no % 100 == 0: # print("%d of %d (%f)" % (batch_no, len(batches), float(batch_no / len(batches)))) # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step() batch.sort(key=lambda x: len(x), reverse=True) with autograd.record(): self.model.embeddings.embed(batch) loss = self.model.neg_log_likelihood( batch, self.model.tag_type) current_loss += loss.sum().asscalar() loss.backward() # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) # optimizer.step() trainer.step(len(batch)) sys.stdout.write( "\r%.2f%%" % (batch_no / float(len(batches)) * 100)) sys.stdout.flush() if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) current_loss /= len(train_data) if not train_with_dev: print('.. evaluating... dev... ') dev_score, dev_fp, dev_result = self.evaluate( self.corpus.dev, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory, cn=cn) else: dev_fp = 0 dev_result = '_' # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_score) # save if model is current best and we use dev data for model selection if save_model and not train_with_dev and dev_score == scheduler.best: self.model.save(base_path) summary = '%d' % epoch + '\t({:%H:%M:%S})'.format(datetime.datetime.now()) \ + '\t%f\t%d\t%f\tDEV %d\t' % ( current_loss, scheduler.num_bad_epochs, learning_rate, dev_fp) + dev_result summary = summary.replace('\n', '') if self.corpus.test and len( self.corpus.test) and show_test: print('test... ') test_score, test_fp, test_result = self.evaluate( self.corpus.test, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory, cn=cn) summary += '\tTEST \t%d\t' % test_fp + test_result with open(loss_txt, "a") as loss_file: loss_file.write('%s\n' % summary) loss_file.close() print(summary) # if we do not use dev data for model selection, save final model if save_model and train_with_dev: self.model.save(base_path) return scheduler.best # return maximum dev f1 except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print('saving model') self.model.save(base_path + "/final-model") print('done')
def main(): parser = argparse.ArgumentParser(description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('arena', 'games', 'roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument('-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') args = parser.parse_args() if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] args.dir_path = 'dqn-%s' % rom_name ctx = re.findall('([a-z]+)(\d*)', args.ctx) ctx = [(device, int(num)) if len(num) >0 else (device, 0) for device, num in ctx] replay_start_size = 50000 max_start_nullops = 30 replay_memory_size = 1000000 history_length = 4 rows = 84 cols = 84 q_ctx = mx.Context(*ctx[0]) game = AtariGame(rom_path=args.rom, resize_mode='scale', replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length) ##RUN NATURE freeze_interval = 10000 epoch_num = 200 steps_per_epoch = 250000 update_interval = 4 discount = 0.99 eps_start = 1.0 eps_min = 0.1 eps_decay = (1.0 - 0.1) / 1000000 eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 action_num = len(game.action_set) data_shapes = {'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size,), 'dqn_reward': (minibatch_size,)} optimizer_params = {'name': 'adagrad', 'learning_rate': args.lr, 'eps': args.eps, 'clip_gradient': args.clip_gradient, 'rescale_grad': 1.0, 'wd': args.wd} dqn_output_op = DQNOutputNpyOp() dqn_sym = dqn_sym_nature(action_num, dqn_output_op) qnet = Critic(data_shapes=data_shapes, sym=dqn_sym, optimizer_params=optimizer_params, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) qnet.print_stat() target_qnet.print_stat() # Begin Playing Game training_steps = 0 total_steps = 0 for epoch in xrange(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() game.start() while steps_left > 0: # Running New Episode episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 time_episode_start = time.time() game.begin_episode(steps_left) while not game.episode_terminate: # 1. We need to choose a new action based on the current game status if game.state_enabled and game.replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! current_state = game.current_state() state = nd.array(current_state.reshape((1,) + current_state.shape), ctx=q_ctx) / float(255.0) qval_npy = qnet.calc_score(batch_size=1, data=state)[0].asnumpy() action = numpy.argmax(qval_npy) episode_q_value += qval_npy[0, action] episode_action_step += 1 else: action = npy_rng.randint(action_num) # 2. Play the game for a single mega-step (Inside the game, the action may be repeated for several times) game.play(action) total_steps += 1 # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps % update_interval == 0 and game.replay_memory.sample_enabled: # 3.1 Draw sample from the replay_memory training_steps += 1 episode_update_step += 1 states, actions, rewards, next_states, terminate_flags \ = game.replay_memory.sample(batch_size=minibatch_size) states = nd.array(states, ctx=q_ctx) / float(255.0) next_states = nd.array(next_states, ctx=q_ctx) / float(255.0) actions = nd.array(actions, ctx=q_ctx) rewards = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.calc_score(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.calc_score(batch_size=minibatch_size, data=next_states)[0] qval = qnet.calc_score(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount outputs = qnet.fit_target(batch_size=minibatch_size, data=states, dqn_action=actions, dqn_reward=target_rewards) # 3.3 Calculate Loss diff = nd.abs(nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = (0.5 * nd.sum(nd.square(quadratic_part)) + nd.sum(diff - quadratic_part)).asscalar() episode_loss += loss # 3.3 Update the target network every freeze_interval # (We can do annealing instead of hard copy) if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) steps_left -= game.episode_step time_episode_end = time.time() # Update the statistics epoch_reward += game.episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, game.episode_step / (time_episode_end - time_episode_start), eps_curr) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % (episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % (episode_q_value / episode_action_step, episode_action_step) logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
from gluonts.model.gp_forecaster.gaussian_process import GaussianProcess from gluonts.mx.context import check_gpu_support from gluonts.mx.kernels import RBFKernel from gluonts.mx.linalg_util import jitter_cholesky, jitter_cholesky_eig # This test verifies that both eigenvalue decomposition and iterative jitter method # make a non-positive definite matrix positive definite to be able to compute the cholesky. # Both gpu and cpu as well as single and double precision are tested. @pytest.mark.skipif( sys.platform == "linux", reason= f"skipping since potrf crashes on mxnet 1.6.0 on linux when matrix is not spd", ) @pytest.mark.parametrize("ctx", [mx.Context("gpu"), mx.Context("cpu")]) @pytest.mark.parametrize("jitter_method", ["iter", "eig"]) @pytest.mark.parametrize("float_type", [np.float32, np.float64]) def test_jitter_unit(jitter_method, float_type, ctx) -> None: # TODO: Enable GPU tests on Jenkins if ctx == mx.Context("gpu") and not check_gpu_support(): return matrix = nd.array([[[1, 2], [3, 4]], [[10, 100], [-21.5, 41]]], ctx=ctx, dtype=float_type) F = mx.nd num_data_points = matrix.shape[1] if jitter_method == "eig": L = jitter_cholesky_eig(F, matrix, num_data_points, ctx, float_type) elif jitter_method == "iter": L = jitter_cholesky(F, matrix, num_data_points, ctx, float_type)
def jitter_cholesky( F, matrix: Tensor, num_data_points: Optional[int] = None, ctx: mx.Context = mx.Context("cpu"), float_type: DType = np.float64, max_iter_jitter: int = 10, neg_tol: float = -1e-8, diag_weight: float = 1e-6, increase_jitter: int = 10, ) -> Optional[Tensor]: """ This function applies the jitter method. It iteratively tries to compute the Cholesky decomposition and adds a positive tolerance to the diagonal that increases at each iteration until the matrix is positive definite or the maximum number of iterations has been reached. Parameters ---------- matrix Kernel matrix of shape (batch_size, num_data_points, num_data_points). num_data_points Number of rows in the kernel_matrix. ctx Determines whether to compute on the cpu or gpu. float_type Determines whether to use single or double precision. max_iter_jitter Maximum number of iterations for jitter to iteratively make the matrix positive definite. neg_tol Parameter in the jitter methods to eliminate eliminate matrices with diagonal elements smaller than this when checking if a matrix is positive definite. diag_weight Multiple of mean of diagonal entries to initialize the jitter. increase_jitter Each iteration multiply by jitter by this amount Returns ------- Optional[Tensor] The method either fails to make the matrix positive definite within the maximum number of iterations and outputs an error or succeeds and returns the lower triangular Cholesky factor `L` of shape (batch_size, num_data_points, num_data_points) """ num_iter = 0 diag = batch_diagonal( F, matrix, num_data_points, ctx, float_type ) # shape (batch_size, num_data_points, 1) diag_mean = diag.mean(axis=1).expand_dims( axis=2 ) # shape (batch_size, 1, 1) jitter = F.zeros_like(diag) # shape (batch_size, num_data_points, 1) # Ensure that diagonal entries are numerically non-negative, as defined by neg_tol # TODO: Add support for symbolic case: Cannot use < operator with symbolic variables if F.sum(diag <= neg_tol) > 0: raise mx.base.MXNetError( " Matrix is not positive definite: negative diagonal elements" ) while num_iter <= max_iter_jitter: try: L = F.linalg.potrf( F.broadcast_add( matrix, F.broadcast_mul( F.eye(num_data_points, ctx=ctx, dtype=float_type), jitter, ), ) ) # gpu will not throw error but will store nans. If nan, L.sum() = nan # so the error tolerance can be large. # TODO: Add support for symbolic case: Cannot use <= operator with symbolic variables assert F.max(F.abs(L.nansum() - L.sum()) <= 1e-1) return L except: if num_iter == 0: # Initialize the jitter: constant jitter per each batch jitter = ( F.broadcast_mul(diag_mean, F.ones_like(jitter)) * diag_weight ) else: jitter = jitter * increase_jitter finally: num_iter += 1 raise mx.base.MXNetError( f" Matrix is not positive definite after the maximum number of iterations = {max_iter_jitter} " f"with a maximum jitter = {F.max(jitter)}" )
model = zoo.load_pretrained_resnext_to_unext101_64_4d(ctx=_ctx, migrate_input_norm=False, fine_tune=False) # model=zoo.resume_training_unext101_64_4d(freeze_input_norm = True, # fine_tune = True, # ctx=_ctx, # symb = "unext_resize_ver03_101_64_4_px_global_weight_highest_inv_weight_enp-symbol.json", # parame = "unext_resize_ver03_101_64_4_px_global_weight_highest_inv_weight_enp-0000.params") # model=zoo.resume_training_unext101_64_4d_beyond_word(freeze_input_norm = True, # fine_tune = True, # ctx=_ctx, # symb = "unext101_64_4d_deconv_enp_72000-symbol.json", # parame = "unext101_64_4d_deconv_enp_72000-0000.params") with mx.Context(_ctx): model.hybridize() # model.collect_params().initialize() # sx = mx.sym.var('data') # sym = model(sx) # graph = mx.viz.plot_network(sym) # graph.format = 'tif' # graph.render('model') with warnings.catch_warnings(): warnings.simplefilter("ignore") model.collect_params().initialize() num_epochs = 80 num_steps = len(my_train) test_num_steps = len(my_test) # print(num_steps)
def train(self, base_path: str, sequence_length: int, learning_rate: float = 20, mini_batch_size: int = 100, anneal_factor: float = 0.25, patience: int = 10, clip=0.25, max_epochs: int = 10000): number_of_splits = len(self.corpus.train_files) val_data = self._batchify(self.corpus.valid, mini_batch_size) os.makedirs(base_path, exist_ok=True) loss_txt = os.path.join(base_path, 'loss.txt') savefile = os.path.join(base_path, 'best-lm.pt') try: with mx.Context(mxnet_prefer_gpu()): self.model.initialize() best_val_loss = 100000000 scheduler = ReduceLROnPlateau(lr=learning_rate, verbose=True, factor=anneal_factor, patience=patience) optimizer = mx.optimizer.SGD(learning_rate=learning_rate, lr_scheduler=scheduler) trainer = gluon.Trainer(self.model.collect_params(), optimizer=optimizer) for epoch in range(1, max_epochs + 1): print('Split %d' % epoch + '\t - ({:%H:%M:%S})'.format(datetime.datetime.now())) # for group in optimizer.param_groups: # learning_rate = group['lr'] train_slice = self.corpus.get_next_train_slice() train_data = self._batchify(train_slice, mini_batch_size) print('\t({:%H:%M:%S})'.format(datetime.datetime.now())) # go into train mode # self.model.train() # reset variables epoch_start_time = time.time() total_loss = 0 start_time = time.time() hidden = self.model.init_hidden(mini_batch_size) cell = hidden.copy() # not really sure what this does ntokens = len(self.corpus.dictionary) # do batches for batch, i in enumerate( range(0, len(train_data) - 1, sequence_length)): data, targets = self._get_batch( train_data, i, sequence_length) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = self._repackage_hidden(hidden) cell = self._repackage_hidden(cell) # self.model.zero_grad() # optimizer.zero_grad() # do the forward pass in the model with autograd.record(): output, rnn_output, hidden, cell = self.model.forward( data, hidden, cell) # try to predict the targets loss = self.loss_function( output.reshape(-1, ntokens), targets).mean() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. # torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip) trainer.step(mini_batch_size) total_loss += loss.asscalar() if batch % self.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / self.log_interval elapsed = time.time() - start_time print( '| split {:3d} /{:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, number_of_splits, batch, len(train_data) // sequence_length, elapsed * 1000 / self.log_interval, cur_loss, self._safe_exp(cur_loss))) total_loss = 0 start_time = time.time() print('epoch {} done! \t({:%H:%M:%S})'.format( epoch, datetime.datetime.now())) scheduler.step(cur_loss) ############################################################################### # TEST ############################################################################### # skip evaluation # val_loss = self.evaluate(val_data, mini_batch_size, sequence_length) # scheduler.step(val_loss) # # # Save the model if the validation loss is the best we've seen so far. # if val_loss < best_val_loss: # self.model.save(savefile) # best_val_loss = val_loss # print('best loss so far {:5.2f}'.format(best_val_loss)) val_loss = cur_loss if (self.corpus.current_train_file_index + 1) % 100 == 0 or self.corpus.is_last_slice: self.model.save(savefile) ############################################################################### # print info ############################################################################### print('-' * 89) local_split_number = epoch % number_of_splits if local_split_number == 0: local_split_number = number_of_splits summary = '| end of split {:3d} /{:3d} | epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' \ 'valid ppl {:8.2f} | learning rate {:3.2f}'.format(local_split_number, number_of_splits, epoch, (time.time() - epoch_start_time), val_loss, self._safe_exp(val_loss), learning_rate) with open(loss_txt, "a") as myfile: myfile.write('%s\n' % summary) print(summary) print('-' * 89) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early')
def forward(self, x): return self.relu(self.bn(self.conv(x))) class SwapAxes(nn.Block): def __init__(self, dim1, dim2): super(SwapAxes, self).__init__() self.dim1 = dim1 self.dim2 = dim2 def forward(self, x): return nd.swapaxes(x, self.dim1, self.dim2) with mx.Context(mx.cpu(0)): model = nn.Sequential() model.add( SwapAxes(1, 2), CBR(40, 1), CBR(40), CBR(40), nn.MaxPool1D(2), CBR(80, 1), CBR(80), CBR(80), nn.MaxPool1D(2), CBR(160, 1), CBR(160), CBR(160), CBR(160),
def test_amp_conversion(): def check_amp_convert_symbol(): x = mx.sym.var("x") y = mx.sym.var("y") z = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True) siny = mx.sym.sin(y) res = z + siny # Compare symbols with similar computation graphs created using convert_symbol and manually. res_converted = amp.convert_symbol(res, target_dtype="float16", target_dtype_ops=["FullyConnected"], fp32_ops=["sin"]) x_fp16 = mx.sym.amp_cast(x, dtype="float16") y_fp16 = mx.sym.amp_cast(y, dtype="float16") amp_casted_siny = mx.sym.sin(mx.sym.amp_cast(y, dtype="float32")) z = mx.sym.FullyConnected(x_fp16, y_fp16, num_hidden=10, no_bias=True) outs = mx.sym.amp_multicast(z, amp_casted_siny, num_outputs=2) res_expected = outs[0] + outs[1] assert same_symbol_structure(res_converted, res_expected), \ "convert_symbol generating wrong computation graph" # convert_symbol called with incorrect inputs assert_raises(AssertionError, amp.convert_symbol, res, target_dtype="float16", target_dtype_ops=["FullyConnected"], fp32_ops=["elemwise_add"]) assert_raises(AssertionError, amp.convert_symbol, res, target_dtype="float16", target_dtype_ops=["FullyConnected"], fp32_ops=["Activation"], conditional_fp32_ops=[('Activation', 'act_type', ['selu'])]) assert_raises(AssertionError, amp.convert_symbol, res, target_dtype="float16", target_dtype_ops=["Activation"], fp32_ops=["Activation"], conditional_fp32_ops=[('Activation', 'act_type', ['selu'])]) assert_raises(AssertionError, amp.convert_symbol, res, target_dtype="float16", target_dtype_ops=["FullyConnected"], fp32_ops=["FullyConnected"]) # Test for op in conditional ops with condition not satisfied x = mx.sym.var("x") y = mx.sym.var("y") fc_cond = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True) res_converted = amp.convert_symbol(fc_cond, target_dtype="float16", target_dtype_ops=[], fp32_ops=["sin"], conditional_fp32_ops=[("FullyConnected", "no_bias", ["False"])]) res_expected = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True) assert same_symbol_structure(res_converted, res_expected), \ "convert_symbol generating wrong computation graph when conditional ops is used" # Test for op in conditional ops with condition satisfied res_converted = amp.convert_symbol(fc_cond, target_dtype="float16", target_dtype_ops=[], fp32_ops=["sin"], conditional_fp32_ops=[("FullyConnected", "no_bias", ["True"])]) x_fp32 = mx.sym.amp_cast(x, dtype="float32") y_fp32 = mx.sym.amp_cast(y, dtype="float32") res_expected = mx.sym.FullyConnected(x_fp32, y_fp32, num_hidden=10, no_bias=True) assert same_symbol_structure(res_converted, res_expected), \ "convert_symbol generating wrong computation graph when conditional ops used with satisfying condition" # Test with a real world model, default inputs for convert_symbol dir_path = os.path.dirname(os.path.realpath(__file__)) model_path = os.path.join(dir_path, 'model') if not os.path.isdir(model_path): os.mkdir(model_path) prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path) sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) inputs = {} inputs['data'] = mx.nd.ones((1, 3, 224, 224)) inputs.update(arg_params) converted_sym = amp.convert_symbol(sym) exe = converted_sym.simple_bind(mx.gpu(0), data=(1, 3, 224, 224), grad_req='null') exe.forward(is_train=False, **inputs) exe.outputs[0].asnumpy() inputs2 = {} inputs2['data'] = mx.nd.ones((1, 3, 224, 224)) inputs2['fc1_weight'] = inputs['fc1_weight'].astype(np.float16) inputs2['fc1_bias'] = inputs['fc1_bias'].astype(np.float16) # Test with a real world model, tweak inputs for convert_symbol converted_sym = amp.convert_symbol(sym, target_dtype="float16", target_dtype_ops=["Convolution"], data_names=["data"], cast_optional_params=True) converted_sym2 = amp.convert_symbol(sym, target_dtype="float16", target_dtype_ops=["Convolution"], data_names=["data"], cast_optional_params=False) exe = converted_sym.simple_bind(mx.gpu(0), data=(1, 3, 224, 224), grad_req='null') exe2 = converted_sym2.simple_bind(mx.gpu(), data=(1, 3, 224, 224), grad_req='null') converted_args = converted_sym.list_arguments() converted_auxs = converted_sym.list_auxiliary_states() for i, key in enumerate(exe.arg_arrays): if converted_args[i] in arg_params: arg_params[converted_args[i]] = arg_params[converted_args[i]].astype(exe.arg_arrays[i].dtype) for i, key in enumerate(exe.aux_arrays): if converted_auxs[i] in aux_params: aux_params[converted_auxs[i]] = aux_params[converted_auxs[i]].astype(exe.aux_arrays[i].dtype) inputs2.update(arg_params) exe.forward(is_train=False, **inputs2) exe.outputs[0].wait_to_read() inputs['fc1_weight'] = inputs['fc1_weight'].astype(np.float16) inputs['fc1_bias'] = inputs['fc1_bias'].astype(np.float16) exe2.forward(is_train=False, **inputs) exe2.outputs[0].wait_to_read() def check_amp_convert_model(): # Test with real world model, default inputs for convert_model dir_path = os.path.dirname(os.path.realpath(__file__)) model_path = os.path.join(dir_path, 'model') if not os.path.isdir(model_path): os.mkdir(model_path) prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path) sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) # Test with real world model, tweak inputs for convert_model result_sym, result_arg_params, result_aux_params = amp.convert_model(sym, arg_params, aux_params, target_dtype="float16", target_dtype_ops=["Convolution"]) mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu()) mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]]) mod.set_params(result_arg_params, result_aux_params) mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))], label=[mx.nd.ones((1,))])) mod.get_outputs()[0].asnumpy() assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == np.float32 # Call convert_model with cast_optional_params set to True result_sym, result_arg_params, result_aux_params = amp.convert_model(sym, arg_params, aux_params, target_dtype="float16", target_dtype_ops=["Convolution"], cast_optional_params=True) mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu()) mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]]) mod.set_params(result_arg_params, result_aux_params) mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))], label=[mx.nd.ones((1,))])) mod.get_outputs()[0].asnumpy() assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == np.float16 def check_amp_convert_hybrid_block(): # Test conversion for hybrid block on CPU model_cpu = get_model("resnet50_v1") model_cpu.collect_params().initialize(ctx=mx.cpu()) model_cpu.hybridize() model_cpu(mx.nd.random.uniform(0, 1, shape=(1, 3, 224, 224), ctx=mx.cpu())) converted_model_cpu = amp.convert_hybrid_block(model_cpu) # Test with real world model, default inputs for convert_hybrid_block model = get_model("resnet50_v1") model.collect_params().initialize(ctx=mx.gpu()) model.hybridize() model(mx.nd.zeros((1, 3, 224, 224))) converted_model = amp.convert_hybrid_block(model) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224), dtype=np.float32)) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224), dtype=np.float32)) # Test with real world model, tweak inputs for convert_hybrid_block converted_model = amp.convert_hybrid_block(model, target_dtype="float16", target_dtype_ops=["Convolution"]) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224), dtype=np.float32)) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224), dtype=np.float32)) # Check symbolic block dir_path = os.path.dirname(os.path.realpath(__file__)) model_path = os.path.join(dir_path, 'model') if not os.path.isdir(model_path): os.mkdir(model_path) prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path) net = SymbolBlock.imports(os.path.join(model_path, "imagenet1k-resnet-18-symbol.json"), input_names=["data", "softmax_label"], param_file=os.path.join(model_path, "imagenet1k-resnet-18-0000.params")) net.collect_params().reset_ctx(ctx=mx.gpu()) net.hybridize() net(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,))) converted_model = amp.convert_hybrid_block(net) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,))) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,))) # Check symbolic block, tweaked inputs converted_model = amp.convert_hybrid_block(net, target_dtype="float16", target_dtype_ops=["Convolution"]) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, ))) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, ))) params = converted_model.collect_params() assert params["stage2_unit1_conv2_weight"].dtype == np.float32 # Pass cast_optional_params as True to convert_hybrid_block converted_model = amp.convert_hybrid_block(net, target_dtype="float16", target_dtype_ops=["Convolution"], cast_optional_params=True) params = converted_model.collect_params() assert params["stage2_unit1_conv2_weight"].dtype == np.float16 with mx.Context(mx.gpu(0)): check_amp_convert_symbol() check_amp_convert_model() check_amp_convert_hybrid_block()
def train(self, train_file, dev_file, test_file, save_dir, pretrained_embeddings=None, min_occur_count=2, lstm_layers=3, word_dims=100, tag_dims=100, dropout_emb=0.33, lstm_hiddens=400, dropout_lstm_input=0.33, dropout_lstm_hidden=0.33, mlp_arc_size=500, mlp_rel_size=100, dropout_mlp=0.33, learning_rate=2e-3, decay=.75, decay_steps=5000, beta_1=.9, beta_2=.9, epsilon=1e-12, num_buckets_train=40, num_buckets_valid=10, num_buckets_test=10, train_iters=50000, train_batch_size=5000, test_batch_size=5000, validate_every=100, save_after=5000, debug=False): """Train a deep biaffine dependency parser. Parameters ---------- train_file : str path to training set dev_file : str path to dev set test_file : str path to test set save_dir : str a directory for saving model and related meta-data pretrained_embeddings : tuple (embedding_name, source), used for gluonnlp.embedding.create(embedding_name, source) min_occur_count : int threshold of rare words, which will be replaced with UNKs, lstm_layers : int layers of lstm word_dims : int dimension of word embedding tag_dims : int dimension of tag embedding dropout_emb : float word dropout lstm_hiddens : int size of lstm hidden states dropout_lstm_input : int dropout on x in variational RNN dropout_lstm_hidden : int dropout on h in variational RNN mlp_arc_size : int output size of MLP for arc feature extraction mlp_rel_size : int output size of MLP for rel feature extraction dropout_mlp : float dropout on the output of LSTM learning_rate : float learning rate decay : float see ExponentialScheduler decay_steps : int see ExponentialScheduler beta_1 : float see ExponentialScheduler beta_2 : float see ExponentialScheduler epsilon : float see ExponentialScheduler num_buckets_train : int number of buckets for training data set num_buckets_valid : int number of buckets for dev data set num_buckets_test : int number of buckets for testing data set train_iters : int training iterations train_batch_size : int training batch size test_batch_size : int test batch size validate_every : int validate on dev set every such number of batches save_after : int skip saving model in early epochs debug : bool debug mode Returns ------- DepParser parser itself """ logger = init_logger(save_dir) config = _Config(train_file, dev_file, test_file, save_dir, pretrained_embeddings, min_occur_count, lstm_layers, word_dims, tag_dims, dropout_emb, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, learning_rate, decay, decay_steps, beta_1, beta_2, epsilon, num_buckets_train, num_buckets_valid, num_buckets_test, train_iters, train_batch_size, debug) config.save() self._vocab = vocab = ParserVocabulary(train_file, pretrained_embeddings, min_occur_count) vocab.save(config.save_vocab_path) vocab.log_info(logger) with mx.Context(mxnet_prefer_gpu()): self._parser = parser = BiaffineParser(vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, debug) parser.initialize() scheduler = ExponentialScheduler(learning_rate, decay, decay_steps) optimizer = mx.optimizer.Adam(learning_rate, beta_1, beta_2, epsilon, lr_scheduler=scheduler) trainer = gluon.Trainer(parser.collect_params(), optimizer=optimizer) data_loader = DataLoader(train_file, num_buckets_train, vocab) global_step = 0 best_UAS = 0. batch_id = 0 epoch = 1 total_epoch = math.ceil(train_iters / validate_every) logger.info('Epoch %d out of %d', epoch, total_epoch) bar = Progbar(target=min(validate_every, data_loader.samples)) while global_step < train_iters: for words, tags, arcs, rels in data_loader.get_batches(batch_size=train_batch_size, shuffle=True): with autograd.record(): arc_accuracy, _, _, loss = parser.forward(words, tags, arcs, rels) loss_value = loss.asscalar() loss.backward() trainer.step(train_batch_size) batch_id += 1 try: bar.update(batch_id, exact=[('UAS', arc_accuracy, 2), ('loss', loss_value)]) except OverflowError: pass # sometimes loss can be 0 or infinity, crashes the bar global_step += 1 if global_step % validate_every == 0: bar = Progbar(target=min(validate_every, train_iters - global_step)) batch_id = 0 UAS, LAS, speed = evaluate_official_script(parser, vocab, num_buckets_valid, test_batch_size, dev_file, os.path.join(save_dir, 'valid_tmp')) logger.info('Dev: UAS %.2f%% LAS %.2f%% %d sents/s', UAS, LAS, speed) epoch += 1 if global_step < train_iters: logger.info('Epoch %d out of %d', epoch, total_epoch) if global_step > save_after and UAS > best_UAS: logger.info('- new best score!') best_UAS = UAS parser.save(config.save_model_path) # When validate_every is too big if not os.path.isfile(config.save_model_path) or best_UAS != UAS: parser.save(config.save_model_path) return self
def test_amp_conversion(): def check_amp_convert_symbol(): x = mx.sym.var("x") y = mx.sym.var("y") z = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True) siny = mx.sym.sin(y) res = z + siny # Compare symbols with similar computation graphs created using convert_symbol and manually. res_converted = amp.convert_symbol(res, target_dtype="float16", target_dtype_ops=["FullyConnected"], fp32_ops=["sin"]) x_fp16 = mx.sym.amp_cast(x, dtype="float16") y_fp16 = mx.sym.amp_cast(y, dtype="float16") siny = mx.sym.sin(y) z = mx.sym.FullyConnected(x_fp16, y_fp16, num_hidden=10, no_bias=True) amp_casted_z = mx.sym.amp_cast(z, dtype="float32") res_expected = amp_casted_z + siny assert same_symbol_structure(res_converted, res_expected), \ "convert_symbol generating wrong computation graph" # convert_symbol called with incorrect inputs assert_raises(AssertionError, amp.convert_symbol, res, target_dtype="float16", target_dtype_ops=["FullyConnected"], fp32_ops=["elemwise_add"]) assert_raises(AssertionError, amp.convert_symbol, res, target_dtype="float16", target_dtype_ops=["FullyConnected"], fp32_ops=["Activation"], conditional_fp32_ops=[('Activation', 'act_type', ['selu'])]) assert_raises(AssertionError, amp.convert_symbol, res, target_dtype="float16", target_dtype_ops=["Activation"], fp32_ops=["Activation"], conditional_fp32_ops=[('Activation', 'act_type', ['selu'])]) assert_raises(AssertionError, amp.convert_symbol, res, target_dtype="float16", target_dtype_ops=["FullyConnected"], fp32_ops=["FullyConnected"]) # Test for op in conditional ops with condition not satisfied x = mx.sym.var("x") y = mx.sym.var("y") fc_cond = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True) res_converted = amp.convert_symbol(fc_cond, target_dtype="float16", target_dtype_ops=[], fp32_ops=["sin"], conditional_fp32_ops=[ ("FullyConnected", "no_bias", ["False"]) ]) res_expected = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True) assert same_symbol_structure(res_converted, res_expected), \ "convert_symbol generating wrong computation graph when conditional ops is used" # Test for op in conditional ops with condition satisfied res_converted = amp.convert_symbol(fc_cond, target_dtype="float16", target_dtype_ops=[], fp32_ops=["sin"], conditional_fp32_ops=[ ("FullyConnected", "no_bias", ["True"]) ]) x_fp32 = mx.sym.amp_cast(x, dtype="float32") y_fp32 = mx.sym.amp_cast(y, dtype="float32") res_expected = mx.sym.FullyConnected(x_fp32, y_fp32, num_hidden=10, no_bias=True) assert same_symbol_structure(res_converted, res_expected), \ "convert_symbol generating wrong computation graph when conditional ops used with satisfying condition" # Test with a real world model, default inputs for convert_symbol dir_path = os.path.dirname(os.path.realpath(__file__)) model_path = os.path.join(dir_path, 'model') if not os.path.isdir(model_path): os.mkdir(model_path) prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path) sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) inputs = {} inputs['data'] = mx.nd.ones((1, 3, 224, 224)) inputs.update(arg_params) converted_sym = amp.convert_symbol(sym) exe = converted_sym.simple_bind(mx.gpu(0), data=(1, 3, 224, 224), grad_req='null') exe.forward(is_train=False, **inputs) exe.outputs[0].asnumpy() inputs2 = {} inputs2['data'] = mx.nd.ones((1, 3, 224, 224)) inputs2['fc1_weight'] = inputs['fc1_weight'].astype(np.float16) inputs2['fc1_bias'] = inputs['fc1_bias'].astype(np.float16) # Test with a real world model, tweak inputs for convert_symbol converted_sym = amp.convert_symbol(sym, target_dtype="float16", target_dtype_ops=["Convolution"], data_names=["data"], cast_optional_params=True) converted_sym2 = amp.convert_symbol(sym, target_dtype="float16", target_dtype_ops=["Convolution"], data_names=["data"], cast_optional_params=False) exe = converted_sym.simple_bind(mx.gpu(0), data=(1, 3, 224, 224), grad_req='null') exe2 = converted_sym2.simple_bind(mx.gpu(), data=(1, 3, 224, 224), grad_req='null') converted_args = converted_sym.list_arguments() converted_auxs = converted_sym.list_auxiliary_states() for i, key in enumerate(exe.arg_arrays): if converted_args[i] in arg_params: arg_params[converted_args[i]] = arg_params[ converted_args[i]].astype(exe.arg_arrays[i].dtype) for i, key in enumerate(exe.aux_arrays): if converted_auxs[i] in aux_params: aux_params[converted_auxs[i]] = aux_params[ converted_auxs[i]].astype(exe.aux_arrays[i].dtype) inputs2.update(arg_params) exe.forward(is_train=False, **inputs2) exe.outputs[0].wait_to_read() inputs['fc1_weight'] = inputs['fc1_weight'].astype(np.float16) inputs['fc1_bias'] = inputs['fc1_bias'].astype(np.float16) exe2.forward(is_train=False, **inputs) exe2.outputs[0].wait_to_read() def check_amp_convert_model(): # Test with real world model, default inputs for convert_model dir_path = os.path.dirname(os.path.realpath(__file__)) model_path = os.path.join(dir_path, 'model') if not os.path.isdir(model_path): os.mkdir(model_path) prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path) sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) # Test with real world model, tweak inputs for convert_model result_sym, result_arg_params, result_aux_params = amp.convert_model( sym, arg_params, aux_params, target_dtype="float16", target_dtype_ops=["Convolution"]) mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu()) mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1, )]]) mod.set_params(result_arg_params, result_aux_params) mod.forward( mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))], label=[mx.nd.ones((1, ))])) mod.get_outputs()[0].asnumpy() assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == np.float32 # Call convert_model with cast_optional_params set to True result_sym, result_arg_params, result_aux_params = amp.convert_model( sym, arg_params, aux_params, target_dtype="float16", target_dtype_ops=["Convolution"], cast_optional_params=True) mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu()) mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1, )]]) mod.set_params(result_arg_params, result_aux_params) mod.forward( mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))], label=[mx.nd.ones((1, ))])) mod.get_outputs()[0].asnumpy() assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == np.float16 def check_amp_convert_hybrid_block(): # Test conversion for hybrid block on CPU model_cpu = get_model("resnet50_v1") model_cpu.collect_params().initialize(ctx=mx.cpu()) model_cpu.hybridize() model_cpu( mx.nd.random.uniform(0, 1, shape=(1, 3, 224, 224), ctx=mx.cpu())) converted_model_cpu = amp.convert_hybrid_block(model_cpu) # Test with real world model, default inputs for convert_hybrid_block model = get_model("resnet50_v1") model.collect_params().initialize(ctx=mx.gpu()) model.hybridize() model(mx.nd.zeros((1, 3, 224, 224))) converted_model = amp.convert_hybrid_block(model) result = converted_model.forward( mx.nd.zeros((1, 3, 224, 224), dtype=np.float32)) result = converted_model.forward( mx.nd.zeros((1, 3, 224, 224), dtype=np.float32)) # Test with real world model, tweak inputs for convert_hybrid_block converted_model = amp.convert_hybrid_block( model, target_dtype="float16", target_dtype_ops=["Convolution"]) result = converted_model.forward( mx.nd.zeros((1, 3, 224, 224), dtype=np.float32)) result = converted_model.forward( mx.nd.zeros((1, 3, 224, 224), dtype=np.float32)) # Check symbolic block dir_path = os.path.dirname(os.path.realpath(__file__)) model_path = os.path.join(dir_path, 'model') if not os.path.isdir(model_path): os.mkdir(model_path) prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path) net = SymbolBlock.imports( os.path.join(model_path, "imagenet1k-resnet-18-symbol.json"), input_names=["data", "softmax_label"], param_file=os.path.join(model_path, "imagenet1k-resnet-18-0000.params")) net.collect_params().reset_ctx(ctx=mx.gpu()) net.hybridize() net(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, ))) converted_model = amp.convert_hybrid_block(net) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, ))) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, ))) # Check symbolic block, tweaked inputs converted_model = amp.convert_hybrid_block( net, target_dtype="float16", target_dtype_ops=["Convolution"]) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, ))) result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, ))) params = converted_model.collect_params() assert params["stage2_unit1_conv2_weight"].dtype == np.float32 # Pass cast_optional_params as True to convert_hybrid_block converted_model = amp.convert_hybrid_block( net, target_dtype="float16", target_dtype_ops=["Convolution"], cast_optional_params=True) params = converted_model.collect_params() assert params["stage2_unit1_conv2_weight"].dtype == np.float16 def check_amp_convert_bucketing_module(): model = train_model(context=mx.current_context()) result_model = amp.convert_bucketing_module(model) val_sent = [] batch_size = 128 invalid_label = -1 num_sentence = 1000 buckets = [5, 10, 20, 30, 40] len_vocab = 50 for _ in range(num_sentence): len_sentence = randint(6, max(buckets) - 1) # leave out the two last buckets empty val_sentence = [] for _ in range(len_sentence): val_sentence.append(randint(1, len_vocab)) val_sent.append(val_sentence) data_val = mx.rnn.BucketSentenceIter(val_sent, batch_size, buckets=buckets, invalid_label=invalid_label) result_model.bind(data_val.provide_data, data_val.provide_label, for_training=False) result_model.score(data_val, mx.metric.Perplexity(invalid_label), batch_end_callback=mx.callback.Speedometer( batch_size, 1)) # AMP conversion with cast_optional_params set to true # Flaky test when cast_optional_params set to True : https://github.com/apache/incubator-mxnet/issues/16030 ''' result_model = amp.convert_bucketing_module(model, cast_optional_params=True) result_model.bind(data_val.provide_data, data_val.provide_label, for_training=False) result_model.score(data_val, mx.metric.Perplexity(invalid_label), batch_end_callback=mx.callback.Speedometer(batch_size, 1)) ''' with mx.Context(mx.gpu(0)): check_amp_convert_symbol() check_amp_convert_model() check_amp_convert_hybrid_block() check_amp_convert_bucketing_module()
def main(): # Initialize problem parameters batch_size = 1 prediction_length = 50 context_length = 5 axis = [-5, 5, -3, 3] float_type = np.float64 ctx = mx.Context("gpu") num_samples = 3 ts_idx = 0 # Initialize test data to generate Gaussian Process from lb = -5 ub = 5 dx = (ub - lb) / (prediction_length - 1) x_test = nd.arange(lb, ub + dx, dx, ctx=ctx, dtype=float_type).reshape(-1, 1) x_test = nd.tile(x_test, reps=(batch_size, 1, 1)) # Define the GP hyper parameters amplitude = nd.ones((batch_size, 1, 1), ctx=ctx, dtype=float_type) length_scale = math.sqrt(0.4) * nd.ones_like(amplitude) sigma = math.sqrt(1e-5) * nd.ones_like(amplitude) # Instantiate desired kernel object and compute kernel matrix rbf_kernel = RBFKernel(amplitude, length_scale) # Generate samples from 0 mean Gaussian process with RBF Kernel and plot it gp = GaussianProcess( sigma=sigma, kernel=rbf_kernel, prediction_length=prediction_length, context_length=context_length, num_samples=num_samples, ctx=ctx, float_type=float_type, sample_noise=False, # Returns sample without noise ) mean = nd.zeros((batch_size, prediction_length), ctx=ctx, dtype=float_type) covariance = rbf_kernel.kernel_matrix(x_test, x_test) gp.plot(x_test=x_test, samples=gp.sample(mean, covariance), ts_idx=ts_idx) # Generate training set on subset of interval using the sine function x_train = nd.array([-4, -3, -2, -1, 1], ctx=ctx, dtype=float_type).reshape(context_length, 1) x_train = nd.tile(x_train, reps=(batch_size, 1, 1)) y_train = nd.sin(x_train.squeeze(axis=2)) # Predict exact GP using the GP predictive mean and covariance using the same fixed hyper-parameters samples, predictive_mean, predictive_std = gp.exact_inference( x_train, y_train, x_test) assert (np.sum(np.isnan( samples.asnumpy())) == 0), "NaNs in predictive samples!" gp.plot( x_train=x_train, y_train=y_train, x_test=x_test, ts_idx=ts_idx, mean=predictive_mean, std=predictive_std, samples=samples, axis=axis, )
def test_ndarray_copy(): c = mx.nd.array(np.random.uniform(-10, 10, (10, 10))) d = c.copyto(mx.Context('cpu', 0)) assert np.sum(np.abs(c.asnumpy() != d.asnumpy())) == 0.0
def generate_poisoning_fun(index=0, total_round=5): # load dataset dev = mx.gpu(1) batch_size = 1 # data_shape = (batch_size, 3072) data_shape = (batch_size, 3, 28, 28) train_iter = mx.io.ImageRecordIter( path_imgrec="data/cifar10/cifar10_train.rec", data_shape=(3, 28, 28), batch_size=batch_size, # mean_img="data/cifar10/mean.bin", rand_crop=True, rand_mirror=True, round_batch=True) val_iter = mx.io.ImageRecordIter( path_imgrec="data/cifar10/cifar10_val.rec", data_shape=(3, 28, 28), batch_size=batch_size, # mean_img="data/cifar10/mean.bin", rand_crop=True, rand_mirror=True, round_batch=True) all_data = [] all_label = [] poisoning_d = [] poisoning_l = [] poisoing_data_list = [] poisoing_label_list = [] for i in range(index): batch_data = copy.deepcopy(train_iter.next()) poisoning_d.append(batch_data.data[0]) poisoning_l.append(batch_data.label[0] + 1) for j in range(index): with mx.Context(dev): # load original model softmax, arg_params, aux_params = mx.model.load_checkpoint( 'model/cifar10_model', 300) model = mx.mod.Module(softmax, context=dev) model.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) model.set_params(arg_params, aux_params) # -------- parameters ---------------- train_iter.reset() dataBatchP = copy.deepcopy(train_iter.next()) dataBatchP.label[0] = dataBatchP.label[0] + 1 dataBatchP.data[0] = poisoning_d[j] num_normal = 10 attacked_model_lr = 0.01 model.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', attacked_model_lr), )) # -----------get normal data loss and accuracy---------- loss = 0 for num in range(num_normal): dataBatch = copy.deepcopy(train_iter.next()) label = dataBatch.label[0] model.forward(dataBatch) output = model.get_outputs()[0].asnumpy() loss += CalLogLoss(output, label.asnumpy()) print 'normal data loss: %.4f' % loss val_iter.reset() val_acc = model.score(val_iter, 'acc') print 'Val Acc: %.4f' % val_acc[0][1] # -----------get loss and accuracy with initial poisoned data---------- # load pre-trained weight model.forward(dataBatchP, is_train=True) model.backward() model.update() val_iter.reset() val_acc = model.score(val_iter, 'acc') print 'Val Acc: %.4f' % val_acc[0][1] # re-evaluate normal data loss loss = 0 train_iter.reset() dataBatch = copy.deepcopy(train_iter.next()) for num in range(num_normal): dataBatch = copy.deepcopy(train_iter.next()) label = dataBatch.label[0] model.forward(dataBatch) loss += CalLogLoss(model.get_outputs()[0].asnumpy(), label.asnumpy()) print 'normal data loss: %.4f' % loss # ---------generate poisoned data------------ plt.figure('poisoned data') # initial poisoned data plt.subplot(1, 5, 1) plt.imshow( (dataBatchP.data[0]).asnumpy()[0].astype(np.uint8).transpose( 1, 2, 0)) # print dataBatchP.data[0].asnumpy()[0] pre_loss = loss for round in range(total_round): start = time.time() print 'round %d' % round # calculate gradient wrt poisoned data dir = np.zeros(data_shape).reshape(1, 2352) label_tmp = copy.deepcopy(dataBatchP.label) for gradient_round in range(data_shape[-1] * data_shape[-2] * data_shape[-3]): data_tmp = copy.deepcopy(dataBatchP.data[0]) data_tmp = data_tmp.asnumpy().reshape(1, 2352) data_tmp[0][gradient_round] += 1 # load pre-trained weight model.set_params(arg_params, aux_params) dataBatch_tmp = copy.deepcopy( mx.io.DataBatch( [mx.nd.array(data_tmp.reshape(1, 3, 28, 28))], label_tmp)) model.forward(dataBatch_tmp, is_train=True) model.backward() # update attacked model model.update() # calculate normal data loss loss = 0 train_iter.reset() dataBatch = copy.deepcopy(train_iter.next()) for num in range(num_normal): dataBatch = copy.deepcopy(train_iter.next()) label = dataBatch.label[0] model.forward(dataBatch) output = model.get_outputs()[0].asnumpy() loss += CalLogLoss(output, label.asnumpy()) dir[0][gradient_round] = np.sign(loss - pre_loss) tmp = (dataBatchP.data[0]).asnumpy().reshape(1, 2352) + dir * 10 tmp[tmp > 255] = 255 tmp[tmp < 0] = 0 # print dataBatchP.data[0].asnumpy()[0] dataBatchP.data[0] = mx.nd.array(tmp.reshape(1, 3, 28, 28)) end = time.time() print 'time: %.4f' % (end - start) if round % 4 == 0: plt.subplot(1, 5, round / 2 + 2) plt.imshow(dataBatchP.data[0].asnumpy()[0].astype( np.uint8).transpose(1, 2, 0)) # print dataBatchP.data[0].asnumpy()[0] # make one attack # load pre-trained weight model.set_params(arg_params, aux_params) model.forward(dataBatchP, is_train=True) model.backward() # update attacked model model.update() val_iter.reset() val_acc = model.score(val_iter, 'acc') print 'Val Acc: %.4f' % val_acc[0][1] # re-evaluate normal data loss loss = 0 dataBatch = copy.deepcopy(train_iter.next()) for num in range(num_normal): dataBatch = copy.deepcopy(train_iter.next()) label = dataBatch.label[0] model.forward(dataBatch) output = model.get_outputs()[0].asnumpy() loss += CalLogLoss(output, label.asnumpy()) print 'normal data loss: %.4f' % loss pre_loss = loss poisoing_data_list.append(dataBatchP.data[0].asnumpy()[0] / 255) poisoing_label_list.append(dataBatchP.label[0].asnumpy()) all_data.append(poisoing_data_list) all_label.append(poisoing_label_list) return all_data, all_label