def __init__(self, hparams=DotDict({ 'model_type': 'transformer', 'ninp': 128, 'nhead': 2, 'nhid': 512, 'nlayers': 2, 'tie_layers': True, 'tie_encoder_decoder': True, 'dropout': 0.1, })): super(LanguageModelTrainer, self).__init__() self.hparams = hparams if isinstance(hparams, DotDict) \ else DotDict(hparams) from utils import get_default_tokenizer self.vocab_size = get_default_tokenizer()._tokenizer.get_vocab_size() self.model_type = hparams.get('model_type', 'transformer') assert self.model_type in ['transformer', 'lstm'] if self.model_type == 'transformer': self.model = TransformerModel(ntoken=self.vocab_size, **hparams) else: self.model = LSTMModel(ntoken=self.vocab_size, **hparams) self.batch_size = hparams.get('batch_size', 64) self.bptt = hparams.get('bptt', 128)
def main(): batch_size = 128 epochs = 100 maxlen = 300 model_path = 'models/lstm_model.h5' num_words = 40000 num_label = 2 x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') model = LSTMModel(num_words, num_label, embeddings=None).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format( precision_score(y_test, y_pred, average='binary'))) print('recall : {:.4f}'.format( recall_score(y_test, y_pred, average='binary'))) print('f1 : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
## Did it finish? last_output = " ".join(ss_output.stdout.decode("utf-8").split('\n')[-2:]) if "Saving model in tsv format" not in last_output: ## won't save a file if it doesn't finish. print('Starspace did not complete. PANIC! \nReverting to default initialization.') args.init_embed = 0 ## change the parameter to not use Starspace embeddings later. ''' # Init models, opt and criterion if args.model == 'FastText': print("Using FastText model") model = FastText(len(vocab), args.embeddim, len(label_map), args.cuda) else: print("Using LSTM model") model = LSTMModel(len(vocab), args.embeddim, args.hiddendim, label_map, args.batchsize, args.cuda) crit = nn.BCEWithLogitsLoss() if args.cuda: print("Using cuda") torch.cuda.device(args.gpu) model.cuda() crit.cuda() else: print("Using CPU only") params = list(model.parameters()) opti = torch.optim.Adam(params, lr=args.lr) print(model) def evaluate(model, loader, crit, cuda, bs, num_labels, model_type): data_size = len(loader)
def create_model(self) -> torch.nn.Module: return LSTMModel(self.calculate_input_size(), self.calculate_output_size(), self.helper.opt.state_size, self.helper.opt.n_layers)
#Main Loop while True: min_test_loss = 1.e6 loss = 0.0 train_loss_seq = [] test_loss_seq = [] if model_type == 'Transformer': model = TransformerModel(config) elif model_type == 'LSTM': model = LSTMModel(config) if cuda: model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=config['train']['learning_rate'], weight_decay=config['train']['weight_decay']) criterion = torch.nn.MSELoss() optimizer.zero_grad() for it in range(n_iter): model.train() country = random.choice(train_countries) inp, target = get_data_tensor(data, country, measure_mode, output_mode=output_mode, cuda=cuda)
def main(): # ハイパーパラメータの背一定 batch_size = 128 epochs = 100 maxlen = 300 # model_path = 'models/rnn_model.h5' # model_path = 'models/lstm_model.h5' # model_path = 'models/CNN_model.h5' model_path = 'models/latm_iniemb_model.h5' num_words = 4000 num_label = 2 # データ・セットの読み込み x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') # データセットの前処理 x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') # 単語分散表現 wv = load_fasttext('data/cc.ja.300.vec') wv = filter_embeddings(wv, vocab.word_index, num_words) # モデルの構築 # model = RNNModel(num_words, num_label, embeddings=None).build() model = LSTMModel(num_words, num_label, embeddings=wv).build() # model = CNNModel(num_words, num_label, embeddings=None).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) # 予測 model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format( precision_score(y_test, y_pred, average='binary'))) print('recall: {:.4f}'.format( recall_score(y_test, y_pred, average='binary'))) print('f1: {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
def launch(model_params, checkpoint_path, device='cuda'): print('model_params:\t', model_params) max_length = model_params['bptt'] tokenizer = get_default_tokenizer() eos_token = tokenizer.token_to_id('[SEP]') eod_token = tokenizer.token_to_id('[DOC_SEP]') vocab_size = tokenizer._tokenizer.get_vocab_size() assert eos_token is not None, 'Invalid tokenizer files - EOS token cannot be null' # Model from models import TransformerModel, LSTMModel model_type = model_params.get('model_type', 'transformer') assert model_type in ['transformer', 'lstm'] if model_type == 'transformer': model = TransformerModel(ntoken=vocab_size, **model_params) else: model = LSTMModel(ntoken=vocab_size, **model_params) model = model.to(device) if checkpoint_path and path.exists(checkpoint_path): print(f'Loading checkpoint from {checkpoint_path}') checkpoint_state = torch.load(checkpoint_path) model.load_state_dict(checkpoint_state) @torch.no_grad() def _generate(input_ids=None, max_length=max_length, do_sample=True, num_beams=5, temperature=1.3, top_k=50, top_p=1.0, repetition_penalty=1.2, eos_token_ids=[eos_token, eod_token], length_penalty=1.0, num_return_sequences=1, vocab_size=vocab_size): pad_token_id = 0 model.eval() batch_size = 1 cur_len = input_ids.shape[1] # Expand input to num beams input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len) input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len) # generated hypotheses generated_hyps = [ BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size) ] # scores for each sentence in the beam beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) # cache compute states past = None # done sentences done = [False for _ in range(batch_size)] while cur_len < max_length: outputs = model(input_ids.t()) outputs = outputs.permute(1, 0, 2) # print(input_ids) # print(torch.argmax(outputs)) scores = outputs[:, -1, :] # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: for i in range(batch_size * num_beams): for previous_token in set(input_ids[i].tolist()): # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability if scores[i, previous_token] < 0: scores[i, previous_token] *= repetition_penalty else: scores[i, previous_token] /= repetition_penalty if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: scores = scores / temperature # Top-p/top-k filtering # min_value = torch.min(scores, dim=-1)[] scores = top_k_top_p_filtering( scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 ) # (batch_size * num_beams, vocab_size) # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search) try: next_words = torch.multinomial( torch.softmax(scores, dim=-1), num_samples=2, replacement=True) # (batch_size * num_beams, 2) except: print((torch.softmax(scores, dim=-1) > 0).sum()) raise ValueError() # Compute next scores _scores = F.log_softmax( scores, dim=-1) # (batch_size * num_beams, vocab_size) _scores = torch.gather( _scores, -1, next_words) # (batch_size * num_beams, 2) next_scores = _scores + beam_scores[:, None].expand_as( _scores) # (batch_size * num_beams, 2) # Match shape of greedy beam search next_words = next_words.view( batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) next_scores = next_scores.view( batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) else: # do greedy beam search scores = F.log_softmax( scores, dim=-1) # (batch_size * num_beams, vocab_size) assert scores.size() == (batch_size * num_beams, vocab_size) # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) _scores = scores + beam_scores[:, None].expand_as( scores) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together (we are keeping top hypothesis accross beams) _scores = _scores.view( batch_size, num_beams * vocab_size) # (batch_size, num_beams * vocab_size) next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True) assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams) # next batch beam content # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch) next_batch_beam = [] # for each sentence for batch_ex in range(batch_size): # if we are done with this sentence done[batch_ex] = done[batch_ex] or generated_hyps[ batch_ex].is_done(next_scores[batch_ex].max().item()) if done[batch_ex]: next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch continue # next sentence beam content next_sent_beam = [] # next words for this sentence for idx, score in zip(next_words[batch_ex], next_scores[batch_ex]): # get beam and word IDs beam_id = idx // vocab_size word_id = idx % vocab_size # end of sentence, or next word if word_id.item( ) in eos_token_ids or cur_len + 1 == max_length: generated_hyps[batch_ex].add( input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item()) else: next_sent_beam.append( (score, word_id, batch_ex * num_beams + beam_id)) # the beam for next step is full if len(next_sent_beam) == num_beams: break # update next beam content assert len(next_sent_beam ) == 0 if cur_len + 1 == max_length else num_beams if len(next_sent_beam) == 0: next_sent_beam = [(0, pad_token_id, 0) ] * num_beams # pad the batch next_batch_beam.extend(next_sent_beam) assert len(next_batch_beam) == num_beams * (batch_ex + 1) # sanity check / prepare next batch assert len(next_batch_beam) == batch_size * num_beams beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) beam_words = input_ids.new([x[1] for x in next_batch_beam]) beam_idx = input_ids.new([x[2] for x in next_batch_beam]) # re-order batch input_ids = input_ids[beam_idx, :] input_ids = torch.cat([input_ids, beam_words.unsqueeze(1)], dim=-1) # re-order internal states if past: reordered_past = [] for layer_past in past: # get the correct batch idx from layer past batch dim # batch dim of `past` and `mems` is at 2nd position reordered_layer_past = [ layer_past[:, i].unsqueeze(1).clone().detach() for i in beam_idx ] reordered_layer_past = torch.cat(reordered_layer_past, dim=1) # check that shape matches assert reordered_layer_past.shape == layer_past.shape reordered_past.append(reordered_layer_past) past = tuple(reordered_past) # update current length cur_len = cur_len + 1 # stop when we are done with each sentence if all(done): break # visualize hypotheses # print([len(x) for x in generated_hyps], cur_len) # globals().update( locals() ); # !import code; code.interact(local=vars()) # for ii in range(batch_size): # for ss, ww in sorted(generated_hyps[ii].hyp, key=lambda x: x[0], reverse=True): # print("%.3f " % ss + " ".join(self.dico[x] for x in ww.tolist())) # print("") # select the best hypotheses tgt_len = input_ids.new(batch_size) best = [] for i, hypotheses in enumerate(generated_hyps): if len(hypotheses.hyp) == 0: continue best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1] tgt_len[i] = len(best_hyp) + 1 # +1 for the <EOS> symbol best.append(best_hyp) # generate target batch decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id) for i, hypo in enumerate(best): decoded[i, :tgt_len[i] - 1] = hypo decoded[i, tgt_len[i] - 1] = eos_token_ids[0] return decoded model_input = LEADING_TEXT while True: user_prompt = input(' >>> ') if user_prompt == 'exit': exit() else: num_return_sequences = 1 model_input += ' [P0] ' + user_prompt + ' [SEP] [P1] ' input_ids = tokenizer.encode(model_input).ids input_ids = torch.LongTensor(input_ids).unsqueeze(0) input_ids = input_ids.to(device) output = _generate(input_ids=input_ids, max_length=min(max_length, input_ids.size(1) + 40)) if num_return_sequences != 1: output = output.view(batch_size, num_return_sequences, -1) response = tokenizer.decode(output[0].cpu().tolist(), skip_special_tokens=False) eod_token = '[DOC_SEP]' if eod_token in response: response = response[response.index(eod_token):] start_token = '[P1]' sep_token = '[SEP]' if start_token in response: start_idx = response.index(start_token) + len(start_token) + 1 response = response[start_idx:] if sep_token in response: sep_idx = response.index(sep_token) response = response[:sep_idx] model_input += response + f' {sep_token} ' print('Bot: ' + response)
def main(): try: os.mkdir(args.snapshot_directory) except: pass images = [] files = os.listdir(args.dataset_path) files.sort() for filename in files: image = np.load(os.path.join(args.dataset_path, filename)) image = image / 255 images.append(image) images = np.vstack(images) images = images.transpose((0, 3, 1, 2)).astype(np.float32) train_dev_split = 0.9 num_images = images.shape[0] num_train_images = int(num_images * train_dev_split) num_dev_images = num_images - num_train_images images_train = images[:num_train_images] images_dev = images[num_dev_images:] xp = np using_gpu = args.gpu_device >= 0 if using_gpu: cuda.get_device(args.gpu_device).use() xp = cp hyperparams = HyperParameters(snapshot_directory=args.snapshot_directory) hyperparams.print() if hyperparams.use_gru: model = GRUModel(hyperparams, snapshot_directory=args.snapshot_directory) else: model = LSTMModel(hyperparams, snapshot_directory=args.snapshot_directory) if using_gpu: model.to_gpu() dataset = draw.data.Dataset(images_dev) iterator = draw.data.Iterator(dataset, batch_size=1) cols = hyperparams.generator_generation_steps figure = plt.figure(figsize=(8, 4 * cols)) axis_1 = figure.add_subplot(cols, 3, 1) axis_1.set_title("Data") axis_rec_array = [] for n in range(cols): axis_rec_array.append(figure.add_subplot(cols, 3, n * 3 + 2)) axis_rec_array[0].set_title("Reconstruction") axis_gen_array = [] for n in range(cols): axis_gen_array.append(figure.add_subplot(cols, 3, n * 3 + 3)) axis_gen_array[0].set_title("Generation") for batch_index, data_indices in enumerate(iterator): with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): x = dataset[data_indices] x = to_gpu(x) axis_1.imshow(make_uint8(x[0])) r_t_array, x_param = model.sample_image_at_each_step_from_posterior( x, zero_variance=args.zero_variance, step_limit=args.step_limit) for r_t, axis in zip(r_t_array, axis_rec_array[:-1]): r_t = to_cpu(r_t) axis.imshow(make_uint8(r_t[0])) mu_x, ln_var_x = x_param mu_x = to_cpu(mu_x.data) axis_rec_array[-1].imshow(make_uint8(mu_x[0])) r_t_array, x_param = model.sample_image_at_each_step_from_prior( batch_size=1, xp=xp) for r_t, axis in zip(r_t_array, axis_gen_array[:-1]): r_t = to_cpu(r_t) axis.imshow(make_uint8(r_t[0])) mu_x, ln_var_x = x_param mu_x = to_cpu(mu_x.data) axis_gen_array[-1].imshow(make_uint8(mu_x[0])) plt.pause(0.01)
def train_and_evaluate(dataset, loss, noise, run=0, num_batch=32, asymmetric=0): val_split = 0.1 if dataset == 'mnist': kerasModel = MNISTModel(num_batch=num_batch) kerasModel.optimizer = Adagrad() elif dataset == 'cifar10_deep': kerasModel = CIFAR10Model(num_batch=num_batch, type='deep') elif dataset[8:-1] == 'resnet': kerasModel = CIFAR10Model(num_batch=num_batch, type=dataset[8:]) elif dataset == 'cifar100': kerasModel = CIFAR100Model(num_batch=num_batch) elif dataset == 'imdb': kerasModel = IMDBModel(num_batch=num_batch) kerasModel.optimizer = Adagrad() elif dataset == 'lstm': kerasModel = LSTMModel(num_batch=num_batch) kerasModel.optimizer = Adagrad(lr=0.001) else: ValueError('No dataset given.') import sys sys.exit() # an important data-dependent configuration if dataset == 'cifar100': filter_outlier = False else: filter_outlier = True # the data, shuffled and split between train and test sets print('Loading %s ...' % dataset) X_train, X_test, y_train, y_test = kerasModel.get_data() print('Done.') # apply label noise if asymmetric == 0: y_train, P = noisify_with_P(y_train, kerasModel.classes, noise, random_state=run) elif asymmetric == 1: if dataset == 'mnist': y_train, P = noisify_mnist_asymmetric(y_train, noise, random_state=run) elif dataset == 'cifar100': y_train, P = noisify_cifar100_asymmetric(y_train, noise, random_state=run) elif dataset[:7] == 'cifar10': y_train, P = noisify_cifar10_asymmetric(y_train, noise, random_state=run) else: # binary classes y_train, P = noisify_binary_asymmetric(y_train, noise, random_state=run) print('T: \n', P) # convert class vectors to binary class matrices Y_train = to_categorical(y_train, kerasModel.classes) Y_test = to_categorical(y_test, kerasModel.classes) # keep track of the best model model_file = build_file_name('tmp_model/', dataset, loss, noise, asymmetric, run) # this is the case when we post-train changing the loss if loss == 'est_backward': vanilla_file = build_file_name('tmp_model/', dataset, 'crossentropy', noise, asymmetric, run) if not os.path.isfile(vanilla_file): ValueError('Need to train with crossentropy first !') # first compile the vanilla_crossentropy model with the saved weights kerasModel.build_model('crossentropy', P=None) kerasModel.load_model(vanilla_file) # estimate P est = NoiseEstimator(classifier=kerasModel, alpha=0.0, filter_outlier=filter_outlier) # use all X_train P_est = est.fit(X_train).predict() print('Condition number:', np.linalg.cond(P_est)) print('T estimated: \n', P) # compile the model with the new estimated loss kerasModel.build_model('backward', P=P_est) elif loss == 'est_forward': vanilla_file = build_file_name('tmp_model/', dataset, 'crossentropy', noise, asymmetric, run) if not os.path.isfile(vanilla_file): ValueError('Need to train with crossentropy first !') # first compile the vanilla_crossentropy model with the saved weights kerasModel.build_model('crossentropy', P=None) kerasModel.load_model(vanilla_file) # estimate P est = NoiseEstimator(classifier=kerasModel, alpha=0.0, filter_outlier=filter_outlier) # use all X_train P_est = est.fit(X_train).predict() print('T estimated:', P) # compile the model with the new estimated loss kerasModel.build_model('forward', P=P_est) else: # compile the model kerasModel.build_model(loss, P) # fit the model history = kerasModel.fit_model(model_file, X_train, Y_train, validation_split=val_split) history_file = build_file_name('history/', dataset, loss, noise, asymmetric, run) # decomment for writing history with open(history_file, 'wb') as f: pickle.dump(history, f) print('History dumped at ' + str(history_file)) # test score = kerasModel.evaluate_model(X_test, Y_test) # clean models, unless it is vanilla_crossentropy --to be used by P_est if loss != 'crossentropy': os.remove(model_file) return score
def create_model(self) -> torch.nn.Module: return LSTMModel(self.train_set.in_channels(), self.train_set.out_channels(), self.helper.opt.state_size, self.helper.opt.n_layers)
def main(): try: os.mkdir(args.snapshot_directory) except: pass comm = chainermn.create_communicator() device = comm.intra_rank cuda.get_device(device).use() xp = cp images = [] files = os.listdir(args.dataset_path) files.sort() subset_size = int(math.ceil(len(files) / comm.size)) files = deque(files) files.rotate(-subset_size * comm.rank) files = list(files)[:subset_size] for filename in files: image = np.load(os.path.join(args.dataset_path, filename)) image = image / 256 images.append(image) print(comm.rank, files) images = np.vstack(images) images = images.transpose((0, 3, 1, 2)).astype(np.float32) train_dev_split = 0.9 num_images = images.shape[0] num_train_images = int(num_images * train_dev_split) num_dev_images = num_images - num_train_images images_train = images[:num_train_images] # To avoid OpenMPI bug # multiprocessing.set_start_method("forkserver") # p = multiprocessing.Process(target=print, args=("", )) # p.start() # p.join() hyperparams = HyperParameters() hyperparams.chz_channels = args.chz_channels hyperparams.generator_generation_steps = args.generation_steps hyperparams.generator_share_core = args.generator_share_core hyperparams.generator_share_prior = args.generator_share_prior hyperparams.generator_share_upsampler = args.generator_share_upsampler hyperparams.generator_downsampler_channels = args.generator_downsampler_channels hyperparams.inference_share_core = args.inference_share_core hyperparams.inference_share_posterior = args.inference_share_posterior hyperparams.inference_downsampler_channels = args.inference_downsampler_channels hyperparams.batch_normalization_enabled = args.enable_batch_normalization hyperparams.use_gru = args.use_gru hyperparams.no_backprop_diff_xr = args.no_backprop_diff_xr if comm.rank == 0: hyperparams.save(args.snapshot_directory) hyperparams.print() if args.use_gru: model = GRUModel(hyperparams, snapshot_directory=args.snapshot_directory) else: model = LSTMModel(hyperparams, snapshot_directory=args.snapshot_directory) model.to_gpu() optimizer = AdamOptimizer(model.parameters, lr_i=args.initial_lr, lr_f=args.final_lr, beta_1=args.adam_beta1, communicator=comm) if comm.rank == 0: optimizer.print() num_pixels = images.shape[1] * images.shape[2] * images.shape[3] dataset = draw.data.Dataset(images_train) iterator = draw.data.Iterator(dataset, batch_size=args.batch_size) num_updates = 0 for iteration in range(args.training_steps): mean_kld = 0 mean_nll = 0 mean_mse = 0 start_time = time.time() for batch_index, data_indices in enumerate(iterator): x = dataset[data_indices] x += np.random.uniform(0, 1 / 256, size=x.shape) x = to_gpu(x) z_t_param_array, x_param, r_t_array = model.sample_z_and_x_params_from_posterior( x) loss_kld = 0 for params in z_t_param_array: mean_z_q, ln_var_z_q, mean_z_p, ln_var_z_p = params kld = draw.nn.functions.gaussian_kl_divergence( mean_z_q, ln_var_z_q, mean_z_p, ln_var_z_p) loss_kld += cf.sum(kld) loss_sse = 0 for r_t in r_t_array: loss_sse += cf.sum(cf.squared_error(r_t, x)) mu_x, ln_var_x = x_param loss_nll = cf.gaussian_nll(x, mu_x, ln_var_x) loss_nll /= args.batch_size loss_kld /= args.batch_size loss_sse /= args.batch_size loss = args.loss_beta * loss_nll + loss_kld + args.loss_alpha * loss_sse model.cleargrads() loss.backward(loss_scale=optimizer.loss_scale()) optimizer.update(num_updates, loss_value=float(loss.array)) num_updates += 1 mean_kld += float(loss_kld.data) mean_nll += float(loss_nll.data) mean_mse += float(loss_sse.data) / num_pixels / ( hyperparams.generator_generation_steps - 1) printr( "Iteration {}: Batch {} / {} - loss: nll_per_pixel: {:.6f} - mse: {:.6f} - kld: {:.6f} - lr: {:.4e}" .format( iteration + 1, batch_index + 1, len(iterator), float(loss_nll.data) / num_pixels + math.log(256.0), float(loss_sse.data) / num_pixels / (hyperparams.generator_generation_steps - 1), float(loss_kld.data), optimizer.learning_rate)) if comm.rank == 0 and batch_index > 0 and batch_index % 100 == 0: model.serialize(args.snapshot_directory) if comm.rank == 0: model.serialize(args.snapshot_directory) if comm.rank == 0: elapsed_time = time.time() - start_time print( "\r\033[2KIteration {} - loss: nll_per_pixel: {:.6f} - mse: {:.6f} - kld: {:.6f} - lr: {:.4e} - elapsed_time: {:.3f} min" .format( iteration + 1, mean_nll / len(iterator) / num_pixels + math.log(256.0), mean_mse / len(iterator), mean_kld / len(iterator), optimizer.learning_rate, elapsed_time / 60))
def main(args): print(args) startime = time.time() os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' # Set hyper-parameters. batch_size = 128 epochs = 100 maxlen = 300 model_path = 'models/model_{}.h5' num_words = 40000 num_label = 2 # Data loading. print(return_time(startime), "1. Loading data ...") x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') # pre-processing. print(return_time(startime), "2. Preprocessing dataset ...") x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') # Preparing word embedding. if args.loadwv: print(return_time(startime), "3. Loading word embedding ...") wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words) if os.path.exists(wv_path): wv = np.load(wv_path) print(return_time(startime), "Loaded word embedding successfully!") else: print(return_time(startime), "Word embedding file doesn't exist") exit() else: print(return_time(startime), "3. Preparing word embedding ...") wv = load_fasttext('data/cc.ja.300.vec.gz') wv = filter_embeddings(wv, vocab.word_index, num_words) # Saving word embedding. if args.savewv: wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words) np.save(wv_path, wv) print(return_time(startime), "Saved word embedding successfully!", wv_path) # Build models. models = [ RNNModel(num_words, num_label, embeddings=None).build(), LSTMModel(num_words, num_label, embeddings=None).build(), CNNModel(num_words, num_label, embeddings=None).build(), RNNModel(num_words, num_label, embeddings=wv).build(), LSTMModel(num_words, num_label, embeddings=wv).build(), CNNModel(num_words, num_label, embeddings=wv).build(), CNNModel(num_words, num_label, embeddings=wv, trainable=False).build() ] model_names = [ "RNN-None", "LSTM-None", "CNN-None", "RNN-wv", "LSTM-wv", "CNN-wv", "CNN-wv-notrain" ] print(return_time(startime), "4. Start training ...") for i, model in enumerate(models): print("***********************************") print(return_time(startime), "Model:", model_names[i]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) # Preparing callbacks. callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path.format(model_names[i]), save_best_only=True) ] # Train the model. model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) # Inference. model = load_model(model_path.format(model_names[i])) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format(precision_score(y_test, y_pred, average='binary'))) print('recall : {:.4f}'.format(recall_score(y_test, y_pred, average='binary'))) print('f1 : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))