mean_square_error = tf.reduce_mean(tf.square(output_score - y_)) train_step = tf.train.AdamOptimizer(learning_rate).minimize( mean_square_error) sess = tf.InteractiveSession() tf.global_variables_initializer().run() # DONOTCHANGE: Reserved for nsml use bind_model(sess=sess, config=config) # DONOTCHANGE: They are reserved for nsml if config.pause: nsml.paused(scope=locals()) if config.mode == 'train': # 데이터를 로드합니다. dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen) dataset_len = len(dataset) if config.validate > 0: validate_len = int(dataset_len * config.validate) ds_review = np.array([np.array(review) for review, _ in dataset]) ds_label = np.array([int(label) for _, label in dataset]) global_perm = np.random.RandomState( seed=777).permutation(dataset_len) ds_review, ds_label = ds_review[global_perm], ds_label[global_perm] dataset = MovieReviewDataset(remake=True, new_review=ds_review[validate_len:],
model = create_model() model.compile(loss='mse', optimizer=Adam(), metrics=[]) model.summary() # DONOTCHANGE: Reserved for nsml use bind_model(model, config) # DONOTCHANGE: They are reserved for nsml if config.pause: nsml.paused(scope=locals()) # 학습 모드일 때 사용합니다. (기본값) if config.mode == 'train': # 데이터를 로드합니다. print("Now Loading Dataset...") dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen) save_data = dataset.save_data print("Loading Dataset Done") batches_per_epoch = int(len(dataset.labels) / config.batch) + 1 # 학습을 수행합니다. def on_epoch_end(epoch, logs): # nsml ps, 혹은 웹 상의 텐서보드에 나타나는 값을 리포트하는 함수입니다. nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs, train__loss=logs['loss'], step=epoch) # DONOTCHANGE (You can decide how often you want to save the model)
args.add_argument('--strmaxlen', type=int, default=200) args.add_argument('--embedding', type=int, default=8) config = args.parse_args() learning_rate = 1e-3 grad_clip = True dropout = 0.2 n_char = 4500 n_embed = 256 n_hidden = 256 if not HAS_DATASET and not IS_ON_NSML: # It is not running on nsml DATASET_PATH = '../sample_data/movie_review/' if config.mode == 'train': dataset = MovieReviewDataset(DATASET_PATH, flip=True) else: dataset = MovieReviewDataset('', build=False, flip=False) model = Movie(n_char, n_embed, n_hidden, dropout) model_run = Movie(n_char, n_embed, n_hidden, dropout) if GPU_NUM: model = model.cuda() model_run.cuda() accumulate(model_run, model, 0) # DONOTCHANGE: Reserved for nsml use bind_model(model_run, dataset, config) criterion = nn.MSELoss()
# DONOTCHANGE: Reserved for nsml use bind_model(model, config) criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001) # DONOTCHANGE: They are reserved for nsml if config.pause: nsml.paused(scope=locals()) # 학습 모드일 때 사용합니다. (기본값) if config.mode == 'train': # 데이터를 로드합니다. t0 = time.time() dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen, max_size=config.max_dataset) print("dataset loaded %.2f s" % (time.time() - t0)) pin_memory = USE_GPU > 0 if config.no_eval: train_loader = DataLoader(dataset=dataset, batch_size=config.batch, shuffle=True, collate_fn=collate_fn, num_workers=2, pin_memory=pin_memory) eval_loader = [] else: train_sampler, eval_sampler = dataset.get_sampler() train_loader = DataLoader(dataset=dataset, batch_size=config.batch, sampler=train_sampler, collate_fn=collate_fn, num_workers=2, pin_memory=pin_memory) eval_loader = DataLoader(dataset=dataset, batch_size=config.batch, sampler=eval_sampler, collate_fn=collate_fn, num_workers=2, pin_memory=pin_memory)
lgb_model = object models = (model, lgb_model, vect_word, vect_char) # DONOTCHANGE: Reserved for nsml use print("nsml binding...") bind_model(models, config) # DONOTCHANGE: They are reserved for nsml if config.pause: nsml.paused(scope=locals()) # 학습 모드일 때 사용합니다. (기본값) if config.mode == 'train': # 데이터를 로드합니다. print("data loading...") dataset = MovieReviewDataset(DATASET_PATH) # X_trn, X_val, Y_trn, Y_val= trn_val_seperation(dataset, 144570) X_trn, X_val, Y_trn, Y_val= trn_val_seperation(dataset, 3) # Vectorizer를 학습합니다 vect_word, vect_char = vect_fit(X_trn, vect_word, vect_char) # Text를 Vector화 합니다 X_trn = vect_transform(X_trn, vect_word, vect_char) X_val = vect_transform(X_val, vect_word, vect_char) #Dataset 구성 train_data = lgb.Dataset(X_trn, Y_trn) valid_data = lgb.Dataset(X_val, Y_val, reference=train_data) gc.collect()
word_embed: we, sent_len: sl, chars: cs, word_len: wl, sylls: ss, y_: label, is_training: train } # DONOTCHANGE: Reserved for nsml if config.pause: nsml.paused(scope=locals()) if config.mode == 'train': # 데이터를 로드합니다. dataset = MovieReviewDataset(DATASET_PATH, max_sent_len, max_word_len, max_syll_num) dataset_len = len(dataset) one_batch_size = dataset_len // config.batch if dataset_len % config.batch != 0: one_batch_size += 1 if config.debug: debugset = MovieReviewDataset(DEBUG3_PATH, max_sent_len, max_word_len, max_syll_num) debugset_len = len(debugset) one_debug_size = debugset_len // config.batch if debugset_len % config.batch != 0: one_debug_size += 1 train_step = 0 best_ema = 99999.0
# DONOTCHANGE: They are reserved for nsml if config.pause: nsml.paused(scope=locals()) ### Training mode # 학습 모드일 때 사용합니다. (기본값) if config.mode == 'train': # 데이터를 로드합니다. if not HAS_DATASET and not IS_ON_NSML: # It is not running on nsml DATASET_PATH = '../sample_data/movie_review/' corpus = DP.Corpus(DATASET_PATH, total_train) print('[*]', 'Load corpus') # Load training data train_dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen, True, corpus) print('[*]', 'Load train dataset') train_loader = DataLoader(dataset=train_dataset, batch_size=config.batch, shuffle=True, collate_fn=collate_fn, num_workers=1) total_train = len(train_loader) # Load validation data test_dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen, True, corpus, test=True) print('[*]', 'Load test dataset')
# Select model args.add_argument('--model', type=str, default='SentpieceModel', choices=['SentpieceModel', 'regression', 'classification', 'bilstmwithattn', 'cnntext']) config = args.parse_args() print('HAS_DATASET :', HAS_DATASET) print('IS_ON_NSML :', IS_ON_NSML) print('DATASET_PATH :', DATASET_PATH) print(config) sp = [] wp_vocab = [] preprcess_infer = {} if config.mode == 'train': sp, wp_vocab = build_vocab(config.mode, DATASET_PATH, vocab_size=config.vocab_size) dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen, max_word_len=config.max_words_len, max_wp_len=config.max_wp_len, n_class=11) vocab_size = len(dataset.i2wp) else: vocab_size = 19488 #19475 model_type = { 'SentpieceModel': SentpieceModel(vocab_size, char_emb_size=config.char_embedding, word_emb_size=config.word_embedding, hidden_size=config.hidden_dim, max_wp_len=config.max_wp_len, max_words_len=config.max_words_len) } models = [ SentpieceModel(vocab_size,
for m in range(num_models): models.append(Model(sess, "model" + str(m), config)) tf.global_variables_initializer().run() # DONOTCHANGE: Reserved for nsml bind_model(sess=sess, config=config, model=models) # DONOTCHANGE: Reserved for nsml if config.pause: nsml.paused(scope=locals()) if config.mode == 'train': # 데이터를 로드합니다. dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen, is_train=True) dataset_len = len(dataset) one_batch_size = dataset_len // config.batch if dataset_len % config.batch != 0: one_batch_size += 1 # epoch마다 학습을 수행합니다. for epoch in range(config.epochs): #avg_loss = 0.0 avg_cost_list = np.zeros(len(models)) for i, (data, labels) in enumerate(_batch_loader(dataset, config.batch)): labels = np.reshape(labels, (-1, 1)) onehot_label = sess.run( tf.reshape(tf.one_hot(labels, depth=11, dtype=tf.float32),
model = CNNReg(config.vocasize, config.embedding, config.maxlen, GPU_NUM) if GPU_NUM: model = model.cuda() # DONOTCHANGE: Reserved for nsml use bind_model(model, config) criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=config.lr) # DONOTCHANGE: They are reserved for nsml if config.pause: nsml.paused(scope=locals()) # 학습 모드일 때 사용합니다. (기본값) if config.mode == 'train': dataset = MovieReviewDataset(DATASET_PATH, config.vocasize, config.minlen, config.maxlen) train_loader = DataLoader(dataset=dataset, batch_size=config.batch, shuffle=True, collate_fn=lambda data: zip(*data), num_workers=2) total_batch = len(train_loader) # epoch마다 학습을 수행합니다. for epoch in range(config.epochs): avg_loss = 0.0 for i, (data, labels) in enumerate(train_loader): predictions = model(data) labels = Variable(torch.from_numpy(np.array(labels))) if GPU_NUM: labels = labels.cuda()
model = models.Model(inputs=inputs, outputs=[outputs1, outputs2]) model.summary() model.compile(optimizer=optimizers.Adam(lr=0.001, amsgrad=True, clipvalue=1.0), loss=['categorical_crossentropy', 'mse'], metrics=['accuracy']) # DONOTCHANGE: Reserved for nsml use bind_model(model) # DONOTCHANGE: They are reserved for nsml if config.pause: nsml.paused(scope=locals()) # 학습 모드일 때 사용합니다. (기본값) if config.mode == 'train': # 데이터를 로드합니다. dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen) dataset_len = len(dataset) one_batch_size = dataset_len//config.batch if dataset_len % config.batch != 0: one_batch_size += 1 # epoch마다 학습을 수행합니다. for epoch in range(config.epochs): avg_loss = 0.0 avg_acc = 0.0 dataset.shuffle() for i, (data, labels, sentiments) in enumerate(_batch_loader(dataset, config.batch)): loss, ce_loss, mse_loss, ce_acc, mse_acc = model.train_on_batch(data, [sentiments, labels]) print('Batch : ', i + 1, '/', one_batch_size, ', loss in this minibatch: ', float(loss), ', CE in this minibatch: ', float(ce_loss), ', CE ACC in this minibatch: ', float(ce_acc),