def train_model(args, model_choice): # loading data and divide filename = 'Sentiment140/training.1600000.processed.noemoticon.csv' # filename = 'Sentiment140/testdata.manual.2009.06.14.csv' train_set, test_set = divide_dataset(filename, args.ratio, args.sample) print 'trainset = %d, testset = %d' % (len(train_set[0]), len(test_set[0])) # loading word2vec model start = time.time() model_path = 'new_model.pkl' with open(model_path, 'r') as f: model = cPickle.load(f) # model= [] print 'loading model successfully. Time spend = ', time.time() - start all_info = 'Train model %s \n' % model_choice xtrain, ytrain = train_set xtest, ytest = test_set batchinfo = '' for batch_size in [128, 256, 512, 1024, 2048, 4096]: print 'batch_size = ', batch_size net = CNN.cnn(args) if args.use_cuda: net = net.cuda() for epoch in range(args.max_epochs): info = 'Epoch %d \n' % epoch train_loss, train_correct = 0, 0 train_num_batch = len(xtrain) / batch_size for i in range(train_num_batch): # get data for each batch x = xtrain[i * batch_size:(i + 1) * batch_size] y = ytrain[i * batch_size:(i + 1) * batch_size] # process data to get word embedding try: texts, x = texts_preprocessing(x, model, args.max_len, preprocess_choice="vector") except: print 'epoch = ', epoch print type(x) # simulate one batch if epoch < 1: loss, correct = train(net, (x, y), args, fine_tune=False) else: loss, correct, x = train(net, (x, y), args, fine_tune=True) model = model_update(model, texts, x) # print 'the differences = ', np.square(new_x[i]-x).sum() train_loss += loss train_correct += correct # print 'train model %s, time spend = %d' % (model_choice, time.time()-start) info += 'Train loss: %.3f | Acc: %.3f%% (%d/%d) \n' % \ (train_loss / train_num_batch, 100.0 * train_correct / train_num_batch / batch_size, train_correct, train_num_batch * batch_size) test_loss, test_correct = 0, 0 process_time, test_time = 0, 0 test_num_batch = len(xtest) / batch_size for i in range(test_num_batch): # get data for each batch x = xtest[i * batch_size:(i + 1) * batch_size] y = ytest[i * batch_size:(i + 1) * batch_size] # process data start = time.time() _, x = texts_preprocessing(x, model, args.max_len, preprocess_choice="vector") process_time += (time.time() - start) # simulate one batch start = time.time() loss, correct = test(net, (x, y), args.use_cuda) test_time += (time.time() - start) test_loss += loss test_correct += correct info += 'Test loss: %.3f | Acc: %.3f%% (%d/%d) \n' % \ (test_loss/test_num_batch, 100.0 * test_correct / test_num_batch / batch_size, test_correct, test_num_batch * batch_size) info += 'batch_size = %d, for each batch, avg_process_time = %.6f, avg_test_time = %.6f \n' % \ (batch_size, float(process_time) / test_num_batch, float(test_time) / test_num_batch) print info all_info += info save_file = 'results/%s_bs%d_info.txt' % (model_choice, batch_size) with open(save_file, 'w') as f: f.writelines(all_info) batchinfo += 'batch_size = %d, for each batch, avg_process_time = %.6f, avg_test_time = %.6f \n' % \ (batch_size, float(process_time) / test_num_batch, float(test_time) / test_num_batch) + \ ' for each tweet, avg_process_time = %.6f, avg_test_time = %.6f \n' % \ (float(process_time) / test_num_batch / batch_size, float(test_time) / test_num_batch / batch_size) with open('results/batch-info.txt', 'w') as f: f.writelines(batchinfo)
def train_model(args, model_choice): # loading data and divide filename = 'Sentiment140/training.1600000.processed.noemoticon.csv' # filename = 'Sentiment140/testdata.manual.2009.06.14.csv' train_set, test_set = divide_dataset(filename, 0.8, 0.1) print 'trainset = %d, testset = %d' % (len(train_set[0]), len(test_set[0])) # loading vocab start = time.time() vocab_path = 'vocab.pkl' if os.path.exists(vocab_path): with open(vocab_path, 'r') as f: vocab = cPickle.load(f) else: vocab = build_vocab(filename, args.min_freq) print 'loading model successfully. Time spend = ', time.time() - start args.vocab_size = len(vocab) if model_choice == "cnn": net = CNN.cnn(args) elif model_choice == "lstm": net = LSTM.lstm(args) elif model_choice == "lstm_attn_cnn": net = LSTM_CNN.lstm_attn_cnn(args) else: print "Wrong model_choice, please correct and try again." return all_info = 'Train model %s \n' % model_choice if args.use_cuda: net = net.cuda() xtrain, ytrain = train_set xtest, ytest = test_set if embed_flag: model_choice = 'embed_' + model_choice print 'with embedding' else: print 'no embedding' batchinfo = '' for batch_size in [128, 256, 512, 1024]: for epoch in range(args.max_epochs): info = 'Epoch %d \n' % epoch train_loss, train_correct = 0, 0 train_num_batch = len(xtrain) / batch_size for i in range(train_num_batch): # get data for each batch x = xtrain[i*batch_size: (i+1)*batch_size] y = ytrain[i*batch_size: (i+1)*batch_size] # process data x = texts_preprocessing(x, vocab, args.max_len, preprocess_choice="index") # simulate one batch loss, correct = train(net, (x, y), args, embed_flag) train_loss += loss train_correct += correct # print 'train model %s, time spend = %d' % (model_choice, time.time()-start) info += 'Train loss: %.3f | Acc: %.3f%% (%d/%d) \n' % \ (train_loss / train_num_batch, 100.0 * train_correct / train_num_batch / batch_size, train_correct, train_num_batch * batch_size) test_loss, test_correct = 0, 0 process_time, test_time = 0, 0 test_num_batch = len(xtest) / batch_size for i in range(test_num_batch): # get data for each batch x = xtest[i * batch_size: (i + 1) * batch_size] y = ytest[i * batch_size: (i + 1) * batch_size] # process data start = time.time() x = texts_preprocessing(x, vocab, args.max_len, preprocess_choice="index") process_time += (time.time() - start) # simulate one batch start = time.time() loss, correct = test(net, (x, y), args.use_cuda, embed_flag) test_time += (time.time() - start) test_loss += loss test_correct += correct info += 'Test loss: %.3f | Acc: %.3f%% (%d/%d) \n' % \ (test_loss / test_num_batch, 100.0 * test_correct / test_num_batch / batch_size, test_correct, test_num_batch * batch_size) info += 'batch_size = %d, for each batch, avg_process_time = %.6f, avg_test_time = %.6f \n' % \ (batch_size, float(process_time) / test_num_batch, float(test_time) / test_num_batch) print info all_info += info save_file = 'results/%s_bs%d_info.txt' % (model_choice, batch_size) with open(save_file, 'w') as f: f.writelines(all_info) batchinfo += 'batch_size = %d, for each batch, avg_process_time = %.6f, avg_test_time = %.6f \n' % \ (batch_size, float(process_time) / test_num_batch, float(test_time) / test_num_batch) batch_file = 'results/%s_batch-info.txt' % (model_choice) with open(batch_file,'w') as f: f.writelines(batchinfo)