def __init__( self, path="/home/james/PycharmProjects/flaskChatbot/app/seq2seq_backend/seq2seq_model.ckpt-44000" ): self.metadata, self.idx_q, self.idx_a = data.load_data( PATH= '/home/james/PycharmProjects/flaskChatbot/app/seq2seq_backend/datasets/cornell_corpus/' ) self.path = path (trainX, trainY), (testX, testY), (validX, validY) = data_utils.split_dataset( self.idx_q, self.idx_a) # parameters xseq_len = trainX.shape[-1] yseq_len = trainY.shape[-1] batch_size = 32 xvocab_size = len(self.metadata['idx2w']) yvocab_size = xvocab_size emb_dim = 1024 import seq2seq_wrapper # In[7]: self.model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_len, yseq_len=yseq_len, xvocab_size=xvocab_size, yvocab_size=yvocab_size, ckpt_path='ckpt/cornell_corpus/', emb_dim=emb_dim, num_layers=3) self.sess = tf.Session() saver = tf.train.Saver() saver.restore(self.sess, self.path)
def __init__(self, train=False): # load data from pickle and npy files self.metadata, idx_q, idx_a = data.load_data(PATH='datasets/twitter/') (trainX, trainY), (testX, testY), (validX, validY) = data_utils.split_dataset( idx_q, idx_a) # parameters xseq_len = trainX.shape[-1] yseq_len = trainY.shape[-1] batch_size = 16 xvocab_size = len(self.metadata['idx2w']) yvocab_size = xvocab_size emb_dim = 1024 importlib.reload(seq2seq_wrapper) self.model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_len, yseq_len=yseq_len, xvocab_size=xvocab_size, yvocab_size=yvocab_size, ckpt_path='ckpt/twitter/', emb_dim=emb_dim, num_layers=3) if train: val_batch_gen = data_utils.rand_batch_gen(validX, validY, 32) train_batch_gen = data_utils.rand_batch_gen( trainX, trainY, batch_size) sess = self.model.train(train_batch_gen, val_batch_gen) self.sess = self.model.restore_last_session()
def saparate_dataset(uncompressed_dir, split_data_dir, ratio={ 'train': 0.7, 'dev': 0.15, 'test': 0.15 }): """ Saparate the files from original folder to split folder, eq. train, dev and test. The amount of files would follow the designated ratio and stay in corresponding class folder Args: uncompressed_dir: The folder stores folder named by class split_data_dir: The folder to store the saparated folders, train, dev and test Returns: None """ # Check if split destination exists if not tf.io.gfile.exists(split_data_dir): tf.io.gfile.mkdir(split_data_dir) # Make dir for specified saparation for k in ratio: dir_path = os.path.join(split_data_dir, k) if not tf.io.gfile.exists(dir_path): tf.io.gfile.mkdir(dir_path) # Make the shuffle reproducible random.seed(523) # Walkthrough dirs under uncompressed and split them into saparation dir for i, dir_ in enumerate(os.listdir(uncompressed_dir)): dir_path = os.path.join(uncompressed_dir, dir_) filenames = os.listdir(dir_path) sys.stdout.write("\r>>Saparating images into sets %d/%d" % (i + 1, len(os.listdir(uncompressed_dir)))) sys.stdout.flush() split_filenames = data_utils.split_dataset(filenames, ratio) for split, filelist in split_filenames.items(): split_class_path = os.path.join(split_data_dir, os.path.join(split, dir_)) if not tf.io.gfile.exists(split_class_path): tf.io.gfile.mkdir(split_class_path) src_path = map(lambda x: os.path.join(dir_path, x), filelist) dst_path = map(lambda x: os.path.join(split_class_path, x), filelist) for src, dst in zip(src_path, dst_path): shutil.copyfile(src, dst) sys.stdout.write('\n') sys.stdout.flush()
def get_model(): importlib.reload(d_data) importlib.reload(IE_data) d_metadata, d_idx_q, d_idx_a = d_data.load_data(PATH='../datasets/danny/') i_metadata, i_idx_q, i_idx_a = IE_data.load_data(PATH='../datasets/IE/') (d_trainX, d_trainY), (d_testX, d_testY), (d_validX, d_validY) = data_utils.split_dataset(d_idx_q, d_idx_a) (i_trainX, i_trainY), (i_testX, i_testY), (i_validX, i_validY) = data_utils.split_dataset(i_idx_q, i_idx_a) d_model = seq2seq_wrapper.Seq2Seq( xseq_len=d_trainX.shape[-1], yseq_len=d_trainY.shape[-1], xvocab_size=len(d_metadata['idx2w']), yvocab_size=len(d_metadata['idx2w']), ckpt_path='../ckpt/danny/', loss_path='', metadata=d_metadata, emb_dim=1024, num_layers=3 ) i_model = seq2seq_wrapper.Seq2Seq( xseq_len=i_trainX.shape[-1], yseq_len=i_trainY.shape[-1], xvocab_size=len(i_metadata['idx2w']), yvocab_size=len(i_metadata['idx2w']), ckpt_path='../ckpt/IE/', loss_path='', metadata=i_metadata, emb_dim=1024, num_layers=3 ) d_sess = d_model.restore_last_session() i_sess = i_model.restore_last_session() return d_model, i_model, d_sess, i_sess, d_metadata, i_metadata
def getdata(genre): metadata, idx_prev, idx_curr, idx_next = data.load_data(genre,PATH='dataset/') (train_prev, train_curr, train_next), (test_prev, test_curr, test_next), (valid_prev, valid_curr, valid_next) \ = data_utils.split_dataset(idx_prev, idx_curr, idx_next) train = defaultdict() train['p'] = train_prev train['c'] = train_curr train['n'] = train_next valid = defaultdict() valid['p'] = valid_prev valid['c'] = valid_curr valid['n'] = valid_next test = defaultdict() test['p'] = test_prev test['c'] = test_curr test['n'] = test_next return train, valid, test, metadata
import tensorflow as tf import numpy as np import data, data_utils import sys # gather dataset data_ctl, idx_words, idx_phonemes = data.load_data() (trainX, trainY), (testX, testY), (validX, validY) = data_utils.split_dataset(idx_words, idx_phonemes) # parameters xseq_len = trainX.shape[-1] yseq_len = trainY.shape[-1] batch_size = 128 xvocab_size = len(data_ctl['idx2alpha'].keys()) # 27 yvocab_size = len(data_ctl['idx2pho'].keys()) # 70 emb_dim = 128 ''' build the graph ''' tf.reset_default_graph() enc_ip = [ tf.placeholder(dtype=tf.int32, shape = (None,), name = 'ei_{}'.format(i)) for i in range(xseq_len) ] # alternatively # enc_ip = tf.placeholder(shape=[None,xseq_len], dtype=tf.int32, name='enc_ip')
device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu' # Download cats vs. dogs dataset if needed data_dir = Path(args.data_dir) if not (data_dir / 'PetImages').exists(): data_dir.mkdir(parents=True, exist_ok=True) download_and_unzip_from_url(args.dataset_url, data_dir) dataset_dir = data_dir / 'PetImages' # Create checkpoints save dir if needed if not Path(args.checkpoints_dir).exists(): Path(args.checkpoints_dir).mkdir(parents=True, exist_ok=True) # Randomly split the cats vs. dogs dataset into train/valid/test portions rng = np.random.RandomState(args.seed) imagepaths, classes = split_dataset(dataset_dir, rng, args.p_train, args.p_val, args.p_test) # Get pytorch datasets and dataloaders transform = get_transforms() dataset = get_torch_datasets(transform, imagepaths, data_dir) batch_size = { 'train': args.batch_size, 'test': args.test_batch_size, 'val': args.test_batch_size } loader = get_torch_loaders(dataset, batch_size) # Some filenames for intermediate checkpointing phase_1_savename = 'squeezenet_post_output_layer_training.pth' phase_2_savename = 'squeezenet_post_finetuning.pth' phase_1_savepath = Path(args.checkpoints_dir) / phase_1_savename
@author: gopal """ import tensorflow as tf import numpy as np import importlib import seq2seq_wrapper from dataset import data import data_utils importlib.reload(seq2seq_wrapper) importlib.reload(data_utils) # preprocessed data # load data from pickle and npy files metadata, idx_p, idx_x, idx_a = data.load_data(PATH='dataset/') (trainP, trainX, trainA), (testP, testX, testA), (validP, validX, validA) = data_utils.split_dataset(idx_p, idx_x, idx_a) def length(x): for i in range(len(x)): if x[i] == 0: break return i filter_index_10 = [i for i in range(len(trainX)) if length(trainX[i])==10 and length(trainP[i])>10 and length(trainA[i])>10] filter_index_15 = [i for i in range(len(trainX)) if length(trainX[i])==15 and length(trainP[i])>10 and length(trainA[i])>10] filter_index_07 = [i for i in range(len(trainX)) if length(trainX[i])==7 and length(trainP[i])>10 and length(trainA[i])>10] trainX_filter_10 = trainX[filter_index_10] trainA_filter_10 = trainA[filter_index_10] trainP_filter_10 = trainP[filter_index_10]
def main(): t = time.localtime() t_mark = time.strftime("%m-%d %H:%M", t) print('\n', t_mark, '\n') print('device:', device) # ========= Get Parameter =========# # train parameters parser = argparse.ArgumentParser(description='Vivi') parser.add_argument('--dataset', type=str, default='poem_1031k_theme_train') parser.add_argument('--epochs', type=int, default=15) parser.add_argument('--ckpt_path', type=str, default='') parser.add_argument('--val_rate', type=float, default=0.1) parser.add_argument('--batch_size', type=int, default=80) parser.add_argument('--teacher_forcing_ratio', type=float, default=0.8) parser.add_argument('--model_name', type=str, default='Seq2seq_12') parser.add_argument('--train_mode', type=str, default='kw2poem') # nL21L or kw2poem parser.add_argument('--note', type=str, default='') parser.add_argument('--train_soft', type=bool, default=True) # Jul12 parser.add_argument('--template', type=bool, default=False) # Jul12 parser.add_argument('--w1', type=float, default=3.) parser.add_argument('--w2', type=float, default=0.) args = parser.parse_args() dataset = args.dataset dataset_path = 'resource/dataset/' + dataset + '.txt' epochs = args.epochs ckpt_path = args.ckpt_path val_rate = args.val_rate batch_size = args.batch_size teacher_forcing_ratio = args.teacher_forcing_ratio model_name = args.model_name train_mode = args.train_mode train_param = vars(args) # load model parameters checkpoint = None if os.path.exists(ckpt_path): checkpoint = torch.load(ckpt_path) model_param = checkpoint['model_param'] train_param = checkpoint['train_param'] last_epoch = checkpoint['epoch'] else: conf = configparser.ConfigParser() conf.read('config/config_' + model_name + '.ini') model_param_li = conf.items('model_param') model_param = {'model_name': model_name} for item in model_param_li: model_param[item[0]] = item[1] last_epoch = 0 print('train param: ', train_param) print('model param: ', model_param) # ========= Preparing Data =========# # read data if model_name == 'BERT': pairs = data_utils.read_BERT_train_data(dataset) elif train_mode == 'nL21L': pairs = data_utils.read_nL21L_train_data(dataset_path) else: pairs = data_utils.read_train_data(dataset_path) # split dataset train_pairs, val_pairs = data_utils.split_dataset(pairs, val_rate) # pairs data_path = 'models.' + model_name + '.PoetryData' PoetryData = importlib.import_module(data_path) train_Dataset = getattr(PoetryData, 'PoetryData')( train_pairs, src_max_len=int(model_param['input_max_len']), tgt_max_len=int(model_param['target_max_len'])) val_Dataset = getattr(PoetryData, 'PoetryData')( val_pairs, src_max_len=int(model_param['input_max_len']), tgt_max_len=int(model_param['target_max_len'])) # 反射并实例化 # 变成小批 train_data = Data.DataLoader( dataset=train_Dataset, batch_size=batch_size, shuffle=True, drop_last=True, # Jun16 collate_fn=PoetryData.paired_collate_fn # num_workers=2 # 多线程来读数据,提取xy的时候几个数据一起提取 ) val_data = Data.DataLoader( dataset=val_Dataset, batch_size=batch_size, shuffle=True, drop_last=True, # Jun16 collate_fn=PoetryData.paired_collate_fn # num_workers=2 ) # ========= Preparing Model =========# model_path = 'models.' + model_name + '.' + model_name Model = importlib.import_module(model_path) # 导入模块 model = getattr(Model, model_name)(model_param) # 反射并实例化 print('model:', model) optim_path = 'models.' + model_name + '.Optim' Optim = importlib.import_module(optim_path) # 模块(文件) optimizer = Optim.get_optimizer(model, model_param) # 调用模块的函数 print('optimizer:', optimizer) # load model from ckpt if os.path.exists(ckpt_path): # checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) # 重复load model.load_state_dict(checkpoint['model']) # write log head save_dir = 'ckpt/' + str(time.strftime( "%m%d%H%M%S", t)) + '_' + model_param['model_name'] + '/' if not os.path.exists(save_dir): os.makedirs(save_dir) with open(save_dir + 'log', 'a') as f: f.write('\n\n' + str(t_mark) + '\nsave dir:' + save_dir + '\n' + str(train_param) + '\n' + str(model_param) + '\n') print('start training') train(train_data, val_data, model, optimizer, batch_size, epochs, last_epoch, val_rate, teacher_forcing_ratio, save_dir, t, model_param, train_param)