示例#1
0
    def __init__(
        self,
        path="/home/james/PycharmProjects/flaskChatbot/app/seq2seq_backend/seq2seq_model.ckpt-44000"
    ):
        self.metadata, self.idx_q, self.idx_a = data.load_data(
            PATH=
            '/home/james/PycharmProjects/flaskChatbot/app/seq2seq_backend/datasets/cornell_corpus/'
        )
        self.path = path
        (trainX, trainY), (testX, testY), (validX,
                                           validY) = data_utils.split_dataset(
                                               self.idx_q, self.idx_a)

        # parameters
        xseq_len = trainX.shape[-1]
        yseq_len = trainY.shape[-1]
        batch_size = 32
        xvocab_size = len(self.metadata['idx2w'])
        yvocab_size = xvocab_size
        emb_dim = 1024

        import seq2seq_wrapper

        # In[7]:

        self.model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_len,
                                             yseq_len=yseq_len,
                                             xvocab_size=xvocab_size,
                                             yvocab_size=yvocab_size,
                                             ckpt_path='ckpt/cornell_corpus/',
                                             emb_dim=emb_dim,
                                             num_layers=3)
        self.sess = tf.Session()
        saver = tf.train.Saver()
        saver.restore(self.sess, self.path)
示例#2
0
 def __init__(self, train=False):
     # load data from pickle and npy files
     self.metadata, idx_q, idx_a = data.load_data(PATH='datasets/twitter/')
     (trainX, trainY), (testX, testY), (validX,
                                        validY) = data_utils.split_dataset(
                                            idx_q, idx_a)  # parameters
     xseq_len = trainX.shape[-1]
     yseq_len = trainY.shape[-1]
     batch_size = 16
     xvocab_size = len(self.metadata['idx2w'])
     yvocab_size = xvocab_size
     emb_dim = 1024
     importlib.reload(seq2seq_wrapper)
     self.model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_len,
                                          yseq_len=yseq_len,
                                          xvocab_size=xvocab_size,
                                          yvocab_size=yvocab_size,
                                          ckpt_path='ckpt/twitter/',
                                          emb_dim=emb_dim,
                                          num_layers=3)
     if train:
         val_batch_gen = data_utils.rand_batch_gen(validX, validY, 32)
         train_batch_gen = data_utils.rand_batch_gen(
             trainX, trainY, batch_size)
         sess = self.model.train(train_batch_gen, val_batch_gen)
     self.sess = self.model.restore_last_session()
def saparate_dataset(uncompressed_dir,
                     split_data_dir,
                     ratio={
                         'train': 0.7,
                         'dev': 0.15,
                         'test': 0.15
                     }):
    """ Saparate the files from original folder to split folder, eq. train, dev and test. The amount of
        files would follow the designated ratio and stay in corresponding class folder
    Args:
        uncompressed_dir: The folder stores folder named by class
        split_data_dir: The folder to store the saparated folders, train, dev and test
    Returns:
        None
    """
    # Check if split destination exists
    if not tf.io.gfile.exists(split_data_dir):
        tf.io.gfile.mkdir(split_data_dir)

    #  Make dir for specified saparation
    for k in ratio:
        dir_path = os.path.join(split_data_dir, k)
        if not tf.io.gfile.exists(dir_path):
            tf.io.gfile.mkdir(dir_path)

    # Make the shuffle reproducible
    random.seed(523)

    # Walkthrough dirs under uncompressed and split them into saparation dir
    for i, dir_ in enumerate(os.listdir(uncompressed_dir)):
        dir_path = os.path.join(uncompressed_dir, dir_)
        filenames = os.listdir(dir_path)

        sys.stdout.write("\r>>Saparating images into sets %d/%d" %
                         (i + 1, len(os.listdir(uncompressed_dir))))
        sys.stdout.flush()

        split_filenames = data_utils.split_dataset(filenames, ratio)
        for split, filelist in split_filenames.items():
            split_class_path = os.path.join(split_data_dir,
                                            os.path.join(split, dir_))
            if not tf.io.gfile.exists(split_class_path):
                tf.io.gfile.mkdir(split_class_path)
            src_path = map(lambda x: os.path.join(dir_path, x), filelist)
            dst_path = map(lambda x: os.path.join(split_class_path, x),
                           filelist)
            for src, dst in zip(src_path, dst_path):
                shutil.copyfile(src, dst)

    sys.stdout.write('\n')
    sys.stdout.flush()
示例#4
0
def get_model():
    importlib.reload(d_data)
    importlib.reload(IE_data)

    d_metadata, d_idx_q, d_idx_a = d_data.load_data(PATH='../datasets/danny/')
    i_metadata, i_idx_q, i_idx_a = IE_data.load_data(PATH='../datasets/IE/')

    (d_trainX, d_trainY), (d_testX, d_testY), (d_validX, d_validY) = data_utils.split_dataset(d_idx_q, d_idx_a)
    (i_trainX, i_trainY), (i_testX, i_testY), (i_validX, i_validY) = data_utils.split_dataset(i_idx_q, i_idx_a)

    d_model = seq2seq_wrapper.Seq2Seq(
        xseq_len=d_trainX.shape[-1],
        yseq_len=d_trainY.shape[-1],
        xvocab_size=len(d_metadata['idx2w']),
        yvocab_size=len(d_metadata['idx2w']),
        ckpt_path='../ckpt/danny/',
        loss_path='',
        metadata=d_metadata,
        emb_dim=1024,
        num_layers=3
    )

    i_model = seq2seq_wrapper.Seq2Seq(
        xseq_len=i_trainX.shape[-1],
        yseq_len=i_trainY.shape[-1],
        xvocab_size=len(i_metadata['idx2w']),
        yvocab_size=len(i_metadata['idx2w']),
        ckpt_path='../ckpt/IE/',
        loss_path='',
        metadata=i_metadata,
        emb_dim=1024,
        num_layers=3
    )

    d_sess = d_model.restore_last_session()
    i_sess = i_model.restore_last_session()

    return d_model, i_model, d_sess, i_sess, d_metadata, i_metadata
示例#5
0
def getdata(genre):
    metadata, idx_prev, idx_curr, idx_next = data.load_data(genre,PATH='dataset/')
    (train_prev, train_curr, train_next), (test_prev, test_curr, test_next), (valid_prev, valid_curr, valid_next) \
        = data_utils.split_dataset(idx_prev, idx_curr, idx_next)
    train = defaultdict()
    train['p'] = train_prev
    train['c'] = train_curr
    train['n'] = train_next
    
    valid = defaultdict()
    valid['p'] = valid_prev
    valid['c'] = valid_curr
    valid['n'] = valid_next
    
    test = defaultdict()
    test['p'] = test_prev
    test['c'] = test_curr
    test['n'] = test_next
    
    return train, valid, test, metadata
示例#6
0
import tensorflow as tf
import numpy as np 
import data, data_utils

import sys

# gather dataset
data_ctl, idx_words, idx_phonemes = data.load_data()
(trainX, trainY), (testX, testY), (validX, validY) = data_utils.split_dataset(idx_words, idx_phonemes)


# parameters 
xseq_len = trainX.shape[-1]
yseq_len = trainY.shape[-1]
batch_size = 128
xvocab_size = len(data_ctl['idx2alpha'].keys())  # 27
yvocab_size = len(data_ctl['idx2pho'].keys())  # 70
emb_dim = 128


'''
 build the graph

'''
tf.reset_default_graph()

enc_ip = [ tf.placeholder(dtype=tf.int32,
                       shape = (None,),
                       name = 'ei_{}'.format(i)) for i in range(xseq_len) ]
# alternatively
#  enc_ip = tf.placeholder(shape=[None,xseq_len], dtype=tf.int32, name='enc_ip')
示例#7
0
device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu'

# Download cats vs. dogs dataset if needed
data_dir = Path(args.data_dir)
if not (data_dir / 'PetImages').exists():
    data_dir.mkdir(parents=True, exist_ok=True)
    download_and_unzip_from_url(args.dataset_url, data_dir)
dataset_dir = data_dir / 'PetImages'

# Create checkpoints save dir if needed
if not Path(args.checkpoints_dir).exists():
    Path(args.checkpoints_dir).mkdir(parents=True, exist_ok=True)

# Randomly split the cats vs. dogs dataset into train/valid/test portions
rng = np.random.RandomState(args.seed)
imagepaths, classes = split_dataset(dataset_dir, rng, args.p_train, args.p_val,
                                    args.p_test)

# Get pytorch datasets and dataloaders
transform = get_transforms()
dataset = get_torch_datasets(transform, imagepaths, data_dir)
batch_size = {
    'train': args.batch_size,
    'test': args.test_batch_size,
    'val': args.test_batch_size
}
loader = get_torch_loaders(dataset, batch_size)

# Some filenames for intermediate checkpointing
phase_1_savename = 'squeezenet_post_output_layer_training.pth'
phase_2_savename = 'squeezenet_post_finetuning.pth'
phase_1_savepath = Path(args.checkpoints_dir) / phase_1_savename
示例#8
0
@author: gopal
"""
import tensorflow as tf
import numpy as np
import importlib
import seq2seq_wrapper
from dataset import data
import data_utils
importlib.reload(seq2seq_wrapper)
importlib.reload(data_utils)
# preprocessed data


# load data from pickle and npy files
metadata, idx_p, idx_x, idx_a = data.load_data(PATH='dataset/')
(trainP, trainX, trainA), (testP, testX, testA), (validP, validX, validA) = data_utils.split_dataset(idx_p, idx_x, idx_a)

def length(x):
    for i in range(len(x)):
        if x[i] == 0:
            break
    return i

filter_index_10 = [i for i in range(len(trainX)) if length(trainX[i])==10 and length(trainP[i])>10 and length(trainA[i])>10]

filter_index_15 = [i for i in range(len(trainX)) if length(trainX[i])==15 and length(trainP[i])>10 and length(trainA[i])>10]
filter_index_07 = [i for i in range(len(trainX)) if length(trainX[i])==7 and length(trainP[i])>10 and length(trainA[i])>10]

trainX_filter_10 = trainX[filter_index_10]
trainA_filter_10 = trainA[filter_index_10]
trainP_filter_10 = trainP[filter_index_10]
示例#9
0
def main():
    t = time.localtime()
    t_mark = time.strftime("%m-%d %H:%M", t)
    print('\n', t_mark, '\n')
    print('device:', device)

    # ========= Get Parameter =========#
    # train parameters
    parser = argparse.ArgumentParser(description='Vivi')

    parser.add_argument('--dataset',
                        type=str,
                        default='poem_1031k_theme_train')
    parser.add_argument('--epochs', type=int, default=15)
    parser.add_argument('--ckpt_path', type=str, default='')
    parser.add_argument('--val_rate', type=float, default=0.1)
    parser.add_argument('--batch_size', type=int, default=80)
    parser.add_argument('--teacher_forcing_ratio', type=float, default=0.8)
    parser.add_argument('--model_name', type=str, default='Seq2seq_12')
    parser.add_argument('--train_mode', type=str,
                        default='kw2poem')  # nL21L or kw2poem
    parser.add_argument('--note', type=str, default='')

    parser.add_argument('--train_soft', type=bool, default=True)  # Jul12
    parser.add_argument('--template', type=bool, default=False)  # Jul12
    parser.add_argument('--w1', type=float, default=3.)
    parser.add_argument('--w2', type=float, default=0.)

    args = parser.parse_args()

    dataset = args.dataset
    dataset_path = 'resource/dataset/' + dataset + '.txt'
    epochs = args.epochs
    ckpt_path = args.ckpt_path
    val_rate = args.val_rate
    batch_size = args.batch_size
    teacher_forcing_ratio = args.teacher_forcing_ratio
    model_name = args.model_name
    train_mode = args.train_mode

    train_param = vars(args)

    # load model parameters
    checkpoint = None
    if os.path.exists(ckpt_path):
        checkpoint = torch.load(ckpt_path)
        model_param = checkpoint['model_param']
        train_param = checkpoint['train_param']
        last_epoch = checkpoint['epoch']
    else:
        conf = configparser.ConfigParser()
        conf.read('config/config_' + model_name + '.ini')
        model_param_li = conf.items('model_param')
        model_param = {'model_name': model_name}
        for item in model_param_li:
            model_param[item[0]] = item[1]
        last_epoch = 0

    print('train param: ', train_param)
    print('model param: ', model_param)

    # ========= Preparing Data =========#

    # read data
    if model_name == 'BERT':
        pairs = data_utils.read_BERT_train_data(dataset)
    elif train_mode == 'nL21L':
        pairs = data_utils.read_nL21L_train_data(dataset_path)
    else:
        pairs = data_utils.read_train_data(dataset_path)

    # split dataset
    train_pairs, val_pairs = data_utils.split_dataset(pairs, val_rate)  # pairs

    data_path = 'models.' + model_name + '.PoetryData'
    PoetryData = importlib.import_module(data_path)
    train_Dataset = getattr(PoetryData, 'PoetryData')(
        train_pairs,
        src_max_len=int(model_param['input_max_len']),
        tgt_max_len=int(model_param['target_max_len']))
    val_Dataset = getattr(PoetryData, 'PoetryData')(
        val_pairs,
        src_max_len=int(model_param['input_max_len']),
        tgt_max_len=int(model_param['target_max_len']))  # 反射并实例化

    # 变成小批
    train_data = Data.DataLoader(
        dataset=train_Dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,  # Jun16
        collate_fn=PoetryData.paired_collate_fn
        # num_workers=2  # 多线程来读数据,提取xy的时候几个数据一起提取
    )
    val_data = Data.DataLoader(
        dataset=val_Dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,  # Jun16
        collate_fn=PoetryData.paired_collate_fn
        # num_workers=2
    )

    # ========= Preparing Model =========#

    model_path = 'models.' + model_name + '.' + model_name
    Model = importlib.import_module(model_path)  # 导入模块
    model = getattr(Model, model_name)(model_param)  # 反射并实例化
    print('model:', model)

    optim_path = 'models.' + model_name + '.Optim'
    Optim = importlib.import_module(optim_path)  # 模块(文件)
    optimizer = Optim.get_optimizer(model, model_param)  # 调用模块的函数
    print('optimizer:', optimizer)

    # load model from ckpt
    if os.path.exists(ckpt_path):
        # checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) # 重复load
        model.load_state_dict(checkpoint['model'])

    # write log head
    save_dir = 'ckpt/' + str(time.strftime(
        "%m%d%H%M%S", t)) + '_' + model_param['model_name'] + '/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    with open(save_dir + 'log', 'a') as f:
        f.write('\n\n' + str(t_mark) + '\nsave dir:' + save_dir + '\n' +
                str(train_param) + '\n' + str(model_param) + '\n')

    print('start training')
    train(train_data, val_data, model, optimizer, batch_size, epochs,
          last_epoch, val_rate, teacher_forcing_ratio, save_dir, t,
          model_param, train_param)