def main(args, load_exclude_set, restoreCallback): logging.basicConfig(\ filename=0,\ level=logging.DEBUG,\ format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\ datefmt='%H:%M:%S') if args.debug: debug() logging.info(json.dumps(args, indent=2)) cuda_init(0, args.cuda) volatile = Storage() volatile.load_exclude_set = load_exclude_set volatile.restoreCallback = restoreCallback data_class = LanguageGeneration data_arg = Storage() data_arg.file_id = args.dataid data_arg.tokenizer = args.tokenizer data_arg.max_sent_length = args.max_sent_length data_arg.convert_to_lower_letter = args.convert_to_lower_letter data_arg.min_frequent_vocab_times = args.min_frequent_vocab_times data_arg.min_rare_vocab_times = args.min_rare_vocab_times wordvec_class = GeneralWordVector def load_dataset(data_arg, wvpath, embedding_size): wv = wordvec_class(wvpath) dm = data_class(**data_arg) return dm, wv.load_matrix(embedding_size, dm.frequent_vocab_list) if args.cache: dm, volatile.wordvec = try_cache( load_dataset, (data_arg, args.wvpath, args.embedding_size), args.cache_dir, data_class.__name__ + "_" + wordvec_class.__name__) else: dm, volatile.wordvec = load_dataset(data_arg, args.wvpath, args.embedding_size) volatile.dm = dm param = Storage() param.args = args param.volatile = volatile model = TransformerLM(param) if args.mode == "train": model.train_process() elif args.mode == "test": test_res = model.test_process() json.dump(test_res, open("./result.json", "w")) elif args.mode == "load": return model else: raise ValueError("Unknown mode")
def main(args, load_exclude_set, restoreCallback): logging.basicConfig(\ filename=0,\ level=logging.DEBUG,\ format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\ datefmt='%H:%M:%S') if args.debug: debug() logging.info(json.dumps(args, indent=2)) cuda_init(0, args.cuda) volatile = Storage() volatile.load_exclude_set = load_exclude_set volatile.restoreCallback = restoreCallback data_class = LanguageGeneration data_arg = Storage() data_arg.file_id = args.dataid data_arg.max_sent_length = args.max_sent_length data_arg.convert_to_lower_letter = args.convert_to_lower_letter data_arg.pretrained = args.pretrained data_arg.tokenizer = args.pretrained_model def load_dataset(data_arg): tokenizer = PretrainedTokenizer( GPT2Tokenizer.from_pretrained(data_arg.tokenizer)) new_arg = Storage(data_arg.copy()) new_arg.tokenizer = tokenizer dm = data_class(**new_arg) return dm if args.cache: dm = try_cache(load_dataset, (data_arg, ), args.cache_dir, data_class.__name__) else: dm = load_dataset(data_arg) volatile.dm = dm param = Storage() param.args = args param.volatile = volatile model = GPT2LM(param) if args.mode == "train": model.train_process() elif args.mode == "test": test_res = model.test_process() json.dump(test_res, open("./result.json", "w")) else: raise ValueError("Unknown mode")
def run(*argv): import argparse import time from utils import Storage parser = argparse.ArgumentParser( description='A language model with GRU. Attention, beamsearch,\ dropout and batchnorm is supported.') args = Storage() parser.add_argument( '--name', type=str, default=None, help= 'The name of your model, used for tensorboard, etc. Default: runXXXXXX_XXXXXX (initialized by current time)' ) parser.add_argument('--restore', type=str, default=None, help='Checkpoints name to load. \ "NAME_last" for the last checkpoint of model named NAME. "NAME_best" means the best checkpoint. \ You can also use "last" and "best", by default use last model you run. \ Attention: "NAME_last" and "NAME_best" are not guaranteed to work when 2 models with same name run in the same time. \ "last" and "best" are not guaranteed to work when 2 models run in the same time.\ Default: None (don\'t load anything)') parser.add_argument('--mode', type=str, default="train", help='"train" or "test". Default: train') parser.add_argument('--dh_size', type=int, default=200, help='Size of decoder GRU') parser.add_argument( '--droprate', type=float, default=0, help= 'The probability to be zeroed in dropout. 0 indicates for don\'t use dropout' ) parser.add_argument( '--decode_mode', type=str, choices=['max', 'sample', 'gumbel', 'samplek', 'beam'], default='samplek', help= 'The decode strategy when freerun. Choices: max, sample, gumbel(=sample), \ samplek(sample from topk), beam(beamsearch). Default: samplek') parser.add_argument('--batchnorm', action='store_true', help='Use bathnorm') parser.add_argument( '--top_k', type=int, default=10, help='The top_k when decode_mode == "beam" or "samplek"') parser.add_argument( '--length_penalty', type=float, default=0.7, help= 'The beamsearch penalty for short sentences. The penalty will get larger when this becomes smaller.' ) parser.add_argument('--temperature', type=float, default=1, help='Temperature. Default: 1') parser.add_argument( '--dataid', type=str, default='resources://MSCOCO', help='Resources/path for data set. Default: resources://MSCOCO') parser.add_argument('--epoch', type=int, default=100, help="Epoch for training. Default: 100") parser.add_argument('--batch_per_epoch', type=int, default=500, help="Batches per epoch. Default: 1500") parser.add_argument( '--wvid', type=str, default="resources://Glove300d", help= "Resources/path for pretrained wordvector. Default: resources://Glove300d" ) parser.add_argument( '--out_dir', type=str, default="./output", help='Output directory for test output. Default: ./output') parser.add_argument( '--log_dir', type=str, default="./tensorboard", help='Log directory for tensorboard. Default: ./tensorboard') parser.add_argument( '--model_dir', type=str, default="./model", help='Checkpoints directory for model. Default: ./model') parser.add_argument( '--cache_dir', type=str, default="./cache", help='Checkpoints directory for cache. Default: ./cache') parser.add_argument('--cpu', action="store_true", help='Use cpu.') parser.add_argument('--debug', action='store_true', help='Enter debug mode (using ptvsd).') parser.add_argument( '--cache', action='store_true', help= 'Use cache for speeding up load data and wordvec. (It may cause problems when you switch dataset.)' ) parser.add_argument('--seed', type=int, default=0, help='Specify random seed. Default: 0') parser.add_argument('--lr', type=float, default=1e-3, help='Learning rate. Default: 0.001') cargs = parser.parse_args(argv) # general setting args.name = cargs.name or time.strftime("run%Y%m%d_%H%M%S", time.localtime()) args.restore = cargs.restore args.mode = cargs.mode args.out_dir = cargs.out_dir args.log_dir = cargs.log_dir args.model_dir = cargs.model_dir args.cache_dir = cargs.cache_dir args.debug = cargs.debug args.cache = cargs.cache args.cuda = not cargs.cpu ## dataset settings args.dataid = cargs.dataid args.tokenizer = "space" args.max_sent_length = 50 args.convert_to_lower_letter = False args.min_frequent_vocab_times = 10 args.min_rare_vocab_times = 0 args.wvid = cargs.wvid ## training settings args.epochs = cargs.epoch args.lr = cargs.lr args.batch_size = 64 args.batch_num_per_gradient = 4 args.grad_clip = 5 args.show_sample = [0] # show which batch when evaluating at tensorboard args.checkpoint_steps = 20 args.checkpoint_max_to_keep = 5 ## arguments for restoring checkpoints args.restore_optimizer = True load_exclude_set = [] restoreCallback = None ## architecture settings args.batch_per_epoch = cargs.batch_per_epoch args.embedding_size = 300 args.dh_size = cargs.dh_size args.droprate = cargs.droprate args.batchnorm = cargs.batchnorm ## decoding settings args.decode_mode = cargs.decode_mode args.top_k = cargs.top_k args.length_penalty = cargs.length_penalty args.temperature = cargs.temperature ## random seed args.seed = cargs.seed import random random.seed(cargs.seed) import torch torch.manual_seed(cargs.seed) import numpy as np np.random.seed(cargs.seed) from main import main main(args, load_exclude_set, restoreCallback)