def main(): parser = get_parser() opts = parser.parse_args() opts.unsuper_num = 968 os.makedirs("experiments", exist_ok=True) if opts.phase == 'train': # Create directories log_dir = os.path.join("experiments", opts.experiment_name) os.makedirs( log_dir, exist_ok=False) # False to prevent multiple train run by mistake os.makedirs(os.path.join(log_dir, "samples"), exist_ok=True) os.makedirs(os.path.join(log_dir, "checkpoint"), exist_ok=True) os.makedirs(os.path.join(log_dir, "results"), exist_ok=True) os.makedirs(os.path.join(log_dir, "interps"), exist_ok=True) os.makedirs(os.path.join(log_dir, "logs"), exist_ok=True) print(f"Training on experiment {opts.experiment_name}...") # Dump options with open(os.path.join(log_dir, "opts.txt"), "w") as f: for key, value in vars(opts).items(): f.write(str(key) + ": " + str(value) + "\n") train(opts) elif opts.phase == 'test': print(f"Testing on experiment {opts.experiment_name}...") test(opts) elif opts.phase == 'test_interp': print(f"Testing interpolation on experiment {opts.experiment_name}...") interp(opts) else: raise NotImplementedError
def get_wrapper_parser(): """Defines the command line interface of the wrapper""" import argparse parser = get_parser() # Allow alternative images (semi-developer) parser.add_argument('--image', metavar='IMG', type=str, default=os.path.join(home, "xcpEngine.simg"), help='image name') # Options for mapping files and directories into container # Update `expected_overlap` variable in merge_help() when adding to this g_wrap = parser.add_argument_group( 'Wrapper options', 'Standard options that require mapping files into the container') # Developer patch/shell options g_dev = parser.add_argument_group( 'Developer options', 'Tools for testing and debugging xcpEngine') g_dev.add_argument('-f', '--patch-xcpEngine', metavar='PATH', type=os.path.abspath, help='working xcpEngine repository') g_dev.add_argument('--shell', action='store_true', help='open shell in image instead of running xcpEngine') return parser
def main(): parser = argparse.ArgumentParser(description="Generate documentation using the different model options") parser.add_argument('-v', "--verbose", action='store_true', help="Set logging level to INFO") args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.INFO) options_md = """ # JoliGAN Options Here are all the available options to call with `train.py` """ options_md += document_parser(opt.get_parser()) model_parsers = opt.get_models_parsers() model_names = list(model_parsers.keys()) model_names.sort() for name in model_names: parser = model_parsers[name] options_md += "\n\n## %s\n\n%s" % (name, document_parser(parser)) print(options_md)
def get_dup_parser(): parser = get_parser() parser.add_argument('--source', '-s', required=True, help='source path to be analyzed (or source dup path)') parser.add_argument('--target', '-t', required=True, help='target path to be analyzed (or source dup path)') parser.add_argument('--corpus', '-c', action='store_true', default=False, help='whether the source and target is corpus path') parser.add_argument('--output', '-o', default='linenos.json', help='duplicated linenos output path') parser.add_argument('--remove', '-r', action='store_true', default=False, help='duplicated linenos output path') return parser
def get_flen_parser(): parser = get_parser() parser.add_argument( '--file-path', '-f', required=True, help='file path' ) parser.add_argument( '--split', '-s', action='store_true', default=False ) parser.add_argument( '--max-length', '-l', type=int, default=100, help='max length of sentence' ) return parser
def parse_and_set_global_flags(): global FLAGS parser = options.get_parser() parser.parse_args(namespace=FLAGS) FLAGS.logdir = util.ensure_absolute_path(FLAGS.logdir, root=paths.DATA_ROOT + '/experiments') os.makedirs(FLAGS.logdir, exist_ok=True) if FLAGS.batch_size_test is None: FLAGS.batch_size_test = FLAGS.batch_size if FLAGS.checkpoint_dir is None: FLAGS.checkpoint_dir = FLAGS.logdir os.makedirs(FLAGS.checkpoint_dir, exist_ok=True)
def visualize_outputs(): pl.seed_everything(0) parser = options.get_parser() parser.add_argument('checkpoint_path', type=str) parser.add_argument('--output_dir', default='./logs', type=str) args = parser.parse_args() dataloader = MNISTDataModule(hparams=args) dataloader.prepare_data() val_dataloader = dataloader.val_dataloader() x = get_ordered_batch(val_dataloader) model = BetaVAE.load_from_checkpoint(args.checkpoint_path) decoder = model.model.decoder visualize(model, decoder, x, args.output_dir, latent_dims=args.z_dim)
def get_stat_parser(): parser = get_parser() parser.add_argument('--output-duplicated', '-o', action='store_true', help='output middle result, i.e. .dup file') parser.add_argument('--file-paths', '-fps', metavar='FPS', nargs='+', required=True, help='file paths to be counted') return parser
def train(): pl.seed_everything(0) parser = options.get_parser() parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() dataloader = MultiDSpritesDataModule(hparams=args) model = MONet(hparams=args) model.init_parameters() logger_callback = LoggerCallback() checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor='train_loss', mode='min', prefix=model.__class__.__name__) early_stop_callback = EarlyStopping(monitor='train_loss', min_delta=0.01, patience=10, verbose=False, mode='min') trainer = pl.Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, callbacks=[early_stop_callback], deterministic=True, log_every_n_steps=50, log_gpu_memory='min_max', # num_sanity_val_steps=0 ) start = time.time() trainer.fit(model, datamodule=dataloader) if not args.fast_dev_run: time_elapsed = time.time() - start h = time_elapsed // 3600 m = (time_elapsed // 60) % 60 s = time_elapsed % 60 print(f'Training complete in {h:.0f}h {m:.0f}m {s:.0f}s') print(f'Lowest train error: {checkpoint_callback.best_model_score:.4f}')
def get_wrapper_parser(): """Defines the command line interface of the wrapper""" import argparse parser = get_parser() # Allow alternative images (semi-developer) parser.add_argument('--image', metavar='IMG', type=str, default='pennbbl/xcpengine:{}'.format(__version__), help='image name') # Options for mapping files and directories into container # Update `expected_overlap` variable in merge_help() when adding to this g_wrap = parser.add_argument_group( 'Wrapper options', 'Standard options that require mapping files into the container') # Developer patch/shell options g_dev = parser.add_argument_group( 'Developer options', 'Tools for testing and debugging xcpEngine') g_dev.add_argument('-f', '--patch-xcpEngine', metavar='PATH', type=os.path.abspath, help='working xcpEngine repository') g_dev.add_argument('--shell', action='store_true', help='open shell in image instead of running xcpEngine') g_dev.add_argument('-e', '--env', action='append', nargs=2, metavar=('ENV_VAR', 'value'), help='Set custom environment variable within container') g_dev.add_argument('-u', '--user', action='store', help='Run container as a given user/uid') return parser
def train(): pl.seed_everything(0) parser = options.get_parser() parser.set_defaults(input_channels=1, comp_vae_out_channels=1, input_height=28, input_width=28) parser.add_argument('--image_type', default=0, type=int) parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() dataloader = MNISTDataModule(hparams=args) model = BetaVAE(hparams=args) model.init_parameters() logger_callback = LoggerCallback() checkpoint_callback = ModelCheckpoint( save_top_k=1, monitor='val_loss', mode='min', prefix=model.__class__.__name__+'_' ) trainer = pl.Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, callbacks=[logger_callback], num_sanity_val_steps=0 ) start = time.time() trainer.fit(model, datamodule=dataloader) if not args.fast_dev_run: time_elapsed = time.time() - start h = time_elapsed // 3600 m = (time_elapsed // 60) % 60 s = time_elapsed % 60 print(f'Training complete in {h:.0f}h {m:.0f}m {s:.0f}s') print(f'Best val. score: {checkpoint_callback.best_model_score:.4f}')
def get_filter_parser(): parser = get_parser() parser.add_argument('--source-paths', '-sps', nargs='+', required=True, help='source file paths') parser.add_argument('--target-paths', '-tps', nargs='+', required=True, help='target file paths') parser.add_argument('--source-constraint', '-sc', metavar='EXPR', required=True, help='source constraint, tuple like (1,50)') parser.add_argument( '--target-constraint', '-tc', metavar='EXPR', default=None, help='target constraint, tuple line source constraint or ratio ' 'like (0.5,1.5) which means the target length is between half ' 'and double size as source length') parser.add_argument( '--source-logic-and', action='store_true', default=False, help='if set, then the criteria is satisfied if and only if all ' 'source path meet the requirement, else anyone of them') parser.add_argument( '--target-logic-and', action='store_true', default=False, help='if set, then the criteria is satisfied if and only if all ' 'target path meet the requirement, else anyone of them') return parser
def get_csgm_parser(): parser = get_parser() group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '--to-sgm', action='store_true', help='palin2sgm, input plain text, output sgm format file') group.add_argument( '--from-sgm', action='store_true', help='sgm2plain, input sgm format file, output plain text') parser.add_argument('--file-path', '-fp', metavar='FP', required=True, help='file path to be converted') parser.add_argument('--save-path', '-o', metavar='FP', default=None, help='file path to save at') return parser
def main(): parser = get_parser() parser.add_argument( "--file", dest="filename" ) args = parser.parse_args() arg1 = InMemStream("echo") arg2 = InMemStream() arg3 = InMemStream() ioloop = tornado.ioloop.IOLoop.current() if args.filename == "stdin": arg3 = PipeStream(sys.stdin.fileno()) send_stream(arg1, arg2, arg3, args.host) ioloop.start() elif args.filename: f = os.open(args.filename, os.O_RDONLY) arg3 = PipeStream(f) ioloop.run_sync(lambda: send_stream(arg1, arg2, arg3, args.host)) else: raise NotImplementedError()
def get_mapping_parser(): parser = get_parser() add_bilingual_args_(parser) add_output_args_(parser) return parser
import numpy as np import pickle as pk import tensorflow as tf from nlp.util import config, ets_reader from nlp.util import utils as U from nlp.util.w2vEmbReader import W2VEmbReader import options def make_abs(path): return os.path.abspath(path) ''' get config ''' parser = options.get_parser() config_file = 'config/mode.conf' argv=[]# override config file here FLAGS = config.get_config(parser=parser, config_file=config_file, argv=argv) FLAGS.chkpt_dir = make_abs(FLAGS.chkpt_dir) FLAGS.data_dir = os.path.join(FLAGS.data_dir, FLAGS.item_id) pprint.pprint(FLAGS) ''' setup checkpoint directory ''' if not os.path.exists(FLAGS.chkpt_dir): U.mkdirs(FLAGS.chkpt_dir) print('Created checkpoint directory', FLAGS.chkpt_dir) config.save_local_config(FLAGS) # ''' setup logger (???) ''' # U.set_logger(FLAGS.chkpt_dir)
def main(): parser = options.get_parser() args, _ = parser.parse_known_args() logging.basicConfig( level=logging.INFO, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s' ) logging.info(f"host: {socket.gethostname()}") if not os.path.isdir(args.output): logging.info("Make dir %s" % (args.output)) os.makedirs(args.output) torch.manual_seed(0) np.random.seed(0) best_acc_path = os.path.join(args.output, "best-ter.pth") # "./output/conv_lstm.pth" log_path = os.path.join(args.output, "log") ckpt_path = os.path.join(args.output, "ckpt.pth") with open(os.path.join(args.output, 'train_conf.yaml'), 'w') as fo: yaml.dump({**vars(args), **{'best_dev_path': best_acc_path}}, fo) loader_seeds = list(range(100)) logging.info( f'max input length: {args.max_ilen[-1]}, max output length: {args.max_olen[-1]}' ) batch_milestones = list( zip(args.max_ilen, args.max_olen, [ args.batch_reduce_ratio**i for i in range(1, len(args.max_ilen) + 1) ])) device, cpu = torch.device('cuda'), torch.device('cpu') word2id, id2word = utils.get_word_map(args.wordlist) assert len(word2id) == len(id2word), "duplicate words" train_data = dataset.Speech(args.train_ark, args.train_scp, args.train_text, args.train_len, args.data_sample_rate, args.lstm_sample_rate, add_delta=True, transform=dataset.ToTensor(device), max_segment_size=args.segment_size, add_spec_aug=(args.add_spec_aug == 1), min_flen_aug=args.min_flen_aug, spec_F=args.spec_F, spec_T=args.spec_T) dev_data = dataset.Speech(args.dev_ark, args.dev_scp, args.dev_text, args.dev_len, args.data_sample_rate, args.lstm_sample_rate, add_delta=True, transform=dataset.ToTensor(device), max_segment_size=args.segment_size) if args.batch_sampler == 'utt': from sampler import BucketBatchSampler train_loader = tud.DataLoader(train_data, batch_sampler=BucketBatchSampler( shuffle=args.shuffle, batch_size=args.batch_size, files=train_data.ark_fns, lengths=train_data.lengths, milestones=batch_milestones, cycle=True, seeds=list(loader_seeds)), collate_fn=dataset.collate_fn) dev_loader = tud.DataLoader(dev_data, batch_sampler=BucketBatchSampler( shuffle=False, batch_size=args.dev_batch_size, files=dev_data.ark_fns, lengths=dev_data.lengths, milestones=batch_milestones, cycle=False), collate_fn=dataset.collate_fn) else: raise NotImplementedError logging.info('Number of GPUs %d' % (torch.cuda.device_count())) encoder = SegModel(input_size=train_data.feat_dim, hidden_size=args.n_hidden, n_layers=args.n_layers, bid=True, num_label=len(word2id), segment_size=args.segment_size, segment_ratio=args.segment_ratio, sample_rate=args.lstm_sample_rate, dropout=args.dropout, pooling_type=args.pooling, lambda_emb=args.lambda_emb, penalize_emb=args.penalize_emb, word_bias=(args.word_bias == 1), num_word_samples=args.seg_word_samples).to(device) if args.optim == 'Adam': optimizer = optim.Adam(encoder.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optim == 'SGD': optimizer = optim.SGD(encoder.parameters(), lr=args.lr, momentum=args.momentum, nesterov=(args.nesterov == 1), weight_decay=args.weight_decay) else: raise NotImplementedError('Option for optimizer: Adam|SGD') if args.scheduler == 'StepLR': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.scheduler_gamma) elif args.scheduler == 'ReduceLROnPlateau': scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', factor=args.scheduler_gamma, patience=1, min_lr=1.0e-8, verbose=True) # threshold=0.001, else: raise NotImplementedError( 'Option for scheduler: StepLR|ReduceLROnPlateau') logging.info(f"{encoder}, {optimizer}, {scheduler}") if args.amp == 1: logging.info(f"AMP training, opt level: O1") encoder, optimizer = amp.initialize(encoder, optimizer, opt_level="O1") ckpt = { 'epoch': 0, 'step': 0, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'best_dev_acc': -float('inf'), 'best_dev_loss': float('inf'), 'sampler': train_loader.batch_sampler.state_dict(), 'encoder': encoder.state_dict() } if args.amp == 1: ckpt['amp'] = amp.state_dict() if args.load_awe is not None: logging.info('Load awe model %s' % (args.load_awe)) awe_std = torch.load(args.load_awe) encoder.lstm.load_state_dict(awe_std['lstm']) encoder.pooling.load_state_dict(awe_std['pooling']) encoder.fc1[0].load_state_dict(awe_std['fc1']) if args.load_emb is not None: logging.info('Unit normalizing word embedding matrix') logging.info('Load pre-trained embedding %s' % (args.load_emb)) emb_agwe = torch.FloatTensor(np.load(args.load_emb)).to(device) emb_agwe = emb_agwe / (emb_agwe * emb_agwe).sum(dim=-1, keepdim=True).pow(0.5) encoder.fc1[-1].weight.data.copy_(emb_agwe) else: emb_agwe = None if os.path.isfile(ckpt_path): logging.info('Load checkpoint %s' % (ckpt_path)) try: ckpt = torch.load(ckpt_path) for key, value in ckpt['encoder'].items(): if key.split('.')[0] != 'seg_loss': encoder.state_dict()[key].copy_(value) logging.info(f"Loaded encoder parameters") optimizer.load_state_dict(ckpt['optimizer']) scheduler.load_state_dict(ckpt['scheduler']) # load data loader train_loader.batch_sampler.load_state_dict(ckpt['sampler']) if args.amp == 1: amp.load_state_dict(ckpt['amp']) except Exception as err: logging.info(f"Error loading {ckpt_path}: {err}") train_opt = argparse.Namespace(max_grad=args.max_grad, emb_agwe=emb_agwe, accum_batch_size=args.accum_batch_size, log_interval=args.log_interval, log_path=log_path, ckpt_path=ckpt_path, amp=args.amp) dev_opt = argparse.Namespace(emb_agwe=emb_agwe) for epoch in range(args.epoch): if epoch < ckpt['epoch']: continue logging.info('Epoch %d, lr: %.5f' % (epoch, optimizer.param_groups[0]['lr'])) train(encoder, optimizer, train_loader, ckpt, train_opt) dl, dl_seg, dl_emb, dacc = evaluate(encoder, dev_loader, dev_opt) if args.scheduler == 'StepLR': scheduler.step() elif args.scheduler == 'ReduceLROnPlateau': scheduler.step(dacc) else: raise NotImplementedError( 'Option for scheduler: StepLR|ReduceLROnPlateau') pcont = 'Epoch %d, dev loss: %.3f, seg loss: %.3f, emb loss: %.3f, acc (OOV): %.3f' % ( epoch, dl, dl_seg, dl_emb, dacc) logging.info(pcont) open(log_path, 'a+').write(pcont + "\n") # if dacc > ckpt['best_dev_acc']: # ckpt['best_dev_acc'] = dacc # torch.save(encoder.state_dict(), open(best_acc_path, 'wb')) ckpt['epoch'] = ckpt['epoch'] + 1 ckpt['encoder'] = encoder.state_dict() ckpt['optimizer'] = optimizer.state_dict() ckpt['scheduler'] = scheduler.state_dict() if args.amp == 1: ckpt['amp'] = amp.state_dict() if dacc > ckpt['best_dev_acc']: ckpt['best_dev_acc'] = dacc torch.save(encoder.state_dict(), open(best_acc_path, 'wb')) torch.save(ckpt, open(ckpt_path + '.best', 'wb')) torch.save(ckpt, open(ckpt_path, 'wb')) return
rt_output = decode(rt_feat, audio_input) else: rt_feat, rt_s_feat = rt_encode(rt_input, audio_prev) rt_output = decode(rt_feat, audio_input, s_rt=rt_s_feat) d_fake_loss = dis(rt_output, torch.zeros(rt_gt.size(0)).cuda()) if train: optimizer2.zero_grad() (d_real_loss + d_fake_loss).backward() optimizer2.step() return d_real_loss.item() + d_fake_loss.item() if __name__ == '__main__': opt = get_parser() utils.init_log_dir(opt) train_set, val_set = VoxLmark2rgbDataset(opt, 'train'), VoxLmark2rgbDataset( opt, 'val') train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, drop_last=True, collate_fn=PadSequence()) val_loader = DataLoader(val_set, batch_size=opt.batch_size, shuffle=False,
def main(): parser = options.get_parser('Trainer') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) args = parser.parse_args() print(args) args.cuda = not args.disable_cuda and torch.cuda.is_available() torch.manual_seed(5) if args.cuda: torch.backends.cudnn.benchmark = True # increase recursion depth sys.setrecursionlimit(10000) # checkpoint checkpoint_dir = os.path.dirname(args.checkpoint) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) # load dataset train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=False) assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!' train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus] val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus] start_epoch = 0 caseless = args.caseless batch_size = args.batch_size num_epoch = args.num_epoch # build vocab sents = [tup[0] for tup in train_corpus + val_corpus] feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless) target_map = ddi2013.target_map # get class weights _, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless) class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None train_loader, val_loader, test_loader = utils.load_datasets(args.processed_dir, args.train_size, args, feature_map, dataloader=True) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = RelationTreeModel(vocab_size, tagset_size, args) # loss criterion = utils.build_loss(args, class_weights=class_weights) # load states if os.path.isfile(args.load_checkpoint): print('Loading checkpoint file from {}...'.format(args.load_checkpoint)) checkpoint_file = torch.load(args.load_checkpoint) start_epoch = checkpoint_file['epoch'] + 1 model.load_state_dict(checkpoint_file['state_dict']) # optimizer.load_state_dict(checkpoint_file['optimizer']) else: print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint)) if not args.rand_embedding: pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim) print(pretrained_word_embedding.size()) print(vocab_size) model.load_pretrained_embedding(pretrained_word_embedding) if args.disable_fine_tune: model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words model.rand_init(init_embedding=args.rand_embedding) # trainer trainer = TreeTrainer(args, model, criterion) best_f1 = float('-inf') if os.path.isfile(args.load_checkpoint): dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda) test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) best_f1 = dev_f1 print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format( dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1)) track_list = [] patience_count = 0 start_time = time.time() q = mp.Queue() # set start methods try: mp.set_start_method('spawn') except RuntimeError: pass for epoch in range(start_epoch, num_epoch): epoch_loss = train(train_loader, trainer, epoch) # processes = [] # for rank in range(args.num_processes): # p = mp.Process(target=train, args=(train_loader, trainer, epoch, q)) # p.start() # processes.append(p) # for p in processes: # p.join() # # epoch_loss = q.get() # update lr trainer.lr_step(epoch_loss) dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda) test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) if dev_f1 >= best_f1: patience_count = 0 best_f1 = dev_f1 track_list.append({'epoch': epoch, 'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1)) try: utils.save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': trainer.optimizer.state_dict(), 'f_map': feature_map, 't_map': target_map, }, {'track_list': track_list, 'args': vars(args) }, args.checkpoint) except Exception as inst: print(inst) else: patience_count += 1 track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1)) print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time)) if patience_count >= args.patience: break
def main(): parser = options.get_parser('Generator') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_generation_args(parser) args = parser.parse_args() model_path = args.load_checkpoint + '.model' args_path = args.load_checkpoint + '.json' with open(args_path, 'r') as f: _args = json.load(f)['args'] [setattr(args, k, v) for k, v in _args.items()] args.cuda = not args.disable_cuda and torch.cuda.is_available() print(args) if args.cuda: torch.backends.cudnn.benchmark = True # increase recursion depth sys.setrecursionlimit(10000) # load dataset train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus( args.processed_dir, ddi=False) assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!' train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus] val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus] caseless = args.caseless batch_size = args.batch_size # build vocab sents = [tup[0] for tup in train_corpus + val_corpus] feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless) target_map = ddi2013.target_map # get class weights _, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless) class_weights = torch.Tensor( utils.get_class_weights(train_targets)) if args.class_weight else None # load dataets _, _, test_loader = utils.load_datasets(args.processed_dir, args.train_size, args, feature_map, dataloader=True) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = RelationTreeModel(vocab_size, tagset_size, args) # loss criterion = utils.build_loss(args, class_weights=class_weights) # load states assert os.path.isfile(model_path), "Checkpoint not found!" print('Loading checkpoint file from {}...'.format(model_path)) checkpoint_file = torch.load(model_path) model.load_state_dict(checkpoint_file['state_dict']) # trainer trainer = TreeTrainer(args, model, criterion) # predict y_true, y_pred, treelists, f1_by_len = predict(trainer, test_loader, target_map, cuda=args.cuda) # assign words to roots for tup, treelist in zip(test_raw_corpus, treelists): for t in treelist: t.idx = tup.sent[t.idx] if t.idx < len(tup.sent) else None # prediction print('Predicting...') # write result: sent_id|e1|e2|ddi|type with open(args.predict_file, 'w') as f: for tup, pred in zip(test_raw_corpus, y_pred): ddi = 0 if pred == 'null' else 1 f.write('|'.join([tup.sent_id, tup.e1, tup.e2, str(ddi), pred])) f.write('\n') def print_info(f, tup, target, pred, root): f.write('{}\n'.format(' '.join(tup.sent))) f.write('{}\n'.format(' | '.join( [tup.sent_id, tup.e1, tup.e2, target, pred]))) f.write('{}\n\n'.format(root)) # error analysis print('Analyzing...') with open(args.error_file, 'w') as f: f.write(' | '.join(['sent_id', 'e1', 'e2', 'target', 'pred'])) f.write('\n') for tup, target, pred, treelist in zip(test_raw_corpus, y_true, y_pred, treelists): if target != pred: print_info(f, tup, target, pred, treelist[-1]) # attention print('Writing attention scores...') with open(args.correct_file, 'w') as f: f.write(' | '.join(['target', 'sent', 'att_weight'])) f.write('\n') for tup, target, pred, treelist in zip(test_raw_corpus, y_true, y_pred, treelists): if target == pred and target != 'null': print_info(f, tup, target, pred, treelist[-1])
def main(): parser = options.get_parser('Trainer') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) args = parser.parse_args() print(args) args.cuda = not args.disable_cuda and torch.cuda.is_available() # checkpoint checkpoint_dir = os.path.dirname(args.checkpoint) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) # load dataset train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir) assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!' train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus] val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus] test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus] start_epoch = 0 caseless = args.caseless batch_size = args.batch_size num_epoch = args.num_epoch # preprocessing sents = [tup[0] for tup in train_corpus + val_corpus] feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless) ## # target_map = {c:i for i, c in enumerate(['null', 'true'])} target_map = ddi2013.target_map train_features, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless) val_features, val_targets = utils.build_corpus(val_corpus, feature_map, target_map, caseless) test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless) class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None train_loader = utils.construct_bucket_dataloader(train_features, train_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=True) val_loader = utils.construct_bucket_dataloader(val_features, val_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False) test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False) print('Preprocessing done! Vocab size: {}'.format(len(feature_map))) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = utils.build_model(args, vocab_size, tagset_size) # loss criterion = utils.build_loss(args, class_weights=class_weights) # load states if os.path.isfile(args.load_checkpoint): print('Loading checkpoint file from {}...'.format(args.load_checkpoint)) checkpoint_file = torch.load(args.load_checkpoint) start_epoch = checkpoint_file['epoch'] + 1 model.load_state_dict(checkpoint_file['state_dict']) # optimizer.load_state_dict(checkpoint_file['optimizer']) else: print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint)) if not args.rand_embedding: pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim) print(pretrained_word_embedding.size()) print(vocab_size) model.load_pretrained_embedding(pretrained_word_embedding) if args.disable_fine_tune: model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words model.rand_init(init_embedding=args.rand_embedding) # trainer trainer = SeqTrainer(args, model, criterion) if os.path.isfile(args.load_checkpoint): dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda) test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format( dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1)) track_list = [] best_f1 = float('-inf') patience_count = 0 start_time = time.time() for epoch in range(start_epoch, num_epoch): epoch_loss = train(train_loader, trainer, epoch) # update lr trainer.lr_step() dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda) if dev_f1 >= best_f1: patience_count = 0 best_f1 = dev_f1 test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) track_list.append({'epoch': epoch, 'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1)) try: utils.save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': trainer.optimizer.state_dict(), 'f_map': feature_map, 't_map': target_map, }, {'track_list': track_list, 'args': vars(args) }, args.checkpoint + '_lstm') except Exception as inst: print(inst) else: patience_count += 1 track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss)) print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time)) if patience_count >= args.patience: break
def main(): parser = get_parser() args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args.seed) args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() num_labels_old = AutoConfig.from_pretrained( args.model_name_or_path).num_labels config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels_old, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.model_type == "electra": model = ElectraForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) else: model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if num_labels != num_labels_old: config.num_labels = num_labels model.num_labels = num_labels if args.model_type in ["roberta", "bert", "electra"]: from transformers.modeling_roberta import RobertaClassificationHead model.classifier = (RobertaClassificationHead(config) if args.model_type == "roberta" else nn.Linear( config.hidden_size, config.num_labels)) for module in model.classifier.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=config.initializer_range) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() elif args.model_type == "bart": from transformers.modeling_bart import BartClassificationHead model.classification_head = BartClassificationHead( config.d_model, config.d_model, config.num_labels, config.classif_dropout, ) model.model._init_weights(model.classification_head.dense) model.model._init_weights(model.classification_head.out_proj) elif args.model_type == "xlnet": model.logits_proj = nn.Linear(config.d_model, config.num_labels) model.transformer._init_weights(model.logits_proj) else: raise NotImplementedError if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab if args.reinit_pooler: if args.model_type in ["bert", "roberta"]: encoder_temp = getattr(model, args.model_type) encoder_temp.pooler.dense.weight.data.normal_( mean=0.0, std=encoder_temp.config.initializer_range) encoder_temp.pooler.dense.bias.data.zero_() for p in encoder_temp.pooler.parameters(): p.requires_grad = True elif args.model_type in ["xlnet", "bart", "electra"]: raise ValueError( f"{args.model_type} does not have a pooler at the end") else: raise NotImplementedError if args.reinit_layers > 0: if args.model_type in ["bert", "roberta", "electra"]: assert args.reinit_pooler or args.model_type == "electra" from transformers.modeling_bert import BertLayerNorm encoder_temp = getattr(model, args.model_type) for layer in encoder_temp.encoder.layer[-args.reinit_layers:]: for module in layer.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_( mean=0.0, std=encoder_temp.config.initializer_range) elif isinstance(module, BertLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() elif args.model_type == "xlnet": from transformers.modeling_xlnet import XLNetLayerNorm, XLNetRelativeAttention for layer in model.transformer.layer[-args.reinit_layers:]: for module in layer.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_( mean=0.0, std=model.transformer.config.initializer_range) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, XLNetLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, XLNetRelativeAttention): for param in [ module.q, module.k, module.v, module.o, module.r, module.r_r_bias, module.r_s_bias, module.r_w_bias, module.seg_embed, ]: param.data.normal_( mean=0.0, std=model.transformer.config.initializer_range) elif args.model_type == "bart": for layer in model.model.decoder.layers[-args.reinit_layers:]: for module in layer.modules(): model.model._init_weights(module) else: raise NotImplementedError if args.mixout > 0: from mixout import MixLinear for sup_module in model.modules(): for name, module in sup_module.named_children(): if isinstance(module, nn.Dropout): module.p = 0.0 if isinstance(module, nn.Linear): target_state_dict = module.state_dict() bias = True if module.bias is not None else False new_module = MixLinear(module.in_features, module.out_features, bias, target_state_dict["weight"], args.mixout) new_module.load_state_dict(target_state_dict) setattr(sup_module, name, new_module) print(model) model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("Saving model checkpoint to %s", args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
def main(): parser = options.get_parser('Generator') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_generation_args(parser) args = parser.parse_args() print(args) args.cuda = not args.disable_cuda and torch.cuda.is_available() caseless = args.caseless batch_size = args.batch_size if os.path.isfile(args.load_checkpoint): print('Loading checkpoint file from {}...'.format(args.load_checkpoint)) checkpoint_file = torch.load(args.load_checkpoint) else: print('No checkpoint file found: {}'.format(args.load_checkpoint)) raise OSError train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=True) test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus] # preprocessing feature_map = checkpoint_file['f_map'] target_map = checkpoint_file['t_map'] test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless) # train/val split test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = utils.build_model(args, vocab_size, tagset_size) # loss criterion = utils.build_loss(args) # load states model.load_state_dict(checkpoint_file['state_dict']) # trainer trainer = SeqTrainer(args, model, criterion) if args.cuda: model.cuda() y_true, y_pred, att_weights = predict(trainer, test_loader, target_map, cuda=args.cuda) assert len(y_pred) == len(test_corpus), 'length of prediction is inconsistent with that of data set' # prediction print('Predicting...') assert len(y_pred) == len(test_corpus), 'length of prediction is inconsistent with that of data set' # write result: sent_id|e1|e2|ddi|type with open(args.predict_file, 'w') as f: for tup, pred in zip(test_raw_corpus, y_pred): ddi = 0 if pred == 'null' else 1 f.write('|'.join([tup.sent_id, tup.e1, tup.e2, str(ddi), pred])) f.write('\n') # error analysis print('Analyzing...') with open(args.error_file, 'w') as f: f.write(' | '.join(['sent_id', 'e1', 'e2', 'target', 'pred'])) f.write('\n') for tup, target, pred, att_weight in zip(test_raw_corpus, y_true, y_pred, att_weights): if target != pred: size = len(tup.sent) f.write('{}\n'.format(' '.join(tup.sent))) if args.model != 'InterAttentionLSTM': att_weight = [att_weight] for i in range(len(att_weight)): f.write('{}\n'.format(' '.join(map(lambda x: str(round(x, 4)), att_weight[i][:size])))) f.write('{}\n\n'.format(' | '.join([tup.sent_id, tup.e1, tup.e2, target, pred]))) # attention print('Writing attention scores...') with open(args.att_file, 'w') as f: f.write(' | '.join(['target', 'sent', 'att_weight'])) f.write('\n') for tup, target, pred, att_weight in zip(test_raw_corpus, y_true, y_pred, att_weights): if target == pred and target != 'null': size = len(tup.sent) f.write('{}\n'.format(target)) f.write('{}\n'.format(' '.join(tup.sent))) if args.model != 'InterAttentionLSTM': att_weight = [att_weight] for i in range(len(att_weight)): f.write('{}\n'.format(' '.join(map(lambda x: str(round(x, 4)), att_weight[i][:size]))))
........ -----END PRIVATE KEY-----""", "client_email": "*****@*****.**", "client_id": "....", "auth_uri": "https://accounts.google.com/o/oauth2/auth", "token_uri": "https://accounts.google.com/o/oauth2/token", "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/" "yourworker%40yourproject.iam.gserviceaccount.com" } def __init__(self, parser, server_name): writable_metrics_and_types = {'your model precision': float} super(YourParamsSheet, self).__init__(parser, writable_column_types=writable_metrics_and_types, experiment_id_column='exp_hash', server_name=server_name) if __name__ == '__main__': server_name = os.environ.get('SERVERNAME', None) params = YourParamsSheet(get_parser(), server_name) params.exec_loop(train)
def main(): # build parser parser = options.get_parser('Preprocessor') options.add_dataset_args(parser) options.add_preprocessing_args(parser) args = parser.parse_args() print(args) # make dirs base_dir = os.path.dirname(os.path.realpath(__file__)) lib_dir = os.path.join(base_dir, 'lib') processed_dir = args.processed_dir train_dir = os.path.join(processed_dir, 'train') val_dir = os.path.join(processed_dir, 'val') test_dir = os.path.join(processed_dir, 'test') utils.make_dirs( [args.processed_dir, lib_dir, train_dir, val_dir, test_dir]) # preprocess train_corpus = ddi2013.preprocess_ddi(os.path.join(args.raw_dir, 'train'), position=True) test_corpus = ddi2013.preprocess_ddi(os.path.join(args.raw_dir, 'test'), position=True) # get train targets input_targets = utils.map_iterable([item.type for item in train_corpus], ddi2013.target_map) # train/val split train_corpus, _, val_corpus, _ = utils.stratified_shuffle_split( train_corpus, input_targets, train_size=args.train_size) # write to files if not os.path.isdir(args.processed_dir): os.mkdir(args.processed_dir) ddi2013.write_to_file(train_corpus, os.path.join(args.processed_dir, 'train.ddi')) ddi2013.write_to_file(val_corpus, os.path.join(args.processed_dir, 'val.ddi')) ddi2013.write_to_file(test_corpus, os.path.join(args.processed_dir, 'test.ddi')) # download necessary tools download_tagger(lib_dir) download_parser(lib_dir) # parse #TODO: sometimes compile failed os.system( 'CLASSPATH="lib:lib/stanford-parser/stanford-parser.jar:lib/stanford-parser/stanford-parser-3.5.1-models.jar"' ) os.system('javac -cp $CLASSPATH lib/*.java') print('=' * 80) print('Preprocessing dataset') print('=' * 80) # java classpath for calling Stanford parser classpath = ':'.join([ lib_dir, os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'), os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar') ]) # split into separate files split(os.path.join(processed_dir, 'train.ddi'), train_dir) split(os.path.join(processed_dir, 'val.ddi'), val_dir) split(os.path.join(processed_dir, 'test.ddi'), test_dir) # parse sentences for d in [train_dir, val_dir, test_dir]: parse(d, cp=classpath, dep=args.dep, const=args.const) # get vocabulary build_vocab(glob.glob(os.path.join(processed_dir, '*/*.toks')), os.path.join(processed_dir, 'vocab.txt')) build_vocab(glob.glob(os.path.join(processed_dir, '*/*.toks')), os.path.join(processed_dir, 'vocab-cased.txt'), lowercase=False)