CHARS = CharField(fix_length=config['l0'], lower=not config['differ_uppercase']) LABEL = torchdata.Field(use_vocab=False, sequential=False, preprocessing=lambda x: int(x), is_target=True) train_dataset, test_dataset = torchdata.TabularDataset.splits( path=config['dataset_path'], train=config['dataset_train'], test=config['dataset_test'], format='tsv', fields=[('label', LABEL), ('chars', CHARS)]) train_iterator = torchdata.BucketIterator(train_dataset, batch_size=config['batch_size'], device=device) test_iterator = torchdata.BucketIterator( test_dataset, batch_size=config['test_batch_size'], device=device) num_classes, weights = utils.get_weights( [e.label for e in train_dataset.examples], config) alphabet = config['alphabet'] # alphabet.append("'") CHARS.build_vocab(alphabet) LABEL.build_vocab(train_dataset) charCNNModel = CharCNNModel(num_classes, alphabet=alphabet).to(device) if config['load_model']: charCNNModel.load_state_dict(
def __init__(self, model, labeled, unlabeled, batch_size=64, cap=None, resume_from=None): self.model = model self.data_root = envs.DATA_DIR self.device = envs.CUDA_DEVICE self.model.to(self.device) # compute class weights train_set = BasicDS(path=os.path.join(self.data_root, 'train.json'), text_field=TEXT, label_field=LABEL, samples=labeled, cap=cap) test_set = BasicDS(path=os.path.join(self.data_root, 'test.json'), text_field=TEXT, label_field=LABEL, samples=None, cap=cap) infer_set = BasicDS(path=os.path.join(self.data_root, 'train.json'), text_field=TEXT, label_field=LABEL, samples=unlabeled, cap=cap) self.train_iterator = data.BucketIterator( train_set, batch_size=batch_size, device=self.device, shuffle=True, sort_key=lambda x: len(x.text), sort_within_batch=True) self.test_iterator, self.infer_iterator = data.BucketIterator.splits( (test_set, infer_set), batch_size=batch_size, device=self.device, shuffle=False, sort_key=lambda x: len(x.text), sort_within_batch=True) labels = [] for i in range(len(train_set)): labels.append(train_set[i].label) class_weight = compute_class_weight(Counter(labels), num_classes=10, min_count=1) class_weight = torch.Tensor(class_weight).to(self.device) self.criterion = nn.CrossEntropyLoss(class_weight) self.optimizer = optim.Adam(self.model.parameters()) if envs.RESUME_FROM: ckpt = torch.load(os.path.join(envs.EXPT_DIR, envs.RESUME_FROM)) self.model.load_state_dict(ckpt['model']) self.optimizer.load_state_dict(ckpt['optimizer']) for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(envs.CUDA_DEVICE)
logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s %(levelname)s: - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') args.logger = logger args.device = torch.device('cuda') # -- DATA train_data, dev_data, test_data, SRC, TRG = load_iwslt(args) tok2i, i2tok, SRC, TRG = load_iwslt_vocab(args, SRC, TRG, args.data_prefix) SRC = copy.deepcopy(SRC) for data_ in [train_data, dev_data, test_data]: if not data_ is None: data_.fields['src'] = SRC sort_key = lambda x: len(x.src) trainloader = data.BucketIterator(dataset=train_data, batch_size=args.batch_size, device=args.device, train=True, repeat=False, shuffle=True, sort_key=sort_key, sort_within_batch=True) if not train_data is None else None validloader = data.BucketIterator(dataset=dev_data, batch_size=args.batch_size, device=args.device, train=False, repeat=False, shuffle=True, sort_key=sort_key, sort_within_batch=True) if not dev_data is None else None testloader = data.BucketIterator(dataset=test_data, batch_size=args.batch_size, device=args.device, train=False, repeat=False, shuffle=False, sort_key=sort_key, sort_within_batch=True) if not test_data is None else None args.n_classes = len(TRG.vocab.stoi) # -- loss loss_flags = {} if 'multiset' in args.loss: loss_fn = sequential_set_loss if not args.transformer_auxiliary_end: loss_fn = sequential_set_no_stop_loss loss_flags['self_teach_beta'] = float(args.self_teach_beta)
# # loss = trainer_G._train_batch( # src_seq, src_length.tolist(), tgt_seq, G, teacher_forcing_ratio=0) # if step % 100 == 0: # print('[step %d] loss_G %.4f' % (epoch * len(train_iter) + step, loss)) # Checkpoint(model=G, optimizer=optim_G, epoch=0, step=0, # input_vocab=EN.vocab, output_vocab=EN.vocab).save(opt._load_G_from) # Train SeqGAN ALPHA = 0 for epoch in range(100): logging.info('[Epoch %d]' % epoch) train_iter = data.BucketIterator(dataset=train, batch_size=16, device=opt.device, sort_within_batch=True, sort_key=lambda x: len(x.src), repeat=False) for step, batch in enumerate(train_iter): src_seq = batch.src[0] src_length = batch.src[1] tgt_seq = src_seq.clone() # gold = tgt_seq[:, 1:] # reconstruction loss # loss_G.reset() # decoder_outputs, decoder_hidden, other = G(src_seq, src_length.tolist(), target_variable=None) # fake = torch.cat(other[DecoderRNN.KEY_SEQUENCE], dim=1) # (1) train D
print("Number of src words (types):", len(src_field.vocab)) print("Number of trg words (types):", len(trg_field.vocab), "\n") print_data_info(train_data, valid_data, test_data, SRC, TRG) # In[19]: batch_size = 36 # In[20]: train_iter = data.BucketIterator(train_data, batch_size=batch_size, train=True, sort_within_batch=True, sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False, device=DEVICE) # In[21]: valid_iter = data.Iterator(valid_data, batch_size=1, train=False, sort=False, repeat=False, device=DEVICE) # In[22]:
def train(args): train_data, val_data, test_data, SRC, TGT = prepare_data(args) BATCH_SIZE = args.batch_size best_bleu_loss = 0 pad_idx = TGT.vocab.stoi["<pad>"] print("Size of source vocabulary:", len(SRC.vocab)) print("Size of target vocabulary:", len(TGT.vocab)) print("FC matrix:", args.hidden_dim, args.ff_dim) print(args.compress) model = transformer.make_model(len(SRC.vocab), len(TGT.vocab), d_model=args.hidden_dim, d_ff=args.ff_dim, N=args.num_blocks, compress=args.compress, compress_att=args.compress_attn, compress_mode=args.compress_mode, num_compress_enc=args.num_enc_blocks_comp, num_compress_dec=args.num_dec_blocks_comp ) model.to(device) if args.load_model: print('load model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) # TODO args = params['args'] state_dict = params['model'] # opts = params[''] model.load_state_dict(state_dict) criterion = train_utils.LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) # criterion = nn.NLLLoss(reduction="sum", ignore_index=0) criterion.to(device) train_iter = data.BucketIterator(train_data, batch_size=BATCH_SIZE, train=True, sort_within_batch=True, sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False, device=device) valid_iter = data.Iterator(val_data, batch_size=BATCH_SIZE, train=False, sort=False, repeat=False, device=device) model_opt = opt.WrapperOpt(model.src_embed[0].d_model, 2, 4000, torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9)) # train_time = begin_time = time.time() valid_params = (SRC, TGT, valid_iter) print("Number of examples in train: ", BATCH_SIZE * len([_ for _ in train_iter])) print("Number of examples in validation: ", BATCH_SIZE * len([_ for _ in valid_iter])) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("Number of parameters: ", params) if args.debug: model2 = transformer.make_model(len(SRC.vocab), len(TGT.vocab), d_model=args.hidden_dim, d_ff=args.ff_dim, N=args.num_blocks, compress=True,compress_att=True, compress_mode=args.compress_mode, num_compress_enc=args.num_enc_blocks_comp, num_compress_dec=args.num_dec_blocks_comp) # print("Tranable parameters in fc module ", params2) debug_compress_info(model, model2) exit() os.makedirs(os.path.dirname(args.save_to), exist_ok=True) if args.multi_gpu: devices = list(np.arange(args.num_devices)) model_parallel = nn.DataParallel(model, device_ids=devices) logger_file = {}#Logger(name=args.exp_name) logger_file['bleu'] = [] logger_file['loss'] = [] for epoch in range(args.max_epoch): print("=" * 80) print("Epoch ", epoch + 1) print("=" * 80) print("Train...") if args.multi_gpu: model_parallel.train() train_loss_fn = MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt) train_model = model_parallel else: train_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt) model.train() _, logger_file = train_utils.run_epoch(args, (train_utils.rebatch(pad_idx, b) for b in train_iter), model_parallel if args.multi_gpu else model, train_loss_fn, valid_params=valid_params, epoch_num=epoch, logger=logger_file) if args.multi_gpu: model_parallel.eval() val_loss_fn = MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt) else: model.eval() val_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt) print("Validation...") loss, bleu_loss = train_utils.run_epoch(args, (train_utils.rebatch(pad_idx, b) for b in valid_iter),\ model_parallel if args.multi_gpu else model, val_loss_fn, valid_params=valid_params, is_valid=True) if bleu_loss > best_bleu_loss: best_bleu_loss = bleu_loss model_state_dict = model.state_dict() model_file = args.save_to + args.exp_name + 'valid.bin' checkpoint = { 'model': model_state_dict, } print('save model without optimizer [%s]' % model_file, file=sys.stderr) torch.save(checkpoint, model_file) print() print("Validation perplexity ", np.exp(loss)) with open("./logs/"+args.exp_name, 'wb') as f_out: pickle.dump(logger_file, f_out)
def dyn_batch_without_padding(new, i, sofar): if args.distillation: return sofar + max(len(new.src), len(new.trg), len(new.dec)) else: return sofar + max(len(new.src), len(new.trg)) if args.batch_size == 1: # speed-test: one sentence per batch. batch_size_fn = lambda new, count, sofar: count else: batch_size_fn = dyn_batch_with_padding # dyn_batch_without_padding train_real, dev_real = data.BucketIterator.splits( (train_data, dev_data), batch_sizes=(args.batch_size, args.batch_size), device=args.gpu, shuffle=False, batch_size_fn=batch_size_fn, repeat=None if args.mode == 'train' else False) aux_reals = [data.BucketIterator(dataset, batch_size=args.batch_size, device=args.gpu, train=True, batch_size_fn=batch_size_fn, shuffle=False) for dataset in aux_data] logger.info("build the dataset. done!") # ----------------------------------------------------------------------------------------------------------------- # # model hyper-params: logger.info('use default parameters of t2t-base') hparams = {'d_model': 512, 'd_hidden': 512, 'n_layers': 6, 'n_heads': 8, 'drop_ratio': 0.1, 'warmup': 16000} # ~32 args.__dict__.update(hparams) # ----------------------------------------------------------------------------------------------------------------- # # show the arg: hp_str = (f"{args.dataset}_subword_" f"{args.d_model}_{args.d_hidden}_{args.n_layers}_{args.n_heads}_"
def __init__(self, emb_dim=50, mbsize=32, custom_data=False, eval=False, train_data_path="", eval_data_file="", checkpoint_path=""): self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize=self._tokenizer, fix_length=16) self.LABEL = data.Field(sequential=False, unk_token=None) self.MAX_CHARS = 20000 self.NLP = en_core_web_sm.load() if not eval: # Only take sentences with length <= 15 f = lambda ex: len(ex.text) <= 15 and ex.label != 'neutral' if custom_data: # create tuples representing the columns fields = [(None, None), ('text', self.TEXT), (None, None), (None, None), ('label', self.LABEL)] # load the dataset in json format train_data, validation_data, test_data = data.TabularDataset.splits( path=train_data_path, train='train_data.csv', validation='validation_data.csv', test='test_data.csv', format='csv', fields=fields, skip_header=True) else: train_data, test_data = datasets.IMDB.splits( self.TEXT, self.LABEL) train_data, validation_data = train_data.split() self.TEXT.build_vocab(train_data, vectors=GloVe('6B', dim=emb_dim)) self.LABEL.build_vocab(train_data) self.n_vocab = len(self.TEXT.vocab.itos) self.emb_dim = emb_dim self.train_iter, self.val_iter, self.test_iter = data.BucketIterator.splits( (train_data, validation_data, test_data), batch_size=mbsize, device=-1, sort_key=lambda x: len(x.text), shuffle=True, repeat=True) self.train_loader = self.train_iter self.test_loader = self.test_iter self.validation_loader = self.val_iter self.train_iter = iter(self.train_iter) self.val_iter = iter(self.val_iter) self.test_iter = iter(self.test_iter) else: self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize=self._tokenizer, fix_length=16) self.TEXT.vocab = self._get_from_checkpoint(checkpoint_path) self.n_vocab = len(self.TEXT.vocab.itos) fields = [('text', self.TEXT)] # load the dataset in json format test_data = data.TabularDataset(path=eval_data_file, format='csv', fields=fields, skip_header=True) self.test_iter = data.BucketIterator( test_data, batch_size=mbsize, device=-1, sort_key=lambda x: len(x.text), shuffle=False, repeat=False) self.test_loader = self.test_iter self.test_iter = iter(self.test_iter)
print('Train Example: {}'.format('\n'.join([ '{} ---- {}'.format(example.text, example.label) for example in train_data.examples[:5] ]))) print('Valid Example: {}'.format('\n'.join([ '{} ---- {}'.format(example.text, example.label) for example in valid_data.examples[:5] ]))) print('Test Example: {}'.format('\n'.join([ '{} ---- {}'.format(example.text, example.label) for example in test_data.examples[:5] ]))) train_iter = data.BucketIterator(dataset=train_data, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text)) valid_iter = data.BucketIterator(dataset=valid_data, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text)) test_iter = data.Iterator(dataset=test_data, batch_size=BATCH_SIZE, sort=False) # build model from text_classify.model import RNN, WordAVGModel, TextCNN from text_classify.transformer import Transformer embedding_size = TEXT.vocab.vectors.shape[ 1] if USE_PRE_TRAIN_MODEL else EMBEDDING_SIZE # model = RNN(input_size=len(TEXT.vocab), embedding_size=embedding_size, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, output_size=len(LABEL.vocab)) # model = TextCNN(input_size=len(TEXT.vocab), embedding_size=embedding_size, output_size=len(LABEL.vocab), pooling_method='avg') model = WordAVGModel(vocab_size=len(TEXT.vocab),
def __init__(self, args): path = '../data/squad' logging.info( "Preprocessing Data - First Phase :: Reading And Transforming") self.preprocess('{}/{}'.format(path, args.Train_File)) self.preprocess('{}/{}'.format(path, args.Dev_File)) self.RAW = data.RawField() self.RAW.is_target = False self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize) self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True) self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False) dict_fields = { 'qid': ('qid', self.RAW), 'start_idx': ('start_idx', self.LABEL), 'end_idx': ('end_idx', self.LABEL), 'context': [('c_word', self.WORD), ('c_char', self.CHAR)], 'question': [('q_word', self.WORD), ('q_char', self.CHAR)] } logging.info("Preprocessing Data - Second Phase :: To Torchtext") self.train, self.dev = data.TabularDataset.splits(path=path, train=args.Train_File + 'l', \ validation=args.Dev_File + 'l', format='json', fields=dict_fields) if args.Max_Token_Length > 0: self.train.examples = [ e for e in self.train.examples if len(e.c_word) <= args.Max_Token_Length ] logging.info( "Preprocessing Data - Third Phase :: Building Vocabulary") self.CHAR.build_vocab(self.train, self.dev) self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=args.Word_Dim)) logging.info("Preprocessing Data - Fourth Phase :: Building Itertors") device = torch.device( "cuda:{}".format(args.GPU) if torch.cuda.is_available() else "cpu") self.train_iter = data.BucketIterator( dataset=self.train, batch_size=args.Batch_Size) # sort_key = lambda x : len(x.c_word) self.dev_iter = data.BucketIterator(dataset=self.dev, batch_size=10)
def getBucketIter(self, dataset, **kwargs): if 'device' not in kwargs: kwargs = dict(kwargs, device=self.device) else: kwargs = dict(kwargs) return data.BucketIterator(dataset, **kwargs)
fix_length=config['max_seq_length']) LABEL = torchdata.Field(use_vocab=False, sequential=False, preprocessing=lambda x: int(x), is_target=True) train_dataset, test_dataset = torchdata.TabularDataset.splits( path=config['dataset_path'], train=config['dataset_train'], test=config['dataset_test'], format='tsv', fields=[('label', LABEL), ('text', TEXT)]) train_iterator = torchdata.BucketIterator(train_dataset, batch_size=config['batch_size'], sort_key=lambda x: len(x.text), device=device, sort_within_batch=False) test_iterator = torchdata.BucketIterator( test_dataset, batch_size=config['test_batch_size'], sort_key=lambda x: len(x.text), device=device, sort_within_batch=False) TEXT.build_vocab(train_dataset) LABEL.build_vocab(train_dataset) num_classes, weights = get_weights( [e.label for e in train_dataset.examples], config) bert_config = BertConfig(vocab_size_or_config_json_file=32000,
test = data.Dataset(counter_test, fields=[('sentence', TEXT), ('adj', None), ('trigger', TEXT), ('trigger_index', TRIGGERINDEX), ('eep', EEP), ('index', INDEX)]) for_vocab = data.Dataset(for_vocab, fields=[('sentence', TEXT), ('adj', None), ('trigger', None), ('trigger_index', None), ('eep', None), ('index', None)]) TEXT.build_vocab(for_vocab, vectors='glove.6B.100d') # , max_size=30000) TEXT.vocab.vectors.unk_init = init.xavier_uniform print(TEXT.vocab.vectors.shape) print() train_iter = data.BucketIterator(train, batch_size=64, train=True, sort_within_batch=True, sort_key=lambda x: (len(x.sentence)), repeat=False, device='cuda') for batch in train_iter: print(batch) for i in batch.index: print(len(counter[i].sentence)) print(batch.sentence.shape[0]) assert len(counter[i].sentence) <= batch.sentence.shape[0] x = batch.sentence.t() adj = [] trigger = batch.trigger_index.t().flatten() count = 0 for ind in batch.index:
def __init__(self, args): path = './data/squad' dataset_path = path + '/torchtext/' train_examples_path = dataset_path + 'train_examples.pt' dev_examples_path = dataset_path + 'dev_examples.pt' print("preprocessing data files...") if not os.path.exists('{}/{}l'.format(path, args.train_file)): self.preprocess_file('{}/{}'.format(path, args.train_file)) if not os.path.exists('{}/{}l'.format(path, args.dev_file)): self.preprocess_file('{}/{}'.format(path, args.dev_file)) self.RAW = data.RawField() # explicit declaration for torchtext compatibility self.RAW.is_target = False self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize) self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True) self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False) dict_fields = { 'id': ('id', self.RAW), 's_idx': ('s_idx', self.LABEL), 'e_idx': ('e_idx', self.LABEL), 'context': [('c_word', self.WORD), ('c_char', self.CHAR)], 'question': [('q_word', self.WORD), ('q_char', self.CHAR)] } list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL), ('c_word', self.WORD), ('c_char', self.CHAR), ('q_word', self.WORD), ('q_char', self.CHAR)] if os.path.exists(dataset_path): print("loading splits...") train_examples = torch.load(train_examples_path) dev_examples = torch.load(dev_examples_path) self.train = data.Dataset(examples=train_examples, fields=list_fields) self.dev = data.Dataset(examples=dev_examples, fields=list_fields) else: print("building splits...") self.train, self.dev = data.TabularDataset.splits( path=path, train='{}l'.format(args.train_file), validation='{}l'.format(args.dev_file), format='json', fields=dict_fields) os.makedirs(dataset_path) torch.save(self.train.examples, train_examples_path) torch.save(self.dev.examples, dev_examples_path) #cut too long context in the training set for efficiency. # print(self.train.examples[0].c_word) if args.context_threshold > 0: self.train.examples = [ e for e in self.train.examples if len(e.c_word) <= args.context_threshold ] print("building vocab...") self.CHAR.build_vocab(self.train, self.dev) self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=args.word_dim)) print("building iterators...") device = torch.device( "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") self.train_iter = data.BucketIterator(self.train, batch_size=args.train_batch_size, device=device, repeat=True, shuffle=True, sort_key=lambda x: len(x.c_word)) self.dev_iter = data.BucketIterator(self.dev, batch_size=args.dev_batch_size, device=device, repeat=False, sort_key=lambda x: len(x.c_word))
def predict(): predict_cfg = get_predict_args() device = get_device() print(device) # load checkpoint ckpt_path = find_ckpt_in_directory(predict_cfg.ckpt) ckpt = torch.load(ckpt_path, map_location=device) cfg = ckpt["cfg"] # to know which words to UNK we need to know the Glove vocabulary glove_words = load_glove_words(cfg.word_vectors) # load data sets print("Loading data... ", end="") input_field, label_field, not_in_glove = get_data_fields(glove_words) train_data, dev_data, test_data = SNLI.splits(input_field, label_field) print("Done") print("Words not in glove:", len(not_in_glove)) # build vocabulary (deterministic so no need to load it) input_field.build_vocab(train_data, dev_data, test_data, vectors=None, vectors_cache=None) label_field.build_vocab(train_data) # construct model model = build_model(cfg, input_field.vocab) # load parameters from checkpoint into model print("Loading saved model..") model.load_state_dict(ckpt["model"]) print("Done") train_iter = data.BucketIterator( train_data, batch_size=cfg.batch_size, train=False, repeat=False, device=device if torch.cuda.is_available() else -1) dev_iter = data.BucketIterator( dev_data, batch_size=cfg.batch_size, train=False, repeat=False, device=device if torch.cuda.is_available() else -1) test_iter = data.BucketIterator( test_data, batch_size=cfg.batch_size, train=False, repeat=False, device=device if torch.cuda.is_available() else -1) print_config(cfg) print("Embedding variance:", torch.var(model.embed.weight).item()) model.to(device) print_parameters(model) print(model) # switch model to evaluation mode model.eval() train_iter.init_epoch() dev_iter.init_epoch() test_iter.init_epoch() criterion = nn.CrossEntropyLoss(reduction='sum') print("Starting evaluation..") eval_list = [("train", train_iter), ("dev", dev_iter), ("test", test_iter)] for name, it in eval_list: eval_result = evaluate(model, criterion, it) eval_str = make_kv_string(eval_result) print("# Evaluation {}: {}".format(name, eval_str)) # print dev examples for highscore dev_iter.init_epoch() p2h, h2p, prems, hypos, predictions, targets = extract_attention( model, dev_iter, input_field.vocab, label_field.vocab) np.savez(os.path.join(cfg.save_path, "dev_items"), p2h=p2h, h2p=h2p, prems=prems, hypos=hypos, predictions=predictions, targets=targets) # print dev examples for highscore dev_iter.init_epoch() dev_dir = os.path.join(cfg.save_path, "dev") if not os.path.exists(dev_dir): os.makedirs(dev_dir) print_examples(model, dev_iter, input_field.vocab, label_field.vocab, dev_dir, 0, n=-1)
('plot_score', PLOT_SCORE), ('image_score', IMAGE_SCORE), ('music_score', MUSIC_SCORE), ('actors_score', ACTORS_SCORE), ('name0', None)], skip_header=True) train, val = get_dataset(union_toloka_result_proc_path).split() golden_train = get_dataset(union_golden_proc_path2) TEXT.build_vocab(train, max_size=30000) model_path = "./models/model" rnn_model = MultiModel(model=BiLSTMClassifier(300, len(TEXT.vocab.stoi), 256, 2).to(device)) # rnn_model.load_state_dict(torch.load(model_path)) batch_size = 32 train_iter, val_iter = data.BucketIterator.splits( (train, val), sort_key=lambda x: len(x.text), batch_sizes=(batch_size, batch_size), device=device) golden_iter = data.BucketIterator(golden_train, sort_key=lambda x: len(x.text), batch_size=batch_size, device=device) criterion_cls = nn.BCEWithLogitsLoss().to(device) criterion_scores = nn.MSELoss(reduction='none').to(device) criterion_scores_l1 = nn.L1Loss(reduction='none').to(device) rnn_model = MultiModel(model=BiLSTMClassifier(300, len(TEXT.vocab.stoi), 256, 2).to(device)) optimizer = optim.Adam([param for param in rnn_model.model.parameters() if param.requires_grad]) fit(rnn_model, criterion_cls, criterion_scores, optimizer, train_iter, epochs_count=30, val_data=val_iter) torch.save(rnn_model.model.state_dict(), model_path) do_eval_epoch(rnn_model, None, criterion_scores_l1, val_iter)
sentences = data.Field(lower=True, tokenize=tokenizer) ans = data.Field(sequential=False) train, dev, test = datasets.SNLI.splits(sentences, ans) sentences.build_vocab(train, dev, test, min_freq=3) ans.build_vocab(train, dev, test) if torch.cuda.is_available(): device = torch.device('cuda:0') else: device = torch.device('cpu') Batch_Size = 128 test_iter = data.BucketIterator(test, batch_size=Batch_Size, shuffle=False) n_layer = 1 class My_RNN(nn.Module): def __init__(self, embed_dim, hidden_dim, drop_p): super(My_RNN, self).__init__() self.rnn = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, num_layers=n_layer, dropout=drop_p, bidirectional=True) def forward(self, inputs): batch_size = inputs.size()[1]
def get_data_iter(): #获取字符vocab分词器 def char_vocab_tokenizer(sentence): c_lists = [[c for c in word] for word in sentence.strip().split()] return list(_flatten(c_lists)) def tag_tokenizer(x): rel = [int(tag) for tag in x.split()] return rel def _get_dataset(csv_data, char_to_idx, seq, tag, char_, char_len): examples = [] fileds = [('Seq', seq), ('Tag', tag), ('Char_', char_), ('Char_len', char_len)] for seq, tag in zip(csv_data['Seq'], csv_data['Tag']): char_list = [[char_to_idx[c] for c in word] for word in seq.strip().split()] char_len_list = [len(word) for word in seq.strip().split()] examples.append( data.Example.fromlist( [seq, tag, pad_char_list(char_list), char_len_list], fileds)) return examples, fileds seq = data.Field(sequential=True, use_vocab=True, lower=True) tag = data.Field(sequential=True, lower=False, use_vocab=False, tokenize=tag_tokenizer) char_ = data.Field(sequential=True, use_vocab=False, batch_first=True) char_len = data.Field(sequential=True, use_vocab=False, batch_first=True) char_vocab = data.Field(sequential=True, use_vocab=True, tokenize=char_vocab_tokenizer) #只是用来构建字符集词典 get_charvocab_fields = [('Seq', char_vocab), ('None', None), ('None', None)] train = data.TabularDataset.splits(path='./Dataset', train='train.csv', format='csv', skip_header=True, fields=get_charvocab_fields)[0] char_vocab.build_vocab(train) #字符集的词典 # 构建Dataset数据集 train_data = pd.read_csv('./Dataset/train.csv') val_data = pd.read_csv('./Dataset/valid.csv') test_data = pd.read_csv('./Dataset/test.csv') train_dataset = data.Dataset(*_get_dataset( train_data, char_vocab.vocab.stoi, seq, tag, char_, char_len)) val_dataset = data.Dataset(*_get_dataset(val_data, char_vocab.vocab.stoi, seq, tag, char_, char_len)) test_dataset = data.Dataset(*_get_dataset(test_data, char_vocab.vocab.stoi, seq, tag, char_, char_len)) # 构造词典 seq.build_vocab( train_dataset, vectors=torchtext.vocab.Vectors(name='./Dataset/glove.6B.200d.txt')) # 构造数据迭代器 train_iter = data.BucketIterator(train_dataset, batch_size=1, shuffle=True, sort_key=lambda x: len(x.Seq), device=tc.device('cpu')) val_iter, test_iter = data.BucketIterator.splits( (val_dataset, test_dataset), batch_sizes=(1, 1), shuffle=False, repeat=False, sort=False, device=tc.device('cpu')) return seq, char_vocab, train_iter, test_iter, val_iter
def load_data(self, train_file, test_file, val_file=None): ''' Loads the data from files Sets up iterators for training, validation and test data Also create vocabulary and word embeddings based on the data Inputs: train_file (String): absolute path to training file test_file (String): absolute path to test file val_file (String): absolute path to validation file ''' # Loading Tokenizer NLP = spacy.load('en') def tokenizer(sent): return list(x.text for x in NLP.tokenizer(sent) if x.text != " ") # Creating Filed for data TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len) LABEL = data.Field(sequential=False, use_vocab=False) datafields = [("text", TEXT), ("label", LABEL)] # Load data from pd.DataFrame into torchtext.data.Dataset train_df = self.get_pandas_df(train_file) train_examples = [ data.Example.fromlist(i, datafields) for i in train_df.values.tolist() ] train_data = data.Dataset(train_examples, datafields) test_df = self.get_pandas_df(test_file) test_examples = [ data.Example.fromlist(i, datafields) for i in test_df.values.tolist() ] test_data = data.Dataset(test_examples, datafields) # If validation file exists, load it. Otherwise get validation data # from training data if val_file: val_df = self.get_pandas_df(val_file) val_examples = [ data.Example.fromlist(i, datafields) for i in val_df.values.tolist() ] val_data = data.Dataset(val_examples, datafields) else: train_data, val_data = train_data.split(split_ratio=0.8) TEXT.build_vocab(train_data) self.vocab = TEXT.vocab self.train_iterator = data.BucketIterator( (train_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.val_iterator, self.test_iterator = data.BucketIterator.splits( (val_data, test_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=False) print("Loaded {} training examples".format(len(train_data))) print("Loaded {} test examples".format(len(test_data))) print("Loaded {} validation examples".format(len(val_data)))
def main(): ############################### # PREPROCESSING ############################### datasets = ["train", "val", "test"] for dataset in datasets: if not os.path.exists(os.path.join("data", dataset + ".tsv")): print("Creating TSV for " + dataset) convert_to_tsv(dataset) print("Creating datasets", end='', flush=True) curr_time = datetime.now() article_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor, lower=True, tokenize=tokenizer_in) summary_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor, lower=True, tokenize=tokenizer_out, init_token='<sos>') train_set = data.TabularDataset(path='./data/train.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)]) val_set = data.TabularDataset(path='./data/val.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)]) diff_time, curr_time = get_time_diff(curr_time) print(", took {} min".format(diff_time)) print("Building vocabulary and creating batches", end='', flush=True) article_field.build_vocab(train_set, vectors="glove.6B.100d", max_size=encoder_vocab_size) summary_field.build_vocab(train_set, max_size=decoder_vocab_size) train_iter = data.BucketIterator(dataset=train_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE) val_iter = data.BucketIterator(dataset=val_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE) diff_time, curr_time = get_time_diff(curr_time) print(", took {} min".format(diff_time)) ############################### # MODEL CREATION ############################### print("Creating encoder and decoder models", end='', flush=True) encoder = EncoderLSTM(input_size=encoder_vocab_size, embed_size=embed_size, hidden_size=encoder_hidden_size, use_gpu=True, gpu_device=DEVICE, batch_size=batch_size) encoder.embedding.weight.data = article_field.vocab.vectors encoder.cuda(device=DEVICE) decoder = AttnDecoderLSTM(input_size=encoder_vocab_size, embed_size=embed_size, hidden_size=decoder_hidden_size, output_size=decoder_vocab_size, use_gpu=True, gpu_device=DEVICE, batch_size=batch_size) decoder.embedding.weight.data = article_field.vocab.vectors decoder.cuda(device=DEVICE) diff_time, curr_time = get_time_diff(curr_time) print(", took {} min".format(diff_time)) # Loss and SGD optimizers loss_func = nn.NLLLoss(ignore_index=1) # Ignore <pad> token encoder_opt = optim.Adam(encoder.parameters(), lr=lr) decoder_opt = optim.Adam(decoder.parameters(), lr=lr) ############################### # TRAINING ############################### print("Beginning training") tqdm_epoch = tqdm(range(num_epochs), desc="Epoch") for epoch in tqdm_epoch: train_iter.init_epoch() tqdm_batch = tqdm(train_iter, desc="Batch") for b_id, batch in enumerate(tqdm_batch): encoder.batch_size = batch.batch_size # Fixes weird bug where we get batch sizes that are not batch_size decoder.batch_size = batch.batch_size avg_loss = train(batch, encoder, decoder, encoder_opt, decoder_opt, loss_func, teacher_forcing_ratio) ############################### # TESTING ############################### # Load test set print("Loading test set") test_set = data.TabularDataset(path='./data/test.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)]) test_iter = data.BucketIterator(dataset=test_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE) print("Evaluating model") evaluate(encoder=encoder, decoder=decoder, dataset=test_iter, rev_field=article_field)
max_size=30000, vectors="glove.6B.300d", unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) PAD_INDEX = SRC.vocab.stoi[PAD_TOKEN] # SOS_INDEX = SRC.vocab.stoi[SOS_TOKEN] # EOS_INDEX = SRC.vocab.stoi[EOS_TOKEN] # print(LABEL.vocab.freqs.most_common(10)) ############################# # define iterator train_iter = data.BucketIterator(train_data, batch_size=params['BATCH_SIZE'], device=DEVICE, sort_within_batch=True, sort_key=lambda x: len(x.text), train=True, repeat=False) # train_iter = data.Iterator(train_data, batch_size=1, train=False, sort=False, repeat=False, device=DEVICE) valid_iter = data.Iterator(valid_data, batch_size=1, train=False, sort=False, repeat=False, device=DEVICE) test_iter = data.Iterator(test_data, batch_size=1,
def load_pairs(): TEXT1 = data.Field(fix_length=500) TEXT2 = data.Field(fix_length=500) LABEL = data.Field(sequential=False, is_target=True, use_vocab=False, dtype=torch.float64) ID = data.Field(sequential=False, is_target=True, use_vocab=False, dtype=torch.float64) ONEHOT = data.Field(sequential=False, is_target=True, use_vocab=False, dtype=torch.float32) # TEXT1是对象,返回到数据结构中的应该是个对象类型,而不是单纯的文本类型 field = { 'label': ('label', LABEL), 'text1': ('text1', TEXT1), 'text2': ('text2', TEXT2), 'onehot1': ('onehot1', ONEHOT), 'onehot2': ('onehot2', ONEHOT) } field1 = { 'id': ('id', ID), 'text': ('text', TEXT1), 'label': ('label', LABEL), 'onehot': ('onehot', ONEHOT) } # train_pairs 的数据格式为field字典,所以train_pairs有text1键, 然后他的属性值以为Text1 train_pairs, valid_pairs = data.TabularDataset.splits( # 切分语料库 path='./data/', train='train_pairs.json', validation='val_pairs.json', format='json', fields=field) train_data, test_data = data.TabularDataset.splits( path='./data/', train='compare_data_5.json', test='test_data.json', format='json', fields=field1) vectors = torchtext.vocab.Vectors(name='./data/fasttext.vec') TEXT1.build_vocab(train_pairs, vectors=vectors) # build_vocab构建语料库的vocabulary同时加载word-embedding # 从预训练的vectors中,将当前corpus词汇表的词向量抽取出来,构成当前corpus的Vocab(词汇表) # 自动构建embedding矩阵 TEXT2.build_vocab(train_pairs, vectors=vectors) print('Length of TEXT1 Vocabulary:' + str(len(TEXT1.vocab))) print('Length of TEXT2 Vocabulary:' + str(len(TEXT2.vocab))) print('Dim of TEXT1,TEXT2:', TEXT1.vocab.vectors.size()[1], TEXT2.vocab.vectors.size()[1]) train_pairs_iter, valid_pairs_iter = data.BucketIterator.splits( (train_pairs, valid_pairs), sort=False, batch_size=100, repeat=False, shuffle=True, device=torch.device('cuda:0')) train_data_iter = data.BucketIterator(train_data, sort=False, batch_size=5, repeat=False, shuffle=False, device=torch.device('cuda:0')) test_data_iter = data.BucketIterator(test_data, sort=False, batch_size=100, repeat=False, shuffle=True, device=torch.device('cuda:0')) return train_pairs_iter, valid_pairs_iter, train_data_iter, test_data_iter
trg_sen_in = batch.trg[0][:,:-1] # skip eos trg_sen = batch.trg[0][:,1:] # skip sos preds = model(src_sen, batch.src[1].cpu().numpy(), trg_sen_in) return src_sen, trg_sen, preds if __name__ == "__main__": eng_field, fren_field, (train, val, test) = load_data() model = Seq2Seq_Translation(eng_field, fren_field) trg_mask = torch.ones(len(eng_field.vocab)) trg_mask[eng_field.vocab.stoi["<pad>"]] = 0 criterion = nn.NLLLoss(weight=trg_mask) optimizer = optim.Adam(model.parameters(), lr=5e-4) scheduler = optim.lr_scheduler.StepLR(optimizer, 15) train_iter = data.BucketIterator(train, batch_size=64, sort_key=lambda ex: len(ex.src), sort_within_batch=True) examples = iter(data.BucketIterator(val, batch_size=1, train=False, shuffle=True, repeat=True)) for epoch in range(20): scheduler.step() model.train() for i, batch in enumerate(train_iter): src_sen, trg_sen, preds = batch_forward(batch) loss = criterion(preds.contiguous().view(-1,preds.size(2)), trg_sen.contiguous().view(-1)) # writer.add_scalar('data/train_loss', loss.data[0], len(train_iter)*epoch + i) optimizer.zero_grad() loss.backward() clip_grad_norm(model.parameters(), 5.0) optimizer.step() if i == len(train_iter)-1: break
def get_iterator(self, dataset): return data.BucketIterator(dataset, batch_size=self.params['batch_size'], shuffle=False)
def load_dataset(config, train_pos='train.pos', train_neg='train.neg', dev_pos='dev.pos', dev_neg='dev.neg', test_pos='test.pos', test_neg='test.neg'): root = config.data_path roots = re.split(', +', root) if len(roots) > 1: logger.info("Combining datasets...") files = {'train.pos':[], 'train.neg':[], 'dev.pos':[], \ 'dev.neg':[], 'test.pos':[], 'test.neg':[]} for dir_path in roots: for file in files.keys(): with open(dir_path + file, 'r', encoding='utf8') as f: files[file].extend(f.readlines()) for file, sents in files.items(): with open('./data/style_transfer/%s' % file, 'w', encoding='utf8') as f: for sent in sents: f.write('%s' % sent) root = './data/style_transfer/' TEXT = data.Field(batch_first=True, eos_token='<eos>') dataset_fn = lambda name: data.TabularDataset( path=root + name, format='tsv', fields=[('text', TEXT)] ) train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg]) dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg]) test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg]) TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq) if config.load_pretrained_embed: start = time.time() vectors=torchtext.vocab.GloVe('6B', dim=config.embed_size, cache=config.pretrained_embed_path) TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) print('vectors', TEXT.vocab.vectors.size()) print('load embedding took {:.2f} s.'.format(time.time() - start)) vocab = TEXT.vocab dataiter_fn = lambda dataset, train: data.BucketIterator( dataset=dataset, batch_size=config.batch_size, shuffle=train, repeat=train, sort_key=lambda x: len(x.text), sort_within_batch=False, device=config.device ) train_pos_iter, train_neg_iter = map(lambda x: dataiter_fn(x, True), [train_pos_set, train_neg_set]) dev_pos_iter, dev_neg_iter = map(lambda x: dataiter_fn(x, False), [dev_pos_set, dev_neg_set]) test_pos_iter, test_neg_iter = map(lambda x: dataiter_fn(x, False), [test_pos_set, test_neg_set]) train_iters = DatasetIterator(train_pos_iter, train_neg_iter) dev_iters = DatasetIterator(dev_pos_iter, dev_neg_iter) test_iters = DatasetIterator(test_pos_iter, test_neg_iter) return train_iters, dev_iters, test_iters, vocab
for split in ["train", "val", "test"]: my_data[split] = datasets.TranslationDataset(path="data/new_" + split, exts=('.nl', '.amr'), fields=(NL_SRC, AMR_SRC)) MIN_FREQ = 5 NL_SRC.build_vocab(my_data["train"].src, min_freq=MIN_FREQ) AMR_SRC.build_vocab(my_data["train"].trg, min_freq=MIN_FREQ) PAD_INDEX = AMR_SRC.vocab.stoi[PAD_TOKEN] print_data_info(my_data, NL_SRC, AMR_SRC) train_iter = data.BucketIterator(my_data["train"], batch_size=BATCH_SIZE, train=True, sort_within_batch=True, sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False, device=DEVICE) valid_iter = data.Iterator(my_data["val"], batch_size=1, train=False, sort=False, repeat=False, device=DEVICE) model = make_autoencoder(len(NL_SRC.vocab), len(AMR_SRC.vocab), emb_size=500, hidden_size=500,
def main(): args_parser = argparse.ArgumentParser(description='Tuning with graph-based parsing') args_parser.add_argument('--cuda', action='store_true', help='using GPU') args_parser.add_argument('--num_epochs', type=int, default=200, help='Number of training epochs') args_parser.add_argument('--batch_size', type=int, default=64, help='Number of sentences in each batch') args_parser.add_argument('--hidden_size', type=int, default=256, help='Number of hidden units in RNN') args_parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN') args_parser.add_argument('--opt', choices=['adam', 'sgd', 'adamax'], help='optimization algorithm') args_parser.add_argument('--objective', choices=['cross_entropy', 'crf'], default='cross_entropy', help='objective function of training procedure.') args_parser.add_argument('--learning_rate', type=float, default=0.01, help='Learning rate') args_parser.add_argument('--decay_rate', type=float, default=0.05, help='Decay rate of learning rate') args_parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') args_parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') args_parser.add_argument('--epsilon', type=float, default=1e-8, help='epsilon for adam or adamax') args_parser.add_argument('--p_rnn', nargs=2, type=float, default=0.1, help='dropout rate for RNN') args_parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input embeddings') args_parser.add_argument('--p_out', type=float, default=0.33, help='dropout rate for output layer') args_parser.add_argument('--schedule', type=int, help='schedule for learning rate decay') args_parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') #args_parser.add_argument('--punctuation', nargs='+', type=str, help='List of punctuations') args_parser.add_argument('--word_path', help='path for word embedding dict') args_parser.add_argument('--freeze', action='store_true', help='frozen the word embedding (disable fine-tuning).') # args_parser.add_argument('--char_path', help='path for character embedding dict') args_parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" args_parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" args_parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args_parser.add_argument('--model_path', help='path for saving model file.', default='models/temp') args_parser.add_argument('--model_name', help='name for saving model file.', default='generator') args_parser.add_argument('--seq2seq_save_path', default='checkpoints4/seq2seq_save_model', type=str, help='seq2seq_save_path') args_parser.add_argument('--seq2seq_load_path', default='checkpoints4/seq2seq_save_model', type=str, help='seq2seq_load_path') # args_parser.add_argument('--rl_finetune_seq2seq_save_path', default='models/rl_finetune/seq2seq_save_model', # type=str, help='rl_finetune_seq2seq_save_path') # args_parser.add_argument('--rl_finetune_network_save_path', default='models/rl_finetune/network_save_model', # type=str, help='rl_finetune_network_save_path') # args_parser.add_argument('--rl_finetune_seq2seq_load_path', default='models/rl_finetune/seq2seq_save_model', # type=str, help='rl_finetune_seq2seq_load_path') # args_parser.add_argument('--rl_finetune_network_load_path', default='models/rl_finetune/network_save_model', # type=str, help='rl_finetune_network_load_path') args_parser.add_argument('--direct_eval', action='store_true', help='direct eval without generation process') args = args_parser.parse_args() spacy_en = spacy.load('en_core_web_sm') # python -m spacy download en spacy_de = spacy.load('de_core_news_sm') # python -m spacy download en spacy_fr = spacy.load('fr_core_news_sm') # python -m spacy download en SEED = 0 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) device = torch.device('cuda') #torch.device('cuda' if torch.cuda.is_available() else 'cpu') #'cpu' if not torch.cuda.is_available() else 'cuda:0' def tokenizer_en(text): # create a tokenizer function return [tok.text for tok in spacy_en.tokenizer(text)] def tokenizer_de(text): # create a tokenizer function return [tok.text for tok in spacy_de.tokenizer(text)] def tokenizer_fr(text): # create a tokenizer function return [tok.text for tok in spacy_fr.tokenizer(text)] en_field = data.Field(sequential=True, tokenize=tokenizer_en, lower=True, include_lengths=True, batch_first=True) #use_vocab=False fix_length=10 de_field = data.Field(sequential=True, tokenize=tokenizer_de, lower=True, include_lengths=True, batch_first=True) #use_vocab=False fr_field = data.Field(sequential=True, tokenize=tokenizer_fr, lower=True, include_lengths=True, batch_first=True) #use_vocab=False print('begin loading training data-----') # print('time: ', time.asctime( time.localtime(time.time()) )) seq2seq_train_data = MultiSourceTranslationDataset( path='wmt14_3/sample', exts=('.de', '.fr', '.en'), fields=(de_field, fr_field, en_field)) print('begin loading validation data-----') # print('time: ', time.asctime( time.localtime(time.time()) )) seq2seq_dev_data = MultiSourceTranslationDataset( path='wmt14_3/test', exts=('.de', '.fr', '.en'), fields=(de_field, fr_field, en_field)) print('end loading data-----') # print('time: ', time.asctime( time.localtime(time.time()) )) # en_train_data = datasets.TranslationDataset(path='wmt14_3/sample', exts=('.en', '.en'), fields=(en_field, en_field)) # print('end en data-----') # print('time: ', time.asctime( time.localtime(time.time()) )) # de_train_data = datasets.TranslationDataset(path='wmt14_3/sample', exts=('.de', '.de'), fields=(de_field, de_field)) # fr_train_data = datasets.TranslationDataset(path='wmt14_3/sample', exts=('.fr', '.fr'), fields=(fr_field, fr_field)) # en_field.build_vocab(en_train_data, max_size=80000) # ,vectors="glove.6B.100d" # de_field.build_vocab(de_train_data, max_size=80000) # ,vectors="glove.6B.100d" # fr_field.build_vocab(fr_train_data, max_size=80000) # ,vectors="glove.6B.100d" # vocab_thread = 20000+2 # with open(str(vocab_thread)+'_vocab_en.pickle', 'rb') as f: # en_field.vocab = pickle.load(f) # with open(str(vocab_thread)+'_vocab_de.pickle', 'rb') as f: # de_field.vocab = pickle.load(f) # with open(str(vocab_thread)+'_vocab_fr.pickle', 'rb') as f: # fr_field.vocab = pickle.load(f) with open('vocab_en.pickle', 'rb') as f: en_field.vocab = pickle.load(f) with open('vocab_de.pickle', 'rb') as f: de_field.vocab = pickle.load(f) with open('vocab_fr.pickle', 'rb') as f: fr_field.vocab = pickle.load(f) print('end build vocab-----') # print('time: ', time.asctime( time.localtime(time.time()) )) # trg_field.build_vocab(seq2seq_train_data, max_size=80000) # mt_dev shares the fields, so it shares their vocab objects train_iter = data.BucketIterator( dataset=seq2seq_train_data, batch_size=10, sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)), device=device, shuffle=True) # Note that if you are runing on CPU, you must set device to be -1, otherwise you can leave it to 0 for GPU. dev_iter = data.BucketIterator( dataset=seq2seq_dev_data, batch_size=10, sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)), device=device, shuffle=False) num_words_en = len(en_field.vocab.stoi) # Pretrain seq2seq model using denoising autoencoder. model name: seq2seq model EPOCHS = 100 # 150 DECAY = 0.97 # TODO: #len(en_field.vocab.stoi) # ?? word_embedd ?? word_dim = 300 # ?? seq2seq = Seq2seq_Model(EMB=word_dim, HID=args.hidden_size, DPr=0.5, vocab_size1=len(de_field.vocab.stoi), vocab_size2=len(fr_field.vocab.stoi), vocab_size3=len(en_field.vocab.stoi), word_embedd=None, device=device).to(device) # TODO: random init vocab # seq2seq.emb.weight.requires_grad = False print(seq2seq) loss_seq2seq = torch.nn.CrossEntropyLoss(reduction='none').to(device) parameters_need_update = filter(lambda p: p.requires_grad, seq2seq.parameters()) optim_seq2seq = torch.optim.Adam(parameters_need_update, lr=0.0003) seq2seq.load_state_dict(torch.load(args.seq2seq_load_path +'_batch_'+ str(2000000) + '.pt')) # TODO: 10.7 # torch.save(seq2seq.state_dict(), args.seq2seq_save_path +'_batch_'+ str(ii) + '.pt') seq2seq.to(device) def count_parameters(model: torch.nn.Module): return sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'The model has {count_parameters(seq2seq):,} trainable parameters') PAD_IDX = en_field.vocab.stoi['<pad>'] # criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) ii=0#141500 if True: # i%1 == 0: seq2seq.eval() bleu_ep = 0 acc_numerator_ep = 0 acc_denominator_ep = 0 testi = 0 for _, batch in enumerate(dev_iter): # for _ in range(1, num_batches + 1): word, char, pos, heads, types, masks, lengths = conllx_data.get_batch_tensor(data_dev, batch_size, unk_replace=unk_replace) # word:(32,50) char:(32,50,35) src1, lengths_src1 = batch.src1 # word:(32,50) 150,64 src2, lengths_src2 = batch.src2 # word:(32,50) 150,64 trg, lengths_trg = batch.trg sel, _ = seq2seq(src1.long().to(device), src2.long().to(device), LEN=max(src1.size()[1], src2.size()[1])) # TODO: sel = sel.detach().cpu().numpy() dec_out = trg.cpu().numpy() bleus = [] for j in range(sel.shape[0]): bleu = get_bleu(sel[j], dec_out[j], num_words_en) # sel bleus.append(bleu) numerator, denominator = get_correct(sel[j], dec_out[j], num_words_en) acc_numerator_ep += numerator acc_denominator_ep += denominator # .detach().cpu().numpy() TODO: 10.8 bleu_bh = np.average(bleus) bleu_ep += bleu_bh testi += 1 bleu_ep /= testi # num_batches print('testi: ', testi) print('Valid bleu: %.4f%%' % (bleu_ep * 100)) # print(acc_denominator_ep) if acc_denominator_ep > 0: print('Valid acc: %.4f%%' % ((acc_numerator_ep * 1.0 / acc_denominator_ep) * 100))
def load_data(opt): # 不设置fix_length TEXT = data.Field(sequential=True, fix_length=opt.max_text_len) # 词或者字符 LABEL = data.Field(sequential=False, use_vocab=False) # load # word/ or article/ train_path = opt.data_path + opt.text_type + '/train_set.csv' val_path = opt.data_path + opt.text_type + '/val_set.csv' test_path = opt.data_path + opt.text_type + '/test_set.csv' train_path = 'D:/git/dataset/val_set.csv' test_path = 'D:/git/dataset/val_set.csv' val_path = 'D:/git/dataset/val_set.csv' # aug for data augmentation if opt.aug: print('make augmentation datasets!') train = GrandDataset(train_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False, aug=opt.aug) val = GrandDataset(val_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False) test = GrandDataset(test_path, text_field=TEXT, label_field=None, text_type=opt.text_type, test=True) cache = '.vector_cache' if not os.path.exists(cache): os.mkdir(cache) embedding_path = '{}/{}_{}.txt'.format(opt.embedding_path, opt.text_type, opt.embedding_dim) vectors = Vectors(name=embedding_path, cache=cache) print('load word2vec vectors from {}'.format(embedding_path)) vectors.unk_init = init.xavier_uniform_ # 没有命中的token的初始化方式 # 构建Vocab print('building {} vocabulary......'.format(opt.text_type)) TEXT.build_vocab(train, val, test, min_freq=5, vectors=vectors) # LABEL.build_vocab(train) # 构建Iterator # 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 # 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序 train_iter = data.BucketIterator(dataset=train, batch_size=opt.batch_size, shuffle=True, sort_within_batch=False, repeat=False, device=opt.device) # val_iter = data.BucketIterator(dataset=val, batch_size=opt.batch_size, sort_within_batch=False, repeat=False, # device=opt.device) # train_iter = data.Iterator(dataset=train, batch_size=opt.batch_size, train=True, repeat=False, device=opt.device) val_iter = data.Iterator(dataset=val, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False, device=opt.device) test_iter = data.Iterator(dataset=test, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False, device=opt.device) return train_iter, val_iter, test_iter, len(TEXT.vocab), TEXT.vocab.vectors
def caption_iterator(start_token, end_token, pad_token, train_meta_path, val_1_meta_path, val_2_meta_path, min_freq, batch_size, device, phase, use_categories, use_subs): spacy_en = spacy.load('en') print(f'Preparing dataset for {phase}') def tokenize_en(txt): return [token.text for token in spacy_en.tokenizer(txt)] CAPTION = data.ReversibleField( tokenize='spacy', init_token=start_token, eos_token=end_token, pad_token=pad_token, lower=True, batch_first=True, is_target=True ) INDEX = data.Field( sequential=False, use_vocab=False, batch_first=True ) if use_categories: # preprocessing: if there is no category replace with -1 (unique number) CATEGORY = data.Field( sequential=False, use_vocab=False, batch_first=True, preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x))) ) # filter the dataset if the a category is missing (31 -> 41 (count = 1 :())) filter_pred = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31 else: CATEGORY = None filter_pred = None if use_subs: SUBS = data.ReversibleField( tokenize='spacy', init_token=start_token, eos_token=end_token, pad_token=pad_token, lower=True, batch_first=True ) else: SUBS = None # the order has to be the same as in the table fields = [ ('video_id', None), ('caption', CAPTION), ('start', None), ('end', None), ('duration', None), ('category_32', CATEGORY), ('subs', SUBS), ('phase', None), ('idx', INDEX), ] dataset = data.TabularDataset( path=train_meta_path, format='tsv', skip_header=True, fields=fields, filter_pred=filter_pred ) CAPTION.build_vocab(dataset.caption, min_freq=min_freq) train_vocab = CAPTION.vocab train_subs_vocab = None if use_subs: SUBS.build_vocab(dataset.subs, min_freq=min_freq) train_subs_vocab = SUBS.vocab if phase == 'val_1': dataset = data.TabularDataset( path=val_1_meta_path, format='tsv', skip_header=True, fields=fields, filter_pred=filter_pred ) elif phase == 'val_2': dataset = data.TabularDataset( path=val_2_meta_path, format='tsv', skip_header=True, fields=fields, filter_pred=filter_pred ) # sort_key = lambda x: data.interleave_keys(len(x.caption), len(x.caption)) sort_key = lambda x: 0 #len(x.caption) datasetloader = data.BucketIterator( dataset, batch_size, sort_key=sort_key, device=device, repeat=False, shuffle=True ) return train_vocab, train_subs_vocab, datasetloader
def train(): # Logger. logger = helpers.get_logger('training') helpers.log_args(logger, args) # Prepare training and testing data. TEXT = data.Field(lower=True, tokenize=helpers.tokenize, batch_first=True) LABEL = data.Field(sequential=False) fields = [('label', LABEL), ('text', TEXT)] train_set = data.TabularDataset(args.train_file, 'csv', fields) logger.info(f'Loaded training data: {args.train_file}') TEXT.build_vocab(train_set, max_size=args.max_size, min_freq=args.min_freq, vectors=args.pretrained_embeddings) LABEL.build_vocab(train_set) train_set, valid_set = helpers.split_data(train_set, fields, args.random_seed, args.valid_split) logger.info(f'Number of training examples: {len(train_set.examples)}') logger.info(f'Number of validation examples: {len(valid_set.examples)}') logger.info(f'Size of vocabulary: {len(TEXT.vocab)}') logger.info(f'Number of labels: {len(LABEL.vocab)}') # Initiate criterion, classifier, and optimizer. classifier = CNNClassifier(vocab_size=len(TEXT.vocab), labelset_size=len(LABEL.vocab), embedding_dim=args.embedding_dim, num_layers=args.num_layers, filter_mapping=eval(args.filter_mapping), dropout_prob=args.dropout_prob, pretrained_embeddings=TEXT.vocab.vectors) if args.cuda: classifier.cuda(device=args.device_id) criterion = nn.NLLLoss() optimizer = optim.Adam(classifier.parameters(), args.learning_rate) iterator = data.BucketIterator(dataset=train_set, batch_size=args.batch_size, sort_key=lambda x: len(x.text), device=args.device_id if args.cuda else -1) patience = args.patience min_valid_loss = None for batch in iterator: optimizer.zero_grad() log_probs = classifier(batch.text) loss = criterion(log_probs, batch.label) if args.beta > 0: loss = loss - args.beta * helpers.calc_entropy(log_probs) loss.backward() optimizer.step() progress, epoch = math.modf(iterator.epoch) if iterator.iterations % args.logging_interval == 0: valid_loss, accuracy = helpers.evaluate( valid_set, args.batch_size, classifier, args.device_id if args.cuda else -1) logger.info(f'Epoch {int(epoch):2} | ' f'progress: {progress:<6.2%} | ' f'training loss: {loss.data[0]:6.4f} | ' f'validation loss: {valid_loss:6.4f} | ' f'validation accuracy: {accuracy:<6.2%} |') classifier.train() if min_valid_loss is None: min_valid_loss = valid_loss if valid_loss < min_valid_loss + args.threshold: patience = args.patience min_valid_loss = min(valid_loss, min_valid_loss) else: patience -= 1 if patience == 0: logger.info( f'Patience of {args.patience} reached, decaying learning rate' ) helpers.decay_learning_rate(optimizer, args.decay_factor) patience = args.patience if epoch == args.num_epochs: break # Optional testing after training is done. if args.test_file is not None: test_set = data.TabularDataset(args.test_file, 'csv', fields) logger.info(f'Loaded testing data {args.test_file}') test_loss, accuracy = helpers.evaluate( test_set, args.batch_size, classifier, args.device_id if args.cuda else -1) logger.info(f'Testing loss: {test_loss:6.4f}') logger.info(f'Testing accuracy: {accuracy:<6.2%}')