def save_small_shard_vocab() -> None: ''' Reads the small shard dataset and creates vocabulary objects ''' d = ShardedCSVDataset(WMT14_EN_FR_SMALL_TRAIN_SHARD) def en_fn(item): return item[0] def fr_fn(item): return item[1] en_vocab = build_vocab( d, en_fn, unk_cutoff=2, ) d.reset() fr_vocab = build_vocab( d, fr_fn, unk_cutoff=2, ) save_vocab(en_vocab, SMALL_TRAIN_EN_VOCAB_FILE) save_vocab(fr_vocab, SMALL_TRAIN_FR_VOCAB_FILE)
def main(config_file='config/bert_config.json'): """Main method for training. Args: config_file: in config dir """ # 0. Load config and mkdir with open(config_file) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) get_path(os.path.join(config.model_path, config.experiment_name)) get_path(config.log_path) if config.model_type == 'rnn': # build vocab for rnn build_vocab(file_in=config.all_train_file_path, file_out=os.path.join(config.model_path, 'vocab.txt')) # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type) datasets = data.load_train_and_valid_files( train_file=config.train_file_path, valid_file=config.valid_file_path) train_set, valid_set_train, valid_set_valid = datasets if torch.cuda.is_available(): device = torch.device('cuda') torch.distributed.init_process_group(backend="nccl") sampler_train = DistributedSampler(train_set) else: device = torch.device('cpu') sampler_train = RandomSampler(train_set) data_loader = { 'train': DataLoader(train_set, sampler=sampler_train, batch_size=config.batch_size), 'valid_train': DataLoader(valid_set_train, batch_size=config.batch_size, shuffle=False), 'valid_valid': DataLoader(valid_set_valid, batch_size=config.batch_size, shuffle=False) } # 2. Build model model = MODEL_MAP[config.model_type](config) model.to(device) if torch.cuda.is_available(): model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) # 3. Train trainer = Trainer(model=model, data_loader=data_loader, device=device, config=config) best_model_state_dict = trainer.train() # 4. Save model torch.save(best_model_state_dict, os.path.join(config.model_path, 'model.bin'))
def __init__(self, data_path, train=False, longest_sequence_length=None): data0 = load_sent(data_path + '.0') data1 = load_sent(data_path + '.1') print( f'\n------------------------ Building a Dataset ------------------------' ) print(f'#sents of {data_path}.0 file 0: {len(data0)}' ) # list of list of tokenized words print(f'#sents of {data_path}.1 file 1: {len(data1)}' ) # list of list of tokenized words self.data_all = data0 + data1 self.style_list = [0 for i in data0] + [ 1 for i in data1 ] # data0 is all neg, data1 is all pos # sorting all the data according to their seq lengths in descending order zip_item = zip(self.data_all, self.style_list) sorted_item = sorted(zip_item, key=lambda p: len(p[0]), reverse=True) tuple_item = zip(*sorted_item) self.data_all, self.style_list = [list(t) for t in tuple_item] print(f'len(self.data_all) : {len(self.data_all)}') print(f'len(self.style_list): {len(self.style_list)}') if train: print('\ntrain: True') if not os.path.isfile(cfg.vocab): print(f'{cfg.vocab} does not exist') print('Building Vocab...') build_vocab(data0 + data1, cfg.vocab) else: print(f'{cfg.vocab} already exists') self.vocab = Vocabulary(cfg.vocab, cfg.embedding_file, cfg.embed_dim) print('\nvocabulary size:', self.vocab.size) print( f'vocabulary embedding matrix shape: {self.vocab.embedding.shape}') # print(type(self.vocab.embedding)) # np array self.longest_sequence_length = longest_sequence_length if longest_sequence_length is None: self.update_the_max_length() print(f'self.longest_sequence_length: {self.longest_sequence_length}') print( f'--------------------------------------------------------------------' )
def _build_main_vocab(self, min_vocab_count): def token_stream(): for path, lang in self._vocab_files: with open(path, errors='ignore') as file_: content = file_.read() for token in tokenizer.tokenize(content, self._mode): yield token return vocab.build_vocab(token_stream(), min_vocab_count)
def __init__(self, arg, load_model=False): self.args = arg self.vocab = build_vocab(self.args) self.embedding_matrix = build_embedding_matrix(self.args, self.vocab) self.dataset_train = Dataset(self.args, 'train', self.embedding_matrix) self.dataset_val = Dataset(self.args, 'val', self.embedding_matrix) self.dataset_test = Dataset(self.args, 'test', self.embedding_matrix) self.model = Model(self.args) self.optimizer = optim.Adam(self.model.parameters()) self._epoch = 0 self._iter = 0 self.max_val_acc = None self.max_test_acc = None if load_model: self.load_model()
def create_datasets(): train_dataset = conlldataloader.ConllDataSet(constants.CONLL2003_TRAIN) valid_dataset = conlldataloader.ConllDataSet(constants.CONLL2003_VALID) print('processing train dataset') train_dataset.parse_file() print('finished processing train dataset') print('build training vocab') train_vocab = vocab.build_vocab(train_dataset.word_list) print('done building vocab') print('build output vocab') output_categories = vocab.build_output_vocab(train_dataset.categories) print('done building output vocab') print('processing valid dataset') valid_dataset.parse_file() print('finished processing valid dataset') return train_dataset, valid_dataset, train_vocab, output_categories
def create_datasets() -> CADECLoadedDatasetType: train_conll, valid_conll, _, _, _, _ = get_constants() train_dataset = SCIERCDataset(train_conll) valid_dataset = SCIERCDataset(valid_conll) print('processing train dataset') train_dataset.parse_file() print('finished processing train dataset') print('processing valid dataset') valid_dataset.parse_file() print('finished processing valid dataset') print('build training vocab') train_vocab = vocab.build_vocab(train_dataset.word_list) print('done building vocab') print('build output vocab') output_categories = vocab.build_output_vocab(train_dataset.categories) print('done building output vocab') return train_dataset, valid_dataset, train_vocab, output_categories
def load_model(): model = keras.models.load_model(BEST_MODEL_FILE, custom_objects={ 'TokenAndPositionEmbedding': TokenAndPositionEmbedding, 'TransformerBlock': TransformerBlock }) return model batch_size = 32 # Create a list all files text_ds, index_to_word, word_to_index = build_vocab(directories, batch_size, vocab_size, maxlen) def tokenizeString(s): return [word_to_index.get(_, 1) for _ in s.split()] class TextGenerator(keras.callbacks.Callback): def __init__(self, max_tokens, start_tokens): self.max_tokens = max_tokens self.start_tokens = start_tokens self.k = 10 def sample_from(self, logits): logits, indices = tf.math.top_k(logits, k=self.k, sorted=True) indices = np.asarray(indices).astype("int32")
def load_words(dataset, cutoff=1): _vocab = build_vocab(dataset, cutoff) return _vocab
from dataset import load_data, load_data_to_one_str import torch from torch.autograd import Variable from utils import similarity, mean_vectors from torch.nn import Embedding data = 'data' emb_dir = 'embedding/' emb_file = 'wiki.pl' embedding_dim = 300 hidden_dim = 50 sentence_length = 8 vocab_file = 'tmp/vocab.txt' build_vocab([ emb_dir+emb_file+'.vec' ], 'tmp/vocab.txt') vocab = Vocab(filename=vocab_file) emb: Embedding = load_embedding_model(data=data, emb_dir=emb_dir, emb_file=emb_file, vocab=vocab, input_dim=embedding_dim) model = LSTMEmbeddings(embedding_dim, hidden_dim, vocab.size(), sentence_length, emb) encoded_file_animals = load_data(data+"/animals.txt", vocab, sentence_length) encoded_file_buildings = load_data(data+"/buildings.txt", vocab, sentence_length) # encoded_file_batman_beyond = \ # load_data_to_one_str(data+"/shows/Batman Beyond/batman_beyond_-_1x01_-_rebirth_-_part_1_.vpc.txt", vocab, sentence_length) # # encoded_file_batman_animated = \ # load_data_to_one_str(data + "/shows/Batman: The Animated Series/01. On Leather Wings.txt", vocab, sentence_length) # encoded_file_dharma_greg = \
train3 = load_sent(args.train + '.3', args.max_train_size) train4 = load_sent(args.train + '.4', args.max_train_size) train5 = load_sent(args.train + '.5', args.max_train_size) print '#sents of training file 0:', len(train0) print '#sents of training file 1:', len(train1) print '#sents of training file 2:', len(train2) print '#sents of training file 3:', len(train3) print '#sents of training file 4:', len(train4) print '#sents of training file 5:', len(train5) # loaded all three datasets here. Train once with 0-1 and once with 0-2 print("=====got here training=====") # grand vocabulary if not os.path.isfile(args.vocab): build_vocab(train0 + train1 + train2 + train3 + train4 + train5, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary 1 size:', vocab.size # introduce a second input argument, "vocab2" # vocab2 = Vocabulary(args.vocab2, args.embedding, args.dim_emb) # print 'vocabulary 2 size:', vocab2.size if args.dev: dev0 = load_sent(args.dev + '.0') dev1 = load_sent(args.dev + '.1') dev2 = load_sent(args.dev + '.2') dev3 = load_sent(args.dev + '.3') dev4 = load_sent(args.dev + '.4')
from torch.utils.data import DataLoader import torch import torch.nn as nn import vocab from dataset import SNLITrainDataset, SNLIDevDataset import torch.optim as optim from sklearn.metrics import classification_report import numpy as np LABEL_MAP = {'neutral': 0, 'contradiction': 1, 'entailment': 2} VOCAB = vocab.build_vocab() # setting device on GPU if available, else CPU DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(DEVICE) class Baseline(nn.Module): def __init__(self, embed_size=100, hidden_size=100, projection_dim=200, dropout_rate=0.2): """ Args: embed_size : int Size of input embeddings hidden_size : int Size of the LSTM hidden state projection_dim: int Size of the linear projection layers dropout_rate: float
def main(config_file='config/bert_config.json'): """Main method for training. Args: config_file: in config dir """ global datasets # 0. Load config and mkdir with open(config_file) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) get_path(os.path.join(config.model_path, config.experiment_name)) get_path(config.log_path) if config.model_type in ['rnn', 'lr', 'cnn']: # build vocab for rnn build_vocab(file_in=config.all_train_file_path, file_out=os.path.join(config.model_path, 'vocab.txt')) # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) def load_dataset(): datasets = data.load_train_and_valid_files( train_file=config.train_file_path, valid_file=config.valid_file_path) return datasets if config.serial_load: datasets = SERIAL_EXEC.run(load_dataset) else: datasets = load_dataset() train_set, valid_set_train, valid_set_valid = datasets if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') # torch.distributed.init_process_group(backend="nccl") # sampler_train = DistributedSampler(train_set) sampler_train = RandomSampler(train_set) else: device = torch.device('cpu') sampler_train = RandomSampler(train_set) # TPU device = xm.xla_device() sampler_train = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) data_loader = { 'train': DataLoader(train_set, sampler=sampler_train, batch_size=config.batch_size), 'valid_train': DataLoader(valid_set_train, batch_size=config.batch_size, shuffle=False), 'valid_valid': DataLoader(valid_set_valid, batch_size=config.batch_size, shuffle=False) } # 2. Build model # model = MODEL_MAP[config.model_type](config) model = WRAPPED_MODEL #load model states. # if config.trained_weight: # model.load_state_dict(torch.load(config.trained_weight)) model.to(device) if torch.cuda.is_available(): model = model # model = torch.nn.parallel.DistributedDataParallel( # model, find_unused_parameters=True) # 3. Train trainer = Trainer(model=model, data_loader=data_loader, device=device, config=config) # best_model_state_dict = trainer.train() if config.model_type == 'bert': no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_parameters, lr=config.lr, betas=(0.9, 0.999), weight_decay=1e-8, correct_bias=False) else: # rnn optimizer = Adam(model.parameters(), lr=config.lr) # if config.model_type == 'bert': # scheduler = get_linear_schedule_with_warmup( # optimizer, # num_warmup_steps=config.num_warmup_steps, # num_training_steps=config.num_training_steps) # else: # rnn # scheduler = get_constant_schedule(optimizer) criterion = nn.CrossEntropyLoss() def train_loop_fn(loader): tracker = xm.RateTracker() model.train() for x, batch in enumerate(loader): # batch = tuple(t.to(self.device) for t in batch) output = model(*batch[:-1]) # the last one is label loss = criterion(output, batch[-1]) loss.backward() # xm.optimizer_step(optimizer) # optimizer.zero_grad() tracker.add(FLAGS.batch_size) if (x + 1) % config.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) # after 梯度累加的基本思想在于,在优化器更新参数前,也就是执行 optimizer.step() 前,进行多次反向传播,是的梯度累计值自动保存在 parameter.grad 中,最后使用累加的梯度进行参数更新。 xm.optimizer_step(optimizer) optimizer.zero_grad() if xm.get_ordinal() == 0: if x % FLAGS.log_steps == 0: print( '[xla:{}]({}) Loss={:.5f} Rate={:.2f} GlobalRate={:.2f} Time={}' .format(xm.get_ordinal(), x, loss.item(), tracker.rate(), tracker.global_rate(), time.asctime()), flush=True) def test_loop_fn(loader): total_samples = 0 correct = 0 model.eval() data, pred, target = None, None, None tracker = xm.RateTracker() for x, batch in enumerate(loader): output = model(*batch[:-1]) # the last one is label target = batch[-1] # pred = output.max(1, keepdim=True)[1] # correct += pred.eq(target.view_as(pred)).sum().item() for i in range(len(output)): logits = output[i] pred = int(torch.argmax(logits, dim=-1)) if pred == target[i]: correct += 1 total_samples += len(output) if xm.get_ordinal() == 0: if x % FLAGS.log_steps == 0: print( '[xla:{}]({}) Acc={:.5f} Rate={:.2f} GlobalRate={:.2f} Time={}' .format(xm.get_ordinal(), x, correct * 1.0 / total_samples, tracker.rate(), tracker.global_rate(), time.asctime()), flush=True) accuracy = 100.0 * correct / total_samples if xm.get_ordinal() == 0: print('[xla:{}] Accuracy={:.2f}%'.format(xm.get_ordinal(), accuracy), flush=True) return accuracy, data, pred, target # Train and eval loops accuracy = 0.0 data, pred, target = None, None, None for epoch in range(FLAGS.num_epoch): para_loader = pl.ParallelLoader(data_loader['train'], [device]) train_loop_fn(para_loader.per_device_loader(device)) xm.master_print("Finished training epoch {}".format(epoch)) # para_loader = pl.ParallelLoader(data_loader['valid_train'], [device]) # accuracy_train, data, pred, target = test_loop_fn(para_loader.per_device_loader(device)) para_loader = pl.ParallelLoader(data_loader['valid_valid'], [device]) accuracy_valid, data, pred, target = test_loop_fn( para_loader.per_device_loader(device)) xm.master_print("Finished test epoch {}, valid={:.2f}".format( epoch, accuracy_valid)) if FLAGS.metrics_debug: xm.master_print(met.metrics_report()) # 4. Save model # if xm.get_ordinal() == 0: # # if epoch==FLAGS.num_epoch-1: # # WRAPPED_MODEL.to('cpu') # torch.save(WRAPPED_MODEL.state_dict(), os.path.join( # config.model_path, config.experiment_name, # config.model_type + '-' + str(epoch + 1) + '.bin')) # xm.master_print('saved model.') # WRAPPED_MODEL.to(device) return accuracy_valid
print('Loading model from', os.path.join(args.classifier_path, 'model')) model.saver.restore(sess, os.path.join(args.classifier_path, 'model')) else: print('Creating model with fresh parameters.') sess.run(tf.global_variables_initializer()) if not os.path.exists(args.classifier_path): os.makedirs(args.classifier_path) return model if __name__ == '__main__': args = load_arguments() if not os.path.isfile(args.vocab): build_vocab(args.train_path, args.vocab, lang=args.lang) vocab = Vocabulary(args.vocab) print('vocabulary size', vocab.size) loader = ClassificationBatcher(args, vocab) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = create_model(sess, args, vocab) batches = loader.get_batches(mode='train') start_time = time.time() loss = 0.0
def main(): construct_vocab = False encode_images = False train = True # Read and Process Raw data data = CaptioningData() # Finding image files as data data.set_all_images(cfg.images_path) captions_dict = data.get_captions(cfg.token_file) caption_maxlen = data.get_caption_maxlen() # Construct vocabulary if construct_vocab: # get all caption to construct Vocab all_captions = data.get_all_captions() vocab = build_vocab(vocab_path=cfg.data_path, vocab_name=cfg.vocab_name, captions=all_captions, threshold=2) else: vocab = load_vocab(vocab_path=cfg.data_path, vocab_name=cfg.vocab_name) # print(vocab.word2idx) inception_encoding = Encoder() # train data if train: train_images = data.get_train_images(cfg.train_image_files) train_pairs = [ ImgCaptionPair(img_id, captions_dict[img_id]) for img_id in train_images ] # Image Encoding if encode_images: train_img_encoding = inception_encoding.encode_images( file_path=cfg.images_path, image_list=train_images, encoding_file=cfg.train_img_encoding_file) else: train_img_encoding = inception_encoding.load_image_encoding( encoding_file=cfg.train_img_encoding_file) train_data_generator = data_generator(vocab, train_pairs, train_img_encoding, batch_size=1800, max_len=caption_maxlen) # next(g) # Decoder model decoder = Decoder(vocab_size=len(vocab), embedding_size=300, input_shape=2048, caption_max_len=caption_maxlen) decoder_model = decoder.get_model() decoder_model.load_weights('best_weights.97-0.95.hdf5') if train: decoder_model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) ckpt = ModelCheckpoint('weights.{epoch:02d}-{loss:.2f}.hdf5', monitor='loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=30) best_ckpt = ModelCheckpoint('best_weights.{epoch:02d}-{loss:.2f}.hdf5', monitor='loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) decoder_model.fit_generator(train_data_generator, steps_per_epoch=30, epochs=100, callbacks=[ckpt, best_ckpt]) decoder_model.save('decoder_model.h5') img_ids = data.get_val_images(cfg.val_image_files) img_name = img_ids[9] enc_img = inception_encoding.encode_single_img(file_path=cfg.images_path, img_name=img_name) caption = ["<start>"] while True: par_caps = [vocab(i) for i in caption] par_caps = sequence.pad_sequences([par_caps], maxlen=40, padding='post') preds = decoder_model.predict( [np.array([enc_img]), np.array(par_caps)]) word_pred = vocab.idx2word[np.argmax(preds[0])] caption.append(word_pred) if word_pred == "<end>" or len(caption) > 40: break full_img_path = os.path.join(cfg.images_path, img_name) print(captions_dict[img_name]) print(full_img_path) print(' '.join(caption[1:-1]))
model.inputs: batch['inputs'], model.targets: batch['targets'], model.weights: batch['weights'], model.dropout: 1}) n_words += np.sum(batch['weights']) return np.exp(tot_loss / n_words) if __name__ == '__main__': args = load_arguments() if args.train: train = load_sent(args.train) if not os.path.isfile(args.vocab): build_vocab(train, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary size', vocab.size if args.dev: dev = load_sent(args.dev) if args.test: test = load_sent(args.test) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = create_model(sess, args, vocab) if args.train:
import json from dataset import CodeDataset from vocab import build_vocab, write_vocab, Vocab from nltk.stem import PorterStemmer import pymongo client = pymongo.MongoClient('NOM_CLIENT') db = client.codes stemmer = PorterStemmer() def add_voc(vocab): with open('data/cim.json','r') as df : cim = json.load(df) with open('data/ccam.json','r') as df : ccam = json.load(df) vocab = vocab + [stemmer.stem(c) for c in cim.keys()] + [stemmer.stem(c) for c in ccam.keys()] return vocab if __name__ == "__main__": dataset_cim = CodeDataset(db.cim) dataset_ccam = CodeDataset(db.ccam) vocab_cim = build_vocab(dataset_cim) vocab_ccam = build_vocab(dataset_ccam) write_vocab(vocab_cim + vocab_ccam, "data/vocab.txt") all_vocab = add_voc(vocab) write_vocab(all_vocab, "data/vocab_all.txt")
print 'Creating model with fresh parameters.' sess.run(tf.global_variables_initializer()) return model if __name__ == '__main__': args = load_arguments() ##### data preparation ##### if args.train: train0 = load_sent(args.train + '.0', args.max_train_size) train1 = load_sent(args.train + '.1', args.max_train_size) print '#sents of training file 0:', len(train0) print '#sents of training file 1:', len(train1) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary size:', vocab.size if args.dev: dev0 = load_sent(args.dev + '.0') dev1 = load_sent(args.dev + '.1') if args.test: test0 = load_sent(args.test + '.0') test1 = load_sent(args.test + '.1') config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess:
def main(config_file='config/bert_config.json'): """Main method for training. Args: config_file: in config dir """ global datasets # 0. Load config and mkdir with open(config_file) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) get_path(os.path.join(config.model_path, config.experiment_name)) get_path(config.log_path) if config.model_type in ['rnn', 'lr', 'cnn']: # build vocab for rnn build_vocab(file_in=config.all_train_file_path, file_out=os.path.join(config.model_path, 'vocab.txt')) # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), model_type=config.model_type, config=config) def load_dataset(): train_dataset, collate_fn = data.load_train_and_valid_files( train_file=config.train_file_path) return train_dataset, collate_fn if config.serial_load: train_set, collate_fn = SERIAL_EXEC.run(load_dataset) else: train_set, collate_fn = load_dataset() if torch.cuda.is_available(): device = torch.device('cuda') sampler_train = RandomSampler(train_set) else: device = torch.device('cpu') sampler_train = RandomSampler(train_set) # TPU device = xm.xla_device() sampler_train = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) data_loader = { 'train': DataLoader( train_set, batch_size=config.batch_size, sampler=sampler_train, collate_fn=collate_fn, drop_last=True, ) } # 2. Build model # model = MODEL_MAP[config.model_type](config) model = WRAPPED_MODEL #load model states. # if config.trained_weight: # model.load_state_dict(torch.load(config.trained_weight)) model.to(device) if torch.cuda.is_available(): model = model # model = torch.nn.parallel.DistributedDataParallel( # model, find_unused_parameters=True) # # 3. Train # trainer = Trainer(model=model, data_loader=data_loader, # device=device, config=config) # # best_model_state_dict = trainer.train() # # if config.model_type == 'bert': # no_decay = ['bias', 'gamma', 'beta'] # optimizer_parameters = [ # {'params': [p for n, p in model.named_parameters() # if not any(nd in n for nd in no_decay)], # 'weight_decay_rate': 0.01}, # {'params': [p for n, p in model.named_parameters() # if any(nd in n for nd in no_decay)], # 'weight_decay_rate': 0.0}] # optimizer = AdamW( # optimizer_parameters, # lr=config.lr, # betas=(0.9, 0.999), # weight_decay=1e-8, # correct_bias=False) # else: # rnn # optimizer = Adam(model.parameters(), lr=config.lr) # if config.model_type == 'bert': # scheduler = get_linear_schedule_with_warmup( # optimizer, # num_warmup_steps=config.num_warmup_steps, # num_training_steps=config.num_training_steps) # else: # rnn # scheduler = get_constant_schedule(optimizer) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=config.lr) scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.997) def train_loop_fn(loader): tracker = xm.RateTracker() model.train() a_score, s_score = 0, 0 for x, batch in enumerate(loader): input_batch, target_batch = batch['input_ids'], batch['labels'] input_batch, target_batch = input_batch.to( device), target_batch.to(device) ind_iter = range(input_batch.shape[0]) index = 0 while index < input_batch.shape[0]: # 2: batch_range = list( islice(ind_iter, index, index + int(config.max_stem_size))) batch_input = torch.zeros( (len(batch_range), config.vocab_size), dtype=float).float().to(device) for i in range(len(batch_range)): batch_input[i, input_batch[batch_range[i]]] = 1.0 batch_target_batch = target_batch[index:index + int(config.max_stem_size)] index += int(config.max_stem_size) optimizer.zero_grad() output = model(batch_input) # output : [batch_size, voc_size], target_batch : [batch_size] (LongTensor, not one-hot) if config.hierarchical_softmax: loss = torch.mean( model.hsoftmax(output, batch_target_batch)) else: loss = criterion(output, batch_target_batch) loss.backward() optimizer.step() # drop the learning rate gradually if xm.get_ordinal() == 0: if (x + 1) % 100000 == 0: print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss)) W, WT = model.parameters() weights = W.T.detach().cpu().numpy() dic = data.tokenizer.dictionary vocab = [ key for (key, value ) in sorted(dic.items(), key=lambda x: x[1]) ] vocab = numpy.reshape(numpy.array(vocab), (-1, 1)) w2v = numpy.concatenate((vocab, weights), axis=1) pandas.DataFrame(w2v).to_csv("word2vec.txt", sep=' ', header=None, index=False) with open("word2vec.txt", 'r+', encoding='utf-8') as file: readcontent = file.read( ) # store the read value of exe.txt into file.seek(0, 0) # Takes the cursor to top line file.write( str(len(vocab)) + " " + str(weights.shape[1]) + "\n") # convert int to str since write() deals file.write(readcontent) # torch.save(model, os.path.join(config.model_path, config.experiment_name, 'model.bin')) a_score, s_score = eval( config.analogy_valid_file_path, config.similarity_valid_file_path) print( '[xla:{}]({}) anlogy:{:.6f},sim:{:.6f},Loss={:.5f} Rate={:.2f} GlobalRate={:.2f} Time={}' .format(xm.get_ordinal(), x, a_score, s_score, loss.item(), tracker.rate(), tracker.global_rate(), time.asctime()), flush=True) tracker.add(FLAGS.batch_size) scheduler.step() if xm.get_ordinal() == 0: if (epoch + 1) % 1 == 0 or epoch == int(config.num_epoch) - 1: print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss)) W, WT = model.parameters() weights = W.T.detach().cpu().numpy() dic = data.tokenizer.dictionary vocab = [ key for (key, value) in sorted(dic.items(), key=lambda x: x[1]) ] w2v = numpy.concatenate((vocab, weights), axis=1) pandas.DataFrame(w2v).to_csv("word2vec.txt", sep=' ', header=None, index=False) with open("word2vec.txt", 'r+', encoding='utf-8') as file: readcontent = file.read( ) # store the read value of exe.txt into file.seek(0, 0) # Takes the cursor to top line file.write( str(len(vocab)) + " " + str(weights.shape[1]) + "\n") # convert int to str since write() deals file.write(readcontent) # torch.save(model, os.path.join(config.model_path, config.experiment_name, 'model.bin')) a_score, s_score = eval(config.analogy_valid_file_path, config.similarity_valid_file_path) print( '[xla:{}]({}) anlogy:{:.6f},sim:{:.6f},Loss={:.5f} Rate={:.2f} GlobalRate={:.2f} Time={}' .format(xm.get_ordinal(), x, a_score, s_score, loss.item(), tracker.rate(), tracker.global_rate(), time.asctime()), flush=True) return a_score, s_score # Train and eval loops accuracy = 0.0 data, pred, target = None, None, None for epoch in range(FLAGS.num_epoch): para_loader = pl.ParallelLoader(data_loader['train'], [device]) a_score, s_score = train_loop_fn(para_loader.per_device_loader(device)) xm.master_print("Finished training epoch {}".format(epoch)) if FLAGS.metrics_debug: xm.master_print(met.metrics_report()) return a_score, s_score
print 'Creating model with fresh parameters.' sess.run(tf.global_variables_initializer(), feed_dict={model.emb_init: vocab.embedding}) return model if __name__ == '__main__': args = load_arguments() ##### data preparation ##### if args.train: train0 = load_sent(args.train + '.0', args.max_train_size) train1 = load_sent(args.train + '.1', args.max_train_size) print '#sents of training file 0:', len(train0) print '#sents of training file 1:', len(train1) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab, args.min_count, args.source) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary size:', vocab.size if args.dev: dev0 = load_sent(args.dev + '.0') dev1 = load_sent(args.dev + '.1') if args.test: test0 = load_sent(args.test + '.0') test1 = load_sent(args.test + '.1') config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess:
def main(batch_size, embed_size, num_hiddens, num_layers, ln_hidden, ln_output, rec_unit, learning_rate=1e-4, log_step=10, num_epochs=50, save_step=100, ngpu=1): # hyperparameters num_workers = 0 checkpoint_dir = 'checkpoint' # Image Preprocessing transform = { 'train': transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]), 'val': transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]), } # load data vocab = build_vocab(path='relative_captions_shoes.json') train_data, train_loader = data_and_loader( path='relative_captions_shoes.json', mode='train', vocab=vocab, transform=transform['train'], batch_size=batch_size) val_data, val_loader = data_and_loader(path='relative_captions_shoes.json', mode='valid', vocab=vocab, transform=transform['val'], batch_size=batch_size) losses_val = [] losses_train = [] # Build the models initial_step = initial_epoch = 0 encoder = CNN(embed_size) ### embed_size: power of 2 middle = fcNet(embed_size, ln_hidden, ln_output) decoder = RNN(ln_output, num_hiddens, len(vocab), num_layers, rec_unit=rec_unit, drop_out=0.1) # Loss, parameters & optimizer loss_fun = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.batchnorm.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Train the Models total_step = len(train_loader) try: for epoch in range(initial_epoch, num_epochs): print('Epoch: {}'.format(epoch)) for step, (images, captions, lengths) in enumerate(train_loader, start=initial_step): # Set mini-batch dataset images = Variable(images) captions = Variable(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() middle.zero_grad() encoder.zero_grad() if ngpu > 1: # run on multiple GPUs features = nn.parallel.data_parallel( encoder, images, range(ngpu)) rnn_input = nn.parallel.data_parallel( middle, features, range(ngpu)) outputs = nn.parallel.data_parallel( decoder, features, range(ngpu)) else: # run on single GPU features = encoder(images) rnn_input = middle(features) outputs = decoder(rnn_input, captions, lengths) train_loss = loss_fun(outputs, targets) losses_train.append(train_loss.item()) train_loss.backward() optimizer.step() # Run validation set and predict if step % log_step == 0: encoder.batchnorm.eval() # run validation set batch_loss_val = [] for val_step, (images, captions, lengths) in enumerate(val_loader): images = Variable(images) captions = Variable(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] #features = encoder(target_images) - encoder(refer_images) features = encoder(images) rnn_input = middle(features) outputs = decoder(rnn_input, captions, lengths) val_loss = loss_fun(outputs, targets) batch_loss_val.append(val_loss.item()) losses_val.append(np.mean(batch_loss_val)) # predict sampled_ids = decoder.sample(rnn_input) sampled_ids = sampled_ids.cpu().data.numpy()[0] sentence = utils.convert_back_to_text(sampled_ids, vocab) print('Sample:', sentence) true_ids = captions.cpu().data.numpy()[0] sentence = utils.convert_back_to_text(true_ids, vocab) print('Target:', sentence) print( 'Epoch: {} - Step: {} - Train Loss: {} - Eval Loss: {}' .format(epoch, step, losses_train[-1], losses_val[-1])) encoder.batchnorm.train() # Save the models if (step + 1) % save_step == 0: save_models(encoder, middle, decoder, optimizer, step, epoch, losses_train, losses_val, checkpoint_dir) dump_losses(losses_train, losses_val, os.path.join(checkpoint_dir, 'losses.pkl')) except KeyboardInterrupt: pass finally: # Do final save utils.save_models(encoder, middle, decoder, optimizer, step, epoch, losses_train, losses_val, checkpoint_dir) utils.dump_losses(losses_train, losses_val, os.path.join(checkpoint_dir, 'losses.pkl'))
logger.info('-----Loading styler model from: %s.-----' % os.path.join(args.styler_path, 'model')) model.saver.restore(sess, os.path.join(args.styler_path, 'model')) else: logger.info('-----Creating styler model with fresh parameters.-----') sess.run(tf.global_variables_initializer()) if not os.path.exists(args.styler_path): os.makedirs(args.styler_path) return model if __name__ == '__main__': args = load_arguments() if not os.path.isfile(args.vocab): build_vocab(args.train_path, args.vocab) vocab = Vocabulary(args.vocab) logger.info('vocabulary size: %d' % vocab.size) # use tensorboard if args.suffix: tensorboard_dir = os.path.join(args.logDir, 'tensorboard', args.suffix) else: tensorboard_dir = os.path.join(args.logDir, 'tensorboard') if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) write_dict = { 'writer': tf.summary.FileWriter(logdir=tensorboard_dir, filename_suffix=args.suffix), 'step':
def _build_labels_vocab(self): langs = set(lang for path, lang in self._vocab_files) return vocab.build_vocab(langs, min_count=0)
data0 = load_sent(path + 'formal' + suffix) data1 = load_sent(path + 'informal' + suffix) x = data0 + data1 y = [0] * len(data0) + [1] * len(data1) z = sorted(zip(x, y), key=lambda i: len(i[0])) return zip(*z) if __name__ == '__main__': args = load_arguments() if args.train: train_x, train_y = prepare(args.train) if not os.path.isfile(args.vocab): build_vocab(train_x, args.vocab) # prepare vocabulary # we set the embeding dimension # we read a pickel file (presumably with the data?) # randomly initialize the vector # normalize the random vectors # embedings are normalized vocab = Vocabulary(args.vocab) print('vocabulary size', vocab.size) # prepare datasets: # read form file, # zip # order them if args.dev:
def run_model(args): time = datetime.now().timestamp() ##### data preparation ##### if args.train: logger, saves_dir = utils.init_logging(args, time) print("args: ", args) logger.info("args: " + str(args)) no_of_epochs = args.max_epochs train0 = load_sent(args.train + '.0', args.max_train_size, args.max_seq_length, args.sentence_flag) train1 = load_sent(args.train + '.1', args.max_train_size, args.max_seq_length, args.sentence_flag) print('#sents of training file 0:', len(train0)) print('#sents of training file 1:', len(train1)) logger.info('#sents of training file 0: ' + str(len(train0))) logger.info('#sents of training file 1: ' + str(len(train1))) # build vocab for every run if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) dev0 = [] dev1 = [] if args.dev: dev0 = load_sent(args.dev + '.0', -1, args.max_seq_length, args.sentence_flag) dev1 = load_sent(args.dev + '.1', -1, args.max_seq_length, args.sentence_flag) if args.predict: if args.model_path: # logger.info("Predicting a sample input\n---------------------\n") device = torch.device( "cuda:" + str(args.cuda_device) if torch.cuda.is_available() else "cpu") model = torch.load(args.model_path, map_location=device) model.training = False output = utils.predict(model, args.predict, args.target_sentiment, args.beam) print( f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}" ) # logger.info(f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}") if args.test: logger, saves_dir = utils.init_logging(args, time) print("args: ", args) logger.info("args: " + str(args)) device = torch.device( "cuda:" + str(args.cuda_device) if torch.cuda.is_available() else "cpu") file0 = open(args.test + ".0", "r") file1 = open(args.test + ".1", "r") saves_path = os.path.join(args.saves_path, utils.get_filename(args, time, "")) Path(saves_path).mkdir(parents=True, exist_ok=True) out_file_0 = open(os.path.join(saves_path, "test_outputs_neg_to_pos"), "w") out_file_1 = open(os.path.join(saves_path, "test_outputs_pos_to_neg"), "w") model = torch.load(args.model_path, map_location=device) model.training = False for line in file0: line = line.strip("\n") output = utils.predict(model, line, 1, args.beam) out_file_0.write(output + "\n") for line in file1: line = line.strip("\n") output = utils.predict(model, line, 0, args.beam) out_file_1.write(output + "\n") if args.train: summ_filename = 'runs/cross-alignment/' + utils.get_filename( args, time, "summary") writer = SummaryWriter(summ_filename) model = get_model(args, vocab, logger) model.train_max_epochs(saves_dir, args, train0, train1, dev0, dev1, vocab, no_of_epochs, writer, time, save_epochs_flag=True)
##### data preparation ##### if args.train or args.latent_train: chosen = args.train if len(args.train) > len(args.latent_train) else \ args.latent_train # train0 = load_sent(chosen + '.0', args.max_train_size) # train1 = load_sent(chosen + '.1', args.max_train_size) train0 = load_sent(chosen + 'formal', args.max_train_size) train1 = load_sent(chosen + 'informal', args.max_train_size) print('#sents of training file 0:', len(train0)) print('#sents of training file 1:', len(train1)) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print('vocabulary size:', vocab.size) if args.dev or args.latent_dev: chosen = args.dev if len(args.dev) > len(args.latent_dev) else \ args.latent_dev dev0 = load_sent(chosen + 'formal') dev1 = load_sent(chosen + 'informal') if args.test or args.latent_test: chosen = args.test if len(args.test) > len(args.latent_test) else \ args.latent_test test0 = load_sent(chosen + 'formal') test1 = load_sent(chosen + 'informal')
def main(args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Training on {device}") if not os.path.exists(args.models_dir): os.makedirs(args.models_dir) if args.build_vocab: print( f"Building vocabulary from captions at {args.captions_json} and with count threshold={args.threshold}" ) vocab_object = build_vocab(args.captions_json, args.threshold) with open(args.vocab_path, "wb") as vocab_f: pickle.dump(vocab_object, vocab_f) print( f"Saved the vocabulary object to {args.vocab_path}, total size={len(vocab_object)}" ) else: with open(args.vocab_path, 'rb') as f: vocab_object = pickle.load(f) print( f"Loaded the vocabulary object from {args.vocab_path}, total size={len(vocab_object)}" ) if args.glove_embed_path is not None: with open(args.glove_embed_path, 'rb') as f: glove_embeddings = pickle.load(f) print( f"Loaded the glove embeddings from {args.glove_embed_path}, total size={len(glove_embeddings)}" ) # We are using 300d glove embeddings args.embed_size = 300 weights_matrix = np.zeros((len(vocab_object), args.embed_size)) for word, index in vocab_object.word2index.items(): if word in glove_embeddings: weights_matrix[index] = glove_embeddings[word] else: weights_matrix[index] = np.random.normal( scale=0.6, size=(args.embed_size, )) weights_matrix = torch.from_numpy(weights_matrix).float().to(device) else: weights_matrix = None img_transforms = transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomCrop((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) train_dataset = cocoDataset(args.image_root, args.captions_json, vocab_object, img_transforms) train_dataloader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) encoder = Encoder(args.resnet_size, (3, 224, 224), args.embed_size).to(device) decoder = Decoder(args.rnn_type, weights_matrix, len(vocab_object), args.embed_size, args.hidden_size).to(device) encoder_learnable = list(encoder.linear.parameters()) decoder_learnable = list(decoder.rnn.parameters()) + list( decoder.linear.parameters()) if args.glove_embed_path is None: decoder_learnable = decoder_learnable + list( decoder.embedding.parameters()) criterion = nn.CrossEntropyLoss() params = encoder_learnable + decoder_learnable optimizer = torch.optim.Adam(params, lr=args.learning_rate) start_epoch = 0 if args.ckpt_path is not None: model_ckpt = torch.load(args.ckpt_path) start_epoch = model_ckpt['epoch'] + 1 prev_loss = model_ckpt['loss'] encoder.load_state_dict(model_ckpt['encoder']) decoder.load_state_dict(model_ckpt['decoder']) optimizer.load_state_dict(model_ckpt['optimizer']) print( f"Loaded model and optimizer state from {args.ckpt_path}; start epoch at {start_epoch}; prev loss={prev_loss}" ) total_examples = len(train_dataloader) for epoch in range(start_epoch, args.num_epochs): for i, (images, captions, lengths) in enumerate(train_dataloader): images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True).data image_embeddings = encoder(images) outputs = decoder(image_embeddings, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() if i % args.log_interval == 0: loss_val = "{:.4f}".format(loss.item()) perplexity_val = "{:5.4f}".format(np.exp(loss.item())) print( f"epoch=[{epoch}/{args.num_epochs}], iteration=[{i}/{total_examples}], loss={loss_val}, perplexity={perplexity_val}" ) torch.save( { 'epoch': epoch, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(), 'optimizer': optimizer.state_dict(), 'loss': loss }, os.path.join(args.models_dir, 'model-after-epoch-{}.ckpt'.format(epoch)))