torch.cuda.manual_seed(args.seed) if args.temperature < 1e-3: parser.error("--temperature has to be greater or equal 1e-3") with open(args.checkpoint, 'rb') as f: model = torch.load(f) model.eval() if args.cuda: model.cuda() else: model.cpu() corpus = data.Corpus( args.data, read_test=True) # reading test just to full construct the vocabulary ntokens = len(corpus.dictionary) eos_ind = corpus.dictionary.word2idx['<eos>'] hidden = model.init_hidden(1) input = Variable(torch.ones(1, 1).mul(eos_ind).long(), volatile=True) # input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) if args.cuda: input.data = input.data.cuda() output, hidden = model(input, hidden) with open(args.outf, 'w') as outf: after_word = False
# Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed_all(args.seed) ############################################################################### # Load data ############################################################################### corpus = data.Corpus(args.data, args.train_override, args.valid_override) eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) if args.continue_train: model = torch.load(os.path.join(args.save, 'model.pt')) else:
def get_batch(source, i, args, seq_len=None, evaluation=False): seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i) org_subsource = source[i:i+seq_len+1] data = Variable(org_subsource[:-1], volatile=evaluation) target = Variable(org_subsource[1:].view(-1)) if evaluation: return data, target perm = torch.Tensor(data.size()).float().bernoulli_(1 - args.replacing_prob) perm[0] = 1 if args.cuda: perm = perm.cuda() return data, target, perm if __name__ == '__main__': import data import argparse parser = argparse.ArgumentParser(description='') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--jumbling_prob', type=float, default=0.2, help='probability for consecutive words to be jumbled (0 = no jumbling)') args = parser.parse_args() corpus = data.Corpus('data/penn/') test_data = batchify(corpus.test, 8, args) data, targets, perm_data, changed = get_batch(test_data, 1, args, seq_len=10) print(data) print(perm_data) print(changed)
help='model checkpoint to use') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--cuda', default=False, help='use CUDA') parser.add_argument('--wsj10', default=False, help='use WSJ10') args = parser.parse_args() args.bptt = 70 # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) # Load model with open(args.checkpoint, 'rb') as f: model, _, _ = torch.load(f) torch.cuda.manual_seed(args.seed) model.cpu() if args.cuda: model.cuda() # Load data import hashlib fn = args.data + 'corpus.{}.data'.format( hashlib.md5(args.data.encode()).hexdigest()) print('Loading cached dataset...') corpus = torch.load(fn) dictionary = corpus.dictionary corpus = data.Corpus(args.data, extend=True) corpus.dictionary = dictionary generate(model, corpus, args.cuda, prt=True)
print( classification_report(numpy.asarray(y_true), numpy.asarray(y_preds), target_names=target_names)) return 100. * n_correct / n_total if __name__ == "__main__": dictionary = helper.load_object(args.save_path + 'dictionary.p') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) tasks = [] if 'quora' in args.task: quora_dev = data.Corpus(args.data + 'quora/', dictionary) quora_dev.parse('dev.txt', 'quora', args.tokenize, is_test_corpus=True) print('quora dev set size = ', len(quora_dev.data)) tasks.append(('quora', 2)) if 'snli' in args.task: snli_dev = data.Corpus(args.data + 'snli/', dictionary) snli_dev.parse('dev.txt', 'snli', args.tokenize, is_test_corpus=True) print('snli dev set size = ', len(snli_dev.data)) tasks.append(('snli', 3)) if 'multinli' in args.task: # test matched part multinli_dev_matched = data.Corpus(args.data + 'multinli/', dictionary) multinli_dev_matched.parse('dev_matched.txt', 'multinli',
help='path to save the final model') args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) # Seed the RNG on the GPU if we are using it ############################################################################### # Load data ############################################################################### print("Loading data") corpus = data.Corpus(args.data) # Load the data from the disk # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐ # │ b h n t │ # │ c i o u │ # │ d j p v │ # │ e k q w │ # └ f l r x ┘. # These columns are treated as independent by the model, which means that the # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient # batch processing. def batchify(data, bsz): nbatch = data.size(0) // bsz # Work out how cleanly we can divide the dataset into bsz parts.
# Set the random seed manually for reproducibility. numpy.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### # load train and dev dataset train_corpus = data.Corpus(args.tokenize) train_corpus.parse(args.data + 'train.txt', args.max_example) print('train set size = ', len(train_corpus.data)) dev_corpus = data.Corpus(args.tokenize) dev_corpus.parse(args.data + 'dev.txt', args.max_example) print('development set size = ', len(dev_corpus.data)) dictionary = data.Dictionary() dictionary.build_dict(train_corpus, args.max_words) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + 'dictionary.p') print('vocabulary size = ', len(dictionary)) # ############################################################################### # # Build the model # ###############################################################################
def model_load(fn): global model, criterion, optimizer with open(fn, 'rb') as f: model, criterion, optimizer = torch.load(f) import os import hashlib fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Load OE data ############################################################################### print('Producing dataset...') if args.wikitext_char: oe_corpus = data.CorpusWikiTextChar('data/wikitext-2', corpus.dictionary)
) else: torch.cuda.manual_seed(args.seed) if args.temperature < 1e-3: parser.error("--temperature has to be greater or equal 1e-3") with open(args.checkpoint, 'rb') as f: model = torch.load(f) if args.cuda: model.cuda() else: model.cpu() corpus = data.Corpus(args.data, False) ntokens = len(corpus.dictionary) print 'vocab size: ', ntokens hidden = model.init_hidden(1) input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) if args.cuda: input.data = input.data.cuda() with open(args.outf, 'w') as outf: for i in range(args.words): output, hidden = model(input, hidden) word_weights = output.squeeze().data.div(args.temperature).exp().cpu() word_idx = torch.multinomial(word_weights, 1)[0] input.data.fill_(word_idx) word = corpus.dictionary.idx2word[word_idx]
args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### args.data = './data/'+args.lang corpus = data.Corpus(args.data, args.lang) # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐ # │ b h n t │ # │ c i o u │ # │ d j p v │ # │ e k q w │ # └ f l r x ┘. # These columns are treated as independent by the model, which means that the # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient # batch processing. def batchify_user(data, bsz, start, end): if end==start:
# Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) device = torch.device("cuda" if args.cuda else "cpu") ############################################################################### # Load data ############################################################################### corpus = data.Corpus("./data/wikitext-2") # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐ # │ b h n t │ # │ c i o u │ # │ d j p v │ # │ e k q w │ # └ f l r x ┘. # These columns are treated as independent by the model, which means that the # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient # batch processing. def batchify(data, bsz):
import data from utils import batchify, get_batch, repackage_hidden data = '/data' corpus = data.Corpus(data, preproc=True)
"WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) print('\ncommand-line params : {0}\n'.format(sys.argv[1:])) print('{0}\n'.format(args)) sparsity_list = [0.00075] coherent_list = [1.0] ############################################################################### # Load data ############################################################################### # load train and dev dataset train_corpus = data.Corpus(args.tokenize) train_corpus_temp = data.Corpus(args.tokenize) dev_corpus = data.Corpus(args.tokenize) test_corpus = data.Corpus(args.tokenize) ori_train_size = -1 task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task] for task in task_names: if 'IMDB' in task: ############################################################################### # Load Learning to Skim paper's Pickle file ############################################################################### train_d, dev_d, test_d = helper.get_splited_imdb_data( args.output_base_path + task + '/' + 'imdb.p', SAG=args.SAG) train_corpus_temp.parse(train_d, task, args.max_example) dev_corpus.parse(dev_d, task, args.max_example)
model = MatchTensor(dictionary, embeddings_index, args) if 'CUDA_VISIBLE_DEVICES' in os.environ: cuda_visible_devices = [ int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',') ] if len(cuda_visible_devices) > 1: model = torch.nn.DataParallel(model, device_ids=cuda_visible_devices) if args.cuda: model = model.cuda() helper.load_model_states_from_checkpoint( model, os.path.join(args.save_path, 'model_best.pth.tar'), 'state_dict') print('Model, embedding index and dictionary loaded.') model.eval() test_corpus = data.Corpus(args.data + 'session_with_clicks_v5/', 'session_test.txt', dictionary, args.max_query_length, args.max_doc_length, is_test_corpus=True) print('Test set size = ', len(test_corpus.data)) test_batches = helper.batchify(test_corpus.data, args.batch_size) print('Number of test batches = ', len(test_batches)) test_ranking(model, test_batches)
with open(args.checkpoint, 'rb') as f: model = torch.load(f) with open(args.checkpoint2, 'rb') as f: model_rev = torch.load(f) model.eval() model_rev.eval() if args.cuda: model.cuda() model_rev.cuda() else: model.cpu() model_rev.cpu() corpus = data.Corpus(phase="Test", flag="all_book") corpus_rev = data.Corpus(phase="Test ", flag="all_book_re") ntokens = len(corpus.dictionary) ntokens_rev = len(corpus_rev.dictionary) hidden = model.init_hidden(1) hidden_rev = model_rev.init_hidden(1) input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) input_rev = Variable(torch.rand(1, 1).mul(ntokens_rev).long(), volatile=True) if args.cuda: input.data = input.data.cuda() input_rev.data = input_rev.data.cuda()
def main(): # if output directory doesn't exist, create it if not os.path.exists(args.save_path): os.makedirs(args.save_path) # set the random seed manually for reproducibility. numpy.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) print('\ncommand-line params : {0}\n'.format(sys.argv[1:])) print('{0}\n'.format(args)) ############################################################################### # Load data ############################################################################### dictionary = data.Dictionary() tasks = [] train_dict, dev_dict = {}, {} if 'quora' in args.task: print('**Task name : Quora**') # load quora dataset quora_train = data.Corpus(args.data, dictionary) quora_train.parse('quora/train.txt', 'quora', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format(len( quora_train.data))) quora_dev = data.Corpus(args.data, dictionary) quora_dev.parse('quora/dev.txt', 'quora', args.tokenize) print('Found {} pairs of dev sentences.'.format(len(quora_dev.data))) quora_test = data.Corpus(args.data, dictionary) quora_test.parse('quora/test.txt', 'quora', args.tokenize) print('Found {} pairs of test sentences.'.format(len(quora_test.data))) tasks.append(('quora', 2)) train_dict['quora'] = quora_train dev_dict['quora'] = quora_dev if 'snli' in args.task: print('**Task name : SNLI**') # load snli dataset snli_train = data.Corpus(args.data, dictionary) snli_train.parse('snli/train.txt', 'snli', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format(len( snli_train.data))) snli_dev = data.Corpus(args.data, dictionary) snli_dev.parse('snli/dev.txt', 'snli', args.tokenize) print('Found {} pairs of dev sentences.'.format(len(snli_dev.data))) snli_test = data.Corpus(args.data, dictionary) snli_test.parse('snli/test.txt', 'snli', args.tokenize) print('Found {} pairs of test sentences.'.format(len(snli_test.data))) tasks.append(('snli', 3)) train_dict['snli'] = snli_train dev_dict['snli'] = snli_dev if 'multinli' in args.task: print('**Task name : Multi-NLI**') # load multinli dataset multinli_train = data.Corpus(args.data, dictionary) multinli_train.parse('multinli/train.txt', 'multinli', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format( len(multinli_train.data))) multinli_dev = data.Corpus(args.data, dictionary) multinli_dev.parse('multinli/dev_matched.txt', 'multinli', args.tokenize) multinli_dev.parse('multinli/dev_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of dev sentences.'.format(len( multinli_dev.data))) multinli_test = data.Corpus(args.data, dictionary) multinli_test.parse('multinli/test_matched.txt', 'multinli', args.tokenize) multinli_test.parse('multinli/test_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of test sentences.'.format( len(multinli_test.data))) tasks.append(('multinli', 3)) train_dict['multinli'] = multinli_train dev_dict['multinli'] = multinli_dev if 'allnli' in args.task: print('**Task name : AllNLI**') # load allnli dataset allnli_train = data.Corpus(args.data, dictionary) allnli_train.parse('snli/train.txt', 'snli', args.tokenize, args.max_example) allnli_train.parse('multinli/train.txt', 'multinli', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format( len(allnli_train.data))) allnli_dev = data.Corpus(args.data, dictionary) allnli_dev.parse('snli/dev.txt', 'snli', args.tokenize) allnli_dev.parse('multinli/dev_matched.txt', 'multinli', args.tokenize) allnli_dev.parse('multinli/dev_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of dev sentences.'.format(len(allnli_dev.data))) allnli_test = data.Corpus(args.data, dictionary) allnli_test.parse('snli/test.txt', 'snli', args.tokenize) allnli_test.parse('multinli/test_matched.txt', 'multinli', args.tokenize) allnli_test.parse('multinli/test_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of test sentences.'.format(len( allnli_test.data))) tasks.append(('allnli', 3)) train_dict['allnli'] = allnli_train dev_dict['allnli'] = allnli_dev print('\nvocabulary size = ', len(dictionary)) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + 'dictionary.p') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### if not tasks: return model = MultitaskDomainAdapter(dictionary, embeddings_index, args, tasks) print(model) optim_fn, optim_params = helper.get_optimizer(args.optimizer) optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()), **optim_params) best_accuracy = 0 # for training on multiple GPUs. use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use if 'CUDA_VISIBLE_DEVICES' in os.environ: cuda_visible_devices = [ int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',') ] if len(cuda_visible_devices) > 1: model = torch.nn.DataParallel(model, device_ids=cuda_visible_devices) if args.cuda: model = model.cuda() if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_accuracy = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # ############################################################################### # # Train the model # ############################################################################### train = Train(model, optimizer, dictionary, embeddings_index, args, best_accuracy) train.set_train_dev_corpus(train_dict, dev_dict) train.train_epochs(args.start_epoch, args.epochs)
torch.save([model, criterion, optimizer], f) def model_load(fn): global model, criterion, optimizer with open(fn, 'rb') as f: model, criterion, optimizer = torch.load(f) import os import hashlib #fn = 'corpus.{}.data'.format(hashlib.md5(os.path.dirname(args.train_data).encode()).hexdigest()) #if os.path.exists(fn): # print('Loading cached dataset...') # corpus = torch.load(fn) #else: print('Producing dataset...') corpus = data.Corpus(train_path=args.train_data, dev_path=args.valid_data, test_path=args.test_data, output=args.vocab_file) #corpus = data.Corpus(args.data) #torch.save(corpus, fn) eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### from splitcross import SplitCrossEntropyLoss criterion = None
numpy.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### dictionary = data.Dictionary() train_corpus = data.Corpus(args.data, 'session_train.txt', dictionary, args.max_length) dev_corpus = data.Corpus(args.data, 'session_dev.txt', dictionary, args.max_length) print('Train set size = ', len(train_corpus.data)) print('Dev set size = ', len(dev_corpus.data)) print('Vocabulary size = ', len(dictionary)) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + 'dictionary.p') # embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file) # helper.save_word_embeddings('../data/glove/', 'glove.840B.300d.q2q.txt', embeddings_index, dictionary.idx2word) embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, 'glove.840B.300d.q2q.txt') print('Number of OOV words = ', len(dictionary) - len(embeddings_index))
USE_CUDA = torch.cuda.is_available() MODEL_CHECKPOINT = "models/2019-03-26T12-20-25/model-LSTM-emsize-50-nhid_128-nlayers_6-batch_size_20-epoch_25.pt" WORDS_TO_GEN = 100 TEMPRATURE = 1 SWITCH_WORDS = False SPEECH_FILE = os.path.join("data", CORPUS_NAME, "Clinton_2016-07-28.txt") # with open(MODEL_CHECKPOINT, 'rb') as f: model = torch.load(f) if USE_CUDA: model.cuda() else: model.cpu() corpus = data.Corpus(CORPUS_NAME) glove_embedding = glove.GloveEmbedding(corpus.vocabulary) ntokens = corpus.vocabulary.num_words hidden = model.init_hidden(1) input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True) if USE_CUDA: input.data = input.data.cuda() words = '' # read speech file for initialization if SPEECH_FILE is not None: speech_for_gen = torch.LongTensor(30) with open(SPEECH_FILE, 'r', encoding="utf8") as f: token = 0 for line in f:
print('loading selector') helper.load_model( selector, args.output_base_path + args.task + '/' + args.selector_file_name, 'selector', args.cuda) if args.load_model == 1 or args.load_model == 2: print('loading classifier') helper.load_model( model, args.output_base_path + args.task + '/' + args.classifier_file_name, 'state_dict', args.cuda) print('vocabulary size = ', len(dictionary)) task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task] for task in task_names: test_corpus = data.Corpus(args.tokenize) if 'IMDB' in args.task: ############################################################################### # Load Learning to Skim paper's Pickle file ############################################################################### # train_d, dev_d, test_d = helper.get_splited_imdb_data(args.output_base_path+'data/'+'imdb.p') train_d, dev_d, test_d = helper.get_splited_imdb_data( args.output_base_path + task + '/' + 'imdb.p', SAG=args.SAG) test_corpus.parse(test_d, task, args.max_example) elif task == 'multinli' and args.test != 'train': for partition in ['_matched', '_mismatched']: test_corpus.parse( args.data + task + '/' + args.test + partition + '.txt', task, args.max_example) print('[' + partition[1:] + '] dataset size = ',
import w_hw3.utils as utils start = time.time() # prepare eval_batch_size = 10 args = utils.get_args_parser() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) device = utils.check_device(args) ############################################################################### # Load the best saved model. ############################################################################### with open(os.path.join(args.save, 'model.pt'), 'rb') as f: rnn_model = torch.load(f) rnn_model.rnn.flatten_parameters() criterion = nn.CrossEntropyLoss() test_corpus = data.Corpus() test_corpus.load_dictionary(os.path.join(args.save, 'rnn_model_dict')) test_corpus.set_test(os.path.join(args.data, 'test.txt'), test_size=args.test_size) test_data = utils.batchify(test_corpus.test, eval_batch_size, device) test_loss = utils.evaluate(test_data, rnn_model, test_corpus, criterion, args, eval_batch_size) print('=' * 89) print('| End of training | time: %5.2f s | test loss %5.2f | test ppl %8.2f' % ((time.time() - start), test_loss, math.exp(test_loss)))
# Use gpu or cpu to train use_gpu = True if use_gpu: torch.cuda.set_device(args.gpu_id) device = torch.device(args.gpu_id) else: device = torch.device("cpu") #print(device) # load data train_batch_size = args.train_batch_size eval_batch_size = args.eval_batch_size batch_size = {'train': train_batch_size, 'valid': eval_batch_size} data_loader = data.Corpus("../data/ptb", batch_size, args.max_sql) voc_size = data_loader.voc_size # WRITE CODE HERE within two '#' bar ######################################## # Build LMModel model (bulid your language model here) # choose model according to console inputs if args.network_type == constants.network_self or args.network_type == constants.network_layer_norm: model = model_self.LMModel(device, args.network_type, voc_size, 50) elif args.network_type == constants.network_attention: model = model_attention.LMModel(device, voc_size, 50) elif args.network_type == constants.network_attention_self or args.network_type == constants.network_attention_self_matrix \ or args.network_type == constants.network_attention_self_modified or args.network_type == constants.network_attention_self_matrix_modified: model = model_attention_self.LMModel(device, args.network_type, voc_size, 50, 50, 2, args.max_sql) else:
# parser.add_argument('--save', type=str, default='model.pt', # help='path to save the final model') # parser.add_argument('--onnx-export', type=str, default='', # help='path to export the final model in onnx format') args = parser.parse_args() TAG_CLASS = 3 if torch.cuda.is_available() and not args.no_cuda: device = torch.device('cuda') print("using CUDA") else: device = torch.device('cpu') print('using CPU') corpus = d.Corpus(args.data, device, args.batch_size, args.seq_len) dict = corpus.dictionary num_of_train_batches = corpus.total_num_of_train_batches def accuracy(pred, target): mask = target != 2 total_num = mask.sum() p, i = pred.max(2) num_correct = (target[mask] == i[mask]).sum() return num_correct.item(), total_num.item() #### train #### model = LSTMTagger(args.emsize, args.nhid, args.batch_size, len(dict), TAG_CLASS, args.nlayers, args.bidirect,
dictionary = helper.load_object(args.save_path + 'dictionary.p') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) model = CNN_ARC_II(dictionary, embeddings_index, args) if 'CUDA_VISIBLE_DEVICES' in os.environ: cuda_visible_devices = [ int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',') ] if len(cuda_visible_devices) > 1: model = torch.nn.DataParallel(model, device_ids=cuda_visible_devices) if args.cuda: model = model.cuda() checkpoint = helper.load_from_checkpoint( os.path.join(args.save_path, 'model_best.pth.tar'), args.cuda) model.load_state_dict(checkpoint['state_dict']) model.eval() test_corpus = data.Corpus(args.tokenize, args.max_query_length, args.max_doc_length) test_corpus.parse(args.data + 'test.txt', args.max_example) print('test set size = ', len(test_corpus.data)) test_batches = helper.batchify(test_corpus.data, args.batch_size) print('number of test batches = ', len(test_batches)) test_ranking(model, test_batches)
# print("args.emsize", args.emsize) # print("args.nhid", args.nhid) # print("args.nlayers", args.nlayers) # print("args.lr", args.lr) # print("args.clip", args.clip) # print("args.epochs", args.epochs) # print("args.batch_size", args.batch_size) # print("args.bptt", args.bptt) # print("args.seed", args.seed) # print("args.cuda", args.cuda) # print("args.log_interval", args.log_interval) # print("args.save", args.save) # print("args.dropout", args.dropout) # print("args.tied", args.tied) corpus = data.Corpus(args.data, args.brown) def batchify(data, bsz): nbatch = data.size(0) // bsz data = data.narrow(0, 0, nbatch * bsz) data = data.view(bsz, -1).t().contiguous() if args.cuda: data = data.cuda() return data eval_batch_size = 10 print("Batchifying data...") train_data = batchify(corpus.train, args.batch_size) print(train_data.size())
print "--nlayers\t{}\n--lr\t\t{}".format(nlayers, args.lr) print "--clip\t\t{}\n--epochs\t{}".format(args.clip, args.epochs) print "--batch-size\t{}\n--bptt\t\t{}".format(args.batch_size, args.bptt) print "--seed\t\t{}\n--log-interval\t{}".format(args.seed, args.log_interval) print "--voc-size\t{}\n--L2\t\t{}".format(args.voc_size, args.L2) print "--dropout\t{}\n--cell\t\t{}".format(args.dropout, args.cell) print "\n" + "=" * 50 + "\n" # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### corpus = data.Corpus(args.data, args.voc_size) def batchify(data, bsz): # Work out how cleanly we can divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() if args.cuda: data = data.cuda() return data eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size)
# Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### print("\nLoading...\n") #corpus_words = data.Corpus(args.data, to_sentence = False) corpus = data.Corpus(args.data, to_sentence=True) def pad(tensor, length): return torch.cat([ tensor, tensor.new(length - tensor.size(0), *tensor.size()[1:]).zero_() ]) def batch_sents(data, bsz, nsen=1): """ Returns a dictionary of batched inputs, targets and sequence lengths of sentences in each batch inp: [max(seq_lens), nbatch]
device=device) emb_s.normal_(0, 0.05) torch.save(emb_s, emb_s_file) ############################################################################### # Load data ############################################################################### fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) #if os.path.exists(fn): if False: print('Loading cached dataset...') corpus = torch.load(fn) print('Loading end') else: print('Producing dataset...') corpus = data.Corpus(args.data, sememe) torch.save(corpus, fn) print('Producing end') # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐ # │ b h n t │ # │ c i o u │ # │ d j p v │ # │ e k q w │ # └ f l r x ┘. # These columns are treated as independent by the model, which means that the # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient # batch processing.
format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled = True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ntokens = len(corpus.dictionary) if args.continue_train: model = torch.load(os.path.join(args.save, 'model.pt')) else: genotype = eval("genotypes.%s" % args.arch) model = model.RNNModel(ntokens, args.emsize, args.nhid,
#print(vec[vocab["apple"],:]) print(type(vocab), type(vec)) ############## # unk use average # N used for number in the treebanl --> take average of 1-9 as embedding for that # for words like bread-butter --> split on "-" and average # otherwise if word not in embeddings use average data = d_read.Corpus("/language-modeling-nlp1/data/penn") train_data = data.train valid_data = data.valid test_data = data.test dims = 50 mean_vec = torch.mean(vec, 0).view(1, dims) vocab_tb = data.dictionary.word2idx.keys() numvec = vec[vocab["0"], :].view(1, dims) numvec = torch.cat((vec[vocab["1"], :].view(1, dims), numvec), 0) numvec = torch.cat((vec[vocab["2"], :].view(1, dims), numvec), 0) numvec = torch.cat((vec[vocab["3"], :].view(1, dims), numvec), 0) numvec = torch.cat((vec[vocab["4"], :].view(1, dims), numvec), 0) numvec = torch.cat((vec[vocab["5"], :].view(1, dims), numvec), 0)