def set_transformer_model(self): ''' This Function loads the base transformer model. Args: transformer_config_path : config path(yaml) of the transformer transformer_weights_path : optional . if given loads the weight as well Returns:None ''' # load base transformer model from config with open(self.args.transformer_config_path, 'r') as file: config= yaml.load(file, yaml.FullLoader) model_config = TransformerConfig(config) input_dim = config['transformer']['input_dim'] dr= model_config.downsample_rate hidden_size = model_config.hidden_size output_attention= False base_transformer_model = TransformerModel(model_config,input_dim,output_attentions=output_attention).to('cpu') #load weights if self.args.transformer_weights_path: ckpt = torch.load(self.args.transformer_weights_path, map_location='cpu') base_transformer_model.load_state_dict(ckpt['Transformer']) self.base_transformer_model = base_transformer_model
def main(): print("Generating data...", end="") voc_size = args.vocab_sz inp = np.arange(2, voc_size, 2) tgt = np.arange(3, voc_size, 2) data_x, data_y = get_numbers(inp, tgt) train_len = int(len(data_x) * 0.9) train_x, val_x = data_x[:train_len], data_x[train_len:] train_y, val_y = data_y[:train_len], data_y[train_len:] print("Done") print("Setting model...", end="") model = TransformerModel( input_sz=voc_size, output_sz=voc_size, d_model=args.d_model, nhead=args.n_head, num_encoder_layers=args.n_encoder_layers, num_decoder_layers=args.n_decoder_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, ) if args.load_dir != ".": model.load_state_dict(flow.load(args.load_dir)) model = to_cuda(model) criterion = to_cuda(nn.CrossEntropyLoss()) optimizer = flow.optim.Adam(model.parameters(), lr=args.lr) print("Done") print("Training...") min_loss = 100 for i in range(1, args.n_epochs + 1): epoch_loss = train(model, criterion, optimizer, train_x, train_y) epoch_loss_val = validation(model, criterion, val_x, val_y) print("epoch: {} train loss: {}".format(i, epoch_loss)) print("epoch: {} val loss: {}".format(i, epoch_loss_val)) if epoch_loss < min_loss: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) else: shutil.rmtree(args.save_dir) assert not os.path.exists(args.save_dir) os.mkdir(args.save_dir) flow.save(model.state_dict(), args.save_dir) if i % 3 == 2: print(test(model, test_times=10))
def main(model_name=None, hidden=64, nlayers=1): voc_size = 10000 inp = arange(2, voc_size, 2) tgt = arange(3, voc_size, 2) batch_size = 128 epochs = 30 dataset = NumberLoader(inp, tgt) train_len = int(len(dataset) * 0.9) val_len = len(dataset) - train_len train_set, val_set = random_split(dataset, [train_len, val_len]) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=1) val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=1) model = TransformerModel(voc_size, voc_size, hidden=hidden, nlayers=nlayers) if model_name is not None: model.load_state_dict(load(model_name)) model = model.cuda() # optimizer = optim.SGD(model.parameters(), lr=0.5) optimizer = optim.Adam(model.parameters()) # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) criterion = nn.CrossEntropyLoss() best_loss = 100 for i in range(epochs): epoch_loss = train(model, criterion, optimizer, train_loader) epoch_loss_val = validation(model, criterion, val_loader) # scheduler.step() print("epoch: {} train loss: {}".format(i, epoch_loss)) print("epoch: {} val loss: {}".format(i, epoch_loss_val)) if epoch_loss_val < best_loss: best_loss = epoch_loss_val model_name = "model/model_{0:.5f}.pt".format(epoch_loss_val) save(model.state_dict(), model_name) return model_name
def main(): voc_size = args.vocab_sz print("Setting model...", end="") model = TransformerModel( input_sz=voc_size, output_sz=voc_size, d_model=args.d_model, nhead=args.n_head, num_encoder_layers=args.n_encoder_layers, num_decoder_layers=args.n_decoder_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, ) model.load_state_dict(flow.load(args.load_dir)) model = to_cuda(model) print("Done") print("Inference:") num = args.input_start if num % 2 != 0: print("The input number must be an even number.") return if num > args.vocab_sz - MAX_LEN * 2: print("The input sequence may be out of range.") return input_nums = [num + i * 2 for i in range(MAX_LEN)] src = to_cuda(flow.tensor(input_nums)).unsqueeze(1) pred = [0] for i in range(MAX_LEN): inp = to_cuda(flow.tensor(pred)).unsqueeze(1) output = model(src, inp) out_num = output.argmax(2)[-1].numpy()[0] pred.append(out_num) print("input:", input_nums) print("pred:", pred)
vocab_to_int = vocab["vocab_to_int"] int_to_vocab = vocab["int_to_vocab"] ntokens = len(vocab_to_int) emsize = 512 nhid = 512 nlayers = 4 nhead = 4 dropout = 0.2 model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device) model_save_path = "./models/transformer/lm-siamzone-v4-space-342.pkl" model.load_state_dict( torch.load(model_save_path, map_location=torch.device("cpu"))) model.eval() print("Model initialized") def top_k_top_p_filtering(logits, top_k, top_p, temperature, filter_value=-float("Inf")): # Hugging Face script to apply top k and nucleus sampling logits = logits / temperature top_k = min(top_k, logits.size(-1)) # Safety check if top_k > 0:
def main(args): random_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda") device = torch.device("cuda" if args.cuda else "cpu") corpus = data.Corpus(args.data) ntokens = len(corpus.dictionary) print('loaded dictionary') if args.model == 'Transformer': model = TransformerModel( ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device) else: model = RNNModel( args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) checkpoint = torch.load(args.checkpoint) model.load_state_dict(checkpoint['model_state_dict']) model.eval() print('loaded model') is_transformer_model = hasattr( model, 'model_type') and model.model_type == 'Transformer' if not is_transformer_model: hidden = model.init_hidden(1) input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device) with open(args.outf, 'w') as outf: with torch.no_grad(): # no tracking history for i in range(args.words): if is_transformer_model: output = model(input, False) word_weights = output[-1].squeeze().div( args.temperature).exp().cpu() word_idx = torch.multinomial(word_weights, 1)[0] word_tensor = torch.Tensor([[word_idx]]).long().to(device) input = torch.cat([input, word_tensor], 0) else: output, hidden = model(input, hidden) word_weights = output.squeeze().div(args.temperature).exp().cpu() word_idx = torch.multinomial(word_weights, 1)[0] input.fill_(word_idx) word = corpus.dictionary.idx2word[word_idx] outf.write(word + ('\n' if i % 20 == 19 else ' ')) if i % args.log_interval == 0: print('| Generated {}/{} words'.format(i, args.words))
validation='wiki.valid.tokens', test='wiki.test.tokens') # 依据训练集构建词典 TEXT.build_vocab(train_txt) model = TransformerModel(len(TEXT.vocab.stoi), ninp=200, nhead=2, nhid=200, nlayers=2, dropout=0.2).to(device) # 模型加载训练好的参数 # checkpoint = torch.load('datasets/models/best_model.pth.tar') checkpoint = torch.load('temp/models/best_model.pth.tar') model.load_state_dict(checkpoint['state_dict']) # 已知序列 history = 'it seems' h = [] for w in history.split(): h.append([TEXT.vocab.stoi[w]]) while (True): # 把列表转化成 tensor ,然后计算模型输出 output = model(torch.tensor(h).to(device)) # 获取概率最大的5个单词的 id idxs = output[-1].argsort(descending=True).view(-1)[:10] # 随机选择其中一个 r = random.randint(0, 10) h.append([r])
def main(args): random_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda") device = torch.device("cuda" if args.cuda else "cpu") corpus = data.Corpus(args.data) ntokens = len(corpus.dictionary) word2idx = corpus.dictionary.word2idx idx2word = corpus.dictionary.idx2word args.vocab_size = len(word2idx) print('loaded dictionary') if args.model == 'Transformer': model = TransformerModel( ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device) else: model = RNNModel( args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) checkpoint = torch.load(args.checkpoint) model.load_state_dict(checkpoint['model_state_dict']) model.eval() is_transformer_model = hasattr( model, 'model_type') and model.model_type == 'Transformer' print('loaded model') input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device) # get as starting words only most common starting word # from data corpus(heuristics from baseline) most_common_first_words_ids = [i[0] for i in Counter(corpus.train.tolist()).most_common() if idx2word[i[0]][0].isupper()][:200] # most_common_first_words = [corpus.dictionary.idx2word[i] # for i in most_common_first_words_ids] # private message(binary code) bit_stream = open(args.bit_stream_path, 'r').readline() outfile = open(args.save_path + 'generated' + str(args.bit_num) + '_bit.txt', 'w') bitfile = open(args.save_path + 'bitfile_' + str(args.bit_num) + '_bit.txt', 'w') bit_index = random.randint(0, len(word2idx)) soft = torch.nn.Softmax(0) for uter_id, uter in tqdm.tqdm( enumerate(range(args.utterances_to_generate))): # with torch.no_grad(): # no tracking history input_ = torch.LongTensor([random.choice( most_common_first_words_ids)]).unsqueeze(0).to(device) if not is_transformer_model: hidden = model.init_hidden(1) output, hidden = model(input_, hidden) gen = np.random.choice(len(corpus.dictionary), 1, p=np.array(soft(output.reshape(-1)).tolist()) / sum(soft(output.reshape(-1)).tolist()))[0] gen_res = list() gen_res.append(idx2word[gen]) bit = "" for word_id, word in enumerate(range(args.len_of_generation - 2)): if is_transformer_model: assert NotImplementedError else: output, hidden = model(input_, hidden) p = output.reshape(-1) sorted_, indices = torch.sort(p, descending=True) words_prob = [(j, i) for i, j in zip(sorted_[:2**int(args.bit_num)].tolist(), indices[:2**int(args.bit_num)].tolist())] nodes = createNodes([item[1] for item in words_prob]) root = createHuffmanTree(nodes) codes = huffmanEncoding(nodes, root) for i in range(2**int(args.bit_num)): if bit_stream[bit_index:bit_index + i + 1] in codes: code_index = codes.index( bit_stream[bit_index:bit_index + i + 1]) gen = words_prob[code_index][0] test_data = np.int32(gen) gen_res.append(idx2word[gen]) if idx2word[gen] in ['\n', '', "<eos>"]: break bit += bit_stream[bit_index: bit_index + i + 1] bit_index = bit_index + i + 1 break gen_sen = ' '.join( [word for word in gen_res if word not in ["\n", "", "<eos>"]]) outfile.write(gen_sen + "\n") bitfile.write(bit)
# (1.0, 0.0), # (0.0, -1.0), # (0.0, 1.0), # (0.0, 2.0), # (0.0, 5.0), # ] a = args.alpha b = args.beta encoder = TransformerModel(unidirectional=False) decoder = TransformerLMHeadModel() logger.info(f"Start training of alpha={a} beta={b}") states = torch.load("../TSP/TSP-best.th") encoder.load_state_dict(states["encoder"]) decoder.load_state_dict(states["decoder"]) device = torch.device("cuda") encoder = encoder.to(device) decoder = decoder.to(device) num_epochs = 10 num_gradients_accumulation = 1 num_train_optimization_steps = len( train_dataset) * num_epochs // batch_size // num_gradients_accumulation param_optimizer = list(encoder.named_parameters()) + list( decoder.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
from dataset import TedDataset from tqdm import tqdm device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Load tokenizer de_tokenizer = WordpieceTokenizer('de').load_model() en_tokenizer = WordpieceTokenizer('en').load_model() model = TransformerModel(d_model=512, num_heads=8, num_encoders=6, num_decoders=6, in_vocab_size=len(de_tokenizer), out_vocab_size=len(en_tokenizer)).to(device) model.load_state_dict(torch.load("./outputs/model-epoch10.pt")) model.eval() def translate(inputs): input_len = len(inputs) inputs = torch.tensor([ de_tokenizer.transform(input, max_length=50) for input in inputs ]).cuda() outputs = torch.tensor([[2]] * input_len).cuda() #2 means sos token for i in range(50): prediction = model(inputs, outputs) prediction = torch.argmax(prediction, dim=-1)[:, -1] # get final token outputs = torch.cat((outputs, prediction.view(-1, 1)), dim=-1) outputs = outputs.tolist() cleanoutput = []
def main(): # from pathlib import Path # print("File Path:", Path(__file__).absolute()) # print("Directory Path:", Path().absolute()) args = get_args() args.n_gpu = 1 # noisy_sents_1 = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) # clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) # noisy_sents_2 = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus")) # # noisy_sents = noisy_sents_1 + noisy_sents_2 # noise_space_ratio = [] # # for sentence in noisy_sents: # noise_space_ratio.append(sentence.count(' ') / len(sentence)) # # clean_space_ratio = [] # for sentence in clean_sents: # clean_space_ratio.append(sentence.count(' ') / len(sentence)) # # print("noise_space_ratio: {}, clean_space_ratio: {}".format(sum(noise_space_ratio) / len(noise_space_ratio), # sum(clean_space_ratio) / len(clean_space_ratio))) # ########## # ##for local # args.num_workers=0 # args.train_batch_size = 4 # args.eval_batch_size = 4 # args.eval_interval = 10 # ########## set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) if args.load_vocab != "": tokenizer.load(args.load_vocab) args.vocab_size = tokenizer.__len__() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) if args.mode != 'test' and args.averaging != "": sess = 't0005/rush1-3/37' checkpoints = ["4500", "6500", "7500", "8000"] nsml.load(checkpoint=checkpoints[0], session=sess) args.vocab_size = tokenizer.__len__() print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) params = model.named_parameters() new_dict_params = dict(params) for checkpoint in checkpoints: bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) for name, param in params: new_dict_params[name] += param / len(checkpoints) model.load_state_dict(new_dict_params, strict=False) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.save('best') elif args.mode == 'eval': print("I'm in EVAL") checkpoint = 'best' sess = 't0005/rush1-3/507' nsml.load(checkpoint=checkpoint, session=sess) args.vocab_size = tokenizer.__len__() model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) model.eval() #noisy_sents = open("./naver_data_clean.txt", "r", encoding='utf-8').read().splitlines() noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_corpus")) valid_noisy = noisy_sents[:1000] prediction = correct_beam(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.15) for i, pred in enumerate(prediction[:1000]): print("noisy_input: {}, pred: {}".format(valid_noisy[i], pred)) # bind_txt(prediction) # nsml.save('prediction') # with open('naver_data_clean_again.txt', 'w',encoding='utf-8') as f: # for i, pred in enumerate(prediction): # if i%500==0: print(i) # f.write("%s\n" % pred) ## only works when char tokenizer ##TODO: kobert tokenizer, different vocabsize if it is needed elif args.mode != 'test' and args.resubmit != "": checkpoint = 'best' sess = 't0005/rush1-3/' + args.resubmit print(sess) model = None tokenizer = CharTokenizer([]) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) args.vocab_size = len(tokenizer) print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) bind_nsml(model, tokenizer, args, eos=eos_setting) ########## testing loaded model & tokenizer ############### # model.eval() # noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) # valid_noisy = noisy_sents[-10:] # # prediction = correct(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.1) # # for pred in prediction: # print(pred) ################## nsml.save("best") else: #train_data, valid_data = None, None if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train": if args.mode == "train": # noisy_sents = open("./noisy_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000] # clean_sents = open("./clean_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000] # sents_annotation = ['None'] * len(noisy_sents) noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) if args.mode == "semi-train": noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) checkpoint = 'generated_data' sess = 't0005/rush1-1/' + str(args.semi_dataset) # five copy #sess = 't0005/rush1-1/209' # one copy #sess = 't0005/rush1-1/224' semi_noisy_sents, semi_clean_sents = load_generated_data( checkpoint=checkpoint, session=sess) semi_sents_annotation = ['None'] * len(semi_noisy_sents) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") noisy_sents = read_strings( os.path.join('sejong_corpus', args.noisy_file)) clean_sents = read_strings( os.path.join('sejong_corpus', args.clean_file)) # checkpoint = 'generated_data' # sess = 't0005/rush1-1/113' # noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) sents_annotation = ['None'] * len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 if args.mode == "semi-train": pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( noisy_sents, clean_sents, sents_annotation)] semi_pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( semi_noisy_sents, semi_clean_sents, semi_sents_annotation)] train_data = pairs[:-args.num_val_data] + semi_pairs valid_data = pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data ] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args, eos=eos_setting) else: pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = train_test_split( pairs, test_size=args.val_ratio, random_state=args.seed) # test: about 1000 logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") # print("validation: ", valid_data) train_sents = [x['noisy'] for x in train_data ] + [x['clean'] for x in train_data] # train_sents = [x['clean'] for x in train_data] if args.load_model != "" and args.mode == "train": # Load pretrained model print("load pretrained model") model.load_state_dict( torch.load(args.load_model, map_location=args.device)) if args.freeze: model.token_embeddings.weight.requires_grad = False model.decoder_embeddings.weight.requires_grad = False if args.tokenizer == 'char' and args.load_vocab == "": tokenizer = CharTokenizer.from_strings( train_sents, args.vocab_size) print( f'tokenizer loaded from strings. len={len(tokenizer)}.' ) bind_nsml(model, tokenizer, args, eos=eos_setting) if args.tokenizer == 'char' and tokenizer is not None: tokenizer.save('vocab.txt') if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train': train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
def predict(dn, rn): dir_name_format = "../data/{dn}-{rn}-raw" dir_name = dir_name_format.format(dn=dn, rn=rn) input_path = os.path.join(dir_name, "src-test.txt") if not os.path.isfile(input_path): print(f"File: {input_path} not exist.") return output_filename = f"prediction-{dn}-{rn}.txt" output_path = os.path.join(outputDir, output_filename) if os.path.isfile(output_path): print(f"File {output_path} already exists.") return # 作用:将src进行index preprocess = IndexedInputTargetTranslationDataset.preprocess(source_dictionary) # 作用:将输出逆index为句子 postprocess = lambda x: ''.join( [token for token in target_dictionary.tokenize_indexes(x) if token != END_TOKEN and token != START_TOKEN and token != PAD_TOKEN]) device = torch.device(f'cuda:{args.device}' if torch.cuda.is_available() and not args.no_cuda else 'cpu') print('Building model...') model = TransformerModel(source_dictionary.vocabulary_size, target_dictionary.vocabulary_size, config['d_model'], config['nhead'], config['nhid'], config['nlayers']) model.eval() checkpoint_filepath = checkpoint_path checkpoint = torch.load(checkpoint_filepath, map_location='cpu') model.load_state_dict(checkpoint) translator = Translator( model=model, beam_size=args.beam_size, max_seq_len=args.max_seq_len, trg_bos_idx=target_dictionary.token_to_index(START_TOKEN), trg_eos_idx=target_dictionary.token_to_index(END_TOKEN) ).to(device) from utils.pipe import PAD_INDEX def pad_src(batch): sources_lengths = [len(sources) for sources in batch] sources_max_length = max(sources_lengths) sources_padded = [sources + [PAD_INDEX] * (sources_max_length - len(sources)) for sources in batch] sources_tensor = torch.tensor(sources_padded) return sources_tensor def process(seq): seq = seq.strip() def is_proof(name): return name.count("balance") > 0 or name.count("one") > 0 if is_proof(data_name) and not is_proof(dn): seq += ",$,1" global is_proof_process if is_proof_process: print("processing") is_proof_process = False return seq batch_size = args.bs print(f"Output to {output_path}:") with open(output_path, 'w', encoding='utf-8') as outFile: with open(input_path, 'r', encoding='utf-8') as inFile: seqs = [] for seq in tqdm(inFile): seq = process(seq) src_seq = preprocess(seq) seqs.append(src_seq) if len(seqs) >= batch_size: pred_seq = translator.translate_sentence(pad_src(seqs).to(device)) pred_line = [postprocess(pred) for pred in pred_seq] # print(pred_line) outFile.writelines([p.strip() + '\n' for p in pred_line]) seqs.clear() # endif # endfor if seqs: # last batch pred_seq = translator.translate_sentence(pad_src(seqs).to(device)) pred_line = [postprocess(pred).replace(START_TOKEN, '').replace(END_TOKEN, '') for pred in pred_seq] # print(pred_line) outFile.writelines([p.strip() + '\n' for p in pred_line]) seqs.clear() # endwith # endwith print(f'[Info] {input_path} Finished.')
if epoch_loss_val < best_loss: best_loss = epoch_loss_val model_name = "model/model_{0:.5f}.pt".format(epoch_loss_val) save(model.state_dict(), model_name) return model_name if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'A PyTorch Transformer Language Model for Predicting Odd Numbers') parser.add_argument('--test_model', type=str, help='the model file to load') parser.add_argument('--train_model', type=str, help='the model file to load') args = parser.parse_args() hidden = 128 nlayers = 2 if args.test_model is None: if args.train_model is not None: model_name = main(args.train_model, hidden=hidden, nlayers=nlayers) else: model_name = main(hidden=hidden, nlayers=nlayers) else: model_name = args.test_model model = TransformerModel(10000, 10000, hidden=hidden, nlayers=nlayers) model.load_state_dict(load(model_name)) test(model, test_times=10)
def main(): ### settings args = set_args() save_path = args.save_path if not os.path.isdir(save_path): os.makedirs(save_path) logger.info(args) ### prepare for data train_dataset = COCOMultiLabel(args, train=True, image_path=args.image_path) test_dataset = COCOMultiLabel(args, train=False, image_path=args.image_path) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=True, shuffle=True, drop_last=True, collate_fn=my_collate) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=True, shuffle=False, drop_last=False, collate_fn=my_collate) ## prepare for models encoder = CNN_Encoder().cuda() decoder = TransformerModel(args).cuda() ## set different parameter for training or only evaluation' if args.use_eval: weights_dic = torch.load(args.use_model) encoder.load_state_dict( convert_weights(weights_dic['encoder_state_dict'])) decoder.load_state_dict( convert_weights(weights_dic['decoder_state_dict'])) else: encoder.load_state_dict( convert_weights(torch.load(args.encoder_weights))) encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=args.encoder_lr) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.decoder_lr) ## whether using dataparallel' if torch.cuda.device_count() > 1: encoder = nn.DataParallel(encoder) decoder = nn.DataParallel(decoder) ## set hinge loss function' loss_hinge = torch.nn.HingeEmbeddingLoss(margin=args.C, size_average=None, reduce=None, reduction='mean') ## if only evaluation, return" if args.use_eval: f1 = test(args, encoder, decoder, test_loader, args.threshold, 1) return ## training stage highest_f1 = 0 epochs_without_improve = 0 for epoch in range(args.epochs): ## train and test train(args, encoder, decoder, train_loader, encoder_optimizer, decoder_optimizer, epoch, loss_hinge) f1 = test(args, encoder, decoder, test_loader, args.threshold, epoch) ### save parameter save_dict = { 'encoder_state_dict': encoder.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'epoch': epoch, 'f1': f1, 'decoder_optimizer_state_dict': decoder_optimizer.state_dict(), 'encoder_optimizer_state_dict': encoder_optimizer.state_dict(), 'epochs_without_improve': epochs_without_improve } ### save models' torch.save(save_dict, args.save_path + "/checkpoint_" + timestr + '.pt.tar') if f1 > highest_f1: torch.save( save_dict, args.save_path + "/BEST_checkpoint_" + timestr + '.pt.tar') logger.info("Now the highest f1 is {}, it was {}".format( 100 * f1, 100 * highest_f1)) highest_f1 = f1 epochs_without_improve = 0 else: epochs_without_improve += 1 if epochs_without_improve == 3: adjust_learning_rate(decoder_optimizer, args.coeff) adjust_learning_rate(encoder_optimizer, args.coeff) epochs_without_imp = 0
parser.add_argument('--pretrain_emb_path', type=str, default=hp.pretrain_emb_path) parser.add_argument('--pretrain_cnn_path', type=str, default=hp.pretrain_cnn_path) parser.add_argument('--pretrain_model_path', type=str, default=hp.pretrain_model_path) args = parser.parse_args() for k, v in vars(args).items(): setattr(hp, k, v) args = parser.parse_args() pretrain_emb = align_word_embedding(hp.word_dict_pickle_path, hp.pretrain_emb_path, hp.ntoken, hp.nhid) if hp.load_pretrain_emb else None pretrain_cnn = torch.load(hp.pretrain_cnn_path) if hp.load_pretrain_cnn else None model = TransformerModel(hp.ntoken, hp.ninp, hp.nhead, hp.nhid, hp.nlayers, hp.batch_size, dropout=0.2, pretrain_cnn=pretrain_cnn, pretrain_emb=pretrain_emb, freeze_cnn=hp.freeze_cnn).to(device) if hp.load_pretrain_model: model.load_state_dict(torch.load(hp.pretrain_model_path)) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=hp.lr, weight_decay=1e-6) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, hp.scheduler_decay) if hp.label_smoothing: criterion = LabelSmoothingLoss(hp.ntoken, smoothing=0.1) else: criterion = nn.CrossEntropyLoss(ignore_index=hp.ntoken - 1) now_time = str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time()))) log_dir = 'models/{name}'.format(name=hp.name) writer = SummaryWriter(log_dir=log_dir) log_path = os.path.join(log_dir, 'train.log')