def main(): args = get_args() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info(f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M") eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) #train_data, valid_data = None, None if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train": if args.mode == "train": noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) if args.mode == "semi-train": noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) checkpoint = 'generated_data' sess = 't0005/rush1-1/'+str(args.semi_dataset) # five copy #sess = 't0005/rush1-1/209' # one copy #sess = 't0005/rush1-1/224' semi_noisy_sents, semi_clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) semi_sents_annotation = ['None'] * len(semi_noisy_sents) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") checkpoint = 'generated_data' sess = 't0005/rush1-1/113' noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) sents_annotation = ['None']*len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 if args.mode == "semi-train": pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] semi_pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(semi_noisy_sents, semi_clean_sents, semi_sents_annotation)] train_data = pairs[:-args.num_val_data]+semi_pairs valid_data = pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args, eos=eos_setting) else: pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = pairs[:-args.num_val_data], pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] #tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args,eos=eos_setting) ## to load pretrained model nsml.load(checkpoint='best', session='t0005/rush1-2/79') #print(tokenizer.vocab) if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train': train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
from data_loader import read_strings import os from nsml import DATASET_PATH import random data_dir = os.path.join(DATASET_PATH, 'train') ## 데이터 copy를 몇개 만들것인지 정함. train_label clean data에서 noise만든것을 몇번 반복할건지 num_copy = 5 clean_sents = read_strings(os.path.join(data_dir, "train_label"))*num_copy noisy_sents = noise(clean_sents) shuffle_idxs = list(range(len(clean_sents))) random.shuffle(shuffle_idxs) noise_sents_shuf = [] clean_sents_shuf = [] for i in shuffle_idxs: noise_sents_shuf.append(noisy_sents[i]) clean_sents_shuf.append(clean_sents[i]) save_generated_data(noise_sents_shuf, clean_sents_shuf) ## 현재 돌리는 세션의 이름을 넣으면 됨 't0005/rush1-1/(세션 번호)'. 그 위치에서 만들어진 데이터셋이 잘 로드되는지 확인할 수 있음 checkpoint = 'generated_data' sess = 't0005/rush1-1/230' loaded_noisy_sents, loaded_clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) print(loaded_noisy_sents[:10]) print(loaded_clean_sents[:10])
def main(): args = get_args() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) tokenizer = CharTokenizer([]) configuration = BertConfig(vocab_size=args.vocab_size) model = BertForMaskedLM(configuration).to(args.device) ''' model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) ''' logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) if args.mode == "train" or args.mode == "pretrain": if args.mode == "train": noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") checkpoint = 'generated_data' sess = 't0005/rush1-1/113' noisy_sents, clean_sents = load_generated_data( checkpoint=checkpoint, session=sess) sents_annotation = ['None'] * len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = pairs[:-args.num_val_data], pairs[ -args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args) ## to load pretrained model #nsml.load(checkpoint='best', session='t0005/rush1-1/177') if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain": train(model, tokenizer, train_data, valid_data, args)
def main(): # from pathlib import Path # print("File Path:", Path(__file__).absolute()) # print("Directory Path:", Path().absolute()) args = get_args() args.n_gpu = 1 # noisy_sents_1 = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) # clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) # noisy_sents_2 = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus")) # # noisy_sents = noisy_sents_1 + noisy_sents_2 # noise_space_ratio = [] # # for sentence in noisy_sents: # noise_space_ratio.append(sentence.count(' ') / len(sentence)) # # clean_space_ratio = [] # for sentence in clean_sents: # clean_space_ratio.append(sentence.count(' ') / len(sentence)) # # print("noise_space_ratio: {}, clean_space_ratio: {}".format(sum(noise_space_ratio) / len(noise_space_ratio), # sum(clean_space_ratio) / len(clean_space_ratio))) # ########## # ##for local # args.num_workers=0 # args.train_batch_size = 4 # args.eval_batch_size = 4 # args.eval_interval = 10 # ########## set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) if args.load_vocab != "": tokenizer.load(args.load_vocab) args.vocab_size = tokenizer.__len__() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) if args.mode != 'test' and args.averaging != "": sess = 't0005/rush1-3/37' checkpoints = ["4500", "6500", "7500", "8000"] nsml.load(checkpoint=checkpoints[0], session=sess) args.vocab_size = tokenizer.__len__() print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) params = model.named_parameters() new_dict_params = dict(params) for checkpoint in checkpoints: bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) for name, param in params: new_dict_params[name] += param / len(checkpoints) model.load_state_dict(new_dict_params, strict=False) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.save('best') elif args.mode == 'eval': print("I'm in EVAL") checkpoint = 'best' sess = 't0005/rush1-3/507' nsml.load(checkpoint=checkpoint, session=sess) args.vocab_size = tokenizer.__len__() model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) model.eval() #noisy_sents = open("./naver_data_clean.txt", "r", encoding='utf-8').read().splitlines() noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_corpus")) valid_noisy = noisy_sents[:1000] prediction = correct_beam(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.15) for i, pred in enumerate(prediction[:1000]): print("noisy_input: {}, pred: {}".format(valid_noisy[i], pred)) # bind_txt(prediction) # nsml.save('prediction') # with open('naver_data_clean_again.txt', 'w',encoding='utf-8') as f: # for i, pred in enumerate(prediction): # if i%500==0: print(i) # f.write("%s\n" % pred) ## only works when char tokenizer ##TODO: kobert tokenizer, different vocabsize if it is needed elif args.mode != 'test' and args.resubmit != "": checkpoint = 'best' sess = 't0005/rush1-3/' + args.resubmit print(sess) model = None tokenizer = CharTokenizer([]) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) args.vocab_size = len(tokenizer) print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) bind_nsml(model, tokenizer, args, eos=eos_setting) ########## testing loaded model & tokenizer ############### # model.eval() # noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) # valid_noisy = noisy_sents[-10:] # # prediction = correct(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.1) # # for pred in prediction: # print(pred) ################## nsml.save("best") else: #train_data, valid_data = None, None if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train": if args.mode == "train": # noisy_sents = open("./noisy_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000] # clean_sents = open("./clean_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000] # sents_annotation = ['None'] * len(noisy_sents) noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) if args.mode == "semi-train": noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) checkpoint = 'generated_data' sess = 't0005/rush1-1/' + str(args.semi_dataset) # five copy #sess = 't0005/rush1-1/209' # one copy #sess = 't0005/rush1-1/224' semi_noisy_sents, semi_clean_sents = load_generated_data( checkpoint=checkpoint, session=sess) semi_sents_annotation = ['None'] * len(semi_noisy_sents) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") noisy_sents = read_strings( os.path.join('sejong_corpus', args.noisy_file)) clean_sents = read_strings( os.path.join('sejong_corpus', args.clean_file)) # checkpoint = 'generated_data' # sess = 't0005/rush1-1/113' # noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) sents_annotation = ['None'] * len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 if args.mode == "semi-train": pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( noisy_sents, clean_sents, sents_annotation)] semi_pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( semi_noisy_sents, semi_clean_sents, semi_sents_annotation)] train_data = pairs[:-args.num_val_data] + semi_pairs valid_data = pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data ] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args, eos=eos_setting) else: pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = train_test_split( pairs, test_size=args.val_ratio, random_state=args.seed) # test: about 1000 logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") # print("validation: ", valid_data) train_sents = [x['noisy'] for x in train_data ] + [x['clean'] for x in train_data] # train_sents = [x['clean'] for x in train_data] if args.load_model != "" and args.mode == "train": # Load pretrained model print("load pretrained model") model.load_state_dict( torch.load(args.load_model, map_location=args.device)) if args.freeze: model.token_embeddings.weight.requires_grad = False model.decoder_embeddings.weight.requires_grad = False if args.tokenizer == 'char' and args.load_vocab == "": tokenizer = CharTokenizer.from_strings( train_sents, args.vocab_size) print( f'tokenizer loaded from strings. len={len(tokenizer)}.' ) bind_nsml(model, tokenizer, args, eos=eos_setting) if args.tokenizer == 'char' and tokenizer is not None: tokenizer.save('vocab.txt') if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train': train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)