def main(): args = get_args() args.n_gpu = 1 set_seed(args) # Construct tokenizer tokenizer = CharTokenizer([]) tokenizer.load(args.load_vocab) args.vocab_size = len(tokenizer) logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") # GPU setting os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Construct model model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) # Load data noisy_sents = read_strings(os.path.join('sejong_corpus', args.noisy_file)) clean_sents = read_strings(os.path.join('sejong_corpus', args.clean_file)) sents_annotation = ['None'] * len(noisy_sents) pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] # Train-validation split train_data, valid_data = train_test_split( pairs, test_size=args.val_ratio, random_state=args.seed) # test: about 1000 logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train(model, tokenizer, train_data, valid_data, args, eos=args.eos_setting)
def read_langs(lang1, lang2, term="txt", reverse=False, char_output=False): print("Reading lines...") # Read the file and split into lines # Attach the path here for the source and target language dataset filename = '%s/train/%s-%s.%s' % (opt.main_data_dir, lang1, lang2, term) # This creats the file directory name whichis used below # lines contains the data in form of a list lines = open(filename).read().strip().split('\n') # Split every line into pairs pairs = [[s for s in l.split('\t')] for l in lines] # Reverse pairs, make Lang instances if reverse: pairs = [list(reversed(p)) for p in pairs] input_lang = Lang(lang2) output_lang = Lang(lang1) else: input_lang = Lang(lang1) if char_output: output_lang = CharTokenizer(vocab_file='') else: output_lang = Lang(lang2) return input_lang, output_lang, pairs
def __init__(self, args): super(ParallelTraining, self).__init__() if args.tokenizer == 'char': _tokenizer = CharTokenizer() else: print('use BPE 1000') _tokenizer = HuggingFaceTokenizer() # use BPE-1000 audio_feature = args.audio_feat if args.concat: audio_feature *= 3 self.tokenizer = _tokenizer self.loss_fn = RNNTLoss(blank=0) self.model = Transducer( audio_feature, _tokenizer.vocab_size, args.vocab_dim, # vocab embedding dim args.h_dim, # hidden dim args.layers, pred_num_layers=args.pred_layers, dropout=args.dropout) self.latest_alignment = None self.steps = 0 self.epoch = 0 self.args = args self.best_wer = 1000
def main(args): # config config = TrainConfig(**args) config = config._replace( log_dir=config.log_dir.format(config.tokenizer), summary_dir=config.summary_dir.format(config.tokenizer), # checkpoint_dir=config.checkpoint_dir.format(config.tokenizer), ) set_seed(config.seed) os.makedirs(config.log_dir, exist_ok=True) os.makedirs(config.summary_dir, exist_ok=True) # os.makedirs(config.checkpoint_dir, exist_ok=True) # logger logger = get_logger(log_path=os.path.join(config.log_dir, "logs.txt")) logger.info(config) # 기본적인 모듈들 생성 (vocab, tokenizer) tokenizer_dir = os.path.join(config.resource_dir, config.tokenizer) logger.info(f"get vocab and tokenizer from {tokenizer_dir}") vocab = Vocab(os.path.join(tokenizer_dir, "tok.vocab")) if config.tokenizer.startswith("mecab-"): tokenizer = MeCabTokenizer(os.path.join(tokenizer_dir, "tok.json")) elif config.tokenizer.startswith("sp-"): tokenizer = SentencePieceTokenizer( os.path.join(tokenizer_dir, "tok.model")) elif config.tokenizer.startswith("mecab_sp-"): mecab = MeCabTokenizer(os.path.join(tokenizer_dir, "tok.json")) sp = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model")) tokenizer = MeCabSentencePieceTokenizer(mecab, sp) elif config.tokenizer.startswith("char-"): tokenizer = CharTokenizer() elif config.tokenizer.startswith("word-"): tokenizer = WordTokenizer() elif config.tokenizer.startswith("jamo-"): tokenizer = JamoTokenizer() else: raise ValueError("Wrong tokenizer name.") # 모델에 넣을 데이터 준비 # label-to-index label_to_index = {"0": 0, "1": 1} # Train logger.info(f"read training data from {config.train_path}") train_sentence_as, train_sentence_bs, train_labels = load_data( config.train_path, label_to_index) # Dev logger.info(f"read dev data from {config.dev_path}") dev_sentence_as, dev_sentence_bs, dev_labels = load_data( config.dev_path, label_to_index) # Test logger.info(f"read test data from {config.test_path}") test_sentence_as, test_sentence_bs, test_labels = load_data( config.test_path, label_to_index) # 데이터로 dataloader 만들기 # Train logger.info("create data loader using training data") train_dataset = PAWSDataset(train_sentence_as, train_sentence_bs, train_labels, vocab, tokenizer, config.max_sequence_length) train_random_sampler = RandomSampler(train_dataset) train_data_loader = DataLoader(train_dataset, sampler=train_random_sampler, batch_size=config.batch_size) # Dev logger.info("create data loader using dev data") dev_dataset = PAWSDataset(dev_sentence_as, dev_sentence_bs, dev_labels, vocab, tokenizer, config.max_sequence_length) dev_data_loader = DataLoader(dev_dataset, batch_size=1024) # Test logger.info("create data loader using test data") test_dataset = PAWSDataset(test_sentence_as, test_sentence_bs, test_labels, vocab, tokenizer, config.max_sequence_length) test_data_loader = DataLoader(test_dataset, batch_size=1024) # Summary Writer 준비 summary_writer = SummaryWriter(log_dir=config.summary_dir) # 모델을 준비하는 코드 logger.info("initialize model and convert bert pretrained weight") bert_config = BertConfig.from_json_file( os.path.join(config.resource_dir, config.tokenizer, config.bert_config_file_name)) model = PAWSModel(bert_config, config.dropout_prob) model.bert = load_pretrained_bert( bert_config, os.path.join(config.resource_dir, config.tokenizer, config.pretrained_bert_file_name)) trainer = Trainer(config, model, train_data_loader, dev_data_loader, test_data_loader, logger, summary_writer) trainer.train()
def main(cli_args): # Read from config file and make args with open("./tasks/korquad/config.json", "r") as f: args = AttrDict(json.load(f)) args.seed = cli_args.seed args.tokenizer = cli_args.tokenizer args.output_dir = args.output_dir.format(args.tokenizer) args.resource_dir = cli_args.resource_dir args.data_dir = cli_args.data_dir logger.info(f"Training/evaluation parameters {args}") if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) init_logger() set_seed(args.seed) logging.getLogger("transformers.data.metrics.squad_metrics").setLevel( logging.WARN) # Reduce model loading logs # custom tokenizers tokenizer_dir = os.path.join(args.resource_dir, args.tokenizer) logger.info(f"get vocab and tokenizer from {tokenizer_dir}") if args.tokenizer.startswith("mecab-"): custom_tokenizer = MeCabTokenizer( os.path.join(tokenizer_dir, "tok.json")) elif args.tokenizer.startswith("sp-"): custom_tokenizer = SentencePieceTokenizer( os.path.join(tokenizer_dir, "tok.model")) elif args.tokenizer.startswith("mecab_sp-"): mecab = MeCabTokenizer(os.path.join(tokenizer_dir, "tok.json")) sp = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model")) custom_tokenizer = MeCabSentencePieceTokenizer(mecab, sp) elif args.tokenizer.startswith("char-"): custom_tokenizer = CharTokenizer() elif args.tokenizer.startswith("word-"): custom_tokenizer = WordTokenizer() elif args.tokenizer.startswith("jamo-"): custom_tokenizer = JamoTokenizer() else: raise ValueError("Wrong tokenizer name.") # Load pretrained model and tokenizer config = BertConfig.from_json_file( os.path.join(args.resource_dir, args.tokenizer, args.bert_config_file_name)) tokenizer = BertTokenizer(os.path.join(tokenizer_dir, "tok.vocab"), custom_tokenizer) model = KorQuADModel(config) model.bert = load_pretrained_bert( config, os.path.join(args.resource_dir, args.tokenizer, args.pretrained_bert_file_name)) # GPU or CPU args.device = "cuda" if torch.cuda.is_available() else "cpu" model.to(args.device) logger.info(f"Training/evaluation parameters {args}") # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(f" global_step = {global_step}, average loss = {tr_loss}") # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + "pytorch_model.bin", recursive=True))) if not args.eval_all_checkpoints: checkpoints = checkpoints[-1:] else: logging.getLogger("transformers.configuration_utils").setLevel( logging.WARN) # Reduce model loading logs logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce model loading logs logger.info(f"Evaluate the following checkpoints: {checkpoints}") for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] model = KorQuADModel.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=global_step) result = dict((k + (f"_{global_step}" if global_step else ""), v) for k, v in result.items()) results.update(result) output_dir = os.path.join(args.output_dir, "eval") with open(os.path.join(output_dir, "eval_result.txt"), "w", encoding="utf-8") as f: official_eval_results = eval_during_train(args) for key in sorted(official_eval_results.keys()): logger.info(f" {key} = {official_eval_results[key]}") f.write(f" {key} = {official_eval_results[key]}\n")
def main(): args = get_args() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info(f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M") eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) #train_data, valid_data = None, None if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train": if args.mode == "train": noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) if args.mode == "semi-train": noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) checkpoint = 'generated_data' sess = 't0005/rush1-1/'+str(args.semi_dataset) # five copy #sess = 't0005/rush1-1/209' # one copy #sess = 't0005/rush1-1/224' semi_noisy_sents, semi_clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) semi_sents_annotation = ['None'] * len(semi_noisy_sents) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") checkpoint = 'generated_data' sess = 't0005/rush1-1/113' noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) sents_annotation = ['None']*len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 if args.mode == "semi-train": pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] semi_pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(semi_noisy_sents, semi_clean_sents, semi_sents_annotation)] train_data = pairs[:-args.num_val_data]+semi_pairs valid_data = pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args, eos=eos_setting) else: pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = pairs[:-args.num_val_data], pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] #tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args,eos=eos_setting) ## to load pretrained model nsml.load(checkpoint='best', session='t0005/rush1-2/79') #print(tokenizer.vocab) if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train': train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
def main(): args = get_args() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) tokenizer = CharTokenizer([]) configuration = BertConfig(vocab_size=args.vocab_size) model = BertForMaskedLM(configuration).to(args.device) ''' model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) ''' logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) if args.mode == "train" or args.mode == "pretrain": if args.mode == "train": noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") checkpoint = 'generated_data' sess = 't0005/rush1-1/113' noisy_sents, clean_sents = load_generated_data( checkpoint=checkpoint, session=sess) sents_annotation = ['None'] * len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = pairs[:-args.num_val_data], pairs[ -args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args) ## to load pretrained model #nsml.load(checkpoint='best', session='t0005/rush1-1/177') if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain": train(model, tokenizer, train_data, valid_data, args)
from tokenizer import CharTokenizer from model import TransformerModel from train import bind_nsml parser = argparse.ArgumentParser() parser.add_argument('--session', type=str, default="") parser.add_argument('--checkpoint', type=str, default="best") args = parser.parse_args() args.tokenizer = 'char' session = 't0005/rush1-3/' + args.session checkpoint = args.checkpoint print(f'session: {session}\ncheckpoint: {checkpoint}') model = None tokenizer = CharTokenizer([]) bind_nsml(model, tokenizer) nsml.load(checkpoint=checkpoint, session=session) args.vocab_size = len(tokenizer) print(f'vocab_size: {args.vocab_size}') model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout,
def main(): args = get_args() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) #train_data, valid_data = None, None if args.mode == "train": noisy_sents_labeled = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation_labeled = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents_labeled = read_strings( os.path.join(args.data_dir, "train_label")) noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_corpus")) pairs = noisy_sents pairs_labeled = clean_sents_labeled train_data, valid_data = pairs + noisy_sents_labeled[:-args. num_val_data] + pairs_labeled[:-args.num_val_data], pairs_labeled[ -args. num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x for x in train_data] if args.tokenizer == 'char': tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) print("===vocab size: ", len(tokenizer)) args.vocab_size = len(tokenizer) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train": train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
best_checkpoint = os.path.join(eval_args.name, 'amp_checkpoint.pt') if not os.path.exists(best_checkpoint): raise ValueError('Not found') checkpoint = torch.load(best_checkpoint, map_location='cpu') with open(os.path.join(eval_args.name, 'vars.json'), 'r') as f: params = json.load(f) print('Checkpoint at epoch %d ' % checkpoint['epoch']) args = Struct(**params) window_size = eval_args.window_size window_stride = 0.01 sd.default.samplerate = 16000 duration = 60 # seconds if args.tokenizer == 'char': _tokenizer = CharTokenizer() else: _tokenizer = HuggingFaceTokenizer() # use BPE-400 print('use bpe') model = Transducer( args.audio_feat, _tokenizer.vocab_size, args.vocab_dim, # vocab embedding dim args.h_dim, # hidden dim args.layers, pred_num_layers=args.pred_layers, dropout=args.dropout).cpu() if args.audio_feat > 80: args.audio_feat = args.audio_feat // 3
def main(): # from pathlib import Path # print("File Path:", Path(__file__).absolute()) # print("Directory Path:", Path().absolute()) args = get_args() args.n_gpu = 1 # noisy_sents_1 = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) # clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) # noisy_sents_2 = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus")) # # noisy_sents = noisy_sents_1 + noisy_sents_2 # noise_space_ratio = [] # # for sentence in noisy_sents: # noise_space_ratio.append(sentence.count(' ') / len(sentence)) # # clean_space_ratio = [] # for sentence in clean_sents: # clean_space_ratio.append(sentence.count(' ') / len(sentence)) # # print("noise_space_ratio: {}, clean_space_ratio: {}".format(sum(noise_space_ratio) / len(noise_space_ratio), # sum(clean_space_ratio) / len(clean_space_ratio))) # ########## # ##for local # args.num_workers=0 # args.train_batch_size = 4 # args.eval_batch_size = 4 # args.eval_interval = 10 # ########## set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) if args.load_vocab != "": tokenizer.load(args.load_vocab) args.vocab_size = tokenizer.__len__() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) if args.mode != 'test' and args.averaging != "": sess = 't0005/rush1-3/37' checkpoints = ["4500", "6500", "7500", "8000"] nsml.load(checkpoint=checkpoints[0], session=sess) args.vocab_size = tokenizer.__len__() print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) params = model.named_parameters() new_dict_params = dict(params) for checkpoint in checkpoints: bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) for name, param in params: new_dict_params[name] += param / len(checkpoints) model.load_state_dict(new_dict_params, strict=False) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.save('best') elif args.mode == 'eval': print("I'm in EVAL") checkpoint = 'best' sess = 't0005/rush1-3/507' nsml.load(checkpoint=checkpoint, session=sess) args.vocab_size = tokenizer.__len__() model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) model.eval() #noisy_sents = open("./naver_data_clean.txt", "r", encoding='utf-8').read().splitlines() noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_corpus")) valid_noisy = noisy_sents[:1000] prediction = correct_beam(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.15) for i, pred in enumerate(prediction[:1000]): print("noisy_input: {}, pred: {}".format(valid_noisy[i], pred)) # bind_txt(prediction) # nsml.save('prediction') # with open('naver_data_clean_again.txt', 'w',encoding='utf-8') as f: # for i, pred in enumerate(prediction): # if i%500==0: print(i) # f.write("%s\n" % pred) ## only works when char tokenizer ##TODO: kobert tokenizer, different vocabsize if it is needed elif args.mode != 'test' and args.resubmit != "": checkpoint = 'best' sess = 't0005/rush1-3/' + args.resubmit print(sess) model = None tokenizer = CharTokenizer([]) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) args.vocab_size = len(tokenizer) print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) bind_nsml(model, tokenizer, args, eos=eos_setting) ########## testing loaded model & tokenizer ############### # model.eval() # noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) # valid_noisy = noisy_sents[-10:] # # prediction = correct(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.1) # # for pred in prediction: # print(pred) ################## nsml.save("best") else: #train_data, valid_data = None, None if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train": if args.mode == "train": # noisy_sents = open("./noisy_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000] # clean_sents = open("./clean_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000] # sents_annotation = ['None'] * len(noisy_sents) noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) if args.mode == "semi-train": noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) checkpoint = 'generated_data' sess = 't0005/rush1-1/' + str(args.semi_dataset) # five copy #sess = 't0005/rush1-1/209' # one copy #sess = 't0005/rush1-1/224' semi_noisy_sents, semi_clean_sents = load_generated_data( checkpoint=checkpoint, session=sess) semi_sents_annotation = ['None'] * len(semi_noisy_sents) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") noisy_sents = read_strings( os.path.join('sejong_corpus', args.noisy_file)) clean_sents = read_strings( os.path.join('sejong_corpus', args.clean_file)) # checkpoint = 'generated_data' # sess = 't0005/rush1-1/113' # noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) sents_annotation = ['None'] * len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 if args.mode == "semi-train": pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( noisy_sents, clean_sents, sents_annotation)] semi_pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( semi_noisy_sents, semi_clean_sents, semi_sents_annotation)] train_data = pairs[:-args.num_val_data] + semi_pairs valid_data = pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data ] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args, eos=eos_setting) else: pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = train_test_split( pairs, test_size=args.val_ratio, random_state=args.seed) # test: about 1000 logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") # print("validation: ", valid_data) train_sents = [x['noisy'] for x in train_data ] + [x['clean'] for x in train_data] # train_sents = [x['clean'] for x in train_data] if args.load_model != "" and args.mode == "train": # Load pretrained model print("load pretrained model") model.load_state_dict( torch.load(args.load_model, map_location=args.device)) if args.freeze: model.token_embeddings.weight.requires_grad = False model.decoder_embeddings.weight.requires_grad = False if args.tokenizer == 'char' and args.load_vocab == "": tokenizer = CharTokenizer.from_strings( train_sents, args.vocab_size) print( f'tokenizer loaded from strings. len={len(tokenizer)}.' ) bind_nsml(model, tokenizer, args, eos=eos_setting) if args.tokenizer == 'char' and tokenizer is not None: tokenizer.save('vocab.txt') if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train': train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
from torch.utils.data import DataLoader, SequentialSampler from dataset import TextDataset, collate_fn from data_loader import read_strings import os from evaluation import gleu, gleu_one from collections import Counter, defaultdict from statistics import mean, stdev from tokenization_kobert import KoBertTokenizer from sklearn.model_selection import train_test_split args = get_args() args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = CharTokenizer([]) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) checkpoint = 'best' sess = 't0005/rush1-3/507' bind_nsml(model, tokenizer, args)