def get_dataset(self): """ Given batch_size, returns Dataset object from mnist data with batch sizes of input batch_size """ train, val, test = self.read_mnist(self.mnist_folder) # Create tf Datasets for each. train_data = utils.convert_to_dataset(train, self.batch_size) test_data = utils.convert_to_dataset(test, self.batch_size) return train_data, test_data
def build_vocab(config): """ 위에서 얻은 score를 이용하여 vocab을 만든다. """ pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) """ tokenizer 중 cohesion score를 기준으로 단어를 구분하는 LTokenizer을 사용한다. 한국어 어절을 '명사/동사/형용사/부사'(L part) + '조사 등'(R part)으로 보고, 의미가 핵심적인 L part의 점수를 도출한다. """ # Field를 통해 단어를 tokenize하고 tensor로 바꾼다. # Field에 대한 다양한 parameter에 대한 정보는 https://torchtext.readthedocs.io/en/latest/data.html 에서 얻을 수 있다. kor = ttd.Field(tokenize=tokenizer.tokenize, lower=True, batch_first=True) # 영어를 tokenize하는 함수는 spacy이다. 이후 항상 첫 token은 <sos>, 마지막 token은 <eos>로 지정한다. eng = ttd.Field(tokenize='spacy', init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) data_dir = Path().cwd() / 'data' train_file = os.path.join(data_dir, 'train.csv') train_data = pd.read_csv(train_file, encoding='utf-8') train_data = convert_to_dataset(train_data, kor, eng) print(f'Build vocabulary using torchtext . . .') # 읽어 온 data를 한국어는 한국어 토큰으로, 영어는 영어 토큰으로 나누어 저장한다. kor.build_vocab(train_data, max_size=config.kor_vocab) eng.build_vocab(train_data, max_size=config.eng_vocab) # unique token 개수를 출력한다. print(f'Unique tokens in Korean vocabulary: {len(kor.vocab)}') print(f'Unique tokens in English vocabulary: {len(eng.vocab)}') # 가장 많이 쓰인 한국어/영어 단어를 출력한다. print(f'Most commonly used Korean words are as follows:') print(kor.vocab.freqs.most_common(20)) print(f'Most commonly used English words are as follows:') print(eng.vocab.freqs.most_common(20)) # 생성된 한국어/영어 vocab을 pickle로 저장한다 with open('pickles/kor.pickle', 'wb') as kor_file: pickle.dump(kor, kor_file) with open('pickles/eng.pickle', 'wb') as eng_file: pickle.dump(eng, eng_file)
def build_vocab(config): """ Build vocabulary used to convert input sentence into word indices using soynlp and spacy tokenizer Args: config: configuration containing various options Returns: """ pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) # include lengths of the source sentences to use pack pad sequence kor = ttd.Field(tokenize=tokenizer.tokenize, lower=True, include_lengths=True) eng = ttd.Field(tokenize='spacy', init_token='<sos>', eos_token='<eos>', lower=True) data_dir = Path().cwd() / 'data' train_file = os.path.join(data_dir, 'train.csv') train_data = pd.read_csv(train_file, encoding='utf-8') train_data = convert_to_dataset(train_data, kor, eng) print(f'Build vocabulary using torchtext . . .') kor.build_vocab(train_data, max_size=config.kor_vocab) eng.build_vocab(train_data, max_size=config.eng_vocab) print(f'Unique tokens in Korean vocabulary: {len(kor.vocab)}') print(f'Unique tokens in English vocabulary: {len(eng.vocab)}') print(f'Most commonly used Korean words are as follows:') print(kor.vocab.freqs.most_common(20)) print(f'Most commonly used English words are as follows:') print(eng.vocab.freqs.most_common(20)) with open('pickles/kor.pickle', 'wb') as kor_file: pickle.dump(kor, kor_file) with open('pickles/eng.pickle', 'wb') as eng_file: pickle.dump(eng, eng_file)
def build_vocab(config): """ Build vocab used to convert Korean input sentence into word indices using soynlp tokenizer Args: config: configuration object containing various options Returns: """ pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) # To use packed padded sequences, tell the model how long the actual sequences are by 'include_lengths=True' text = ttd.Field(tokenize=tokenizer.tokenize, include_lengths=True) label = ttd.LabelField(dtype=torch.float) data_dir = Path().cwd() / 'data' train_txt = os.path.join(data_dir, 'train.txt') train_data = pd.read_csv(train_txt, sep='\t') train_data, valid_data = train_test_split(train_data, test_size=0.3, random_state=32) train_data = convert_to_dataset(train_data, text, label) print(f'Building vocabulary using torchtext . . .') text.build_vocab(train_data, max_size=config.vocab_size) label.build_vocab(train_data) print(f'Unique tokens in TEXT vocabulary: {len(text.vocab)}') print(f'Unique tokens in LABEL vocabulary: {len(label.vocab)}') print(f'Most commonly used words are as follows:') print(text.vocab.freqs.most_common(20)) file_text = open('pickles/text.pickle', 'wb') pickle.dump(text, file_text) file_label = open('pickles/label.pickle', 'wb') pickle.dump(label, file_label)
def main(): args = init_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) os.environ['MASTER_ADDR'] = args.MASTER_ADDR os.environ['MASTER_PORT'] = args.MASTER_PORT torch.distributed.init_process_group(backend='nccl', rank=args.local_rank, world_size=1) args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) # not using 16-bits training logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: False", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)) # Set seed set_seed(args) # Prepare task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % args.task_name) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels(args.tagging_schema) num_labels = len(label_list) normal_labels = processor.get_normal_labels(args.tagging_schema) num_normal_labels = len(normal_labels) sent_labels = ABSAProcessor.get_sentiment_labels() num_sent_labels = len(sent_labels) # initialize the pre-trained model args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir='./cache') config.absa_type = args.absa_type config.tfm_mode = args.tfm_mode config.fix_tfm = args.fix_tfm config.num_normal_labels = num_normal_labels config.num_sent_labels = num_sent_labels config.ts_vocab = {label: i for i, label in enumerate(label_list)} config.ote_vocab = {label: i for i, label in enumerate(normal_labels)} config.sent_vocab = {label: i for i, label in enumerate(sent_labels)} config.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" config.output_hidden_states = True config.model_name_or_path = args.model_name_or_path if args.gen_adv_from_path: # Generate adversarial examples modes = ['train', 'dev', 'test'] for mode in modes: model = model_class.from_pretrained(args.gen_adv_from_path).to( args.device) train_dataset, train_evaluate_label_ids, examples, imp_words = load_and_cache_examples( args, args.task_name, tokenizer, mode=mode, model=model) adversary = Adversary(args, model) adv_examples = [] sz = 64 for _ in trange(len(examples) // sz + 1): if len(examples) == 0: continue adv_examples.extend( adversary.generate_adv_examples(examples[:sz], imp_words[:sz], tokenizer)) examples = examples[sz:] imp_words = imp_words[sz:] adv_dataset = convert_to_dataset(args, adv_examples, tokenizer) output_dir = f'{args.task_name}_adv' if not os.path.exists(output_dir): os.makedirs(output_dir) torch.save(adv_dataset, f'{output_dir}/{mode}.pth') torch.save(adv_examples, f'{output_dir}/{mode}-examples.pth') exit(0) if args.load_model: print('Loading model from:', args.load_model) model = model_class.from_pretrained(args.load_model, config=config) else: model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir='./cache') print('Loading model from:', args.model_name_or_path) # Distributed and parallel training model.to(args.device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Training if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.mkdir(args.output_dir) # Store model configuration with results shutil.copyfile('absa_layer.py', args.output_dir + '/absa_layer.py') # Store training configuration shutil.copyfile('train.sh', args.output_dir + '/train.sh') if args.do_adv: # Store adv training config shutil.copyfile('main.py', args.output_dir + '/main.py') train_dataset, train_evaluate_label_ids, examples, imp_words = load_and_cache_examples( args, args.task_name, tokenizer, mode='train', model=model) global_step, tr_loss = train(args, train_dataset, model, tokenizer) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model # save the model configuration torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Validation results = {} best_f1 = -999999.0 best_checkpoint = None checkpoints = [] if args.eval_all_checkpoints: checkpoints = os.listdir(args.output_dir) checkpoints.sort() logger.info("Perform validation on the following checkpoints: %s", checkpoints) test_results = {} steps = [] for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" if checkpoint.split('-')[0] != 'checkpoint': continue if args.pred_checkpoint and args.pred_checkpoint != global_step: continue steps.append(global_step) set_seed(args) model = model_class.from_pretrained(f'{args.output_dir}/{checkpoint}') model.to(args.device) dev_result = evaluate(args, model, tokenizer, mode='dev', prefix=global_step) # regard the micro-f1 as the criteria of model selection if int(global_step) > 1000 and dev_result['micro-f1'] > best_f1: best_f1 = dev_result['micro-f1'] best_checkpoint = checkpoint dev_result = dict( (k + '_{}'.format(global_step), v) for k, v in dev_result.items()) results.update(dev_result) test_result = evaluate(args, model, tokenizer, mode='test', prefix=global_step) test_result = dict( (k + '_{}'.format(global_step), v) for k, v in test_result.items()) test_results.update(test_result) best_ckpt_string = "\nThe best checkpoint is %s" % best_checkpoint logger.info(best_ckpt_string) dev_f1_values, dev_loss_values = [], [] for k in results: v = results[k] if 'micro-f1' in k: dev_f1_values.append((k, v)) if 'eval_loss' in k: dev_loss_values.append((k, v)) test_f1_values, test_loss_values = [], [] for k in test_results: v = test_results[k] if 'micro-f1' in k: test_f1_values.append((k, v)) if 'eval_loss' in k: test_loss_values.append((k, v)) log_file_path = '%s/log.txt' % args.output_dir log_file = open(log_file_path, 'a') log_file.write("\tValidation:\n") for (test_f1_k, test_f1_v), (test_loss_k, test_loss_v), (dev_f1_k, dev_f1_v), (dev_loss_k, dev_loss_v) in zip( test_f1_values, test_loss_values, dev_f1_values, dev_loss_values): global_step = int(test_f1_k.split('_')[-1]) if not args.overfit and global_step <= 1000: continue print('test-%s: %.5lf, test-%s: %.5lf, dev-%s: %.5lf, dev-%s: %.5lf' % (test_f1_k, test_f1_v, test_loss_k, test_loss_v, dev_f1_k, dev_f1_v, dev_loss_k, dev_loss_v)) validation_string = '\t\tdev-%s: %.5lf, dev-%s: %.5lf' % ( dev_f1_k, dev_f1_v, dev_loss_k, dev_loss_v) log_file.write(validation_string + '\n') n_times = args.max_steps // args.save_steps + 1 for step in steps: log_file.write('\tStep %s:\n' % step) precision = test_results['precision_%s' % step] recall = test_results['recall_%s' % step] micro_f1 = test_results['micro-f1_%s' % step] macro_f1 = test_results['macro-f1_%s' % step] log_file.write( '\t\tprecision: %.4lf, recall: %.4lf, micro-f1: %.4lf, macro-f1: %.4lf\n' % (precision, recall, micro_f1, macro_f1)) log_file.write("\tBest checkpoint: %s\n" % best_checkpoint) log_file.write('******************************************\n') log_file.close()