Пример #1
0
    def get_dataset(self):
        """ 
        Given batch_size, returns 
        Dataset object from mnist data
        with batch sizes of input batch_size
        """

        train, val, test = self.read_mnist(self.mnist_folder)

        # Create tf Datasets for each.
        train_data = utils.convert_to_dataset(train, self.batch_size)

        test_data = utils.convert_to_dataset(test, self.batch_size)

        return train_data, test_data
Пример #2
0
def build_vocab(config):
    """
    위에서 얻은 score를 이용하여 vocab을 만든다.
    """
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)
    """
    tokenizer 중 cohesion score를 기준으로 단어를 구분하는 LTokenizer을 사용한다.
    한국어 어절을 '명사/동사/형용사/부사'(L part) + '조사 등'(R part)으로 보고, 의미가 핵심적인 L part의 점수를 도출한다.
    """
    # Field를 통해 단어를 tokenize하고 tensor로 바꾼다.
    # Field에 대한 다양한 parameter에 대한 정보는 https://torchtext.readthedocs.io/en/latest/data.html 에서 얻을 수 있다.
    kor = ttd.Field(tokenize=tokenizer.tokenize, lower=True, batch_first=True)

    # 영어를 tokenize하는 함수는 spacy이다. 이후 항상 첫 token은 <sos>, 마지막 token은 <eos>로 지정한다.
    eng = ttd.Field(tokenize='spacy',
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True,
                    batch_first=True)

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'train.csv')
    train_data = pd.read_csv(train_file, encoding='utf-8')
    train_data = convert_to_dataset(train_data, kor, eng)

    print(f'Build vocabulary using torchtext . . .')

    # 읽어 온 data를 한국어는 한국어 토큰으로, 영어는 영어 토큰으로 나누어 저장한다.
    kor.build_vocab(train_data, max_size=config.kor_vocab)
    eng.build_vocab(train_data, max_size=config.eng_vocab)

    # unique token 개수를 출력한다.
    print(f'Unique tokens in Korean vocabulary: {len(kor.vocab)}')
    print(f'Unique tokens in English vocabulary: {len(eng.vocab)}')

    # 가장 많이 쓰인 한국어/영어 단어를 출력한다.
    print(f'Most commonly used Korean words are as follows:')
    print(kor.vocab.freqs.most_common(20))
    print(f'Most commonly used English words are as follows:')
    print(eng.vocab.freqs.most_common(20))

    # 생성된 한국어/영어 vocab을 pickle로 저장한다
    with open('pickles/kor.pickle', 'wb') as kor_file:
        pickle.dump(kor, kor_file)

    with open('pickles/eng.pickle', 'wb') as eng_file:
        pickle.dump(eng, eng_file)
Пример #3
0
def build_vocab(config):
    """
    Build vocabulary used to convert input sentence into word indices using soynlp and spacy tokenizer
    Args:
        config: configuration containing various options

    Returns:

    """
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    # include lengths of the source sentences to use pack pad sequence
    kor = ttd.Field(tokenize=tokenizer.tokenize,
                    lower=True,
                    include_lengths=True)

    eng = ttd.Field(tokenize='spacy',
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True)

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'train.csv')
    train_data = pd.read_csv(train_file, encoding='utf-8')
    train_data = convert_to_dataset(train_data, kor, eng)

    print(f'Build vocabulary using torchtext . . .')

    kor.build_vocab(train_data, max_size=config.kor_vocab)
    eng.build_vocab(train_data, max_size=config.eng_vocab)

    print(f'Unique tokens in Korean vocabulary: {len(kor.vocab)}')
    print(f'Unique tokens in English vocabulary: {len(eng.vocab)}')

    print(f'Most commonly used Korean words are as follows:')
    print(kor.vocab.freqs.most_common(20))

    print(f'Most commonly used English words are as follows:')
    print(eng.vocab.freqs.most_common(20))

    with open('pickles/kor.pickle', 'wb') as kor_file:
        pickle.dump(kor, kor_file)

    with open('pickles/eng.pickle', 'wb') as eng_file:
        pickle.dump(eng, eng_file)
def build_vocab(config):
    """
    Build vocab used to convert Korean input sentence into word indices using soynlp tokenizer
    Args:
        config: configuration object containing various options

    Returns:

    """

    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    # To use packed padded sequences, tell the model how long the actual sequences are by 'include_lengths=True'
    text = ttd.Field(tokenize=tokenizer.tokenize, include_lengths=True)
    label = ttd.LabelField(dtype=torch.float)

    data_dir = Path().cwd() / 'data'
    train_txt = os.path.join(data_dir, 'train.txt')

    train_data = pd.read_csv(train_txt, sep='\t')
    train_data, valid_data = train_test_split(train_data,
                                              test_size=0.3,
                                              random_state=32)
    train_data = convert_to_dataset(train_data, text, label)

    print(f'Building vocabulary using torchtext . . .')
    text.build_vocab(train_data, max_size=config.vocab_size)
    label.build_vocab(train_data)

    print(f'Unique tokens in TEXT vocabulary: {len(text.vocab)}')
    print(f'Unique tokens in LABEL vocabulary: {len(label.vocab)}')

    print(f'Most commonly used words are as follows:')
    print(text.vocab.freqs.most_common(20))

    file_text = open('pickles/text.pickle', 'wb')
    pickle.dump(text, file_text)

    file_label = open('pickles/label.pickle', 'wb')
    pickle.dump(label, file_label)
Пример #5
0
def main():
    args = init_args()
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        os.environ['MASTER_ADDR'] = args.MASTER_ADDR
        os.environ['MASTER_PORT'] = args.MASTER_PORT
        torch.distributed.init_process_group(backend='nccl',
                                             rank=args.local_rank,
                                             world_size=1)
        args.n_gpu = 1

    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    # not using 16-bits training
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: False",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))

    # Set seed
    set_seed(args)

    # Prepare task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % args.task_name)
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels(args.tagging_schema)
    num_labels = len(label_list)
    normal_labels = processor.get_normal_labels(args.tagging_schema)
    num_normal_labels = len(normal_labels)
    sent_labels = ABSAProcessor.get_sentiment_labels()
    num_sent_labels = len(sent_labels)

    # initialize the pre-trained model
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,
                                              cache_dir='./cache')

    config.absa_type = args.absa_type
    config.tfm_mode = args.tfm_mode
    config.fix_tfm = args.fix_tfm
    config.num_normal_labels = num_normal_labels
    config.num_sent_labels = num_sent_labels
    config.ts_vocab = {label: i for i, label in enumerate(label_list)}
    config.ote_vocab = {label: i for i, label in enumerate(normal_labels)}
    config.sent_vocab = {label: i for i, label in enumerate(sent_labels)}
    config.device = "cuda" if torch.cuda.is_available(
    ) and not args.no_cuda else "cpu"
    config.output_hidden_states = True
    config.model_name_or_path = args.model_name_or_path

    if args.gen_adv_from_path:
        # Generate adversarial examples
        modes = ['train', 'dev', 'test']
        for mode in modes:
            model = model_class.from_pretrained(args.gen_adv_from_path).to(
                args.device)
            train_dataset, train_evaluate_label_ids, examples, imp_words = load_and_cache_examples(
                args, args.task_name, tokenizer, mode=mode, model=model)
            adversary = Adversary(args, model)
            adv_examples = []
            sz = 64
            for _ in trange(len(examples) // sz + 1):
                if len(examples) == 0:
                    continue
                adv_examples.extend(
                    adversary.generate_adv_examples(examples[:sz],
                                                    imp_words[:sz], tokenizer))
                examples = examples[sz:]
                imp_words = imp_words[sz:]
            adv_dataset = convert_to_dataset(args, adv_examples, tokenizer)
            output_dir = f'{args.task_name}_adv'
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            torch.save(adv_dataset, f'{output_dir}/{mode}.pth')
            torch.save(adv_examples, f'{output_dir}/{mode}-examples.pth')
        exit(0)

    if args.load_model:
        print('Loading model from:', args.load_model)
        model = model_class.from_pretrained(args.load_model, config=config)
    else:
        model = model_class.from_pretrained(args.model_name_or_path,
                                            config=config,
                                            cache_dir='./cache')
        print('Loading model from:', args.model_name_or_path)

    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Training
    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.mkdir(args.output_dir)

        # Store model configuration with results
        shutil.copyfile('absa_layer.py', args.output_dir + '/absa_layer.py')
        # Store training configuration
        shutil.copyfile('train.sh', args.output_dir + '/train.sh')
        if args.do_adv:
            # Store adv training config
            shutil.copyfile('main.py', args.output_dir + '/main.py')

        train_dataset, train_evaluate_label_ids, examples, imp_words = load_and_cache_examples(
            args, args.task_name, tokenizer, mode='train', model=model)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)

        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        # save the model configuration
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Validation
    results = {}
    best_f1 = -999999.0
    best_checkpoint = None
    checkpoints = []
    if args.eval_all_checkpoints:
        checkpoints = os.listdir(args.output_dir)
        checkpoints.sort()
    logger.info("Perform validation on the following checkpoints: %s",
                checkpoints)
    test_results = {}
    steps = []
    for checkpoint in checkpoints:
        global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
        if checkpoint.split('-')[0] != 'checkpoint':
            continue
        if args.pred_checkpoint and args.pred_checkpoint != global_step:
            continue
        steps.append(global_step)
        set_seed(args)
        model = model_class.from_pretrained(f'{args.output_dir}/{checkpoint}')
        model.to(args.device)
        dev_result = evaluate(args,
                              model,
                              tokenizer,
                              mode='dev',
                              prefix=global_step)

        # regard the micro-f1 as the criteria of model selection
        if int(global_step) > 1000 and dev_result['micro-f1'] > best_f1:
            best_f1 = dev_result['micro-f1']
            best_checkpoint = checkpoint
        dev_result = dict(
            (k + '_{}'.format(global_step), v) for k, v in dev_result.items())
        results.update(dev_result)

        test_result = evaluate(args,
                               model,
                               tokenizer,
                               mode='test',
                               prefix=global_step)
        test_result = dict(
            (k + '_{}'.format(global_step), v) for k, v in test_result.items())
        test_results.update(test_result)

    best_ckpt_string = "\nThe best checkpoint is %s" % best_checkpoint
    logger.info(best_ckpt_string)
    dev_f1_values, dev_loss_values = [], []
    for k in results:
        v = results[k]
        if 'micro-f1' in k:
            dev_f1_values.append((k, v))
        if 'eval_loss' in k:
            dev_loss_values.append((k, v))
    test_f1_values, test_loss_values = [], []
    for k in test_results:
        v = test_results[k]
        if 'micro-f1' in k:
            test_f1_values.append((k, v))
        if 'eval_loss' in k:
            test_loss_values.append((k, v))
    log_file_path = '%s/log.txt' % args.output_dir
    log_file = open(log_file_path, 'a')
    log_file.write("\tValidation:\n")
    for (test_f1_k,
         test_f1_v), (test_loss_k,
                      test_loss_v), (dev_f1_k,
                                     dev_f1_v), (dev_loss_k,
                                                 dev_loss_v) in zip(
                                                     test_f1_values,
                                                     test_loss_values,
                                                     dev_f1_values,
                                                     dev_loss_values):
        global_step = int(test_f1_k.split('_')[-1])
        if not args.overfit and global_step <= 1000:
            continue
        print('test-%s: %.5lf, test-%s: %.5lf, dev-%s: %.5lf, dev-%s: %.5lf' %
              (test_f1_k, test_f1_v, test_loss_k, test_loss_v, dev_f1_k,
               dev_f1_v, dev_loss_k, dev_loss_v))
        validation_string = '\t\tdev-%s: %.5lf, dev-%s: %.5lf' % (
            dev_f1_k, dev_f1_v, dev_loss_k, dev_loss_v)
        log_file.write(validation_string + '\n')

    n_times = args.max_steps // args.save_steps + 1
    for step in steps:
        log_file.write('\tStep %s:\n' % step)
        precision = test_results['precision_%s' % step]
        recall = test_results['recall_%s' % step]
        micro_f1 = test_results['micro-f1_%s' % step]
        macro_f1 = test_results['macro-f1_%s' % step]
        log_file.write(
            '\t\tprecision: %.4lf, recall: %.4lf, micro-f1: %.4lf, macro-f1: %.4lf\n'
            % (precision, recall, micro_f1, macro_f1))
    log_file.write("\tBest checkpoint: %s\n" % best_checkpoint)
    log_file.write('******************************************\n')
    log_file.close()