예제 #1
0
def main():
    args = get_args()
    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    if args.tokenizer == 'char':
        tokenizer = CharTokenizer([])
    if args.tokenizer == 'kobert':
        print("koBERT tokenizer")
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        args.vocab_size = tokenizer.vocab_size
        print(args.vocab_size)

    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M")

    eos_setting = args.eos_setting

    bind_nsml(model, tokenizer, args, eos=eos_setting)
    if args.pause:
        nsml.paused(scope=locals())

    #train_data, valid_data = None, None
    if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train":
        if args.mode == "train":
            noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
            sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation"))
            clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))

        if args.mode == "semi-train":
            noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
            sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation"))
            clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))

            checkpoint = 'generated_data'
            sess = 't0005/rush1-1/'+str(args.semi_dataset)
            # five copy
            #sess = 't0005/rush1-1/209'
            # one copy
            #sess = 't0005/rush1-1/224'
            semi_noisy_sents, semi_clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
            semi_sents_annotation = ['None'] * len(semi_noisy_sents)

        if args.mode == "pretrain":
            print("PRETRAIN MODE ON!!")
            checkpoint = 'generated_data'
            sess = 't0005/rush1-1/113'
            noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
            sents_annotation = ['None']*len(noisy_sents)

        error_type_counter = Counter()

        for annotation in sents_annotation:
            error_type_counter += Counter(annotation.split(','))

        print(error_type_counter)

        # cleaning noise 버전
        # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
        # original 버전

        if args.mode == "semi-train":
            pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in
                     zip(noisy_sents, clean_sents, sents_annotation)]
            semi_pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in
                     zip(semi_noisy_sents, semi_clean_sents, semi_sents_annotation)]

            train_data = pairs[:-args.num_val_data]+semi_pairs
            valid_data = pairs[-args.num_val_data:]
            logger.info(f"# of train data: {len(train_data)}")
            logger.info(f"# of valid data: {len(valid_data)}")

            train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data]
            tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size)
            bind_nsml(model, tokenizer, args, eos=eos_setting)

        else:
            pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)]

            train_data, valid_data = pairs[:-args.num_val_data], pairs[-args.num_val_data:]
            logger.info(f"# of train data: {len(train_data)}")
            logger.info(f"# of valid data: {len(valid_data)}")

            train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data]
            #tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size)
            bind_nsml(model, tokenizer, args,eos=eos_setting)



        ## to load pretrained model
        nsml.load(checkpoint='best', session='t0005/rush1-2/79')
        #print(tokenizer.vocab)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, dim=1)

    if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train':
        train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
from data_loader import read_strings
import os
from nsml import DATASET_PATH
import random

data_dir = os.path.join(DATASET_PATH, 'train')

## 데이터 copy를 몇개 만들것인지 정함. train_label clean data에서 noise만든것을 몇번 반복할건지
num_copy = 5

clean_sents = read_strings(os.path.join(data_dir, "train_label"))*num_copy
noisy_sents = noise(clean_sents)

shuffle_idxs = list(range(len(clean_sents)))
random.shuffle(shuffle_idxs)

noise_sents_shuf = []
clean_sents_shuf = []
for i in shuffle_idxs:
     noise_sents_shuf.append(noisy_sents[i])
     clean_sents_shuf.append(clean_sents[i])

save_generated_data(noise_sents_shuf, clean_sents_shuf)

## 현재 돌리는 세션의 이름을 넣으면 됨 't0005/rush1-1/(세션 번호)'. 그 위치에서 만들어진 데이터셋이 잘 로드되는지 확인할 수 있음
checkpoint = 'generated_data'
sess = 't0005/rush1-1/230'
loaded_noisy_sents, loaded_clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)

print(loaded_noisy_sents[:10])
print(loaded_clean_sents[:10])
예제 #3
0
def main():
    args = get_args()
    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    tokenizer = CharTokenizer([])
    configuration = BertConfig(vocab_size=args.vocab_size)
    model = BertForMaskedLM(configuration).to(args.device)
    '''
    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    '''
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    bind_nsml(model, tokenizer, args)
    if args.pause:
        nsml.paused(scope=locals())

    if args.mode == "train" or args.mode == "pretrain":
        if args.mode == "train":
            noisy_sents = read_strings(
                os.path.join(args.data_dir, "train_data", "train_data"))
            sents_annotation = read_strings(
                os.path.join(args.data_dir, "train_data", "train_annotation"))
            clean_sents = read_strings(
                os.path.join(args.data_dir, "train_label"))

        if args.mode == "pretrain":
            print("PRETRAIN MODE ON!!")
            checkpoint = 'generated_data'
            sess = 't0005/rush1-1/113'
            noisy_sents, clean_sents = load_generated_data(
                checkpoint=checkpoint, session=sess)
            sents_annotation = ['None'] * len(noisy_sents)

        error_type_counter = Counter()

        for annotation in sents_annotation:
            error_type_counter += Counter(annotation.split(','))

        print(error_type_counter)

        # cleaning noise 버전
        # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
        # original 버전
        pairs = [{
            "noisy": noisy,
            "clean": clean,
            "annotation": annot
        } for noisy, clean, annot in zip(noisy_sents, clean_sents,
                                         sents_annotation)]
        train_data, valid_data = pairs[:-args.num_val_data], pairs[
            -args.num_val_data:]
        logger.info(f"# of train data: {len(train_data)}")
        logger.info(f"# of valid data: {len(valid_data)}")

        train_sents = [x['noisy']
                       for x in train_data] + [x['clean'] for x in train_data]
        tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size)
        bind_nsml(model, tokenizer, args)

        ## to load pretrained model
        #nsml.load(checkpoint='best', session='t0005/rush1-1/177')

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, dim=1)

    if args.mode == "train" or args.mode == "pretrain":
        train(model, tokenizer, train_data, valid_data, args)
예제 #4
0
def main():
    # from pathlib import Path
    # print("File      Path:", Path(__file__).absolute())
    # print("Directory Path:", Path().absolute())

    args = get_args()
    args.n_gpu = 1

    # noisy_sents_1 = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
    # clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))
    # noisy_sents_2 = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus"))
    #
    # noisy_sents = noisy_sents_1 + noisy_sents_2
    # noise_space_ratio = []
    #
    # for sentence in noisy_sents:
    #     noise_space_ratio.append(sentence.count(' ') / len(sentence))
    #
    # clean_space_ratio = []
    # for sentence in clean_sents:
    #     clean_space_ratio.append(sentence.count(' ') / len(sentence))
    #
    # print("noise_space_ratio: {}, clean_space_ratio: {}".format(sum(noise_space_ratio) / len(noise_space_ratio),
    #                                                             sum(clean_space_ratio) / len(clean_space_ratio)))

    # ##########
    # ##for local
    # args.num_workers=0
    # args.train_batch_size = 4
    # args.eval_batch_size = 4
    # args.eval_interval = 10
    # ##########

    set_seed(args)

    if args.tokenizer == 'char':
        tokenizer = CharTokenizer([])
    if args.tokenizer == 'kobert':
        print("koBERT tokenizer")
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        args.vocab_size = tokenizer.vocab_size
        print(args.vocab_size)

    if args.load_vocab != "":
        tokenizer.load(args.load_vocab)
        args.vocab_size = tokenizer.__len__()

    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    eos_setting = args.eos_setting

    bind_nsml(model, tokenizer, args, eos=eos_setting)
    if args.pause:
        nsml.paused(scope=locals())

    if args.mode != 'test' and args.averaging != "":
        sess = 't0005/rush1-3/37'
        checkpoints = ["4500", "6500", "7500", "8000"]

        nsml.load(checkpoint=checkpoints[0], session=sess)
        args.vocab_size = tokenizer.__len__()
        print(args.vocab_size)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        params = model.named_parameters()
        new_dict_params = dict(params)

        for checkpoint in checkpoints:
            bind_nsml(model, tokenizer, args, eos=eos_setting)
            nsml.load(checkpoint=checkpoint, session=sess)
            for name, param in params:
                new_dict_params[name] += param / len(checkpoints)

        model.load_state_dict(new_dict_params, strict=False)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.save('best')

    elif args.mode == 'eval':
        print("I'm in EVAL")

        checkpoint = 'best'
        sess = 't0005/rush1-3/507'
        nsml.load(checkpoint=checkpoint, session=sess)
        args.vocab_size = tokenizer.__len__()

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        model.eval()
        #noisy_sents = open("./naver_data_clean.txt", "r", encoding='utf-8').read().splitlines()
        noisy_sents = read_strings(
            os.path.join(args.data_dir, "train_data", "train_corpus"))
        valid_noisy = noisy_sents[:1000]

        prediction = correct_beam(model,
                                  tokenizer,
                                  valid_noisy,
                                  args,
                                  eos=True,
                                  length_limit=0.15)

        for i, pred in enumerate(prediction[:1000]):
            print("noisy_input: {}, pred: {}".format(valid_noisy[i], pred))

        # bind_txt(prediction)
        # nsml.save('prediction')

        # with open('naver_data_clean_again.txt', 'w',encoding='utf-8') as f:
        #     for i, pred in enumerate(prediction):
        #         if i%500==0: print(i)
        #         f.write("%s\n" % pred)

    ## only works when char tokenizer
    ##TODO: kobert tokenizer, different vocabsize if it is needed
    elif args.mode != 'test' and args.resubmit != "":
        checkpoint = 'best'
        sess = 't0005/rush1-3/' + args.resubmit
        print(sess)

        model = None
        tokenizer = CharTokenizer([])
        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        args.vocab_size = len(tokenizer)
        print(args.vocab_size)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        ########## testing loaded model & tokenizer ###############

        # model.eval()
        # noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
        # valid_noisy = noisy_sents[-10:]
        #
        # prediction = correct(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.1)
        #
        # for pred in prediction:
        #     print(pred)

        ##################

        nsml.save("best")

    else:
        #train_data, valid_data = None, None
        if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train":
            if args.mode == "train":
                # noisy_sents = open("./noisy_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000]
                # clean_sents = open("./clean_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000]
                # sents_annotation = ['None'] * len(noisy_sents)
                noisy_sents = read_strings(
                    os.path.join(args.data_dir, "train_data", "train_data"))
                sents_annotation = read_strings(
                    os.path.join(args.data_dir, "train_data",
                                 "train_annotation"))
                clean_sents = read_strings(
                    os.path.join(args.data_dir, "train_label"))

            if args.mode == "semi-train":
                noisy_sents = read_strings(
                    os.path.join(args.data_dir, "train_data", "train_data"))
                sents_annotation = read_strings(
                    os.path.join(args.data_dir, "train_data",
                                 "train_annotation"))
                clean_sents = read_strings(
                    os.path.join(args.data_dir, "train_label"))

                checkpoint = 'generated_data'
                sess = 't0005/rush1-1/' + str(args.semi_dataset)
                # five copy
                #sess = 't0005/rush1-1/209'
                # one copy
                #sess = 't0005/rush1-1/224'
                semi_noisy_sents, semi_clean_sents = load_generated_data(
                    checkpoint=checkpoint, session=sess)
                semi_sents_annotation = ['None'] * len(semi_noisy_sents)

            if args.mode == "pretrain":
                print("PRETRAIN MODE ON!!")
                noisy_sents = read_strings(
                    os.path.join('sejong_corpus', args.noisy_file))
                clean_sents = read_strings(
                    os.path.join('sejong_corpus', args.clean_file))
                # checkpoint = 'generated_data'
                # sess = 't0005/rush1-1/113'
                # noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
                sents_annotation = ['None'] * len(noisy_sents)

            error_type_counter = Counter()

            for annotation in sents_annotation:
                error_type_counter += Counter(annotation.split(','))

            print(error_type_counter)

            # cleaning noise 버전
            # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
            # original 버전

            if args.mode == "semi-train":
                pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                }
                         for noisy, clean, annot in zip(
                             noisy_sents, clean_sents, sents_annotation)]
                semi_pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                } for noisy, clean, annot in zip(
                    semi_noisy_sents, semi_clean_sents, semi_sents_annotation)]

                train_data = pairs[:-args.num_val_data] + semi_pairs
                valid_data = pairs[-args.num_val_data:]
                logger.info(f"# of train data: {len(train_data)}")
                logger.info(f"# of valid data: {len(valid_data)}")

                train_sents = [x['noisy'] for x in train_data
                               ] + [x['clean'] for x in train_data]
                tokenizer = CharTokenizer.from_strings(train_sents,
                                                       args.vocab_size)
                bind_nsml(model, tokenizer, args, eos=eos_setting)

            else:
                pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                }
                         for noisy, clean, annot in zip(
                             noisy_sents, clean_sents, sents_annotation)]

                train_data, valid_data = train_test_split(
                    pairs, test_size=args.val_ratio,
                    random_state=args.seed)  # test: about 1000
                logger.info(f"# of train data: {len(train_data)}")
                logger.info(f"# of valid data: {len(valid_data)}")

                # print("validation: ", valid_data)

                train_sents = [x['noisy'] for x in train_data
                               ] + [x['clean'] for x in train_data]
                # train_sents = [x['clean'] for x in train_data]

                if args.load_model != "" and args.mode == "train":  # Load pretrained model
                    print("load pretrained model")
                    model.load_state_dict(
                        torch.load(args.load_model, map_location=args.device))

                    if args.freeze:
                        model.token_embeddings.weight.requires_grad = False
                        model.decoder_embeddings.weight.requires_grad = False

                if args.tokenizer == 'char' and args.load_vocab == "":
                    tokenizer = CharTokenizer.from_strings(
                        train_sents, args.vocab_size)
                    print(
                        f'tokenizer loaded from strings. len={len(tokenizer)}.'
                    )

                bind_nsml(model, tokenizer, args, eos=eos_setting)

                if args.tokenizer == 'char' and tokenizer is not None:
                    tokenizer.save('vocab.txt')

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model, dim=1)

        if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train':
            train(model,
                  tokenizer,
                  train_data,
                  valid_data,
                  args,
                  eos=eos_setting)