def main():
    args = get_args()
    args.n_gpu = 1

    set_seed(args)

    # Construct tokenizer
    tokenizer = CharTokenizer([])
    tokenizer.load(args.load_vocab)
    args.vocab_size = len(tokenizer)

    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    # GPU setting
    os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Construct model
    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    # Load data
    noisy_sents = read_strings(os.path.join('sejong_corpus', args.noisy_file))
    clean_sents = read_strings(os.path.join('sejong_corpus', args.clean_file))
    sents_annotation = ['None'] * len(noisy_sents)

    pairs = [{
        "noisy": noisy,
        "clean": clean,
        "annotation": annot
    } for noisy, clean, annot in zip(noisy_sents, clean_sents,
                                     sents_annotation)]

    # Train-validation split
    train_data, valid_data = train_test_split(
        pairs, test_size=args.val_ratio,
        random_state=args.seed)  # test: about 1000
    logger.info(f"# of train data: {len(train_data)}")
    logger.info(f"# of valid data: {len(valid_data)}")

    train(model, tokenizer, train_data, valid_data, args, eos=args.eos_setting)
Пример #2
0
def read_langs(lang1, lang2, term="txt", reverse=False, char_output=False):
    print("Reading lines...")

    # Read the file and split into lines

    # Attach the path here for the source and target language dataset
    filename = '%s/train/%s-%s.%s' % (opt.main_data_dir, lang1, lang2, term)
    # This creats the file directory name whichis used below

    # lines contains the data in form of a list
    lines = open(filename).read().strip().split('\n')

    # Split every line into pairs
    pairs = [[s for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)

        if char_output:
            output_lang = CharTokenizer(vocab_file='')
        else:
            output_lang = Lang(lang2)

    return input_lang, output_lang, pairs
Пример #3
0
    def __init__(self, args):
        super(ParallelTraining, self).__init__()
        if args.tokenizer == 'char':
            _tokenizer = CharTokenizer()
        else:
            print('use BPE 1000')
            _tokenizer = HuggingFaceTokenizer()  # use BPE-1000
        audio_feature = args.audio_feat
        if args.concat:
            audio_feature *= 3

        self.tokenizer = _tokenizer
        self.loss_fn = RNNTLoss(blank=0)
        self.model = Transducer(
            audio_feature,
            _tokenizer.vocab_size,
            args.vocab_dim,  # vocab embedding dim
            args.h_dim,  # hidden dim
            args.layers,
            pred_num_layers=args.pred_layers,
            dropout=args.dropout)
        self.latest_alignment = None
        self.steps = 0
        self.epoch = 0
        self.args = args
        self.best_wer = 1000
Пример #4
0
def main(args):
    # config
    config = TrainConfig(**args)
    config = config._replace(
        log_dir=config.log_dir.format(config.tokenizer),
        summary_dir=config.summary_dir.format(config.tokenizer),
        # checkpoint_dir=config.checkpoint_dir.format(config.tokenizer),
    )
    set_seed(config.seed)

    os.makedirs(config.log_dir, exist_ok=True)
    os.makedirs(config.summary_dir, exist_ok=True)
    # os.makedirs(config.checkpoint_dir, exist_ok=True)

    # logger
    logger = get_logger(log_path=os.path.join(config.log_dir, "logs.txt"))
    logger.info(config)

    # 기본적인 모듈들 생성 (vocab, tokenizer)
    tokenizer_dir = os.path.join(config.resource_dir, config.tokenizer)
    logger.info(f"get vocab and tokenizer from {tokenizer_dir}")
    vocab = Vocab(os.path.join(tokenizer_dir, "tok.vocab"))
    if config.tokenizer.startswith("mecab-"):
        tokenizer = MeCabTokenizer(os.path.join(tokenizer_dir, "tok.json"))
    elif config.tokenizer.startswith("sp-"):
        tokenizer = SentencePieceTokenizer(
            os.path.join(tokenizer_dir, "tok.model"))
    elif config.tokenizer.startswith("mecab_sp-"):
        mecab = MeCabTokenizer(os.path.join(tokenizer_dir, "tok.json"))
        sp = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model"))
        tokenizer = MeCabSentencePieceTokenizer(mecab, sp)
    elif config.tokenizer.startswith("char-"):
        tokenizer = CharTokenizer()
    elif config.tokenizer.startswith("word-"):
        tokenizer = WordTokenizer()
    elif config.tokenizer.startswith("jamo-"):
        tokenizer = JamoTokenizer()
    else:
        raise ValueError("Wrong tokenizer name.")

    # 모델에 넣을 데이터 준비
    # label-to-index
    label_to_index = {"0": 0, "1": 1}
    # Train
    logger.info(f"read training data from {config.train_path}")
    train_sentence_as, train_sentence_bs, train_labels = load_data(
        config.train_path, label_to_index)
    # Dev
    logger.info(f"read dev data from {config.dev_path}")
    dev_sentence_as, dev_sentence_bs, dev_labels = load_data(
        config.dev_path, label_to_index)
    # Test
    logger.info(f"read test data from {config.test_path}")
    test_sentence_as, test_sentence_bs, test_labels = load_data(
        config.test_path, label_to_index)

    # 데이터로 dataloader 만들기
    # Train
    logger.info("create data loader using training data")
    train_dataset = PAWSDataset(train_sentence_as, train_sentence_bs,
                                train_labels, vocab, tokenizer,
                                config.max_sequence_length)
    train_random_sampler = RandomSampler(train_dataset)
    train_data_loader = DataLoader(train_dataset,
                                   sampler=train_random_sampler,
                                   batch_size=config.batch_size)
    # Dev
    logger.info("create data loader using dev data")
    dev_dataset = PAWSDataset(dev_sentence_as, dev_sentence_bs, dev_labels,
                              vocab, tokenizer, config.max_sequence_length)
    dev_data_loader = DataLoader(dev_dataset, batch_size=1024)
    # Test
    logger.info("create data loader using test data")
    test_dataset = PAWSDataset(test_sentence_as, test_sentence_bs, test_labels,
                               vocab, tokenizer, config.max_sequence_length)
    test_data_loader = DataLoader(test_dataset, batch_size=1024)

    # Summary Writer 준비
    summary_writer = SummaryWriter(log_dir=config.summary_dir)

    # 모델을 준비하는 코드
    logger.info("initialize model and convert bert pretrained weight")
    bert_config = BertConfig.from_json_file(
        os.path.join(config.resource_dir, config.tokenizer,
                     config.bert_config_file_name))
    model = PAWSModel(bert_config, config.dropout_prob)
    model.bert = load_pretrained_bert(
        bert_config,
        os.path.join(config.resource_dir, config.tokenizer,
                     config.pretrained_bert_file_name))

    trainer = Trainer(config, model, train_data_loader, dev_data_loader,
                      test_data_loader, logger, summary_writer)
    trainer.train()
Пример #5
0
def main(cli_args):
    # Read from config file and make args
    with open("./tasks/korquad/config.json", "r") as f:
        args = AttrDict(json.load(f))
    args.seed = cli_args.seed
    args.tokenizer = cli_args.tokenizer
    args.output_dir = args.output_dir.format(args.tokenizer)
    args.resource_dir = cli_args.resource_dir
    args.data_dir = cli_args.data_dir
    logger.info(f"Training/evaluation parameters {args}")

    if args.doc_stride >= args.max_seq_length - args.max_query_length:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            "examples. This could result in errors when building features from the examples. Please reduce the doc "
            "stride or increase the maximum length to ensure the features are correctly built."
        )

    init_logger()
    set_seed(args.seed)

    logging.getLogger("transformers.data.metrics.squad_metrics").setLevel(
        logging.WARN)  # Reduce model loading logs

    # custom tokenizers
    tokenizer_dir = os.path.join(args.resource_dir, args.tokenizer)
    logger.info(f"get vocab and tokenizer from {tokenizer_dir}")
    if args.tokenizer.startswith("mecab-"):
        custom_tokenizer = MeCabTokenizer(
            os.path.join(tokenizer_dir, "tok.json"))
    elif args.tokenizer.startswith("sp-"):
        custom_tokenizer = SentencePieceTokenizer(
            os.path.join(tokenizer_dir, "tok.model"))
    elif args.tokenizer.startswith("mecab_sp-"):
        mecab = MeCabTokenizer(os.path.join(tokenizer_dir, "tok.json"))
        sp = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model"))
        custom_tokenizer = MeCabSentencePieceTokenizer(mecab, sp)
    elif args.tokenizer.startswith("char-"):
        custom_tokenizer = CharTokenizer()
    elif args.tokenizer.startswith("word-"):
        custom_tokenizer = WordTokenizer()
    elif args.tokenizer.startswith("jamo-"):
        custom_tokenizer = JamoTokenizer()
    else:
        raise ValueError("Wrong tokenizer name.")

    # Load pretrained model and tokenizer
    config = BertConfig.from_json_file(
        os.path.join(args.resource_dir, args.tokenizer,
                     args.bert_config_file_name))
    tokenizer = BertTokenizer(os.path.join(tokenizer_dir, "tok.vocab"),
                              custom_tokenizer)
    model = KorQuADModel(config)
    model.bert = load_pretrained_bert(
        config,
        os.path.join(args.resource_dir, args.tokenizer,
                     args.pretrained_bert_file_name))
    # GPU or CPU
    args.device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(args.device)

    logger.info(f"Training/evaluation parameters {args}")

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(f" global_step = {global_step}, average loss = {tr_loss}")

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval:
        checkpoints = list(
            os.path.dirname(c) for c in sorted(
                glob.glob(args.output_dir + "/**/" + "pytorch_model.bin",
                          recursive=True)))
        if not args.eval_all_checkpoints:
            checkpoints = checkpoints[-1:]
        else:
            logging.getLogger("transformers.configuration_utils").setLevel(
                logging.WARN)  # Reduce model loading logs
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce model loading logs

        logger.info(f"Evaluate the following checkpoints: {checkpoints}")

        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1]
            model = KorQuADModel.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=global_step)
            result = dict((k + (f"_{global_step}" if global_step else ""), v)
                          for k, v in result.items())
            results.update(result)

        output_dir = os.path.join(args.output_dir, "eval")
        with open(os.path.join(output_dir, "eval_result.txt"),
                  "w",
                  encoding="utf-8") as f:
            official_eval_results = eval_during_train(args)
            for key in sorted(official_eval_results.keys()):
                logger.info(f"  {key} = {official_eval_results[key]}")
                f.write(f" {key} = {official_eval_results[key]}\n")
Пример #6
0
def main():
    args = get_args()
    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    if args.tokenizer == 'char':
        tokenizer = CharTokenizer([])
    if args.tokenizer == 'kobert':
        print("koBERT tokenizer")
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        args.vocab_size = tokenizer.vocab_size
        print(args.vocab_size)

    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M")

    eos_setting = args.eos_setting

    bind_nsml(model, tokenizer, args, eos=eos_setting)
    if args.pause:
        nsml.paused(scope=locals())

    #train_data, valid_data = None, None
    if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train":
        if args.mode == "train":
            noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
            sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation"))
            clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))

        if args.mode == "semi-train":
            noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
            sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation"))
            clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))

            checkpoint = 'generated_data'
            sess = 't0005/rush1-1/'+str(args.semi_dataset)
            # five copy
            #sess = 't0005/rush1-1/209'
            # one copy
            #sess = 't0005/rush1-1/224'
            semi_noisy_sents, semi_clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
            semi_sents_annotation = ['None'] * len(semi_noisy_sents)

        if args.mode == "pretrain":
            print("PRETRAIN MODE ON!!")
            checkpoint = 'generated_data'
            sess = 't0005/rush1-1/113'
            noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
            sents_annotation = ['None']*len(noisy_sents)

        error_type_counter = Counter()

        for annotation in sents_annotation:
            error_type_counter += Counter(annotation.split(','))

        print(error_type_counter)

        # cleaning noise 버전
        # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
        # original 버전

        if args.mode == "semi-train":
            pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in
                     zip(noisy_sents, clean_sents, sents_annotation)]
            semi_pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in
                     zip(semi_noisy_sents, semi_clean_sents, semi_sents_annotation)]

            train_data = pairs[:-args.num_val_data]+semi_pairs
            valid_data = pairs[-args.num_val_data:]
            logger.info(f"# of train data: {len(train_data)}")
            logger.info(f"# of valid data: {len(valid_data)}")

            train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data]
            tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size)
            bind_nsml(model, tokenizer, args, eos=eos_setting)

        else:
            pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)]

            train_data, valid_data = pairs[:-args.num_val_data], pairs[-args.num_val_data:]
            logger.info(f"# of train data: {len(train_data)}")
            logger.info(f"# of valid data: {len(valid_data)}")

            train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data]
            #tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size)
            bind_nsml(model, tokenizer, args,eos=eos_setting)



        ## to load pretrained model
        nsml.load(checkpoint='best', session='t0005/rush1-2/79')
        #print(tokenizer.vocab)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, dim=1)

    if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train':
        train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
Пример #7
0
def main():
    args = get_args()
    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    tokenizer = CharTokenizer([])
    configuration = BertConfig(vocab_size=args.vocab_size)
    model = BertForMaskedLM(configuration).to(args.device)
    '''
    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    '''
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    bind_nsml(model, tokenizer, args)
    if args.pause:
        nsml.paused(scope=locals())

    if args.mode == "train" or args.mode == "pretrain":
        if args.mode == "train":
            noisy_sents = read_strings(
                os.path.join(args.data_dir, "train_data", "train_data"))
            sents_annotation = read_strings(
                os.path.join(args.data_dir, "train_data", "train_annotation"))
            clean_sents = read_strings(
                os.path.join(args.data_dir, "train_label"))

        if args.mode == "pretrain":
            print("PRETRAIN MODE ON!!")
            checkpoint = 'generated_data'
            sess = 't0005/rush1-1/113'
            noisy_sents, clean_sents = load_generated_data(
                checkpoint=checkpoint, session=sess)
            sents_annotation = ['None'] * len(noisy_sents)

        error_type_counter = Counter()

        for annotation in sents_annotation:
            error_type_counter += Counter(annotation.split(','))

        print(error_type_counter)

        # cleaning noise 버전
        # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
        # original 버전
        pairs = [{
            "noisy": noisy,
            "clean": clean,
            "annotation": annot
        } for noisy, clean, annot in zip(noisy_sents, clean_sents,
                                         sents_annotation)]
        train_data, valid_data = pairs[:-args.num_val_data], pairs[
            -args.num_val_data:]
        logger.info(f"# of train data: {len(train_data)}")
        logger.info(f"# of valid data: {len(valid_data)}")

        train_sents = [x['noisy']
                       for x in train_data] + [x['clean'] for x in train_data]
        tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size)
        bind_nsml(model, tokenizer, args)

        ## to load pretrained model
        #nsml.load(checkpoint='best', session='t0005/rush1-1/177')

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, dim=1)

    if args.mode == "train" or args.mode == "pretrain":
        train(model, tokenizer, train_data, valid_data, args)
from tokenizer import CharTokenizer
from model import TransformerModel
from train import bind_nsml

parser = argparse.ArgumentParser()
parser.add_argument('--session', type=str, default="")
parser.add_argument('--checkpoint', type=str, default="best")
args = parser.parse_args()
args.tokenizer = 'char'

session = 't0005/rush1-3/' + args.session
checkpoint = args.checkpoint
print(f'session: {session}\ncheckpoint: {checkpoint}')

model = None
tokenizer = CharTokenizer([])

bind_nsml(model, tokenizer)
nsml.load(checkpoint=checkpoint, session=session)

args.vocab_size = len(tokenizer)
print(f'vocab_size: {args.vocab_size}')

model = TransformerModel(
    vocab_size=args.vocab_size,
    hidden_size=args.hidden_size,
    num_attention_heads=args.num_attention_heads,
    num_encoder_layers=args.num_encoder_layers,
    num_decoder_layers=args.num_decoder_layers,
    intermediate_size=args.intermediate_size,
    dropout=args.dropout,
def main():
    args = get_args()
    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    if args.tokenizer == 'char':
        tokenizer = CharTokenizer([])
    if args.tokenizer == 'kobert':
        print("koBERT tokenizer")
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        args.vocab_size = tokenizer.vocab_size
        print(args.vocab_size)

    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    eos_setting = args.eos_setting

    bind_nsml(model, tokenizer, args, eos=eos_setting)
    if args.pause:
        nsml.paused(scope=locals())

    #train_data, valid_data = None, None
    if args.mode == "train":
        noisy_sents_labeled = read_strings(
            os.path.join(args.data_dir, "train_data", "train_data"))
        sents_annotation_labeled = read_strings(
            os.path.join(args.data_dir, "train_data", "train_annotation"))
        clean_sents_labeled = read_strings(
            os.path.join(args.data_dir, "train_label"))
        noisy_sents = read_strings(
            os.path.join(args.data_dir, "train_data", "train_corpus"))

        pairs = noisy_sents
        pairs_labeled = clean_sents_labeled

        train_data, valid_data = pairs + noisy_sents_labeled[:-args.
                                                             num_val_data] + pairs_labeled[:-args.num_val_data], pairs_labeled[
                                                                 -args.
                                                                 num_val_data:]
        logger.info(f"# of train data: {len(train_data)}")
        logger.info(f"# of valid data: {len(valid_data)}")

        train_sents = [x for x in train_data]

        if args.tokenizer == 'char':
            tokenizer = CharTokenizer.from_strings(train_sents,
                                                   args.vocab_size)
        print("===vocab size: ", len(tokenizer))
        args.vocab_size = len(tokenizer)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, dim=1)

    if args.mode == "train":
        train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
Пример #10
0
best_checkpoint = os.path.join(eval_args.name, 'amp_checkpoint.pt')
if not os.path.exists(best_checkpoint):
    raise ValueError('Not found')

checkpoint = torch.load(best_checkpoint, map_location='cpu')
with open(os.path.join(eval_args.name, 'vars.json'), 'r') as f:
    params = json.load(f)
print('Checkpoint at epoch %d ' % checkpoint['epoch'])
args = Struct(**params)
window_size = eval_args.window_size
window_stride = 0.01
sd.default.samplerate = 16000
duration = 60  # seconds

if args.tokenizer == 'char':
    _tokenizer = CharTokenizer()
else:
    _tokenizer = HuggingFaceTokenizer()  # use BPE-400
    print('use bpe')

model = Transducer(
    args.audio_feat,
    _tokenizer.vocab_size,
    args.vocab_dim,  # vocab embedding dim
    args.h_dim,  # hidden dim
    args.layers,
    pred_num_layers=args.pred_layers,
    dropout=args.dropout).cpu()

if args.audio_feat > 80:
    args.audio_feat = args.audio_feat // 3
Пример #11
0
def main():
    # from pathlib import Path
    # print("File      Path:", Path(__file__).absolute())
    # print("Directory Path:", Path().absolute())

    args = get_args()
    args.n_gpu = 1

    # noisy_sents_1 = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
    # clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))
    # noisy_sents_2 = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus"))
    #
    # noisy_sents = noisy_sents_1 + noisy_sents_2
    # noise_space_ratio = []
    #
    # for sentence in noisy_sents:
    #     noise_space_ratio.append(sentence.count(' ') / len(sentence))
    #
    # clean_space_ratio = []
    # for sentence in clean_sents:
    #     clean_space_ratio.append(sentence.count(' ') / len(sentence))
    #
    # print("noise_space_ratio: {}, clean_space_ratio: {}".format(sum(noise_space_ratio) / len(noise_space_ratio),
    #                                                             sum(clean_space_ratio) / len(clean_space_ratio)))

    # ##########
    # ##for local
    # args.num_workers=0
    # args.train_batch_size = 4
    # args.eval_batch_size = 4
    # args.eval_interval = 10
    # ##########

    set_seed(args)

    if args.tokenizer == 'char':
        tokenizer = CharTokenizer([])
    if args.tokenizer == 'kobert':
        print("koBERT tokenizer")
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        args.vocab_size = tokenizer.vocab_size
        print(args.vocab_size)

    if args.load_vocab != "":
        tokenizer.load(args.load_vocab)
        args.vocab_size = tokenizer.__len__()

    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    eos_setting = args.eos_setting

    bind_nsml(model, tokenizer, args, eos=eos_setting)
    if args.pause:
        nsml.paused(scope=locals())

    if args.mode != 'test' and args.averaging != "":
        sess = 't0005/rush1-3/37'
        checkpoints = ["4500", "6500", "7500", "8000"]

        nsml.load(checkpoint=checkpoints[0], session=sess)
        args.vocab_size = tokenizer.__len__()
        print(args.vocab_size)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        params = model.named_parameters()
        new_dict_params = dict(params)

        for checkpoint in checkpoints:
            bind_nsml(model, tokenizer, args, eos=eos_setting)
            nsml.load(checkpoint=checkpoint, session=sess)
            for name, param in params:
                new_dict_params[name] += param / len(checkpoints)

        model.load_state_dict(new_dict_params, strict=False)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.save('best')

    elif args.mode == 'eval':
        print("I'm in EVAL")

        checkpoint = 'best'
        sess = 't0005/rush1-3/507'
        nsml.load(checkpoint=checkpoint, session=sess)
        args.vocab_size = tokenizer.__len__()

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        model.eval()
        #noisy_sents = open("./naver_data_clean.txt", "r", encoding='utf-8').read().splitlines()
        noisy_sents = read_strings(
            os.path.join(args.data_dir, "train_data", "train_corpus"))
        valid_noisy = noisy_sents[:1000]

        prediction = correct_beam(model,
                                  tokenizer,
                                  valid_noisy,
                                  args,
                                  eos=True,
                                  length_limit=0.15)

        for i, pred in enumerate(prediction[:1000]):
            print("noisy_input: {}, pred: {}".format(valid_noisy[i], pred))

        # bind_txt(prediction)
        # nsml.save('prediction')

        # with open('naver_data_clean_again.txt', 'w',encoding='utf-8') as f:
        #     for i, pred in enumerate(prediction):
        #         if i%500==0: print(i)
        #         f.write("%s\n" % pred)

    ## only works when char tokenizer
    ##TODO: kobert tokenizer, different vocabsize if it is needed
    elif args.mode != 'test' and args.resubmit != "":
        checkpoint = 'best'
        sess = 't0005/rush1-3/' + args.resubmit
        print(sess)

        model = None
        tokenizer = CharTokenizer([])
        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        args.vocab_size = len(tokenizer)
        print(args.vocab_size)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        ########## testing loaded model & tokenizer ###############

        # model.eval()
        # noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
        # valid_noisy = noisy_sents[-10:]
        #
        # prediction = correct(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.1)
        #
        # for pred in prediction:
        #     print(pred)

        ##################

        nsml.save("best")

    else:
        #train_data, valid_data = None, None
        if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train":
            if args.mode == "train":
                # noisy_sents = open("./noisy_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000]
                # clean_sents = open("./clean_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000]
                # sents_annotation = ['None'] * len(noisy_sents)
                noisy_sents = read_strings(
                    os.path.join(args.data_dir, "train_data", "train_data"))
                sents_annotation = read_strings(
                    os.path.join(args.data_dir, "train_data",
                                 "train_annotation"))
                clean_sents = read_strings(
                    os.path.join(args.data_dir, "train_label"))

            if args.mode == "semi-train":
                noisy_sents = read_strings(
                    os.path.join(args.data_dir, "train_data", "train_data"))
                sents_annotation = read_strings(
                    os.path.join(args.data_dir, "train_data",
                                 "train_annotation"))
                clean_sents = read_strings(
                    os.path.join(args.data_dir, "train_label"))

                checkpoint = 'generated_data'
                sess = 't0005/rush1-1/' + str(args.semi_dataset)
                # five copy
                #sess = 't0005/rush1-1/209'
                # one copy
                #sess = 't0005/rush1-1/224'
                semi_noisy_sents, semi_clean_sents = load_generated_data(
                    checkpoint=checkpoint, session=sess)
                semi_sents_annotation = ['None'] * len(semi_noisy_sents)

            if args.mode == "pretrain":
                print("PRETRAIN MODE ON!!")
                noisy_sents = read_strings(
                    os.path.join('sejong_corpus', args.noisy_file))
                clean_sents = read_strings(
                    os.path.join('sejong_corpus', args.clean_file))
                # checkpoint = 'generated_data'
                # sess = 't0005/rush1-1/113'
                # noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
                sents_annotation = ['None'] * len(noisy_sents)

            error_type_counter = Counter()

            for annotation in sents_annotation:
                error_type_counter += Counter(annotation.split(','))

            print(error_type_counter)

            # cleaning noise 버전
            # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
            # original 버전

            if args.mode == "semi-train":
                pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                }
                         for noisy, clean, annot in zip(
                             noisy_sents, clean_sents, sents_annotation)]
                semi_pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                } for noisy, clean, annot in zip(
                    semi_noisy_sents, semi_clean_sents, semi_sents_annotation)]

                train_data = pairs[:-args.num_val_data] + semi_pairs
                valid_data = pairs[-args.num_val_data:]
                logger.info(f"# of train data: {len(train_data)}")
                logger.info(f"# of valid data: {len(valid_data)}")

                train_sents = [x['noisy'] for x in train_data
                               ] + [x['clean'] for x in train_data]
                tokenizer = CharTokenizer.from_strings(train_sents,
                                                       args.vocab_size)
                bind_nsml(model, tokenizer, args, eos=eos_setting)

            else:
                pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                }
                         for noisy, clean, annot in zip(
                             noisy_sents, clean_sents, sents_annotation)]

                train_data, valid_data = train_test_split(
                    pairs, test_size=args.val_ratio,
                    random_state=args.seed)  # test: about 1000
                logger.info(f"# of train data: {len(train_data)}")
                logger.info(f"# of valid data: {len(valid_data)}")

                # print("validation: ", valid_data)

                train_sents = [x['noisy'] for x in train_data
                               ] + [x['clean'] for x in train_data]
                # train_sents = [x['clean'] for x in train_data]

                if args.load_model != "" and args.mode == "train":  # Load pretrained model
                    print("load pretrained model")
                    model.load_state_dict(
                        torch.load(args.load_model, map_location=args.device))

                    if args.freeze:
                        model.token_embeddings.weight.requires_grad = False
                        model.decoder_embeddings.weight.requires_grad = False

                if args.tokenizer == 'char' and args.load_vocab == "":
                    tokenizer = CharTokenizer.from_strings(
                        train_sents, args.vocab_size)
                    print(
                        f'tokenizer loaded from strings. len={len(tokenizer)}.'
                    )

                bind_nsml(model, tokenizer, args, eos=eos_setting)

                if args.tokenizer == 'char' and tokenizer is not None:
                    tokenizer.save('vocab.txt')

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model, dim=1)

        if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train':
            train(model,
                  tokenizer,
                  train_data,
                  valid_data,
                  args,
                  eos=eos_setting)
from torch.utils.data import DataLoader, SequentialSampler
from dataset import TextDataset, collate_fn
from data_loader import read_strings
import os
from evaluation import gleu, gleu_one

from collections import Counter, defaultdict
from statistics import mean, stdev
from tokenization_kobert import KoBertTokenizer

from sklearn.model_selection import train_test_split

args = get_args()
args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = CharTokenizer([])

model = TransformerModel(
    vocab_size=args.vocab_size,
    hidden_size=args.hidden_size,
    num_attention_heads=args.num_attention_heads,
    num_encoder_layers=args.num_encoder_layers,
    num_decoder_layers=args.num_decoder_layers,
    intermediate_size=args.intermediate_size,
    dropout=args.dropout,
).to(args.device)

checkpoint = 'best'
sess = 't0005/rush1-3/507'

bind_nsml(model, tokenizer, args)