예제 #1
0
def getModel(modelname):
    global current_modelname, current_modeltime, current_model, current_tokenizer
    if current_modelname != modelname or current_modeltime != getModelModificationTime(
            modelname):
        app.logger.info(f"loading {modelname}")
        current_modelname = modelname
        current_modeltime = getModelModificationTime(modelname)

        # Code used before adding a models directory to support multiple models (which may have different vocab)
        # parser.add_argument('--tokenizer_path', default='cache/vocab_processed.txt', type=str, required=False, help='词表路径')
        # tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
        tokenizer_path = f'models/{modelname}/cache/vocab_processed.txt'
        current_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=tokenizer_path)
        app.logger.info(f"tokenizer loaded from {tokenizer_path}")

        # Code used before adding a models directory to support multiple models
        # parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径')
        # model = GPT2LMHeadModel.from_pretrained(args.model_path)
        model_path = f'models/{modelname}/final_model'
        current_model = GPT2LMHeadModel.from_pretrained(model_path)
        current_model.to(device)
        current_model.eval()
        app.logger.info(f"model loaded from {model_path}")

    return current_model, current_tokenizer
예제 #2
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--tokenizer_path',
                        default='cache/vocab.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path

    # if raw:
    #     print('building files')
    #     build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer,
    #                 num_pieces=num_pieces)
    #     print('files built')
    raw_data_files = [
        join(raw_data_path, f) for f in listdir(raw_data_path)
        if isfile(join(raw_data_path, f))
    ]
    random.shuffle(raw_data_files)
    each_size = len(raw_data_files) // 8
    split_raw_data_files = []
    for i in range(8):
        split_raw_data_files.append(raw_data_files[i * each_size:(i + 1) *
                                                   each_size])

    def tokenization(index, raw_data_files):
        for file_path in raw_data_files[index]:
            get_tokenization(file_path, tokenized_data_path, full_tokenizer)

    xmp.spawn(tokenization,
              args=(split_raw_data_files, ),
              nprocs=8,
              start_method='fork')
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库')
    parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环')
    parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size')
    parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率')
    parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数')
    parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False)
    parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False)
    parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份')
    parser.add_argument('--min_length', default=20, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--n_ctx', default=50, type=int, required=False, help='训练样本长度')
    parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径')
    parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径')
    parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--padding', action='store_true', help='padding')
    parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json")
    parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe")
    parser.add_argument('--max_steps_perEpoch_perPiece', default=1000000, type=int, required=False)
    parser.add_argument('--steps_savemodel', default=10000, type=int, required=False, help='保存模型步数')
    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    num_pieces = args.num_pieces
    min_length = args.min_length
    n_ctx = args.n_ctx
    padding = args.padding
    print('building files')
    build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces,
                full_tokenizer=full_tokenizer, min_length=min_length,n_ctx=n_ctx, padding=padding)
    print('files built')
예제 #4
0
 def __init__(self, model_path, tokenizer_path):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = GPT2LMHeadModel.from_pretrained(model_path)
     model.to(device)
     model.eval()
     tokenizer = tokenization_bert_word_level.BertTokenizer(
         vocab_file=tokenizer_path)
     vocab = Gpt2Vocab(tokenizer)
     self.device = device
     self.model = model
     self.vocab = vocab
     self.tokenizer = tokenizer
예제 #5
0
def tokenizer_test():
    segment = False
    if segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file='./data/text.data/vocab_processed.txt')
    full_tokenizer.max_len = 100
    line = '你还不了解我,蛛哥就知道,我很快就毛事了,只是被好朋友误会有点不好受'
    line1 = full_tokenizer.tokenize(line)
    print(line1)
    ids = full_tokenizer.convert_tokens_to_ids(line1)
    print(ids)
예제 #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = args.output_dir

    # if raw:
    #     print('building files')
    #     build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer,
    #                 num_pieces=num_pieces)
    #     print('files built')
    raw_data_files = [
        join(raw_data_path, f) for f in listdir(raw_data_path)
        if isfile(join(raw_data_path, f))
    ]
    random.shuffle(raw_data_files)

    def train_model(index):
        device = xm.xla_device()
        torch.manual_seed(0)

        if not os.path.exists(tokenized_data_path):
            os.mkdir(tokenized_data_path)
        if not args.pretrained_model:
            model = transformers.modeling_gpt2.GPT2LMHeadModel(
                config=model_config)
        else:
            model = transformers.modeling_gpt2.GPT2LMHeadModel(
                config=model_config)
            model.load_state_dict(torch.load(output_dir + 'final_model'))
        model.train()
        model.to(device)
        multi_gpu = False
        full_len = 0
        # print('calculating total steps')
        # for i in tqdm(range(num_pieces)):
        #     with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
        #         full_len += len([int(item) for item in f.read().strip().split()])
        # total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
        # print('total steps = {}'.format(total_steps))

        optimizer = transformers.AdamW(model.parameters(),
                                       lr=lr,
                                       correct_bias=True)
        # scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps)
        # if fp16:
        #     try:
        #         from apex import amp
        #     except ImportError:
        #         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        #     model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

        # if torch.cuda.device_count() > 1:
        #     print("Let's use", torch.cuda.device_count(), "GPUs!")
        #     model = DataParallel(model)
        #     multi_gpu = True
        if xm.is_master_ordinal():
            print('starting training')

        doc_size = 10
        raw_data_batch_len = len(raw_data_files) // doc_size
        for epoch in range(epochs):
            if xm.is_master_ordinal():
                print('epoch {}'.format(epoch + 1))
                now = datetime.now()
                print('time: {}'.format(now))
            for batch_len in range(raw_data_batch_len):
                train_dataset = TextDataset(
                    raw_data_files[batch_len * doc_size:(batch_len + 1) *
                                   doc_size], tokenized_data_path,
                    full_tokenizer, n_ctx)

                train_sampler = torch.utils.data.distributed.DistributedSampler(
                    train_dataset,
                    num_replicas=xm.xrt_world_size(),
                    rank=xm.get_ordinal(),
                    shuffle=True)

                # Creates dataloaders, which load data in batches
                # Note: test loader is not shuffled or sampled
                train_loader = torch.utils.data.DataLoader(
                    train_dataset,
                    batch_size=batch_size,
                    sampler=train_sampler,
                    num_workers=8,
                    drop_last=True)

                # tokens = get_tokenization(raw_data_file, tokenized_data_path, full_tokenizer)
                # if tokens is None:
                #     continue
                # start_point = 0
                # samples = []
                # while start_point < len(tokens) - n_ctx:
                #     samples.append(tokens[start_point: start_point + n_ctx])
                #     start_point += stride
                # if start_point < len(tokens):
                #     samples.append(tokens[len(tokens) - n_ctx:])
                # random.shuffle(samples)
                para_train_loader = pl.ParallelLoader(
                    train_loader, [device]).per_device_loader(device)
                running_loss = 0
                for step, batch_inputs in enumerate(para_train_loader):

                    # for step in range(len(samples) // batch_size):

                    #  prepare data
                    # batch = samples[step * batch_size: (step + 1) * batch_size]
                    # batch_labels = []
                    # batch_inputs = []
                    # for ids in batch:
                    #     int_ids_for_labels = [int(x) for x in ids]
                    #     int_ids_for_inputs = [int(x) for x in ids]
                    #     batch_labels.append(int_ids_for_labels)
                    #     batch_inputs.append(int_ids_for_inputs)
                    # print(batch_inputs)
                    batch_inputs = batch_inputs.to(device)

                    # print(batch_labels.size(), batch_inputs.size())
                    #  forward pass
                    outputs = model.forward(input_ids=batch_inputs,
                                            labels=batch_inputs)
                    loss, logits = outputs[:2]

                    #  get loss
                    # if multi_gpu:
                    #     loss = loss.mean()
                    # if gradient_accumulation > 1:
                    #     loss = loss / gradient_accumulation

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                    xm.optimizer_step(optimizer)

                    # if (step + 1) % gradient_accumulation == 0:
                    #     running_loss += loss.item()
                    # optimizer.step()
                    # xm.optimizer_step(optimizer)
                    # optimizer.zero_grad()
                    # scheduler.step()
                    if xm.is_master_ordinal():
                        if (step + 1) % log_step == 0:
                            print(
                                'now time: {}:{}. Step {}/{} of pice {}/{} epoch {}, loss {}'
                                .format(datetime.now().hour,
                                        datetime.now().minute, (step + 1),
                                        len(para_train_loader), batch_len + 1,
                                        raw_data_batch_len, epoch + 1,
                                        running_loss / log_step))
                            running_loss = 0
                        else:
                            running_loss += loss.item()
                xm.save(model.state_dict(), output_dir + 'final_model')

                if xm.is_master_ordinal():
                    gc.collect()

    xmp.spawn(train_model, args=(), nprocs=8, start_method='fork')
예제 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--device", default="0,1,2,3", type=str, required=False, help="设置使用哪些显卡"
    )
    parser.add_argument(
        "--model_config",
        default="config/model_config_small.json",
        type=str,
        required=False,
        help="选择模型参数",
    )
    parser.add_argument(
        "--tokenizer_path",
        default="cache/vocab_small.txt",
        type=str,
        required=False,
        help="选择词库",
    )
    parser.add_argument(
        "--raw_data_path",
        default="data/train.json",
        type=str,
        required=False,
        help="原始训练语料",
    )
    parser.add_argument(
        "--tokenized_data_path",
        default="data/tokenized/",
        type=str,
        required=False,
        help="tokenized语料存放位置",
    )
    parser.add_argument("--raw", action="store_true", help="是否先做tokenize")
    parser.add_argument("--epochs", default=5, type=int, required=False, help="训练循环")
    parser.add_argument(
        "--batch_size", default=8, type=int, required=False, help="训练batch size"
    )
    parser.add_argument("--lr", default=1.5e-4, type=float, required=False, help="学习率")
    parser.add_argument(
        "--warmup_steps", default=2000, type=int, required=False, help="warm up步数"
    )
    parser.add_argument(
        "--log_step",
        default=1,
        type=int,
        required=False,
        help="多少步汇报一次loss,设置为gradient accumulation的整数倍",
    )
    parser.add_argument(
        "--stride", default=768, type=int, required=False, help="训练时取训练数据的窗口步长"
    )
    parser.add_argument(
        "--gradient_accumulation", default=1, type=int, required=False, help="梯度积累"
    )
    parser.add_argument("--fp16", action="store_true", help="混合精度")
    parser.add_argument("--fp16_opt_level", default="O1", type=str, required=False)
    parser.add_argument("--max_grad_norm", default=1.0, type=float, required=False)
    parser.add_argument(
        "--num_pieces", default=100, type=int, required=False, help="将训练语料分成多少份"
    )
    parser.add_argument(
        "--min_length", default=128, type=int, required=False, help="最短收录文章长度"
    )
    parser.add_argument(
        "--output_dir", default="model/", type=str, required=False, help="模型输出路径"
    )
    parser.add_argument(
        "--pretrained_model", default="", type=str, required=False, help="模型训练起点路径"
    )
    parser.add_argument(
        "--writer_dir",
        default="tensorboard_summary/",
        type=str,
        required=False,
        help="Tensorboard路径",
    )
    parser.add_argument("--segment", action="store_true", help="中文以词为单位")
    parser.add_argument("--bpe_token", action="store_true", help="subword")
    parser.add_argument(
        "--encoder_json",
        default="tokenizations/encoder.json",
        type=str,
        help="encoder.json",
    )
    parser.add_argument(
        "--vocab_bpe", default="tokenizations/vocab.bpe", type=str, help="vocab.bpe"
    )

    args = parser.parse_args()
    print("args:\n" + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config
    )
    print("config:\n" + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("using device:", device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print("building files")
        build_files(
            data_path=raw_data_path,
            tokenized_data_path=tokenized_data_path,
            num_pieces=num_pieces,
            full_tokenizer=full_tokenizer,
            min_length=min_length,
        )
        print("files built")

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model
        )
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print("number of parameters: {}".format(num_parameters))

    multi_gpu = False
    full_len = 0
    print("calculating total steps")
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + "tokenized_train_{}.txt".format(i), "r") as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
    print("total steps = {}".format(total_steps))

    optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps
    )
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model, device_ids=[int(i) for i in args.device.split(",")])
        multi_gpu = True
    print("starting training")
    overall_step = 0
    running_loss = 0
    saving_time = datetime.now()
    for epoch in range(epochs):
        print("epoch {}".format(epoch + 1))
        now = datetime.now()
        print("time: {}".format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(
                tokenized_data_path + "tokenized_train_{}.txt".format(i), "r"
            ) as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point : start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx :])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size : (step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm
                        )
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    tb_writer.add_scalar(
                        "loss", loss.item() * gradient_accumulation, overall_step
                    )
                    print(
                        "now time: {}:{}. Step {} of piece {} of epoch {}, loss {}".format(
                            datetime.now().hour,
                            datetime.now().minute,
                            step + 1,
                            piece_num,
                            epoch + 1,
                            running_loss
                            * gradient_accumulation
                            / (log_step / gradient_accumulation),
                        )
                    )
                    running_loss = 0
                delta_time = datetime.now() - saving_time
                if delta_time.seconds > 1800:
                    print("saving model for epoch {}".format(epoch + 1))
                    if not os.path.exists(
                        output_dir + "model_epoch{}".format(epoch + 1)
                    ):
                        os.mkdir(output_dir + "model_epoch{}".format(epoch + 1))
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(
                        output_dir + "model_epoch{}".format(epoch + 1)
                    )
                    saving_time = datetime.now()
                overall_step += 1
            piece_num += 1

        print("saving model for epoch {}".format(epoch + 1))
        if not os.path.exists(output_dir + "model_epoch{}".format(epoch + 1)):
            os.mkdir(output_dir + "model_epoch{}".format(epoch + 1))
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(output_dir + "model_epoch{}".format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print("epoch {} finished".format(epoch + 1))

        then = datetime.now()
        print("time: {}".format(then))
        print("time for one epoch: {}".format(then - now))

    print("training finished")
    if not os.path.exists(output_dir + "final_model"):
        os.mkdir(output_dir + "final_model")
    model_to_save = model.module if hasattr(model, "module") else model
    model_to_save.save_pretrained(output_dir + "final_model")
예제 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--length',
                        default=-1,
                        type=int,
                        required=False,
                        help='生成长度')
    parser.add_argument('--temperature',
                        default=1,
                        type=float,
                        required=False,
                        help='生成温度,越高越随机')
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='生成的时候最高几选一')
    parser.add_argument('--topp',
                        default=0,
                        type=float,
                        required=False,
                        help='生成的时候积累概率最高多少')
    # parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
    #                     help='模型参数路径')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='词表路径')
    parser.add_argument('--model_path',
                        default='model/final_model',
                        type=str,
                        required=False,
                        help='模型路径')
    parser.add_argument('--save_path',
                        default='generated/',
                        type=str,
                        required=False,
                        help='存放生成的文件的路径')
    parser.add_argument('--articles_per_title',
                        default=5,
                        type=int,
                        required=False,
                        help='每个标题生成多少篇文章')
    parser.add_argument('--titles',
                        default='萧炎',
                        type=str,
                        required=False,
                        help='标题列表,是一个字符串,用空格分开')
    parser.add_argument('--titles_file',
                        default='',
                        type=str,
                        required=False,
                        help='标题列表文件,文件中每行一个标题。如果这个选项有值则titles无效')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切词')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--repetition_penalty',
                        default=1.0,
                        type=float,
                        required=False)

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    repetition_penalty = args.repetition_penalty

    titles = args.titles.split()  # 列表,里面每个元素是一个生成的标题
    if args.titles_file:
        with open(args.titles_file, 'r') as f:
            titles = [line.strip('\n') for line in f.readlines()]
    articles_per_title = args.articles_per_title  # 这里定义一个标题生成多少篇文章
    save_path = args.save_path  # 设置存到哪

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    n_ctx = model.config.n_ctx

    if not os.path.exists(save_path):
        os.mkdir(save_path)
    if length == -1:
        length = model.config.n_ctx

    for i, title in enumerate(titles):
        for j in range(articles_per_title):
            with open(save_path + str(i) + '-' + str(j) + '.txt', 'w') as f:
                context_tokens = tokenizer.convert_tokens_to_ids(
                    tokenizer.tokenize(title))
                generated = 0
                out = sample_sequence(n_ctx=n_ctx,
                                      model=model,
                                      length=length,
                                      context=context_tokens,
                                      tokenizer=tokenizer,
                                      temperature=temperature,
                                      top_k=topk,
                                      top_p=topp,
                                      repitition_penalty=repetition_penalty,
                                      device=device)
                out = out.tolist()[0]

                generated += 1
                text = tokenizer.convert_ids_to_tokens(out)

                for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
                    if is_word(item) and is_word(text[i + 1]):
                        text[i] = item + ' '

                for i, item in enumerate(text):
                    if item == '[MASK]' or item == '[UNK]':
                        text[i] = ''
                    if item == '[CLS]' or item == '[SEP]':
                        text[i] = '\n'

                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                text = ''.join(text).replace('##', '').strip()
                # text = ''.join(text.split('\n')[:-1])
                print(text)
                f.write(text + '\n')
                print("=" * 80)
예제 #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='gpt2/config.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=150,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=100,
                        type=int,
                        required=False,
                        help='warm up步数')
    # parser.add_argument('--log_step', default=2, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride',
                        default=384,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--output_dir',
                        default='model_classfier/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    # parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json',
                        default="tokenizations/encoder.json",
                        type=str,
                        help="encoder.json")
    parser.add_argument('--vocab_bpe',
                        default="tokenizations/vocab.bpe",
                        type=str,
                        help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.GPT2Config.from_json_file(args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    # tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    # log_step = args.log_step
    # stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    # fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    # fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    # num_pieces = args.num_pieces
    # min_length = args.min_length
    output_dir = args.output_dir
    # tb_writer = SummaryWriter(log_dir=args.writer_dir)
    # assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        resources, resources_id, input_question_list, max_aq_len = build_files(
            data_path=raw_data_path, full_tokenizer=full_tokenizer)
        print('files built')
    input_ids = [] * len(resources_id)
    # labels = []
    for i in range(len(resources_id)):
        inputsss, _ = sliding_window(max_len=512,
                                     resources=resources_id[i],
                                     stride=512 - 128)
        input_ids.append(inputsss)
        # labels = labels + [choices['label']] * len(inputsss)
    print('sliding built')
    if True:  # shuffle
        index = [i for i in range(len(input_ids))]
        random.shuffle(index)
        new_input_ids = [input_ids[i] for i in index]
        new_input_question_list = [input_question_list[i] for i in index]
        input_ids = new_input_ids

    val_rate = 0.1
    split = int((1 - val_rate) * len(input_ids))
    val_input_ids = input_ids[split:]
    val_input_question_list = input_question_list[split:]

    input_ids = input_ids[:split]
    input_question_list = input_question_list[:split]

    # train_dataset = my_dataset(x=input_ids, y=labels, token_type_ids=token_type_ids)
    # train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

    # if not args.pretrained_model:
    #     model = transformers.models.gpt2.GPT2LMHeadModel(config=model_config)
    # else:
    #     model = transformers.models.gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)

    model = modelMy(args, device)

    model.to(device)
    # old_parameter = model.f**k.weight.clone()
    # num_parameters = 0
    # parameters = model.parameters()
    # for parameter in parameters:
    #     num_parameters += parameter.numel()
    # print('number of parameters: {}'.format(num_parameters))
    # param_optimizer = [p for n, p in model.named_parameters() if p.requires_grad]
    multi_gpu = False
    print('calculating total steps')
    # for i in tqdm(range(num_pieces)):
    #     with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
    #         full_len += len([int(item) for item in f.read().strip().split()])

    optimizer = transformers.optimization.AdamW(model.parameters(),
                                                lr=lr,
                                                weight_decay=0.01,
                                                correct_bias=True)
    scheduler = transformers.optimization.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=1500,
        num_training_steps=args.epochs * len(input_ids))
    # scheduler = transformers.optimization.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10, num_training_steps = args.)
    # from pytorch_pretrained_bert.optimization import BertAdam
    # optimizer = BertAdam(model.parameters(),
    #                      lr=0.1,
    #                      warmup=0.1,
    #                      t_total=100)
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(
            model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    best_loss = 9999999
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        # x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        acc_s = 0
        piece_num = 0

        model.train()
        for step in range(len(input_ids)):  # paper by paper
            # print ("step:{}".format(step))
            # if (overall_step + 2) % gradient_accumulation == 0:
            #     break
            batch_inputs = input_ids[step]
            batch_inputs = torch.tensor(batch_inputs).long().to(
                device).unsqueeze(0)
            batch_questions = [
                z['Question_token'] for z in input_question_list[step][:]
            ]
            batch_questions = torch.tensor(batch_questions).long().to(
                device).unsqueeze(0)
            batch_choices = [
                z['Choices_token'] for z in input_question_list[step][:]
            ]
            batch_choices = torch.tensor(batch_choices).long().to(
                device).unsqueeze(0)
            batch_labels = [z['Goal'] for z in input_question_list[step][:]]
            batch_labels = torch.tensor(batch_labels).long().to(
                device).unsqueeze(0)
            #  forward pass
            outputs = model.forward(inputs=batch_inputs,
                                    questions=batch_questions,
                                    choices=batch_choices,
                                    labels=batch_labels)
            loss, pred, acc = outputs
            acc_s += (acc.cpu())
            running_loss += loss.item()
            #  get loss
            if multi_gpu:
                loss = loss.mean()
            if gradient_accumulation > 1:
                loss = loss / gradient_accumulation

            #  loss backward
            loss.backward()
            if (overall_step + 1) % gradient_accumulation == 0:
                optimizer.step()
                scheduler.step()
                overall_step += 1
                # print("backwards")

            # if (overall_step + 1) % 1 == 0:

            overall_step += 1
        piece_num += 1
        running_loss = running_loss / len(input_ids)
        print('now time: {}:{}. epoch {}, loss {}, acc {:.6f}'.format(
            datetime.now().hour,
            datetime.now().minute, epoch + 1, running_loss * 1000,
            acc_s / len(input_ids)
            # acc_s / 16
            # acc_s / 800
        ))
        #---------------------------------
        running_loss = running_loss * gradient_accumulation / len(resources)
        if running_loss < best_loss:
            best_loss = running_loss
            print('saving model for epoch {}'.format(epoch + 1))
            if not os.path.exists(output_dir +
                                  'model_epoch{}'.format(epoch + 1)):
                os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(output_dir + 'loss.best', optimizer,
                                          epoch)
        running_loss = 0

        model.eval()
        val_accs = 0
        for stepp in range(len(val_input_ids)):
            batch_inputs = val_input_ids[stepp]
            batch_inputs = torch.tensor(batch_inputs).long().to(
                device).unsqueeze(0)
            batch_questions = [
                z['Question_token'] for z in val_input_question_list[stepp][:]
            ]
            batch_questions = torch.tensor(batch_questions).long().to(
                device).unsqueeze(0)
            batch_choices = [
                z['Choices_token'] for z in val_input_question_list[stepp][:]
            ]
            batch_choices = torch.tensor(batch_choices).long().to(
                device).unsqueeze(0)
            batch_labels = [
                z['Goal'] for z in val_input_question_list[stepp][:]
            ]
            batch_labels = torch.tensor(batch_labels).long().to(
                device).unsqueeze(0)
            #  forward pass
            outputs = model.forward(inputs=batch_inputs,
                                    questions=batch_questions,
                                    choices=batch_choices,
                                    labels=batch_labels,
                                    training=False)
            loss, pred, acc = outputs
            val_accs += (acc)
        print('validation acc {}'.format(val_accs / (len(val_input_ids))))
        # print('validation acc {}'.format(val_accs))
        print('epoch {} finished'.format(epoch + 1))
        #------------------
        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final.best', optimizer, epoch)
예제 #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='生成设备')
    parser.add_argument('--length',
                        default=-1,
                        type=int,
                        required=False,
                        help='生成长度')
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        required=False,
                        help='生成的batch size')
    parser.add_argument('--nsamples',
                        default=10,
                        type=int,
                        required=False,
                        help='生成几个样本')
    parser.add_argument('--temperature',
                        default=1,
                        type=float,
                        required=False,
                        help='生成温度')
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='最高几选一')
    parser.add_argument('--topp',
                        default=0,
                        type=float,
                        required=False,
                        help='最高积累概率')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='词表路径')
    parser.add_argument('--model_path',
                        default='model/final_model',
                        type=str,
                        required=False,
                        help='模型路径')
    parser.add_argument('--prefix',
                        default='萧炎',
                        type=str,
                        required=False,
                        help='生成文章的开头')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切词')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--fast_pattern',
                        action='store_true',
                        help='采用更加快的方式生成文本')
    parser.add_argument('--save_samples', action='store_true', help='保存产生的样本')
    parser.add_argument('--save_samples_path',
                        default='.',
                        type=str,
                        required=False,
                        help="保存样本的路径")
    parser.add_argument('--repetition_penalty',
                        default=1.0,
                        type=float,
                        required=False)

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    batch_size = args.batch_size
    nsamples = args.nsamples
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    repetition_penalty = args.repetition_penalty

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    n_ctx = model.config.n_ctx

    if length == -1:
        length = model.config.n_ctx
    if args.save_samples:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/samples.txt',
                            'w',
                            encoding='utf8')
    while True:
        raw_text = args.prefix
        context_tokens = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(raw_text))
        generated = 0
        for _ in range(nsamples // batch_size):
            out = generate(n_ctx=n_ctx,
                           model=model,
                           context=context_tokens,
                           length=length,
                           is_fast_pattern=args.fast_pattern,
                           tokenizer=tokenizer,
                           temperature=temperature,
                           top_k=topk,
                           top_p=topp,
                           repitition_penalty=repetition_penalty,
                           device=device)
            for i in range(batch_size):
                generated += 1
                text = tokenizer.convert_ids_to_tokens(out)
                for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
                    if is_word(item) and is_word(text[i + 1]):
                        text[i] = item + ' '
                for i, item in enumerate(text):
                    if item == '[MASK]':
                        text[i] = ''
                    elif item == '[CLS]':
                        text[i] = '\n\n'
                    elif item == '[SEP]':
                        text[i] = '\n'
                info = "=" * 40 + " SAMPLE " + str(
                    generated) + " " + "=" * 40 + "\n"
                print(info)
                text = ''.join(text).replace('##', '').replace('[UNK]',
                                                               ' ').strip()
                print(text)
                if args.save_samples:
                    samples_file.write(info)
                    samples_file.write(text)
                    samples_file.write('\n')
                    samples_file.write('=' * 90)
                    samples_file.write('\n' * 2)
        print("=" * 80)
        if generated == nsamples:
            # close file when finish writing.
            if args.save_samples:
                samples_file.close()
            break
예제 #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库')
    parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环')
    parser.add_argument('--batch_size', default=64, type=int, required=False, help='训练batch size')
    parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率')
    parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数')
    parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False)
    parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False)
    parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份')
    parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--max_length', default=256, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径')
    parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径')
    parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json")
    parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe")
    parser.add_argument('--max_steps_perEpoch_perPiece', default=1000000, type=int, required=False)
    parser.add_argument('--steps_savemodel', default=10000, type=int, required=False, help='保存模型步数')
    parser.add_argument('--padding', action='store_true', help='输入是否定长')
    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    #os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    padding = args.padding
    max_length = args.max_length
    #tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))
    multi_gpu = False
    optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    #scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
    #                                                      t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    print('starting training')
    step_loss = 0
    running_loss = 10
    loss_ = 10
    iter = iterData(args.tokenized_data_path, rate=1.0, batch_size=batch_size, epochs=epochs)
    step = 0
    epoch0 = -1
    while True:
        data = next(iter)
        if data=='__STOP__':
            break
        epoch, epochs, idx_file, nb_files, batch_inputs = data
        random.shuffle(batch_inputs)
        batch_inputs = torch.tensor(batch_inputs).long().to(device)
        #  forward pass
        outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs)
        loss, logits = outputs[:2]
        #  get loss
        if multi_gpu:
            loss = loss.mean()
        if gradient_accumulation > 1:
            loss = loss / gradient_accumulation
        #  loss backward
        if fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        #  optimizer step
        if (step + 1) % gradient_accumulation == 0:
            running_loss += loss.item()
            optimizer.step()
            optimizer.zero_grad()
            step_loss += 1
            #scheduler.step()
        if (step + 1) % log_step == 0:
            loss_ = running_loss * gradient_accumulation / (log_step / gradient_accumulation)
            print('now time: {}:{}. step: {}, progress-innerEpoch: {}/{}, progress-outerEpoch: {}/{}, loss {}'.format(
                    datetime.now().hour,
                    datetime.now().minute,
                    step+1,
                    idx_file+1,
                    nb_files,
                    epoch + 1,
                    epochs,
                    loss_))
            running_loss = 0
        if step%args.steps_savemodel==0:
            print('saving model for epoch {}'.format(epoch + 1))
            output_dir_ = output_dir + 'model_epoch{}_step{}_loss-{}'.format(epoch + 1, step,'%0.2f'%loss_)
            if not os.path.exists(output_dir_):
                os.mkdir(output_dir_)
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(output_dir_)
        step += 1
        if epoch!=epoch0:
            if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
                os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
            epoch0 = epoch
            print('epoch {} finished'.format(epoch + 1))
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
    print('training finished')
예제 #12
0
파일: train.py 프로젝트: tinda/GPT2-Chinese
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='設置使用哪些顯卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='選擇模型參數')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='選擇詞庫')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始訓練語料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized語料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='訓練迴圈')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='訓練batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='學習率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步數')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步彙報一次loss,設置為gradient accumulation的整數倍')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='訓練時取訓練資料的視窗步長')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度積累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='將訓練語料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收錄文章長度')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型輸出路徑')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型訓練起點路徑')
    parser.add_argument('--writer_dir',
                        default='tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard路徑')
    parser.add_argument('--segment', action='store_true', help='中文以詞為單位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json',
                        default="tokenizations/encoder.json",
                        type=str,
                        help="encoder.json")
    parser.add_argument('--vocab_bpe',
                        default="tokenizations/vocab.bpe",
                        type=str,
                        help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此處設置程式使用哪些顯卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 選擇是否從零開始構建資料集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的顯卡請勿打開
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    num_pieces=num_pieces,
                    full_tokenizer=full_tokenizer,
                    min_length=min_length)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(
            model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    tb_writer.add_scalar('loss',
                                         loss.item() * gradient_accumulation,
                                         overall_step)
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(
                            datetime.now().hour,
                            datetime.now().minute, step + 1, piece_num,
                            epoch + 1, running_loss * gradient_accumulation /
                            (log_step / gradient_accumulation)))
                    running_loss = 0
                overall_step += 1
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
예제 #13
0
        s.update(ts)

    #import pdb;pdb.set_trace()
    _ = list(map(process, lines))
    keys = [word for word in s]
    with open(output_path, 'w') as fh:
        keys = [
            key + '\n'
            for key in ['[SEP]', '[PAD]', '[CLS]', '[MASK]', '[UNK]'] + keys
        ]
        fh.writelines(keys)


#generate_vocab('data/train.json', 'cache/vocab_wiki_small_new.txt')

full_tokenizer = tokenization_bert.BertTokenizer(
    vocab_file='cache/vocab_wiki_small.txt')


def tokenize_list(word_list):
    for word in word_list:
        print(full_tokenizer.convert_tokens_to_ids(word))


tokenize_list(['中国', '政府', '今天', '猫'])
#with open('/home/t-linan/projects/GPT2-Chinese/data/tokenized/tokenized_train_0.txt', 'r') as fh:
with open('data/tokenized/tokenized_train_0_False.txt', 'r') as fh:
    line = fh.readlines()[0]
numbers = line.strip().split()
print(len(numbers))
print(sum([int(number) == 4 for number in numbers]))
"""
예제 #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='4,5,6,7',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收录文章长度')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--writer_dir',
                        default='tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--do_train', action='store_true', help='do train')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json',
                        default="tokenizations/encoder.json",
                        type=str,
                        help="encoder.json")
    parser.add_argument('--vocab_bpe',
                        default="tokenizations/vocab.bpe",
                        type=str,
                        help="vocab.bpe")
    parser.add_argument('--max_steps_perEpoch_perPiece',
                        default=1000000,
                        type=int,
                        required=False)
    parser.add_argument('--steps_savemodel',
                        default=10000,
                        type=int,
                        required=False,
                        help='保存模型步数')
    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    #os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    num_pieces=num_pieces,
                    full_tokenizer=full_tokenizer,
                    min_length=min_length)
        print('files built')
    if not args.do_train:
        return
    trainfiles = os.listdir(args.tokenized_data_path)
    num_pieces = len(trainfiles)
    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    '''
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
    print('total steps = {}'.format(total_steps))
    '''
    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    #scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
    #                                                      t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model, device_ids=[i for i in range(4)])
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        iter = iterData(path_data=tokenized_data_path)
        while True:
            Data = next(iter)
            if Data == '__STOP__':
                break
            piece_num, _, samples = Data
            random.shuffle(samples)
            nb_steps = len(samples) // batch_size
            for step in range(nb_steps):  # drop last
                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    #scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    tb_writer.add_scalar('loss',
                                         loss.item() * gradient_accumulation,
                                         overall_step)
                    print(
                        'now time: {}:{}. Step {} (total {}) of piece {} (total {})  of epoch {}, loss {}'
                        .format(
                            datetime.now().hour,
                            datetime.now().minute, step + 1, nb_steps,
                            piece_num, num_pieces, epoch + 1,
                            running_loss * gradient_accumulation /
                            (log_step / gradient_accumulation)))
                    running_loss = 0
                if overall_step % args.steps_savemodel == 0:
                    print('saving model for epoch {}'.format(epoch + 1))
                    if not os.path.exists(output_dir + 'model_epoch{}_step{}'.
                                          format(epoch + 1, overall_step)):
                        os.mkdir(output_dir + 'model_epoch{}_step{}'.format(
                            epoch + 1, overall_step))
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    model_to_save.save_pretrained(
                        output_dir +
                        'model_epoch{}_step{}'.format(epoch + 1, overall_step))
                overall_step += 1
            piece_num += 1
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))

        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))
        if not os.path.exists(output_dir + 'final_model'):
            os.mkdir(output_dir + 'final_model')
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir + 'final_model')
    print('training finished')
예제 #15
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--path_generateConfig',
                        default='config.json',
                        type=str,
                        required=False,
                        help='生成配置')
    parser.add_argument('--path_texts',
                        default='texts.txt',
                        type=str,
                        required=False,
                        help='文本集')

    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='生成设备')
    parser.add_argument('--length',
                        default=50,
                        type=int,
                        required=False,
                        help='生成长度')
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        required=False,
                        help='生成的batch size')
    parser.add_argument('--nsamples',
                        default=10,
                        type=int,
                        required=False,
                        help='生成几个样本')
    parser.add_argument('--temperature',
                        default=1,
                        type=float,
                        required=False,
                        help='生成温度')
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='最高几选一')
    parser.add_argument('--topp',
                        default=0,
                        type=float,
                        required=False,
                        help='最高积累概率')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='模型参数')
    parser.add_argument('--tokenizer_path',
                        default='data/vocab.txt',
                        type=str,
                        required=False,
                        help='词表路径')
    parser.add_argument('--model_path',
                        default='model/model-test/model_epoch2931/',
                        type=str,
                        required=False,
                        help='模型路径')
    parser.add_argument('--prefix',
                        default='萧炎',
                        type=str,
                        required=False,
                        help='生成文章的开头')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切词')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--fast_pattern',
                        default=True,
                        action='store_true',
                        help='采用更加快的方式生成文本')
    parser.add_argument('--save_samples',
                        default=True,
                        action='store_true',
                        help='保存产生的样本')
    parser.add_argument('--save_samples_path',
                        default='./test/',
                        type=str,
                        required=False,
                        help="保存样本的路径")
    parser.add_argument('--repetition_penalty',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--use_gpu', default=False, help='是否使用GPU')
    args = parser.parse_args()
    print('args:\n' + args.__repr__())
    if os.path.exists(args.path_generateConfig):
        with open(args.path_generateConfig, 'r') as f:
            config = json.load(f)
        args.nsamples = config['nsamples']
        args.model_config = config['model_config']
        args.tokenizer_path = config['tokenizer_path']
        args.model_path = config['model_path']
        args.save_samples_path = config['save_samples_path']
    if os.path.exists(args.path_texts):
        with open(args.path_texts, 'r') as f:
            texts = f.read().strip().split('\n')
    else:
        texts = [args.prefix]
    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    batch_size = args.batch_size
    nsamples = args.nsamples
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    repetition_penalty = args.repetition_penalty
    if args.use_gpu:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    else:
        device = 'cpu'
    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    params = list(model.parameters())
    k = 0
    for i in params:
        l = 1
        #print("该层的结构:" + str(list(i.size())))
        for j in i.size():
            l *= j
        #print("该层参数和:" + str(l))
        k = k + l
    print("总参数数量和:" + str(k))

    n_ctx = model.config.n_ctx

    if length == -1:
        length = model.config.n_ctx
    for prefix in texts:
        if args.save_samples:
            if not os.path.exists(args.save_samples_path):
                os.makedirs(args.save_samples_path)
        samples_file = open(
            os.path.join(args.save_samples_path, 'samples_' + prefix + '.txt'),
            'w')
        print(
            os.path.join(args.save_samples_path, 'samples_' + prefix + '.txt'))
        while True:
            raw_text = prefix
            context_tokens = tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(raw_text))
            generated = 0
            for _ in range(nsamples // batch_size):
                out = generate(n_ctx=n_ctx,
                               model=model,
                               context=context_tokens,
                               length=length,
                               is_fast_pattern=args.fast_pattern,
                               tokenizer=tokenizer,
                               temperature=temperature,
                               top_k=topk,
                               top_p=topp,
                               repitition_penalty=repetition_penalty,
                               device=device)
                for i in range(batch_size):
                    generated += 1
                    text = tokenizer.convert_ids_to_tokens(out)
                    for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
                        if is_word(item) and is_word(text[i + 1]):
                            text[i] = item + ' '
                    for i, item in enumerate(text):
                        if item == '[MASK]':
                            text[i] = ''
                        elif item == '[CLS]':
                            text[i] = '\n\n'
                        elif item == '[SEP]':
                            text[i] = '\n'
                    info = "=" * 40 + " SAMPLE " + str(
                        generated) + " " + "=" * 40 + "\n"
                    print(info)
                    text = ''.join(text).replace('##', '').strip()
                    # print(text)
                    print(text.split('\n')[0])
                    if args.save_samples:
                        samples_file.write(info)
                        samples_file.write(text.split('\n')[0])
                        samples_file.write('\n')
                        samples_file.write('=' * 90)
                        samples_file.write('\n' * 2)
            print("=" * 80)
            if generated == nsamples:
                # close file when finish writing.
                if args.save_samples:
                    samples_file.close()
                break
예제 #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='設置使用哪些顯卡')
    parser.add_argument('--length',
                        default=-1,
                        type=int,
                        required=False,
                        help='生成長度')
    parser.add_argument('--temperature',
                        default=1,
                        type=float,
                        required=False,
                        help='生成溫度,越高越隨機')
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='生成的時候最高幾選一')
    parser.add_argument('--topp',
                        default=0,
                        type=float,
                        required=False,
                        help='生成的時候積累概率最高多少')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='模型參數路徑')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='詞表路徑')
    parser.add_argument('--model_path',
                        default='model/final_model',
                        type=str,
                        required=False,
                        help='模型路徑')
    parser.add_argument('--save_path',
                        default='generated/',
                        type=str,
                        required=False,
                        help='存放生成的檔的路徑')
    parser.add_argument('--articles_per_title',
                        default=5,
                        type=int,
                        required=False,
                        help='每個標題生成多少篇文章')
    parser.add_argument('--titles',
                        default='蕭炎',
                        type=str,
                        required=False,
                        help='標題清單,是一個字串,用空格分開')
    parser.add_argument('--titles_file',
                        default='',
                        type=str,
                        required=False,
                        help='標題列表檔,檔中每行一個標題。如果這個選項有值則titles無效')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切詞')
    parser.add_argument('--segment', action='store_true', help='中文以詞為單位')
    parser.add_argument('--repetition_penalty',
                        default=1.0,
                        type=float,
                        required=False)

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此處設置程式使用哪些顯卡
    length = args.length
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    repetition_penalty = args.repetition_penalty

    titles = args.titles.split()  # 清單,裡面每個元素是一個生成的標題
    if args.titles_file:
        with open(args.titles_file, 'r') as f:
            titles = [line.strip('\n') for line in f.readlines()]
    articles_per_title = args.articles_per_title  # 這裡定義一個標題生成多少篇文章
    save_path = args.save_path  # 設置存到哪

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    n_ctx = model.config.n_ctx

    if not os.path.exists(save_path):
        os.mkdir(save_path)
    if length == -1:
        length = model.config.n_ctx

    for i, title in enumerate(titles):
        for j in range(articles_per_title):
            with open(save_path + str(i) + '-' + str(j) + '.txt', 'w') as f:
                context_tokens = tokenizer.convert_tokens_to_ids(
                    tokenizer.tokenize(title))
                generated = 0
                out = sample_sequence(n_ctx=n_ctx,
                                      model=model,
                                      length=length,
                                      context=context_tokens,
                                      tokenizer=tokenizer,
                                      temperature=temperature,
                                      top_k=topk,
                                      top_p=topp,
                                      repitition_penalty=repetition_penalty,
                                      device=device)
                out = out.tolist()[0]

                generated += 1
                text = tokenizer.convert_ids_to_tokens(out)

                for i, item in enumerate(text[:-1]):  # 確保英文前後有空格
                    if is_word(item) and is_word(text[i + 1]):
                        text[i] = item + ' '

                for i, item in enumerate(text):
                    if item == '[MASK]':
                        text[i] = ''
                    if item == '[CLS]' or item == '[SEP]':
                        text[i] = '\n'

                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                text = ''.join(text).replace('##', '').strip()
                # text = ''.join(text.split('\n')[:-1])
                print(text)
                f.write(text + '\n')
                print("=" * 80)
예제 #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='設定要使用的顯卡,以逗號區隔')
    parser.add_argument('--model_config',
                        type=str,
                        required=False,
                        help='模型參數設定檔的路徑')
    parser.add_argument('--tokenizer_path',
                        type=str,
                        required=True,
                        help='選擇字典檔的路徑')
    parser.add_argument('--raw_data_path',
                        type=str,
                        required=True,
                        help='訓練用語料庫的路徑')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='語料庫 Tokenized 後的存放路徑')
    parser.add_argument('--raw',
                        action='store_true',
                        help='是否已做過 Tokenization')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='設定 Epochs')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='設定 Batch Size')
    parser.add_argument('--lr',
                        default=3e-5,
                        type=float,
                        required=False,
                        help='設定 Learning Rate')
    parser.add_argument('--warmup_steps',
                        default=0.1,
                        type=float,
                        required=False,
                        help='設定 Warmup Steps 的比例')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='Loss 紀錄的間隔,必須是 Gradient Accumulation 的整數倍')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='設定訓練語料庫的窗口大小')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度累積')
    parser.add_argument('--fp16', action='store_true', help='是否使用半精度浮點數')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='將訓練語料庫分成多少份')
    parser.add_argument('--min_length',
                        default=1,
                        type=int,
                        required=False,
                        help='文章最短長度,若文章長度不足將被捨棄')
    parser.add_argument('--output_dir', type=str, required=True, help='模型輸出路徑')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型起始路徑')
    parser.add_argument('--writer_dir',
                        default='tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard 輸出路徑')
    parser.add_argument('--segment', action='store_true', help='是否以詞為單位')
    parser.add_argument('--bpe_token',
                        action='store_true',
                        help='使用 Byte Pair Encoding')
    parser.add_argument('--encoder_json',
                        default='tokenizations/encoder.json',
                        type=str,
                        help='encoder.json')
    parser.add_argument('--vocab_bpe',
                        default='tokenizations/vocab.bpe',
                        type=str,
                        help='vocab.bpe')
    parser.add_argument('--timezone',
                        default=8,
                        type=int,
                        help='手動指定時區,預設為 GMT+8')
    parser.add_argument('--epoch_save',
                        default=1,
                        type=int,
                        help='每隔幾個 Epoch 就存一次權重')

    args = parser.parse_args()
    print(f'Arguments: {args.__repr__()}')

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    # 設定要使用的顯卡
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device

    model_config = transformers.GPT2Config.from_json_file(args.model_config)
    print(f'Config:\n{model_config.to_json_string()}')

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path,
            do_lower_case=False,
            do_basic_tokenize=False)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'Using Device: {device.upper()}')

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    # 不支援半精度浮點數的顯卡不要使用
    fp16 = args.fp16
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tz = args.timezone
    strlen = lambda n: len(str(n))
    get_time = lambda: datetime.utcnow() + timedelta(hours=tz)
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    os.makedirs(output_dir, exist_ok=True)

    if raw:
        print('Building from Raw Data')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    num_pieces=num_pieces,
                    tokenizer=full_tokenizer,
                    min_length=min_length)

    if not args.pretrained_model:
        model = transformers.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)

    if torch.cuda.device_count() == 2:
        device_map = {
            0: [0, 1, 2, 3, 4],
            1: [5, 6, 7, 8, 9, 10, 11],
        }
        model.parallelize(device_map)
        # model.parallelize()
        print('Model Parallelism!')

    model.train()
    if torch.cuda.device_count() < 2:
        model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print(f'Number of Parameters: {num_parameters}')

    multi_gpu = False
    full_len = 0
    print('Calculating Total Steps')
    for i in tqdm(range(num_pieces)):
        _fpath = os.path.join(tokenized_data_path, f'tokenized_train_{i}.txt')
        with open(_fpath, 'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    warmup_steps = int(total_steps * warmup_steps)
    print(f'Total Steps: {total_steps}')

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps)

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                'Please install apex from https://www.github.com/nvidia/apex to use fp16 training.'
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    # if torch.cuda.device_count() > 1:
    #     print(f'Using {torch.cuda.device_count()} GPUs')
    #     model = DataParallel(
    #         model, device_ids=[int(i) for i in args.device.split(',')])
    #     model.to(f'cuda:{model.device_ids[0]}')
    #     multi_gpu = True

    with TimeCost('Training'):
        print('Training Begin')
        overall_step = 0
        running_loss = 0

        for epoch in range(epochs):
            now = get_time()
            print(f'Epoch {epoch + 1} - Time: {now}')
            x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
            random.shuffle(x)
            piece_num = 0
            for i in x:
                _fpath = os.path.join(tokenized_data_path,
                                      f'tokenized_train_{i}.txt')
                with open(_fpath, 'r') as f:
                    line = f.read().strip()
                tokens = line.split()
                tokens = [int(token) for token in tokens]
                start_point = 0
                samples = []
                while start_point < len(tokens) - n_ctx:
                    samples.append(tokens[start_point:start_point + n_ctx])
                    start_point += stride
                if start_point < len(tokens):
                    idx = len(tokens) - n_ctx
                    samples.append(tokens[idx:])
                print(f'Tokenize {i} Sample Size: {len(samples)}')
                random.shuffle(samples)
                # 捨棄最後一個不足一個完整 Batch 的 Step
                _steps = len(samples) // batch_size
                # 若 Samples 數量小於 Batch Size 會發生完全沒有 Steps 可以訓練的問題
                # 不要把 num_pieces 設定的太大,也可以解決這個問題
                _steps = 1 if _steps <= 0 else _steps

                for step in range(_steps):
                    # prepare data
                    batch = samples[step * batch_size:(step + 1) * batch_size]
                    batch_inputs = []
                    for ids in batch:
                        int_ids = [int(x) for x in ids]
                        batch_inputs.append(int_ids)
                    _device = 'cuda:0' if torch.cuda.device_count(
                    ) > 1 else device
                    batch_inputs = torch.tensor(batch_inputs).long().to(
                        _device)

                    # forward pass
                    outputs = model.forward(input_ids=batch_inputs,
                                            labels=batch_inputs)
                    loss, _ = outputs[:2]

                    # get loss
                    if multi_gpu:
                        loss = loss.mean()
                    if gradient_accumulation > 1:
                        loss = loss / gradient_accumulation

                    # loss backward
                    if fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                            torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer), max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       max_grad_norm)

                    # optimizer step
                    if (overall_step + 1) % gradient_accumulation == 0:
                        running_loss += loss.item()
                        optimizer.step()
                        optimizer.zero_grad()
                        scheduler.step()
                    if (overall_step + 1) % log_step == 0:
                        tb_writer.add_scalar(
                            'loss',
                            loss.item() * gradient_accumulation, overall_step)
                        ts = datetime.utcnow() + timedelta(hours=8)
                        ts = ts.strftime('%H:%M:%S')
                        display_loss = running_loss * gradient_accumulation
                        display_loss /= log_step / gradient_accumulation
                        print(
                            f'Time {ts} - '
                            f'Epoch {epoch + 1:{strlen(epochs)}d}/{epochs} - '
                            f'Step {step + 1:{strlen(_steps)}d}/{_steps} - '
                            f'Piece {piece_num + 1:{strlen(num_pieces)}d}/{num_pieces} - '
                            f'Loss {display_loss:.4f}')
                        running_loss = 0
                    overall_step += 1
                piece_num += 1

            if (epoch + 1) % args.epoch_save == 0:
                print(f'Saving Model of Epoch {epoch + 1}')
                model_output_dir = os.path.join(output_dir,
                                                f'model_epoch{epoch + 1}')
                os.makedirs(model_output_dir, exist_ok=True)
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                model_to_save.save_pretrained(model_output_dir)

            then = get_time()
            print(f'Epoch {epoch + 1} Finished - Time: {then}')
            delta = (then - now).total_seconds()
            mm, ss = delta // 60, delta % 60
            hh, mm = mm // 60, mm % 60
            print(
                f'Time Cost of the Epoch {epoch + 1} - {hh:.0f}:{mm:.0f}:{ss:.2f}'
            )

        print('Training Done')
    model_output_dir = os.path.join(output_dir, 'final_model')
    os.makedirs(model_output_dir, exist_ok=True)
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(model_output_dir)
예제 #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = n_ctx
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = args.output_dir

    if raw:
        print('building files')
        build_files(raw_data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    num_pieces=num_pieces)
        print('files built')

    if not args.pretrained_model:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
            config=model_config)
    else:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)
    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=lr,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (step + 1) % log_step == 0:
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(
                            datetime.now().hour,
                            datetime.now().minute,
                            (step + 1) // gradient_accumulation, piece_num,
                            epoch + 1,
                            running_loss * gradient_accumulation / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
예제 #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--model_config', default='gpt2/config.json', type=str, required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库')
    parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs', default=100, type=int, required=False, help='训练循环')
    parser.add_argument('--batch_size', default=1, type=int, required=False, help='训练batch size')
    parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率')
    parser.add_argument('--warmup_steps', default=10000, type=int, required=False, help='warm up步数')
    # parser.add_argument('--log_step', default=2, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False)
    parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False)
    parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份')
    parser.add_argument('--min_length', default=1, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--output_dir', default='model_classfier/', type=str, required=False, help='模型输出路径')
    parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径')
    parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json")
    parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.GPT2Config.from_json_file(args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    log_step = args.log_step
    gradient_accumulation = args.gradient_accumulation
    output_dir = args.output_dir
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if True:
        print('building files')
        resources,resources_id,input_question_list, max_aq_len = build_files(data_path=raw_data_path, full_tokenizer=full_tokenizer)
        print('files built')
    input_ids = [] * len(resources_id)
    # labels = []
    for i in range(len(resources_id)):
        inputsss,_ = sliding_window( max_len = 512, resources = resources_id[i], stride=384)
        input_ids.append(inputsss)
        # labels = labels + [choices['label']] * len(inputsss)
    print('sliding built')

    val_input_ids = input_ids
    val_input_question_list = input_question_list

    # if False:  # shuffle
    #     index = [i for i in range(len(token_type_ids))]
    #     random.shuffle(index)
    #     new_input_ids = [input_ids[i] for i in index]
    #     new_labels = [labels[i] for i in index]
    #     new_token_type_ids = [token_type_ids[i] for i in index]
    #     input_ids = new_input_ids
    #     labels = new_labels
    #     token_type_ids = new_token_type_ids
    # train_dataset = my_dataset(x=input_ids, y=labels, token_type_ids=token_type_ids)
    # train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

    # if not args.pretrained_model:
    #     model = transformers.models.gpt2.GPT2LMHeadModel(config=model_config)
    # else:
    #     model = transformers.models.gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)

    model = modelMy(args,device)
    model.load_pretrained(output_dir + 'loss.best')
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    print('calculating total steps')
    model.eval()
    pred_list = []
    for stepp in range(len(val_input_ids)):
        batch_inputs = val_input_ids[stepp]
        batch_inputs = torch.tensor(batch_inputs).long().to(device).unsqueeze(0)
        batch_questions = [z['Question_token'] for z in val_input_question_list[stepp][:]]
        batch_questions = torch.tensor(batch_questions).long().to(device).unsqueeze(0)
        batch_choices = [z['Choices_token'] for z in val_input_question_list[stepp][:]]
        batch_choices = torch.tensor(batch_choices).long().to(device).unsqueeze(0)
        #  forward pass
        outputs = model.forward(inputs=batch_inputs, questions=batch_questions, choices=batch_choices,
                                labels=None,training=False)
        pred = outputs
        pred_list = pred_list + pred.squeeze(0).tolist()

    f = open('data/answer.data', 'wb')
    # 将文件中的变量加载到当前工作区
    pickle.dump(pred_list, f)
    f.close()

    import csv
    headers = ['id', 'label']
    rows = []
    n_id = 101
    for choice in pred_list:
        if choice == 0:
            charr = 'A'
        if choice == 1:
            charr = 'B'
        if choice == 2:
            charr = 'C'
        if choice == 3:
            charr = 'D'
        rows.append([n_id, charr])
    with open('submit.csv', 'w')as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(rows)