예제 #1
0
def eval_one_epoch(dataloader, model, eval_loss, eval_steps,
                   data_process_func):
    # print(len(dataloader))
    losses, perplexities = [], []
    cuda_logger, eval_logger = loggers.cuda_logger, loggers.validation_logger
    for step, raw in enumerate(dataloader):
        step_time = time.time()
        data = data_process_func(raw)
        log_info(
            cuda_logger, 'Allocated batches {}, {}'.format(
                cuda_mem_in_mb(), {k: v.shape
                                   for k, v in data.items()}))
        with torch.no_grad():
            loss = get_model_output(model, data)[0].mean()
            loss_value = loss.item()
        eval_loss += loss_value
        eval_steps += 1
        perplex_value = torch.exp(torch.tensor(eval_loss / eval_steps)).item()
        perplexities.append(perplex_value)
        losses.append(loss_value)
        log_info(
            eval_logger, '{} Iter Loss {} Perplexity {} Time {}'.format(
                step, loss_value, perplex_value,
                time.time() - step_time))
    return losses, perplexities, eval_loss, eval_steps
예제 #2
0
def gpt2_eval(gpt2, model, dataset, batch_size=32, data_func=lambda x: x):
    sample_logger = loggers.sample_logger
    data_loader = DataLoader(dataset,
                             shuffle=False,
                             batch_size=batch_size,
                             collate_fn=lambda x: x)
    re_prod, re_avg, gpt2_prod, gpt2_avg = gpt2_eval_one_epoch(
        data_loader, gpt2, model, data_func)
    result = {
        're_prod': re_prod,
        're_avg': re_avg,
        'gpt2_prod': gpt2_prod,
        'gpt2_avg': gpt2_avg
    }
    log_info(sample_logger, 'Total ratio {}'.format(result))
    return result
예제 #3
0
def gpt2_eval_one_epoch(dataloader, gpt2, model, data_func):
    sample_logger = loggers.sample_logger
    ratio_prod, ratio_avg = [], []
    gpt2_prod, gpt2_avg = [], []
    for step, raw in enumerate(dataloader):
        # step_time = time.time()
        # print(raw)
        data = data_func(raw)
        # print(data)
        probs = get_seq_prob(model, data, data_func=process_re_data)
        # print(probs)
        for i in range(len(probs)):
            # ep = np.concatenate((prob[0], prob[1]))
            idx = data['idx'][i]
            ep = probs[i][1]
            prob_avg = np.log(np.mean(ep)).item()
            prob_prod = np.mean(np.log(ep)).item()
            # print(prob_avg, prob_prod, type(prob_avg), np.array(prob_avg), np.array(idx), idx, type(idx))
            ratio_avg.append(np.append(np.array(idx), prob_avg))
            ratio_prod.append(np.append(np.array(idx), prob_prod))
        # print(probs)
        probs = get_seq_prob(gpt2, data, data_func=process_re_data)
        # print(probs)

        for i in range(len(probs)):
            # ep = np.concatenate((prob[0], prob[1]))
            idx = data['idx'][i]
            ep = probs[i][1]
            prob_avg = np.log(np.mean(ep))
            prob_prod = np.sum(np.log(ep))
            gpt2_avg.append(np.append(np.array(idx), prob_avg))
            gpt2_prod.append(np.append(np.array(idx), prob_prod))
        dl = len(probs)
        log_info(
            sample_logger,
            'RE Sample {} ratio prod {}, {}, ratio mean {}, {}'.format(
                dl, [x[-1] for x in ratio_prod[-dl:]],
                [x[-1]
                 for x in gpt2_prod[-dl:]], [x[-1] for x in ratio_avg[-dl:]],
                [x[-1] for x in gpt2_avg[-dl:]]))
    return np.array(ratio_prod), np.array(ratio_avg), np.array(
        gpt2_prod), np.array(gpt2_avg)
예제 #4
0
def gpt2_model_eval(config, index):
    from global_constants import ConfigEnums, main_device
    ce = ConfigEnums
    save_path = config[ce.save_path]
    config[ce.model] = config[ce.model].to(main_device)
    config[ce.gpt2] = config[ce.gpt2].to(main_device)
    final_logger = loggers.final_logger
    eval_params = get_params(config, gpt2_eval)
    ratios = gpt2_eval(**eval_params)
    if save_path is not None:
        log_path = list(os.path.split(save_path)[:-1])
        log_path.append('log')
        log_path.append(str(index) + '/')
        log_path = '/'.join(log_path)
        if not os.path.exists(log_path):
            os.mkdir(log_path)
        log_info(final_logger, 'saving ratios')
        torch.save(ratios, log_path + 'gpt2_ratios.pt')
        log_info(final_logger, 'All saved')
    return config[ce.model], -1
예제 #5
0
def evaluate(model, dataset, batch_size, epochs, data_func=lambda x: x):
    validation_logger = loggers.validation_logger
    eval_loss, eval_steps = 0, 0
    losses, perplexities = [], []
    model.eval()
    for e in range(epochs):
        data_loader = DataLoader(dataset,
                                 shuffle=True,
                                 batch_size=batch_size,
                                 collate_fn=lambda x: x)
        epoch_iter = len(data_loader)
        loss, perp, eval_loss, eval_steps = eval_one_epoch(
            data_loader, model, eval_loss, eval_steps, data_func)
        # print(len(losses))
        losses.extend(loss)
        perplexities.extend(perp)
        loss_seg = losses[e * epoch_iter:]
        # print(len(loss), len(losses), e * epoch_iter)
        log_info(validation_logger,
                 '----------------------------------------------------')
        log_info(
            validation_logger,
            'Epoch {}, Mean Loss {}, Min Loss {}, Accum Loss {}'.format(
                e, np.mean(loss_seg), np.min(loss_seg),
                eval_loss / eval_steps))
    eval_loss /= eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))
    log_info(validation_logger, 'Final perplexity {}'.format(perplexity))
    return perplexity, torch.tensor(perplexities), torch.tensor(losses)
예제 #6
0
def train_one_epoch(dataloader, model, optimizer, scheduler, data_process_func,
                    tok):
    losses = []
    cuda_logger, train_logger = loggers.cuda_logger, loggers.train_logger
    loss = None
    for step, raw in enumerate(dataloader):
        step_time = time.time()
        data = data_process_func(raw)
        if data is None:
            log_info(cuda_logger, 'Empty data {} Iter'.format(step))
            continue
        log_info(
            cuda_logger, 'Allocated batches {}, {}'.format(
                cuda_mem_in_mb(), {k: v.shape
                                   for k, v in data.items()}))
        loss = get_model_output(model, data)[0].mean()
        loss_value = loss.item()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        losses.append(loss_value)
        log_info(
            train_logger,
            '{} Iter Loss {} Time {}'.format(step, loss_value,
                                             time.time() - step_time))

    return losses, loss
예제 #7
0
def main(config_file='model_config.json'):
    import libs
    os.chdir('/'.join(os.path.abspath(__file__).split('/')[:-1]))
    libs.log_info(libs.loggers.prepare_logger,
                  'Using config {}'.format(config_file))
    with open(config_file, 'r') as f:
        config = json.load(f) if os.path.exists(
            config_file) and os.path.isfile(config_file) else {}
    models = None
    for k, v in config.items():
        if isinstance(v, list):
            if models is None:
                models = len(v)
            elif models != len(v):
                raise ValueError('Config field {} has wrong length'.format(k))
    models = models if models is not None else 1
    for i in range(models):
        new_config = {}
        for k, v in config.items():
            if isinstance(v, list):
                new_config[k] = v[i]
            else:
                new_config[k] = v
        start_func(new_config)
예제 #8
0
def eval_prob_one_epoch(dataloader,
                        gpt2,
                        model,
                        length,
                        num_samples,
                        data_process_func,
                        tokenizer=None):
    result = pd.DataFrame(
        columns=['e1', 'e2', 'sent', 'log_prod_prob', 'loss', 'sample_sent'])
    sample_logger = loggers.sample_logger
    max_sample = 32
    divs = num_samples // max_sample
    saps = [max_sample] * divs
    if sum(saps) < num_samples:
        saps.append(num_samples - divs * max_sample)
    for step, raw in enumerate(dataloader):
        data = data_process_func(raw)
        if data is None:
            continue
        for i in range(len(data['e1'])):
            step_time = time.time()
            e1, e2 = data['e1'][i], data['e2'][i]
            e1l, e2l = tokenizer.decode(e1.tolist()), tokenizer.decode(
                e2.tolist())
            sents = []
            sent = []
            gen_time = time.time()
            print('sampling {}, {}'.format(e1l, e2l))
            for ns in saps:
                # print(length, ns)
                sent_temp = sample_sequence_entity(model,
                                                   length,
                                                   e1,
                                                   e2,
                                                   num_samples=ns,
                                                   top_k=5)
                if sent_temp is None:
                    continue
                sent_temp = sent_temp.cpu()
                sent.append(sent_temp)
            print('gen_time: {}'.format(time.time() - gen_time))
            # print(sent)
            eval_time = time.time()
            for s in sent:
                for l in range(s.shape[0]):
                    sl = tokenizer.decode(s[l].tolist())
                    if e1l in sl and e2l in sl:
                        sents.append(s[l])
            sl = len(sents)
            idx = data['idx'][i]
            res_data = {
                'e1': [idx[0]] * sl,
                'e2': [idx[1]] * sl,
                'sent': sents,
                'log_prod_prob': [],
                'loss': [],
                'sample_sent': [idx[2]] * sl
            }
            if sl > 0:
                divs = sl // max_sample
                paps = [max_sample] * divs
                if sum(paps) < sl:
                    paps.append(sl - divs * max_sample)
                for j, pap in enumerate(paps):
                    temp_data = {
                        'e1': [e1] * pap,
                        'e2': [e2] * pap,
                        'sent': sents[j * max_sample:j * max_sample + pap],
                        'idx': [idx] * pap
                    }
                    probs = get_seq_prob(gpt2,
                                         temp_data,
                                         data_func=process_re_data)
                    res_data['log_prod_prob'].extend(get_column(probs, 1))
                    res_data['loss'].extend(get_column(probs, 2))

            result = pd.concat([result, pd.DataFrame(res_data)])
            print('eval_time: {}'.format(time.time() - eval_time))
            log_info(
                sample_logger, 'Sampled {} sents for e1 {}, e2 {}'.format(
                    len(sents), tokenizer.decode(e1.tolist()),
                    tokenizer.decode(e2.tolist())))
            print('tot time: {}, avg: {}'.format(time.time() - step_time,
                                                 (time.time() - step_time) /
                                                 num_samples))
    return result
예제 #9
0
def start_func(config):
    from global_constants import data_process_func
    from global_constants import ModelEnums, DatasetEnums, TrainModesEnums, ConfigEnums
    me, de, tme, ce = ModelEnums, DatasetEnums, TrainModesEnums, ConfigEnums
    config = {ce[k]: v for k, v in config.items() if k in ce.__members__}
    # print(config)
    mode = tme[get_config(config, ce.mode)]
    fields = mode.value.fields
    con = {k: get_config(config, k) for k in fields}
    # print(con)
    model_type = me[con[ce.model]]
    load_path = get_config(con, ce.load_path)
    save_path = get_config(con, ce.save_path)

    if save_path is not None:
        if save_path[-1] != '/':
            save_path += '/'
        log_path = list(os.path.split(save_path)[:-1])
        log_path.append('log/')
        log_path = '/'.join(log_path)
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        initial_loggers(log_path)

    prepare_logger, cuda_logger, final_logger = loggers.prepare_logger, loggers.cuda_logger, loggers.final_logger
    json_encoder = json.JSONEncoder(ensure_ascii=False, indent=2)
    log_info(
        prepare_logger, 'config loaded:\n' +
        json_encoder.encode({k.name: v
                             for k, v in con.items()}))

    log_info(prepare_logger, 'loading models: ' + load_path)

    tok = tfm.GPT2Tokenizer.from_pretrained(load_path)
    log_info(prepare_logger, 'model loaded')
    log_info(cuda_logger,
             'avaliable cudas {}'.format(torch.cuda.device_count()))
    # log_info(prepare_logger, 'start training:\n\tepochs: {}\n\tbatch_len: {}\n\tbatch_size: {}'.format(
    #     con[ce.epochs], con[ce.batch_len], con[ce.batch_size]))

    # gpu = GPUtil.getGPUs()[0]
    # log_info(cuda_logger, 'GPU Free {} Used {} Total {}'.format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryTotal))
    log_info(cuda_logger, 'Start cuda memory {}'.format(cuda_mem_in_mb()))
    log_info(cuda_logger, 'Allocated model {}'.format(cuda_mem_in_mb()))
    model = model_type.value.from_pretrained(load_path)

    dataset_type = de[con[ce.dataset_type]]
    dataset_class = dataset_type.value.class_type
    con[ce.data_func] = data_process_func[mode][model_type] \
        [dataset_type](max_len=con[ce.max_len], batch_size=con[ce.batch_size] if ce.batch_size in con else 1)
    con[ce.dataset_type] = dataset_class
    con[ce.tokenizer] = tok
    con[ce.model] = model
    if ce.gpt2 in con:
        con[ce.gpt2] = tfm.GPT2LMHeadModel.from_pretrained(con[ce.gpt2])
    method = mode.value.func

    con[ce.idx_file] = open(con[ce.idx_path], 'r')
    if ce.ent_file in dataset_type.value.fields:
        con[ce.ent_file] = open(con[ce.ent_path], 'r')
    if ce.sent_file in dataset_type.value.fields:
        con[ce.sent_file] = open(con[ce.sent_path], 'r')

    dataset_parameters = {k.name: con[k] for k in dataset_type.value.fields}
    ids = con[ce.ids]
    if ids == '':
        ids = None

    if ids is not None:
        with open(ids, 'r') as f:
            ids = json.load(f)
        ids = np.array_split(ids, con[ce.loaders])
        ids = [x.tolist() for x in ids]
    loaders = []
    for i in range(con[ce.loaders]):
        dataset_parameters[ce.ids] = ids[i]
        loaders.append(dataset_type(**dataset_parameters))

    first_len = loaders[0].get_loaded_length()[0]
    all_len = sum([x.get_loaded_length()[0] for x in loaders])
    dataset_parameters[ce.ids] = list(
        range(all_len, all_len + con[ce.eval_len] * first_len))
    con[ce.eval_set] = dataset_type(**dataset_parameters)

    for i in range(con[ce.loaders]):
        new_con = dict(con)
        new_con[ce.dataset] = loaders[i]
        if new_con[ce.dataset] is None:
            break
        new_con[ce.epoch_iter] = len(new_con[ce.dataset]) // (
            new_con[ce.batch_size] if ce.batch_size in new_con else 1)
        new_model, loss = method(new_con, i)
        con[ce.model] = new_model
        con[ce.prev_eval_loss] = loss
예제 #10
0
def single_train(config, index):
    from global_constants import ConfigEnums, main_device
    ce = ConfigEnums
    save_path = config[ce.save_path]
    save_model = config[ce.save_model]

    config[ce.save_path] = config[ce.save_path] if config[
        ce.save_model] else None
    config[ce.model] = config[ce.model].to(main_device)

    final_logger = loggers.final_logger
    train_params = get_params(config, train)
    new_model, train_losses = train(**train_params)
    new_model = get_module_from_parallel(new_model)
    config[ce.dataset] = config[ce.evalset]
    eval_params = get_params(config, evaluate)
    perplexity, perplexities, eval_losses = evaluate(**eval_params)
    refuse = False
    loss = torch.mean(eval_losses)
    log_info(final_logger, 'final mean loss {}'.format(loss))
    # if loss > config[ce.prev_eval_loss]:
    #     new_model.load_state_dict(model_state)
    #     refuse = True
    #     log_info(final_logger, 'loss {} is high, refused'.format(index))
    #     loss = config[ce.prev_eval_loss]
    # else:
    #     config[ce.prev_eval_loss] = loss
    if save_path is not None:
        if save_model and not refuse:
            new_model = get_module_from_parallel(new_model)
            tokenizer = get_module_from_parallel(config[ce.tokenizer])
            log_info(final_logger, 'saving trained models: ' + save_path)
            new_model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
        log_path = list(os.path.split(save_path)[:-1])
        log_path.append('log')
        log_path.append(str(index) + '/')
        log_path = '/'.join(log_path)
        if not os.path.exists(log_path):
            os.mkdir(log_path)
        log_info(final_logger, 'saving training losses')
        torch.save(train_losses, log_path + 'train_losses.pt')
        log_info(final_logger, 'saving evaluation losses')
        torch.save(eval_losses, log_path + 'eval_losses.pt')
        torch.save(perplexity, log_path + 'perplexity.pt')
        torch.save(perplexities, log_path + 'perplexities.pt')
        log_info(final_logger,
                 'mean eval losses {}'.format(torch.mean(eval_losses)))
        log_info(final_logger, 'All saved')
    return new_model, loss
예제 #11
0
def train(model,
          dataset,
          batch_size,
          epochs,
          epoch_iter,
          learning_rate=1e-2,
          weight_decay=1e-4,
          save_path=None,
          from_checkpoint=False,
          continue_train=False,
          tokenizer=None,
          data_func=lambda x: x):
    loss_logger, train_logger = loggers.loss_logger, loggers.train_logger
    no_decay = ['bias', 'LayerNorm.weight']

    optimizer_params = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = tfm.AdamW(optimizer_params, lr=learning_rate)
    scheduler = tfm.get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=100,
                                                    num_training_steps=epochs *
                                                    epoch_iter)
    losses = []
    if from_checkpoint:
        epoch, mini_epoch, model_state, optimizer_state, scheduler_state, loss = load_checkpoint(
            save_path + 'checkpoint.pt')
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
        scheduler.load_state_dict(scheduler_state)
        if continue_train:
            epochs = epochs - epoch + 1
    model = nn.DataParallel(model)
    model.train()
    for e in range(epochs):
        data_loader = DataLoader(dataset,
                                 shuffle=False,
                                 batch_size=batch_size,
                                 collate_fn=lambda x: x)
        epoch_start = time.time()
        loss_value, loss = train_one_epoch(data_loader,
                                           model,
                                           optimizer,
                                           scheduler,
                                           data_process_func=data_func,
                                           tok=tokenizer)
        losses.extend(loss_value)
        if save_path is not None:
            get_module_from_parallel(model).save_pretrained(save_path)
            if tokenizer is not None:
                tokenizer.save_pretrained(save_path)
            check_point = {
                'model': model,
                'epoch': e,
                'optimizer': optimizer,
                'scheduler': scheduler,
                'loss': loss
            }
            save_checkpoint(save_path + 'checkpoint.pt', check_point)
            log_info(loss_logger, 'saved models for in epoch {}'.format(e))
        loss_seg = losses[e * epoch_iter:]
        log_info(train_logger, '-' * 50)
        log_info(
            train_logger, 'Epoch {}, Mean Loss {}, Min Loss {}'.format(
                e, np.mean(loss_seg), np.min(loss_seg)))
        time_diff = time.time() - epoch_start
        log_info(
            train_logger, 'Time {}, Epoch Time {}, Avg Iter Time {}'.format(
                datetime.now().strftime("%d/%m/%Y %H:%M:%S"), time_diff,
                time_diff / epoch_iter))

    return model, torch.tensor(losses)