Пример #1
0
def test_play_the_game_less_badly():
    bad_seeds_01_env = BadSeeds01(seed_count=5,
                                  bad_seed_count=3,
                                  max_episode_length=2 * 2 + 3 * 3 + 1)

    # measure the good seeds twice
    # measure the bad seeds three times
    for time_i, seed_i in enumerate(
            concatv(
                take(
                    n=2 * len(bad_seeds_01_env.good_seeds),
                    seq=cycle(bad_seeds_01_env.good_seed_indices),
                ),
                take(
                    n=3 * len(bad_seeds_01_env.bad_seeds),
                    seq=cycle(bad_seeds_01_env.bad_seed_indices),
                ),
            )):
        next_state, terminal, reward = bad_seeds_01_env.execute(actions=seed_i)
        assert next_state[time_i, seed_i] != 0.0
        assert terminal is False
        assert reward == 0.0

    # measure the first good seed again
    next_state, terminal, reward = bad_seeds_01_env.execute(
        actions=bad_seeds_01_env.good_seed_indices[0])
    assert next_state[-1, bad_seeds_01_env.good_seed_indices[0]] != 0.0
    assert terminal is True
    # reward is the number of times the least-measured seed was measured
    assert reward == 2.0
Пример #2
0
def test_play_the_game_less_badly():
    bad_seeds_03_env = BadSeeds03(seed_count=5,
                                  bad_seed_count=3,
                                  max_episode_length=3 + 2 * 2 + 3 * 3 + 1)

    # measure the good seeds twice
    # measure the bad seeds three times
    for time_i, seed_i in enumerate(
            concatv(
                take(
                    n=2 * len(bad_seeds_03_env.good_seeds),
                    seq=cycle(bad_seeds_03_env.good_seed_indices),
                ),
                take(
                    n=3 * len(bad_seeds_03_env.bad_seeds),
                    seq=cycle(bad_seeds_03_env.bad_seed_indices),
                ),
            )):
        time_i += 3
        next_state, terminal, reward = bad_seeds_03_env.execute(actions=seed_i)
        assert bad_seeds_03_env.history_array[time_i, seed_i] != 0.0
        assert terminal is False
        assert reward == 0.0

    measurement_counts, measured_seed_counts = count_measurements(
        bad_seeds_03_env.history_array)
    expected_measurement_counts = np.zeros_like(measurement_counts)
    expected_measurement_counts[0, bad_seeds_03_env.good_seed_indices] = 5
    expected_measurement_counts[0, bad_seeds_03_env.bad_seed_indices] = 6
    assert np.all(measurement_counts == expected_measurement_counts)

    # measure the first good seed again
    next_state, terminal, reward = bad_seeds_03_env.execute(
        actions=bad_seeds_03_env.good_seed_indices[0])

    print(f"history:\n{bad_seeds_03_env.history_array}")
    measurement_counts, measured_seed_counts = count_measurements(
        bad_seeds_03_env.history_array)
    print(f"measurement_counts: {measurement_counts}")

    assert next_state[-1, bad_seeds_03_env.good_seed_indices[0]] != 0.0
    assert terminal is True
    # reward is the number of times the least-measured seed was measured
    assert reward == 6.0

    expected_measurement_counts[0, bad_seeds_03_env.good_seed_indices[0]] += 1
    assert np.all(measurement_counts == expected_measurement_counts)
Пример #3
0
def optimize_steps(steps):
    """
    Optimize steps.

    Currently only optimizes per step type. See the :func:`_optimizer`
    decorator for more information on how to register an optimizer.

    :param pbag steps: Collection of steps.
    :return: a pbag of steps.
    """
    def grouping_fn(step):
        step_type = type(step)
        if step_type in _optimizers:
            return step_type
        else:
            return "unoptimizable"

    steps_by_type = groupby(grouping_fn, steps)
    unoptimizable = steps_by_type.pop("unoptimizable", [])
    omg_optimized = concat(_optimizers[step_type](steps)
                           for step_type, steps in steps_by_type.iteritems())
    return pbag(concatv(omg_optimized, unoptimizable))
Пример #4
0
def test_concatv():
    assert list(concatv([], [], [])) == []
    assert (list(take(5, concatv(['a', 'b'], range(1000000000)))) ==
            ['a', 'b', 0, 1, 2])
Пример #5
0
from msvdd_bloc import regexes, tokenize
from msvdd_bloc.resumes import education
from msvdd_bloc.resumes import parse_utils

LOGGER = logging.getLogger(__name__)

#######################
## CRF-BASED PARSING ##
#######################

FIELD_SEP_TEXTS = {
    sep
    for sep in itertoolz.concatv(
        education.constants.FIELD_SEPS,
        education.constants.FIELD_SEP_DTS,
        education.constants.FIELD_SEP_SMS,
        education.constants.LEFT_BRACKETS,
        education.constants.RIGHT_BRACKETS,
    )
}
ITEM_SEP_TEXTS = set(education.constants.FIELD_SEP_SMS)
INSTITUTION_TEXTS = {
    "university",
    "college",
    "institute",
    "department",
    "dept.",
    "high",
    "school",
    "academy",
}
Пример #6
0
from toolz import itertoolz

from msvdd_bloc import regexes, tokenize
from msvdd_bloc.resumes import constants
from msvdd_bloc.resumes import parse_utils
from msvdd_bloc.resumes import work


LOGGER = logging.getLogger(__name__)

FIELD_SEP_TEXTS = {
    sep for sep in itertoolz.concatv(
        work.constants.FIELD_SEPS,
        work.constants.FIELD_SEP_DTS,
        work.constants.FIELD_SEP_SMS,
        constants.LEFT_BRACKETS,
        constants.RIGHT_BRACKETS,
    )
}
COMPANY_TEXTS = set(
    text.lower()
    for text in work.constants.COMPANY_TYPES + work.constants.COMPANY_MODIFIERS
)
POSITION_TEXTS = set(
    text.lower()
    for text in work.constants.POSITION_LEVELS + work.constants.POSITION_TYPES
)


def parse_lines(lines, tagger=None):
Пример #7
0
def main():
    # prameters
    data_dir = '../input/'
    bert_model = '../bert-large-wwm-uncased'  # 把自己large wwm模型路径取代这个bert_model
    #     bert_model = 'bert-base-uncased'
    #     bert_model = './oldtoxic'#使用在老toxic上训练好的预训练模型权重.下载路径:https://www.kaggle.com/qinhui1999/old-toxic-bert-v2
    task_name = 'MyPro'
    output_dir = 'checkpoints/'
    model_save_pth = 'checkpoints/bert_large_wwm.pth'
    max_seq_length = 220
    do_train = True
    do_eval = True
    do_lower_case = True
    train_batch_size = 56
    eval_batch_size = 200
    learning_rate = 1e-5
    num_train_epochs = 1
    warmup_proportion = 0.05
    no_cuda = False
    local_rank = -1
    seed = 42
    gradient_accumulation_steps = 8
    optimize_on_cpu = False
    fp16 = False
    save_checkpoints_steps = 50000
    loss_scale = 128

    # 对模型输入进行处理的processor,git上可能都是针对英文的processor
    processors = {'mypro': MyPro}

    if local_rank == -1 or no_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        torch.distributed.init_process_group(backend='nccl')
        if fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(local_rank != -1))

    if gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(gradient_accumulation_steps))

    train_batch_size = int(train_batch_size / gradient_accumulation_steps)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    if not do_train and not do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(output_dir) and os.listdir(output_dir):
        # raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir))
        print('The checkpoint directory is aleady existed...')
    else:
        os.makedirs(output_dir, exist_ok=True)

    task_name = task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(bert_model,
                                              do_lower_case=do_lower_case)
    # print("tokenizer",tokenizer)
    train_examples = None
    num_train_steps = None
    if do_train:
        train_examples = processor.get_train_examples(data_dir)
        num_train_steps = int(
            len(train_examples) / train_batch_size /
            gradient_accumulation_steps * num_train_epochs)

    # Prepare model
    # model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=2,
    #             cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank))
    model = ToxicModel(bert_model, device)
    # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
    # set_trainable(model.bert, False)
    # 锁定embedding层
    #     set_trainable(model.bert.embeddings, False)
    # set_trainable(model.bert.encoder.layer[11], True)
    # set_trainable(model.head, True)
    # model.load_state_dict(torch.load('checkpoints/bert_classification_2epoch.pth')['state_dict'])
    if fp16:
        model.half()
    model.to(device)
    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if fp16:
        param_optimizer = [
            (n, param.clone().detach().to('cpu').float().requires_grad_())
            for n, param in model.named_parameters()
        ]
    elif optimize_on_cpu:
        param_optimizer = [(n,
                            param.clone().detach().to('cpu').requires_grad_())
                           for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    t_total = num_train_steps
    if local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if do_train:

        if os.path.exists('train.token_new_cleaned_wwm.npy'):
            train_features = np.load('train.token_new_cleaned_wwm.npy',
                                     allow_pickle=True)
        else:
            parallel = Parallel(300, backend="multiprocessing", verbose=5)
            train_features = list(
                concatv(*parallel(
                    delayed(convert_examples_to_features)(
                        example, label_list, max_seq_length, tokenizer)
                    for example in list(partition_all(300, train_examples)))))
            train_features = np.asarray(train_features)
            np.save('train.token_new_cleaned_wwm', train_features)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        torch.cuda.empty_cache()
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)

        print('y_aux', np.asarray([f.y_aux for f in train_features]).shape)
        all_label_ids = torch.tensor(np.hstack([
            np.asarray([f.label_id for f in train_features]),
            np.asarray([f.y_aux for f in train_features])
        ]),
                                     dtype=torch.float32)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(
            train_data,
            sampler=train_sampler,
            batch_size=train_batch_size,
            num_workers=2,
            pin_memory=True,
        )

        #model.load_state_dict(torch.load('checkpoints/bert_large_wwm.pth')['state_dict'])
        # model.load_state_dict(torch.load('checkpoints/0_80000_iterations.pth')['state_dict'])

        model.train()
        best_score = 0
        flags = 0
        torch.cuda.empty_cache()
        ''' 
        model.load_state_dict(torch.load('checkpoints/0_20000_iterations.pth')['model'])
        optimizer.load_state_dict(torch.load('checkpoints/0_20000_iterations.pth')['optimizer'])
        old_iter = int(torch.load('checkpoints/0_20000_iterations.pth')['iteration'])
        '''
        old_iter = -1

        for i_epoch in trange(int(num_train_epochs), desc="Epoch"):
            torch.cuda.empty_cache()
            iteration = 0  # counter
            save_point = save_checkpoints_steps  # 10000
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                if iteration <= old_iter:
                    iteration += 1
                    continue
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                torch.cuda.empty_cache()
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if fp16 and loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * loss_scale
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                loss.backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    if fp16 or optimize_on_cpu:
                        if fp16 and loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / loss_scale
                        is_nan = set_optimizer_params_grad(
                            param_optimizer,
                            model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info(
                                "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                            )
                            loss_scale = loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(
                            model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                #Save model
                if iteration % save_point == 0 and iteration > 0:
                    checkpoint = {
                        'iteration': iteration,
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict()
                    }

                    checkpoint_path = os.path.join(
                        output_dir,
                        '{}_{}_iterations.pth'.format(i_epoch, iteration))

                    torch.save(checkpoint, checkpoint_path)
                    logging.info('Model saved to {}'.format(checkpoint_path))
                    val(model, processor, data_dir, max_seq_length,
                        eval_batch_size, label_list, tokenizer, device)

                iteration += 1

    checkpoint = {
        'state_dict': model.state_dict(),
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(checkpoint, model_save_pth)
    val(model, processor, data_dir, max_seq_length, eval_batch_size,
        label_list, tokenizer, device)

    test(model, processor, data_dir, max_seq_length, eval_batch_size,
         label_list, tokenizer, device)