def get_init_args_dir(init_args_dir):
    """
    to simplify reporting we allow three ways of providing init_args_dir
    :param init_args_dir:
    :return:
    """
    if os.path.isfile(
            init_args_dir
    ):  # , "ERROR {} not found to reload checkpoint".format(init_args_dir)
        _dir = init_args_dir
    elif os.path.isfile(os.path.join(CHECKPOINT_BERT_DIR, init_args_dir)):
        printing(
            "MODEL init {} not found as directory so using second template ",
            var=[init_args_dir],
            verbose=1,
            verbose_level=1)
        _dir = os.path.join(CHECKPOINT_BERT_DIR, init_args_dir)
    else:
        printing(
            "MODEL init {} not found as directory and as subdirectory so using third template template ",
            var=[init_args_dir],
            verbose=1,
            verbose_level=1)
        match = re.match("(.*-model_[0-9]+).*", init_args_dir)
        assert match is not None, "ERROR : template {} not found in {}".format(
            "([.*]-model_[0-9]+).*", init_args_dir)
        _dir = os.path.join(CHECKPOINT_BERT_DIR, match.group(1),
                            init_args_dir + "-args.json")
        assert os.path.isfile(
            _dir), "ERROR : {} does not exist (based on param {}) ".format(
                _dir, init_args_dir)
    return _dir
def printout_allocated_gpu_memory(verbose, comment):

    if verbose == "gpu":
        try:
            printing("GPU {} {}",var=[comment, torch.cuda.memory_allocated()], verbose=verbose, verbose_level="gpu")
        except Exception as e:
            print(e)
def get_dataset_label(dataset_dir_ls, default):
    if dataset_dir_ls is None:
        return None

    if REPO_DATASET.get(dataset_dir_ls[0], None) is None:
        try:
            label = "|".join(
                [get_code_data(path) for _, path in enumerate(dataset_dir_ls)])
        except:
            printing(
                "REPORT : dataset name of directory {} not found as UD so using default ",
                var=[dataset_dir_ls],
                verbose=0,
                verbose_level=1)
            label = "|".join([
                REPO_DATASET.get(path, "{}_{}".format(default, i))
                for i, path in enumerate(dataset_dir_ls)
            ])
    else:
        label = "|".join([
            REPO_DATASET.get(path, "{}_{}".format(default, i))
            for i, path in enumerate(dataset_dir_ls)
        ])

    return label
def write_args(dir,
               model_id,
               checkpoint_dir=None,
               hyperparameters=None,
               info_checkpoint=None,
               verbose=1):

    args_dir = os.path.join(dir, "{}-args.json".format(model_id))
    if os.path.isfile(args_dir):
        info = "updated"
        args = json.load(open(args_dir, "r"))
        args["checkpoint_dir"] = checkpoint_dir
        args["info_checkpoint"] = info_checkpoint
        json.dump(args, open(args_dir, "w"))
    else:
        assert hyperparameters is not None, "REPORT : args.json created for the first time : hyperparameters dic required "
        #assert info_checkpoint is None, "REPORT : args. created for the first time : no checkpoint yet "
        info = "new"
        json.dump(
            OrderedDict([("checkpoint_dir", checkpoint_dir),
                         ("hyperparameters", hyperparameters),
                         ("info_checkpoint", info_checkpoint)]),
            open(args_dir, "w"))
    printing("MODEL args.json {} written {} ".format(info, args_dir),
             verbose_level=1,
             verbose=verbose)
    return args_dir
示例#5
0
def data_gen_dummy(V,
                   batch,
                   nbatches,
                   sent_len=9,
                   word_len=5,
                   verbose=0,
                   seed=None):
    "Generate random data for a src-tgt copy task."
    if seed is not None:
        np.random.seed(seed)
    for i in tqdm(range(nbatches),
                  disable=disable_tqdm_level(verbose, verbose_level=2)):
        data = torch.from_numpy(
            np.random.randint(low=2, high=V, size=(batch, sent_len, word_len)))
        data[:, :, 0] = 2
        # we force padding in the dummy model
        data[:, :, -1] = 1
        data[:, :, -2] = 1
        printing("DATA dummy {} ",
                 var=(data),
                 verbose=verbose,
                 verbose_level=5)
        src = Variable(data, requires_grad=False)
        tgt = Variable(data, requires_grad=False)
        yield MaskBatch(src, tgt, pad=1)
def align_bpe(n_bpe_target_minus_source,
              source_aligned,
              source_aligned_index,
              target_aligned,
              target_aligned_index,
              n_masks_to_add,
              src_token_len,
              bert_tokenizer,
              mask_token,
              mode="dummy",
              index_src=None,
              index_target=None,
              verbose=0):
    """
    align bpe of a given token using mode
    :return:
    """
    assert mode in ["dummy"]
    # dummy means appending with SPACE or MASK when needed
    if n_bpe_target_minus_source > 0:
        assert index_src is not None
        source_aligned_index.extend(
            [index_src for _ in range(n_bpe_target_minus_source)])
        source_aligned.extend(
            bert_tokenizer.convert_tokens_to_ids(
                [mask_token for _ in range(n_bpe_target_minus_source)]))

    elif n_bpe_target_minus_source < 0:
        assert index_target is not None
        # we add a NULL_STR (to be predicted) and index it as the former bpe token
        target_aligned_index.extend(
            [index_target for _ in range(-n_bpe_target_minus_source)])
        target_aligned.extend(
            bert_tokenizer.convert_tokens_to_ids(
                [NULL_STR for _ in range(-n_bpe_target_minus_source)]))

    n_masks_to_add.append(n_bpe_target_minus_source)
    n_masks_to_add.extend([-1 for _ in range(src_token_len - 1)])

    if verbose == "reader":
        printing(
            "SRC appending word bpe align : {}\nTARGET appending word bpe align : {} \nN_MASKS------------ : {}",
            var=[[mask_token for _ in range(n_bpe_target_minus_source)]
                 if n_bpe_target_minus_source > 0 else "",
                 [NULL_STR for _ in range(-n_bpe_target_minus_source)]
                 if n_bpe_target_minus_source < 0 else "",
                 [n_bpe_target_minus_source] +
                 [-1 for _ in range(src_token_len - 1)]],
            verbose_level="reader",
            verbose=verbose)

    return source_aligned, source_aligned_index, target_aligned, target_aligned_index, n_masks_to_add
示例#7
0
def sanity_check_loss_poneration(ponderation_dic, verbose=1):
    if isinstance(ponderation_dic, dict):
        for task in TASKS_PARAMETER:
            assert task in ponderation_dic, "ERROR : task {} is not related to a ponderation while it should ".format(
                task)
    elif isinstance(ponderation_dic, str):
        assert ponderation_dic in MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE, "ERROR ponderation should be in {}".format(
            ponderation_dic, MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE)
        printing("WARNING : COULD NOT SANITY CHECK ponderation_dic {} ",
                 var=[ponderation_dic],
                 verbose=verbose,
                 verbose_level=1)
    else:
        raise (Exception("ponderation_dic is neither string or dict {}".format(
            ponderation_dic)))
def get_new_shard(shard_path, n_shards, rand=True, verbose=1):
    # pick a new file randomly

    assert rand

    i_shard = random.choice(range(n_shards))

    path = os.path.join(shard_path, "train_{}.conll".format(i_shard))

    assert os.path.isfile(path), "ERROR {}".format(path)

    printing("INFO : picking shard {} ",
             var=[path],
             verbose=verbose,
             verbose_level=1)
    return [path]
示例#9
0
def get_perf_rate(metric, score_dic, n_tokens_dic, agg_func, task, verbose=1):
    """
    provides metric : the confusion matrix standart rates for the given task
    :param metric:
    :param score_dic: two level dictionay : first level for agg_func second
    for prediciton class based on CLASS_PER_TASK and task
    :param agg_func:
    :return: rate, denumerator of the rate (if means like f1 : returns all )
    """
    pdb.set_trace()
    if metric in ["recall-{}".format(task), "f1-{}".format(task), "accuracy-{}".format(task)]:

        positive_obs = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]]
        recall = score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]] / positive_obs \
            if positive_obs > 0 else None
        if positive_obs == 0:
            printing("WARNING : no positive observation were seen ", verbose=verbose, verbose_level=1)
        if metric == "recall-{}".format(task):
            return recall, positive_obs
    if metric in ["precision-{}".format(task), "f1-{}".format(task), "accuracy-{}".format(task)]:
        #positive_prediction = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]] - score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]] \
        #                      + score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]]
        positive_prediction = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes_pred_field"][1]]
        precision = score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]] / positive_prediction if positive_prediction > 0 else None
        if metric == "precision-{}".format(task):
            return precision, positive_prediction
    if metric in ["tnr-{}".format(task), "accuracy-{}".format(task), "f1-{}".format(task)]:
        negative_obs = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]]
        if metric == "tnr-{}".format(task):
            return score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]] / negative_obs if negative_obs>0 else None, \
                   negative_obs
    if metric == "f1-{}".format(task):
        if recall is not None and precision is not None and recall>0 and precision>0:
            return hmean([recall, precision]), negative_obs + positive_obs
        else:
            return None, negative_obs + positive_obs

    if metric in ["npv-{}".format(task)]:
        negative_prediction = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes_pred_field"][0]]
        return score_dic[agg_func][
                   TASKS_PARAMETER[task]["predicted_classes"][0]] / negative_prediction if negative_prediction > 0 else None, \
               negative_prediction
    if metric == "accuracy-{}".format(task):
        accuracy = (score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]] + score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]]) / (positive_obs + negative_obs) if positive_obs > 0 and negative_obs > 0 else None
        return accuracy, positive_obs + negative_obs

    raise(Exception("metric {} not supported".format(metric)))
示例#10
0
def setup_repoting_location(root_dir_checkpoints,
                            model_suffix="",
                            shared_id=None,
                            data_sharded=None,
                            verbose=1):
    """
    create an id for a model and locations for checkpoints, dictionaries, tensorboard logs, data
    :param model_suffix:
    :param verbose:
    :return:
    """
    model_local_id = str(uuid4())[:5]
    if shared_id is not None:
        if len(shared_id) > 0:
            model_local_id = shared_id + "-" + model_local_id
    if model_suffix != "":
        model_local_id += "-" + model_suffix
    model_location = os.path.join(root_dir_checkpoints, model_local_id)
    dictionaries = os.path.join(root_dir_checkpoints, model_local_id,
                                "dictionaries")
    tensorboard_log = os.path.join(root_dir_checkpoints, model_local_id,
                                   "tensorboard")
    end_predictions = os.path.join(root_dir_checkpoints, model_local_id,
                                   "predictions")

    os.mkdir(model_location)

    if data_sharded is None:
        data_sharded = os.path.join(root_dir_checkpoints, model_local_id,
                                    "shards")
        os.mkdir(data_sharded)
    else:
        assert os.path.isdir(
            data_sharded), "ERROR data_sharded not dir {} ".format(
                data_sharded)
        printing("INFO DATA already sharded in {}",
                 var=[data_sharded],
                 verbose=verbose,
                 verbose_level=1)
    printing("CHECKPOINTING model location:{}",
             var=[model_location],
             verbose=verbose,
             verbose_level=1)
    printing("CHECKPOINTING model ID:{}",
             var=[model_local_id],
             verbose=verbose,
             verbose_level=1)
    os.mkdir(dictionaries)
    os.mkdir(tensorboard_log)
    os.mkdir(end_predictions)
    printing(
        "CHECKPOINTING \n- {} for checkpoints \n- {} for dictionaries created \n- {} predictions {} ",
        var=[model_location, dictionaries, end_predictions, data_sharded],
        verbose_level=1,
        verbose=verbose)
    return model_local_id, model_location, dictionaries, tensorboard_log, end_predictions, data_sharded
示例#11
0
def print_align_bpe(source_preprocessed, gold, input_alignement_with_raw,
                    labels_n_mask_prediction, verbose, verbose_level):
    if labels_n_mask_prediction is None:
        labels_n_mask_prediction = [[None for _ in range(len(sent))]
                                    for sent in input_alignement_with_raw]
    if isinstance(verbose, int) or verbose == "alignement":
        if verbose == "alignement" or verbose >= verbose_level:
            assert len(source_preprocessed) == len(gold), ""
            assert len(input_alignement_with_raw) == len(gold), ""
            for sent_src, sent_gold, index_match_with_src, append_masks in zip(
                    source_preprocessed, gold, input_alignement_with_raw,
                    labels_n_mask_prediction):
                assert len(sent_src) == len(sent_gold)
                assert len(sent_src) == len(sent_gold)
                for src, gold_tok, index, masks in zip(sent_src, sent_gold,
                                                       index_match_with_src,
                                                       append_masks):
                    printing("{}:{} --> {} (n_masks {})",
                             var=[index, src, gold_tok, masks],
                             verbose=1,
                             verbose_level=1)
def get_optimizer(parameters,
                  lr,
                  optimizer="adam",
                  betas=None,
                  weight_decay=None,
                  verbose=1):

    assert optimizer in AVAILABLE_OPTIMIZER, "ERROR optimizers supported are {} ".format(
        AVAILABLE_OPTIMIZER)

    if optimizer == "adam":
        if betas is None:
            # betas = (0.9, 0.9)
            print("DEFAULT betas:", betas)
        if weight_decay is None:
            weight_decay = 0
        opt = torch.optim.Adam(parameters,
                               lr=lr,
                               betas=betas,
                               eps=1e-9,
                               weight_decay=weight_decay)

    elif optimizer == "SGD":
        assert betas is None, "ERROR "
        opt = torch.optim.SGD(parameters, lr=lr)

    elif optimizer == "bahdanu-adadelta":
        assert betas is None, "ERROR betas not supported for optimizer {}".format(
            optimizer)
        opt = torch.optim.Adadelta(parameters, eps=10e-6, rho=0.95)

    elif optimizer == "AdamW":
        opt = AdamW(parameters, lr=lr, weight_decay=weight_decay)

    printing("TRAINING : optimizer {} has been reloaded with lr {} betas {} ",
             var=[optimizer, lr, betas],
             verbose=verbose,
             verbose_level=2)
    return opt
def build_shard(dir_shard,
                dir_file,
                n_sent_max_per_file,
                format="conll",
                dry_run=False,
                verbose=1):

    onlyfiles = [f for f in listdir(dir_shard) if isfile(join(dir_shard, f))]
    if len(onlyfiles) > 0:
        n_shards = len(onlyfiles)
        n_sents = 0
        for file in onlyfiles:
            n_sents += count_conll_n_sent(os.path.join(dir_shard, file))

        printing("INFO : shards already filled in {} files {} sentences total",
                 var=[n_shards, n_sents],
                 verbose=1,
                 verbose_level=1)
        return dir_shard, n_shards, n_sents

    assert format in "conll"
    assert len(dir_file
               ) == 1, "ONLY 1 set of simultaneous task supported for sharding"
    printing("STARTING SHARDING {} of {} ".format(dir_shard, dir_file),
             verbose=verbose,
             verbose_level=1)
    dir_file = dir_file[0]
    n_sents = count_conll_n_sent(dir_file)
    n_shards = n_sents // n_sent_max_per_file

    if n_shards == 0:
        printing(
            "INFO SHARDING : n_sent_max_per_file is lower that number of files in {} so only building 1 shard",
            var=[dir_file],
            verbose=verbose,
            verbose_level=1)
        n_shards += 1
    split_randomly(n_shards, dir_shard, dir_file, n_sents, dry_run=dry_run)
    sys.stdout.flush()

    printing(
        "INFO SHARD n_sent written {} splitted in {} files with "
        "in average {} sent per file written to {}",
        var=[n_sents, n_shards, n_sent_max_per_file, dir_shard],
        verbose=verbose,
        verbose_level=1)

    return dir_shard, n_shards, n_sents
示例#14
0
def get_normalized_token(norm_field,
                         n_exception,
                         verbose,
                         predict_mode_only=False):

    match = re.match("^Norm=([^|]+)|.+", norm_field)

    try:
        assert match.group(
            1
        ) is not None, " ERROR : not normalization found for norm_field {} ".format(
            norm_field)
        normalized_token = match.group(1)

    except:
        match_double_bar = re.match("^Norm=([|]+)|.+", norm_field)

        if match_double_bar.group(1) is not None:
            match = match_double_bar
            n_exception += 1
            printing("Exception handled we match with {}".format(
                match_double_bar.group(1)),
                     verbose=verbose,
                     verbose_level=2)
            normalized_token = match.group(1)

        else:
            exc = Exception(
                "Failed to handle exception with | on field {} ".format(
                    norm_field))
            if not predict_mode_only:
                raise (exc)
            else:
                print("REPLACING with UNK", exc)
                normalized_token = "UNK"

    return normalized_token, n_exception
示例#15
0
def log_data_src_label_pred(src_detokenized_dic, predict_detokenize_dic,
                            label_detokenized_dic, tasks, verbose,
                            verbose_level):

    if isinstance(verbose, int) or verbose == "alignment":
        if verbose == "alignment" or verbose >= verbose_level:
            for task in [_task for _tasks in tasks for _task in _tasks]:
                input_name = TASKS_PARAMETER[task]["input"]
                label_name_ls = TASKS_PARAMETER[task]["label"]

                for ind_src_sent, src_sent in enumerate(
                        src_detokenized_dic[input_name]):
                    print("      ")
                    for label in label_name_ls:
                        try:
                            assert len(predict_detokenize_dic[task + "-" + label][0][ind_src_sent]) == len(label_detokenized_dic[label][ind_src_sent]), \
                                "ERROR pred {} label {} ".format(predict_detokenize_dic[task + "-" + label][ind_src_sent], label_detokenized_dic[label][ind_src_sent])
                            assert len(src_detokenized_dic[input_name]
                                       [ind_src_sent]) == len(
                                           label_detokenized_dic[label]
                                           [ind_src_sent]), "ERROR "
                            for ind_src, src in enumerate(src_sent):
                                to_print = "SRC : {} ,    ".format(
                                    src) + " ".join([
                                        "PRED:{}  GOLD:{} (label {})".format(
                                            predict_detokenize_dic[task + "-" +
                                                                   label][0]
                                            [ind_src_sent][ind_src],
                                            label_detokenized_dic[label]
                                            [ind_src_sent][ind_src], label)
                                        for label in label_name_ls
                                    ])
                                printing(to_print, verbose=1, verbose_level=1)
                        except Exception as e:
                            print("ERROR : not aligned labels so cannot log ",
                                  e)
def train_predict_eval(args, verbose=0):

    init_seed(args)
    if args.bert_model in BERT_MODEL_DIC:
        model_dir = BERT_MODEL_DIC[args.bert_model]["model"] if args.bert_model else None
        encoder = BERT_MODEL_DIC[args.bert_model]["encoder"] if args.bert_model else None
    else:
        model_dir = None
        encoder = "AutoModel"

    if args.init_args_dir is not None:
        args_checkpoint = json.load(open(args.init_args_dir, "r"))
        args.bert_model = args_checkpoint["hyperparameters"]["bert_model"]

    # if model referenced BERT_MODEL_DIC : using tokenizer directory otherwise loading from hugging face
    if args.bert_model in BERT_MODEL_DIC:
        tokenizer = eval(BERT_MODEL_DIC[args.bert_model]["tokenizer"]) if args.bert_model else None  # , "BertTokenizer"))
        voc_tokenizer = BERT_MODEL_DIC[args.bert_model]["vocab"] if args.bert_model else None
        vocab_size = BERT_MODEL_DIC[args.bert_model].get("vocab_size") if args.bert_model else None
    else:
        print("TOKENIZER Model not in BERT_MODEL_DIC so loading tokenizer from hugging face")
        tokenizer = AutoTokenizer
        voc_tokenizer = args.bert_model
        vocab_size = None

    null_token_index = vocab_size
    description = "grid"

    # We checkpoint the model only if early_stoppin_metric gets better ,
    # early_stoppin_metric choosen in relation to the first task defined in the list
    early_stoppin_metric, subsample_early_stoping_metric_val = get_early_stopping_metric(tasks=args.tasks,early_stoppin_metric=None, verbose=verbose)

    printing("INFO : tasks is {} so setting early_stoppin_metric to {} ", var=[args.tasks, early_stoppin_metric],
             verbose=verbose, verbose_level=1)

    printing("INFO : model {} batch_update_train {} batch_size {} ",
             var=[args.model_id_pref, args.batch_update_train, args.batch_size],
             verbose=verbose, verbose_level=1)

    run(args=args, voc_tokenizer=voc_tokenizer, vocab_size=vocab_size, model_dir=model_dir,
        report_full_path_shared=args.overall_report_dir,
        description=description, null_token_index=null_token_index, null_str=NULL_STR,
        model_suffix="{}".format(args.model_id_pref), debug=False,
        random_iterator_train=True,  bucket_test=False, compute_intersection_score_test=True,
        n_observation_max_per_epoch_train=args.n_iter_max_train if not args.demo_run else 2,
        n_observation_max_per_epoch_dev_test=50000 if not args.demo_run else 2,
        early_stoppin_metric=early_stoppin_metric,
        subsample_early_stoping_metric_val=subsample_early_stoping_metric_val,
        saving_every_epoch=args.saving_every_n_epoch, run_mode="train" if args.train else "test",
        auxilliary_task_norm_not_norm=True, tokenizer=tokenizer, max_token_per_batch=300,
        name_with_epoch=args.name_inflation, encoder=encoder, report=True, verbose=verbose)

    printing("MODEL {} trained and evaluated", var=[args.model_id_pref], verbose_level=1, verbose=verbose)
def get_early_stopping_metric(tasks,
                              verbose,
                              main_task=None,
                              early_stoppin_metric=None,
                              subsample_early_stoping_metric_val=None):
    """
    getting early stopping metric and evaluation subsample
    if early_stoppin_metric is None : uses first eval_metrics stated in TASKS_PARAMETER of the first task of the list passed in args.tasks
    :return:
    """
    if main_task is None:
        printing(
            "INFO : default main task provided is the first of the first list {} ",
            var=[tasks],
            verbose=verbose,
            verbose_level=1)
        if isinstance(tasks[0], list):
            main_task = tasks[0][0]
        else:
            main_task = tasks[0]

    if early_stoppin_metric is None:
        early_stoppin_metric = TASKS_PARAMETER[main_task]["eval_metrics"][0][0]

        printing(
            "INFO : default early_stoppin_metric is early_stoppin_metric  {} first one of "
            "the first possible in TASK_PARAMETER",
            var=[early_stoppin_metric],
            verbose=verbose,
            verbose_level=1)

    if subsample_early_stoping_metric_val is None:
        get_subsample = TASKS_PARAMETER[main_task].get("default-subsample")
        if get_subsample is None:
            get_subsample = "all"
            printing(
                "INFO : early stopping subsample is set to default {} all as not found in {}",
                var=["all", TASKS_PARAMETER[main_task]],
                verbose=verbose,
                verbose_level=1)
        subsample_early_stoping_metric_val = get_subsample
        assert subsample_early_stoping_metric_val in TASKS_PARAMETER[main_task][
            "subsample-allowed"], "ERROR task {} subsample not in {} ".format(
                main_task, subsample_early_stoping_metric_val)
    #sanity_check_early_stop_metric(early_stoppin_metric, TASKS_PARAMETER, tasks)

    return early_stoppin_metric, subsample_early_stoping_metric_val
示例#18
0
def log_warning(counting_failure_parralel_bpe_batch, data_label, batch_i,
                batch, noisy_under_splitted, skipping_batch_n_to_1, aligned,
                noisy_over_splitted, skip_1_t_n, skipping_evaluated_batch,
                verbose):
    printing("WARNING {} aignement failure caused by parallel ",
             var=[counting_failure_parralel_bpe_batch],
             verbose=verbose,
             verbose_level=1)
    printing(
        "WARNING on {} : Out of {} batch of X sentences each {} skipped ({} batch aligned ; {} with at least 1 sentence noisy MORE SPLITTED ; {} with  LESS SPLITTED {} + SENT with skipped_1_to_n : {}) ",
        var=[
            data_label, batch_i, noisy_under_splitted + skipping_batch_n_to_1,
            aligned, noisy_over_splitted, noisy_under_splitted,
            "SKIPPED" if skip_1_t_n else "", skipping_batch_n_to_1
        ],
        verbose=verbose,
        verbose_level=1)
    printing("WARNING on {} ON THE EVALUATION SIDE we skipped extra {} batch ",
             var=[data_label, skipping_evaluated_batch],
             verbose_level=1,
             verbose=verbose)
def input_normalization_processing(task_normalize_is, batch,
                                   norm_2_noise_training, norm_2_noise_eval):
    norm2noise_bool = False
    if (norm_2_noise_training is not None
            or norm_2_noise_eval) and task_normalize_is:
        portion_norm2noise = norm_2_noise_training if norm_2_noise_training is not None else 1.
        norm_2_noise_training = portion_norm2noise is not None
        rand = np.random.uniform(low=0, high=1, size=1)[0]
        norm2noise_bool = portion_norm2noise >= rand
        if norm2noise_bool:
            batch_raw_input = preprocess_batch_string_for_bert(
                batch.raw_output)
            printing("WARNING : input is gold norm",
                     verbose_level=2,
                     verbose=1)
        else:
            printing("WARNING : input is input", verbose_level=2, verbose=1)
            batch_raw_input = preprocess_batch_string_for_bert(batch.raw_input)
    else:
        printing("WARNING : input is input ", verbose_level=2, verbose=1)
        batch_raw_input = preprocess_batch_string_for_bert(batch.raw_input)
    return batch_raw_input, norm2noise_bool, norm_2_noise_training
示例#20
0
def logging_processing_data(_verbose, verbose, verbose_level, batch_raw_input,
                            input_tokens_tensor, batch_raw_output,
                            output_tokens_tensor, inp_bpe_tokenized,
                            out_bpe_tokenized):
    printing("DATA : pre-tokenized input {} ",
             var=[batch_raw_input],
             verbose_level=verbose_level,
             verbose=_verbose)
    printing("DATA : BPEtokenized input ids {}",
             var=[input_tokens_tensor],
             verbose_level=3,
             verbose=verbose)

    printing("DATA : pre-tokenized output {} ",
             var=[batch_raw_output],
             verbose_level=verbose_level,
             verbose=_verbose)
    printing("DATA : BPE tokenized output ids  {}",
             var=[output_tokens_tensor],
             verbose_level=4,
             verbose=verbose)
    # BPE
    printing("DATA : BPE tokenized input  {}",
             var=[inp_bpe_tokenized],
             verbose_level=4,
             verbose=_verbose)
    printing("DATA : BPE tokenized output  {}",
             var=[out_bpe_tokenized],
             verbose_level=4,
             verbose=_verbose)
def get_indexes(list_pretokenized_str,
                tokenizer,
                verbose,
                use_gpu,
                word_norm_not_norm=None):
    """
    from pretokenized string : it will bpe-tokenize it using BERT 'tokenizer'
    and then convert it to tokens ids
    :param list_pretokenized_str:
    :param tokenizer:
    :param verbose:
    :param use_gpu:
    :return:
    """
    all_tokenized_ls = [
        tokenizer.tokenize_origin(inp, ) for inp in list_pretokenized_str
    ]
    tokenized_ls = [tup[0] for tup in all_tokenized_ls]

    aligned_index = [tup[1] for tup in all_tokenized_ls]
    segments_ids = [[0 for _ in range(len(tokenized))]
                    for tokenized in tokenized_ls]

    printing("DATA : bpe tokenized {} , {} {} ",
             var=[tokenized_ls,
                  len(tokenized_ls),
                  len(tokenized_ls[0])],
             verbose=verbose,
             verbose_level="raw_data")
    printing("DATA : bpe tokenized {} , {} {} ",
             var=[tokenized_ls,
                  len(tokenized_ls),
                  len(tokenized_ls[0])],
             verbose=verbose,
             verbose_level="alignement")
    ids_ls = [tokenizer.convert_tokens_to_ids(inp) for inp in tokenized_ls]
    max_sent_len = max([len(inp) for inp in tokenized_ls])
    ids_padded = [
        inp + [PAD_ID_BERT for _ in range(max_sent_len - len(inp))]
        for inp in ids_ls
    ]
    aligned_index_padded = [[e for e in inp] +
                            [1000 for _ in range(max_sent_len - len(inp))]
                            for inp in aligned_index]
    segments_padded = [
        inp + [PAD_ID_BERT for _ in range(max_sent_len - len(inp))]
        for inp in segments_ids
    ]

    if word_norm_not_norm is not None:
        mask = mask_group(word_norm_not_norm,
                          bpe_aligned_index=aligned_index_padded)
    else:
        mask = [[1 for _ in inp] + [0 for _ in range(max_sent_len - len(inp))]
                for inp in segments_ids]
    mask = torch.LongTensor(mask)
    tokens_tensor = torch.LongTensor(ids_padded)
    segments_tensors = torch.LongTensor(segments_padded)
    if use_gpu:
        mask = mask.cuda()
        tokens_tensor = tokens_tensor.cuda()
        segments_tensors = segments_tensors.cuda()

    printing("DATA {}", var=[tokens_tensor], verbose=verbose, verbose_level=3)

    sanity_check_data_len(tokens_tensor,
                          segments_tensors,
                          tokenized_ls,
                          aligned_index,
                          raising_error=True)

    return tokens_tensor, segments_tensors, tokenized_ls, aligned_index_padded, mask
def outputing_raw_data_from_iterator(words, word_norm, chars, chars_norm,
                                     word_norm_not_norm, pos, verbose,
                                     print_raw, normalization, char_dictionary,
                                     word_dictionary, word_norm_dictionary,
                                     pos_dictionary):
    """
    printing real data on the fly for debugging, data sanity check, ...
    TODO : may factorize a few things here
    :param words:
    :param word_norm:
    :param chars:
    :param chars_norm:
    :param word_norm_not_norm:
    :param pos:
    :param verbose:
    :param print_raw:
    :param normalization:
    :param char_dictionary:
    :param word_dictionary:
    :param word_norm_dictionary:
    :param pos_dictionary:
    :return:
    """
    _verbose = verbose if isinstance(verbose, int) else 0
    if print_raw:
        _verbose = 5

    if _verbose >= 5:
        if word_norm_not_norm is not None:
            character_display = [
                " ".join([
                    char_dictionary.get_instance(chars[sent, word_ind, char_i])
                    for char_i in range(chars.size(2))
                ]) + " | NORM : {} |SENT {} WORD {}| ".format(
                    word_norm_not_norm[sent, word_ind], sent, word_ind)
                for ind_sent, sent in enumerate(range(chars.size(0)))
                for ind_w, word_ind in enumerate(range(chars.size(1)))
            ]
        else:
            character_display = [
                " ".join([
                    char_dictionary.get_instance(chars[sent, word_ind, char_i])
                    for char_i in range(chars.size(2))
                ]) for ind_sent, sent in enumerate(range(chars.size(0)))
                for ind_w, word_ind in enumerate(range(chars.size(1)))
            ]

        if word_norm is not None:
            assert word_norm_dictionary is not None
            word_norm_display = " ".join([
                word_norm_dictionary.get_instance(word_norm[sent, word_ind])
                for word_ind in range(word_norm.size(1))
                for sent in range(word_norm.size(0))
            ])
        else:
            print("No word level normalized word (only char)")
            word_norm_display = ["NONE"]

        word_display = [
            word_dictionary.get_instance(words[batch, word_ind]) + " "
            for batch in range(chars.size(0))
            for word_ind in range(chars.size(1))
        ]

        if pos_dictionary is not None:
            pos_display = [
                pos_dictionary.get_instance(pos[batch, 0]) + " "
                for batch in range(chars.size(0))
            ]
        else:
            pos_display = None

    else:
        word_display = []
        character_display = []
        pos_display = []
    if not normalization and chars is not None:
        chars_norm = chars.clone()

    # TODO add word_norm
    if _verbose >= 5:
        if word_norm_not_norm is not None:
            character_norm_display = [
                " ".join([
                    char_dictionary.get_instance(chars_norm[sent, word_ind,
                                                            char_i])
                    for char_i in range(chars_norm.size(2))
                ]) + "|  NORM : {} |SENT {} WORD {}| \n ".format(
                    word_norm_not_norm[sent, word_ind], sent, word_ind)
                for ind_sent, sent in enumerate(range(chars_norm.size(0)))
                for ind_w, word_ind in enumerate(range(chars_norm.size(1)))
            ]
        else:
            character_norm_display = [
                " ".join([
                    char_dictionary.get_instance(chars_norm[sent, word_ind,
                                                            char_i])
                    for char_i in range(chars_norm.size(2))
                ]) for ind_sent, sent in enumerate(range(chars_norm.size(0)))
                for ind_w, word_ind in enumerate(range(chars_norm.size(1)))
            ]
        printing(
            "Feeding source characters {} \n ------ Target characters {}  "
            "(NB : the character vocabulary is the same at input and output)",
            var=(character_display, character_norm_display),
            verbose=_verbose,
            verbose_level=5)
        printing("Feeding source words {} ",
                 var=[word_display],
                 verbose=_verbose,
                 verbose_level=5)
        printing("Feeding Word normalized (word level) {}",
                 var=[word_norm_display],
                 verbose=_verbose,
                 verbose_level=5)
        printing("Feeding source pos {} ",
                 var=[pos_display],
                 verbose=_verbose,
                 verbose_level=5)
        if chars is not None and chars_norm is not None:
            printing("TYPE {} char before batch chars_norm {} ",
                     var=(chars.is_cuda, chars_norm.is_cuda),
                     verbose=verbose,
                     verbose_level=5)
示例#23
0
def make_bert_multitask(pretrained_model_dir,
                        tasks,
                        num_labels_per_task,
                        init_args_dir,
                        mask_id,
                        encoder=None,
                        args=None):
    assert num_labels_per_task is not None and isinstance(num_labels_per_task, dict), \
        "ERROR : num_labels_per_task {} should be a dictionary".format(num_labels_per_task)
    assert isinstance(tasks, list) and len(
        tasks) >= 1, "ERROR tasks {} should be a list of len >=1".format(tasks)

    if init_args_dir is None:
        if pretrained_model_dir is None:

            pretrained_model_dir = args.bert_model
        # assert args.output_attentions is None or not args.output_attentions, "ERROR not supported "

        multitask_wrapper = BertMultiTask

        def get_state_dict_mapping(model):
            if model.startswith("xlm") or model.startswith(
                    "rob") or model.startswith("camembert"):
                return {
                    "roberta":
                    "encoder",  # "lm_head":,
                    "lm_head.decoder":
                    "head.mlm.predictions.decoder",
                    "lm_head.dense":
                    "head.mlm.predictions.transform.dense",
                    "lm_head.bias":
                    "head.mlm.predictions.bias",
                    "lm_head.layer_norm":
                    "head.mlm.predictions.transform.LayerNorm"
                }
            elif model.startswith("bert") or model.startswith(
                    "cahya") or model.startswith("KB"):
                return {"bert": "encoder", "cls": "head.mlm"}
            elif model.startswith("asafaya"):
                return {"bert": "encoder", "cls": "head.mlm"}
            else:
                raise (Exception(
                    f"not supported by {multitask_wrapper} needs to define a ")
                       )

        state_dict_mapping = get_state_dict_mapping(args.bert_model)

        model = multitask_wrapper.from_pretrained(
            pretrained_model_dir,
            tasks=tasks,
            mask_id=mask_id,
            output_attentions=args.output_attentions,
            output_hidden_states=args.output_all_encoded_layers,
            output_hidden_states_per_head=args.output_hidden_states_per_head,
            hard_skip_attention_layers=args.hard_skip_attention_layers,
            hard_skip_all_layers=args.hard_skip_all_layers,
            hard_skip_dense_layers=args.hard_skip_dense_layers,
            num_labels_per_task=num_labels_per_task,
            mapping_keys_state_dic=
            state_dict_mapping,  #DIR_2_STAT_MAPPING[pretrained_model_dir],
            encoder=eval(encoder) if encoder is not None else BertModel,
            dropout_classifier=args.dropout_classifier,
            hidden_dropout_prob=args.hidden_dropout_prob,
            random_init=args.random_init,
            load_params_only_ls=None,
            not_load_params_ls=args.not_load_params_ls)

    elif init_args_dir is not None:
        assert pretrained_model_dir is not None, "ERROR model_dir is needed here for reloading"
        init_args_dir = get_init_args_dir(init_args_dir)
        args_checkpoint = json.load(open(init_args_dir, "r"))
        assert "checkpoint_dir" in args_checkpoint, "ERROR checkpoint_dir not in {} ".format(
            args_checkpoint)

        checkpoint_dir = args_checkpoint["checkpoint_dir"]
        assert os.path.isfile(
            checkpoint_dir), "ERROR checkpoint {} not found ".format(
                checkpoint_dir)

        # redefining model and reloading
        def get_config_bert(bert_model, config_file_name="bert_config.json"):
            model_dir = BERT_MODEL_DIC[bert_model]["model"]
            #tempdir = tempfile.mkdtemp()
            #print("extracting archive file {} to temp dir {}".format(model_dir, tempdir))
            #with tarfile.open(model_dir, 'r:gz') as archive:
            #    archive.extractall(tempdir)
            #serialization_dir = tempdir
            serialization_dir = None
            config_file = os.path.join(model_dir, config_file_name)
            try:
                assert os.path.isfile(
                    config_file
                ), "ERROR {} not a file , extracted from {} : dir includes {} ".format(
                    config_file, model_dir,
                    [x[0] for x in os.walk(serialization_dir)])
            except Exception as e:
                config_file = os.path.join(model_dir, "config.json")
                assert os.path.join(config_file)
            return config_file

        config_file = get_config_bert(
            args_checkpoint["hyperparameters"]["bert_model"])
        encoder = eval(BERT_MODEL_DIC[args_checkpoint["hyperparameters"]
                                      ["bert_model"]]["encoder"])
        config = BertConfig(
            config_file,
            output_attentions=args.output_attentions,
            output_hidden_states=args.output_all_encoded_layers,
            output_hidden_states_per_head=args.output_hidden_states_per_head)
        #
        config.vocab_size = 119547

        model = BertMultiTask(
            config=config,
            tasks=[
                task for tasks in args_checkpoint["hyperparameters"]["tasks"]
                for task in tasks
            ],
            num_labels_per_task=args_checkpoint["info_checkpoint"]
            ["num_labels_per_task"],
            encoder=encoder,
            mask_id=mask_id)
        printing("MODEL : loading model from checkpoint {}",
                 var=[checkpoint_dir],
                 verbose=1,
                 verbose_level=1)
        model.load_state_dict(
            torch.load(checkpoint_dir,
                       map_location=lambda storage, loc: storage))
        model.append_extra_heads_model(downstream_tasks=tasks,
                                       num_labels_dic_new=num_labels_per_task)
    else:
        raise (Exception(
            "only one of pretrained_model_dir checkpoint_dir can be defined "))

    return model
示例#24
0
def focused_masking(masking_strategy, input_tokens_tensor,
                    output_tokens_tensor_aligned, dropout_input_bpe,
                    mask_token_index, sep_token_index, use_gpu, epoch, n_epoch,
                    portion_mask, input_mask, tokenizer, verbose):

    if masking_strategy in ["mlm", "mlm_need_norm"]:

        dropout = 0.15
        assert dropout_input_bpe == 0., "in args.masking_strategy mlm we hardcoded dropout to 0.2 {}".format(
            dropout)
        # standart standart_mlm means : standart MLM prediction
        standart_mlm = True
        # unmask_loss : bool do we unmask other loss than only the MASKed tokens
        unmask_loss = portion_mask
        if masking_strategy == "mlm_need_norm":
            # if mlm_need_norm strategy : in args.portion_mask% of the time we learn as a standart mlm the rest
            # of the time we do the same but only on need_norm tokens (masking them)
            standart_mlm = np.random.random() < portion_mask
            # we force unmask loss to 0
            unmask_loss = 0
        if standart_mlm:
            # standart mlm
            input_tokens_tensor, mask_dropout, dropout_applied = dropout_input_tensor(
                input_tokens_tensor,
                mask_token_index,
                sep_token_index=sep_token_index,
                applied_dropout_rate=0.8,
                dropout=dropout)
        elif masking_strategy == "mlm_need_norm" and not standart_mlm:
            # todo : factorize
            feeding_the_model_with_label = output_tokens_tensor_aligned.clone()
            # we only learn on tokens that are different from gold
            feeding_the_model_with_label[input_tokens_tensor ==
                                         output_tokens_tensor_aligned] = -1
            if np.random.random() < 0.85:
                # 80% of the time we mask the tokens as standart mlm
                input_tokens_tensor[
                    input_tokens_tensor !=
                    output_tokens_tensor_aligned] = mask_token_index
            else:
                # within the 15% rest : 50% of the time we replace by random 50% we keep
                if np.random.random() < 0.5:
                    permute = (torch.randperm(
                        torch.tensor(len(tokenizer.vocab) - 2)
                    )[:len(input_tokens_tensor[
                        input_tokens_tensor != output_tokens_tensor_aligned])]
                               + 1)
                    permute[permute == sep_token_index] = sep_token_index + 10
                    permute[permute ==
                            mask_token_index] = mask_token_index + 10
                    permute[permute == 0] = 53
                    if use_gpu:
                        permute = permute.cuda()
                    input_tokens_tensor[input_tokens_tensor !=
                                        output_tokens_tensor_aligned] = permute
            mask_dropout = (
                input_tokens_tensor == output_tokens_tensor_aligned)

        if standart_mlm and not dropout_applied:
            random_bpe_instead = np.random.random() < 0.5
            if random_bpe_instead:
                permute = (
                    torch.randperm(torch.tensor(len(tokenizer.vocab) - 2))
                    [:len(input_tokens_tensor[mask_dropout == 0])] + 1)
                permute[permute == sep_token_index] = sep_token_index + 10
                permute[permute == mask_token_index] = mask_token_index + 10
                permute[permute == 0] = 53
                if use_gpu:
                    permute = permute.cuda()

                input_tokens_tensor[mask_dropout == 0] = permute

        if unmask_loss:
            print(
                "WARNING : unmaskloss is {}  : 0 means only optimizing on the MASK  , > 0 means optimizes "
                "also on some other sampled based on dropout_adapted)".format(
                    unmask_loss))
            power = 3
            capped = 0.5
            dropout_adated = min(((epoch + 1) / n_epoch)**power, capped)
            printing(
                "LABEL NOT MASKING {}/1 of gold labels with power {} and capped {}"
                .format(dropout_adated, power, capped),
                verbose=verbose,
                verbose_level=2)
            _, mask_losses = dropout_input_tensor(
                input_tokens_tensor,
                mask_token_index,
                sep_token_index=sep_token_index,
                apply_dropout=False,
                dropout=dropout_adated)
            # we backpropagate only on tokens that receive a mask (MLM objective) +
            #  some extra ones tgat we control with dropout_adated
            mask_loss = mask_dropout * mask_losses
        else:
            mask_loss = mask_dropout
        feeding_the_model_with_label = output_tokens_tensor_aligned.clone()
        feeding_the_model_with_label[mask_loss != 0] = -1
        # hald the time we actually mask those tokens otherwise we predict
    elif masking_strategy in ["norm_mask", "norm_mask_variable"]:
        if masking_strategy == "norm_mask_variable":
            # args.portion_mask = min(((epoch + 1) / n_epoch), 0.6)
            portion_mask = 1 - (epoch + 1) / n_epoch  # , 0.6))
        mask_normed = np.random.random() < portion_mask
        feeding_the_model_with_label = output_tokens_tensor_aligned.clone()
        if mask_normed:
            print("MASKING NORMED in mode {} portion mask {}".format(
                masking_strategy, portion_mask))
            feeding_the_model_with_label[input_tokens_tensor ==
                                         output_tokens_tensor_aligned] = -1
            if np.random.random() < 0.5:
                # half the time we mask not to make the model only normalizing
                input_tokens_tensor[
                    input_tokens_tensor !=
                    output_tokens_tensor_aligned] = mask_token_index
    else:
        feeding_the_model_with_label = output_tokens_tensor_aligned.clone()
        # TODO -- handle loggin of output_tokens_tensor_aligned everywhere
        printing("MASK mask:{} \nMASK input:{} \nMASK output:{}",
                 var=[
                     input_mask, input_tokens_tensor,
                     feeding_the_model_with_label
                 ],
                 verbose_level="raw_data",
                 verbose=verbose)

    return input_tokens_tensor, feeding_the_model_with_label
def args_preprocessing(args, verbose=1):
    """
    sanity checking , changing types of arguments and parsing arguments
    """

    args.tasks = [task_simul.split(",") for task_simul in args.tasks]

    if args.hard_skip_dense_layers is None or args.hard_skip_dense_layers == "None":
        args.hard_skip_dense_layers = []
    else:
        args.hard_skip_dense_layers = args.hard_skip_dense_layers.split(",")
        assert len(args.hard_skip_dense_layers) > 0

    if args.hard_skip_attention_layers is None or args.hard_skip_attention_layers == "None":
        args.hard_skip_attention_layers = []
    else:
        args.hard_skip_attention_layers = args.hard_skip_attention_layers.split(
            ",")
        assert len(args.hard_skip_attention_layers) > 0

    if args.hard_skip_all_layers is None or args.hard_skip_all_layers == "None":
        args.hard_skip_all_layers = []
    else:
        args.hard_skip_all_layers = args.hard_skip_all_layers.split(",")
        assert len(args.hard_skip_all_layers) > 0

    if args.prune_heads is not None and args.prune_heads != "None":
        pune_heads_ls = args.prune_heads.split(",")[:-1]
        assert len(pune_heads_ls) > 0
        for layer in pune_heads_ls:
            parsed_layer_to_prune = layer.split("-")
            assert parsed_layer_to_prune[
                0] == "prune_heads", f"ERROR {parsed_layer_to_prune} layer arg: {layer} pune_heads_ls {pune_heads_ls} args.prune_heads {args.prune_heads}"
            assert parsed_layer_to_prune[
                1] == "layer", f"ERROR {parsed_layer_to_prune}"
            assert parsed_layer_to_prune[
                3] == "heads", f"ERROR {parsed_layer_to_prune}"
            try:
                int(parsed_layer_to_prune[2])
                heads = parsed_layer_to_prune[4]
                head_index_ls = heads.split("_")
                heads_ls = [int(index) for index in head_index_ls]
            except Exception as e:
                print(f"Error parsing prune_heads argument {e}")

    if isinstance(args.schedule_lr, str) and args.schedule_lr == "None":
        args.schedule_lr = eval(args.schedule_lr)

    if args.batch_size != "flexible":
        args.batch_size = int(args.batch_size)

    if args.low_memory_foot_print_batch_mode is not None and args.low_memory_foot_print_batch_mode != "flexible_forward_batch_size":
        args.low_memory_foot_print_batch_mode = int(
            args.low_memory_foot_print_batch_mode)
    low_memory_foot_print_batch_mode_available = [
        0, 1, "flexible_forward_batch_size"
    ]

    if args.not_load_params_ls is not None:
        args.not_load_params_ls = args.not_load_params_ls.split(",")[:-1]
    assert args.low_memory_foot_print_batch_mode is None or args.low_memory_foot_print_batch_mode in low_memory_foot_print_batch_mode_available, "ERROR args.low_memory_foot_print_batch_mode {} should be in {}".format(
        args.low_memory_foot_print_batch_mode,
        low_memory_foot_print_batch_mode_available)

    if args.low_memory_foot_print_batch_mode:
        args.batch_update_train = args.batch_size
        args.batch_size = "flexible" if args.low_memory_foot_print_batch_mode == "flexible_forward_batch_size" else 2
        printing(
            "INFO : args.low_memory_foot_print_batch_mode {} "
            "so setting batch_size to {} and args.batch_update_train {}",
            var=[
                args.low_memory_foot_print_batch_mode, args.batch_size,
                args.batch_update_train
            ],
            verbose=verbose,
            verbose_level=1)

        if args.low_memory_foot_print_batch_mode != "flexible_forward_batch_size":
            assert isinstance(
                args.batch_update_train // args.batch_size, int
            ) and args.batch_update_train // args.batch_size > 0, "ERROR batch_size {} should be a multiple of 2 ".format(
                args.batch_update_train)
        printing(
            "INFO iterator : updating with {} equivalent batch size : forward pass is {} batch size",
            var=[args.batch_update_train, args.batch_size],
            verbose=verbose,
            verbose_level=1)
    else:
        args.batch_update_train = args.batch_size
    params = vars(args)
    args.lr = parse_argument_dictionary(params["lr"], hyperparameter="lr")

    if args.test_paths is not None:
        args.test_paths = [
            test_path_task.split(",") for test_path_task in args.test_paths
        ]

    if args.dev_path is not None:
        args.dev_path = [
            dev_path_task.split(",") for dev_path_task in args.dev_path
        ]

    if args.ponderation_per_layer is not None:
        args.ponderation_per_layer = parse_argument_dictionary(
            params["ponderation_per_layer"],
            hyperparameter="ponderation_per_layer")
    if args.norm_order_per_layer is not None:
        args.norm_order_per_layer = parse_argument_dictionary(
            params["norm_order_per_layer"],
            hyperparameter="norm_order_per_layer")

    if args.test_paths is not None:
        assert isinstance(args.test_paths, list) and isinstance(
            args.test_paths[0], list), "ERROR args.test_paths should be a list"
    # 1 simultaneous set of tasks per training dataset
    assert len(args.tasks) == len(
        args.train_path
    ), "ERROR args.tasks is {} but train paths are {}".format(
        args.tasks, args.train_path)

    assert args.penalization_mode in AVAILALE_PENALIZATION_MODE, "ERROR args.penalization_mode {} should be in {}".format(
        args.penalization_mode, AVAILALE_PENALIZATION_MODE)

    if args.multi_task_loss_ponderation is not None:
        argument_as_string = args.multi_task_loss_ponderation
        assert args.tasks is not None
        tasks = [task for tasks in args.tasks for task in tasks]
        # should add test on task X label calling task setting
        for task in tasks:
            if task != "all":
                for label in TASKS_PARAMETER[task]["label"]:
                    pattern = "{}-{}=([^=]*),".format(task, label)
                    match = re.search(pattern, argument_as_string)
                    assert match is not None, "ERROR : pattern {} not found for task {} in argument_as_string {}  ".format(
                        pattern, task, argument_as_string)

    if args.bert_model is not None:
        try:
            assert args.bert_model in BERT_MODEL_DIC, "ERROR args.bert_model {} should be in {}".format(
                args.bert_model, BERT_MODEL_DIC.keys())
        except Exception as e:
            print(f"Will load model and tokenization from transformers {e}")

    return args
def from_bpe_token_to_str(bpe_tensor,
                          topk,
                          pred_mode,
                          null_token_index,
                          null_str,
                          task,
                          tokenizer=None,
                          bpe_tensor_src=None,
                          pos_dictionary=None,
                          label="normalize",
                          label_dictionary=None,
                          mask_index=None,
                          get_bpe_string=False,
                          verbose=1):
    """
    it actually supports not only bpe token but also pos-token
    pred_mode allow to handle gold data also (which only have 2 dim and not three)
    :param bpe_tensor:
    :param topk: int : number of top prediction : will arrange them with all the top1 all the 2nd all the third...
    :param pred_mode: book
    :return:
    """
    assert label is not None or get_bpe_string, \
        "ERROR : task {} get_string {} : one of them should be defined or True".format(label, get_bpe_string)
    if task == "mlm" and pred_mode:
        assert bpe_tensor_src is not None and mask_index is not None, "ERROR bpe_tensor_src is needed to get not-predicted token as well as mask_index "
        predictions_topk_ls = [[[
            bpe_tensor[sent, word, top].item() if bpe_tensor_src[sent,
                                                                 word].item()
            == mask_index else bpe_tensor_src[sent, word].item()
            for word in range(bpe_tensor.size(1))
        ] for sent in range(bpe_tensor.size(0))] for top in range(topk)]
    else:
        predictions_topk_ls = [[[
            bpe_tensor[sent, word,
                       top].item() if pred_mode else bpe_tensor[sent,
                                                                word].item()
            for word in range(bpe_tensor.size(1))
        ] for sent in range(bpe_tensor.size(0))] for top in range(topk)]

    # here all labels that require the tokenizer (should factorize it in some way)
    if get_bpe_string:  #label in ["normalize", "mwe_prediction", "input_masked"] or
        assert tokenizer is not None
        # requires task specific here : mlm only prediction we are interested in are
        # RM , special_extra_token=null_token_index, special_token_string=null_str
        sent_ls_top = [[
            tokenizer.convert_ids_to_tokens(sent_bpe)
            for sent_bpe in predictions_topk
        ] for predictions_topk in predictions_topk_ls]

        printing("DATA : bpe string again {}",
                 var=[sent_ls_top],
                 verbose=verbose,
                 verbose_level="raw_data")
    else:
        dictionary = label_dictionary

        if label_dictionary == "index":
            sent_ls_top = [[[token_ind for token_ind in sent_bpe]
                            for sent_bpe in predictions_topk]
                           for predictions_topk in predictions_topk_ls]
        else:
            try:
                sent_ls_top = [[[
                    dictionary.instances[token_ind -
                                         1] if token_ind > 0 else "UNK"
                    for token_ind in sent_bpe
                ] for sent_bpe in predictions_topk]
                               for predictions_topk in predictions_topk_ls]
            # adding more information about the exe
            except Exception as e:
                print(
                    "{} : dictionary : {} and prediction {} (POS specificity was removed )"
                    .format(e, dictionary.instances, predictions_topk_ls))
                raise (e)

    if not pred_mode:
        sent_ls_top = sent_ls_top[0]

    return sent_ls_top
示例#27
0
def write_conll(format, dir_normalized, dir_original, src_text_ls, text_decoded_ls,
                src_text_pos, pred_pos_ls, tasks, inverse=False,permuting_mode=None, cp_paste=False, sep_token=None, cls_token=None,
                ind_batch=0, new_file=False, cut_sent=False, verbose=0):
    assert format in ["conll"]
    #assert len(tasks) == 1, "ERROR : only supported so far 1 task at a time"

    if tasks[0] == "normalize":
        src_ls = src_text_ls
        pred_ls = text_decoded_ls
        if text_decoded_ls is None:
            assert permuting_mode is not None or cp_paste
            pred_ls = src_text_ls
    elif tasks[0] == "pos":
        src_ls = src_text_pos
        pred_ls = pred_pos_ls
    if format == "conll":
        mode_write = "w" if new_file else "a"
        if new_file:
            printing("CREATING NEW FILE (io_/dat/normalized_writer) : {} ", var=[dir_normalized], verbose=verbose, verbose_level=1)
        with open(dir_normalized, mode_write) as norm_file:
            with open(dir_original, mode_write) as original:
                len_original = 0
                for ind_sent, (original_sent, normalized_sent) in enumerate(zip(src_ls, pred_ls)):
                    try:
                        assert len(original_sent) == len(normalized_sent), "WARNING : (writer) original_sent len {} {} \n  " \
                                                                           "normalized_sent len {} {} ".format(len(original_sent), original_sent, len(normalized_sent), normalized_sent)
                    except AssertionError as e:
                        print(e)
                        if len(original_sent) > len(normalized_sent):
                            normalized_sent.extend(["UNK" for _ in range(len(original_sent)-len(normalized_sent))])
                            print("WARNING (writer) : original larger than prediction : so appending UNK token for writing")
                        else:
                            print("WARNING (writer) : original smaller than prediction ! ")

                    norm_file.write("#\n")
                    original.write("#\n")
                    norm_file.write("#sent_id = {} \n".format(ind_sent+ind_batch+1))
                    original.write("#sent_id = {} \n".format(ind_sent+ind_batch+1))
                    ind_adjust = 0

                    if permuting_mode == "sample_mode":
                        noise_level_sentence = np.random.random(1)[0]

                    for ind, (original_token, normalized_token) in enumerate(zip(original_sent,
                                                                                 normalized_sent)):
                        # WE REMOVE SPECIAL TOKENS ONLY IF THEY APPEAR AT THE BEGINING OR AT THE END
                        # on the source token !! (it tells us when we stop) (we nevern want to use gold information)
                        max_len_word = max(len(original_token), len_original)
                        if (original_token in SPECIAL_TOKEN_LS or original_token in [cls_token, sep_token]) and (ind+1 == len(original_sent) or ind == 0):
                            ind_adjust = 1
                            continue

                        if permuting_mode == "sample_mode":
                            # 20% of sentences we apply a 80 noise level n 80% of cases only a 20% noise level
                            rand_word = np.random.random(1)[0]
                            threshold_word = 0.8 if noise_level_sentence < 0.2 else 0.2
                            if rand_word < threshold_word:
                                permuting_mode = np.random.choice(["permute", "double", "random_replace",
                                                                   "multiply_last", "double_last","remove",
                                                                   "remove_last", "z_replace_s"])
                            #print("PERMUTATION is ", permuting_mode, rand_word, APPLY_PERMUTE_WORD,noise_level_sentence)

                        else:
                            rand_word = None

                        # TODO : when want simultanuous training : assert src_pos src_norm same
                        #   --> assert pred_pos and pred_norm are same lengh (number of words) ans write
                        if tasks[0] == "normalize":
                            if inverse:
                                assert not cp_paste
                                _original_token = normalized_token
                                _normalized_token = original_token

                            else:
                                _original_token = original_token
                                _normalized_token = normalized_token
                                if permuting_mode is not None:
                                    assert not cp_paste
                                    # rule one
                                    #print("ORIGINAL TOKEN", original_token)
                                    if ( _original_token == _normalized_token or _original_token.lower() == _normalized_token.lower())\
                                        and not (original_token.startswith("#") or original_token.startswith("@")):
                                        # rule 1
                                        if permuting_mode == "z_replace_s" and len(original_token) > 1:
                                            if original_token.endswith("s"):
                                                _original_token = original_token[:-1] + "z"
                                            else:
                                                permuting_mode = np.random.choice(["permute", "double",
                                                                                   "random_replace",
                                                                                   "remove", "remove_last",
                                                                                   "multiply_last","double_last",
                                                                                    "z_replace_s"])

                                        if permuting_mode == "permute" and len(original_token) > 1:
                                            start_index = 0 if not (original_token.startswith("#") or original_token.startswith("@")) else 1
                                            to_permute = np.random.randint(start_index, len(original_token)-1)
                                            second_letter = original_token[to_permute+1]
                                            first_letter = original_token[to_permute]
                                            list_original_token = list(original_token)
                                            #pdb.set_trace()
                                            list_original_token[to_permute] = second_letter
                                            list_original_token[to_permute+1] = first_letter
                                            _original_token = "".join(list_original_token)
                                        # rule 2
                                        if (permuting_mode == "double" or permuting_mode == "remove") and len(original_token) > 1:
                                            start_index = 0
                                            to_double = np.random.randint(start_index, len(original_token)-1)
                                            first_letter = original_token[to_double]
                                            list_original_token = list(original_token)
                                            #pdb.set_trace()
                                            if permuting_mode == "double":
                                                list_original_token = list_original_token[:to_double] + [first_letter] + list_original_token[to_double:]
                                            else:
                                                list_original_token = list_original_token[:to_double] + list_original_token[to_double:]

                                            _original_token = "".join(list_original_token)

                                        if permuting_mode == "remove_last" and len(original_token) > 1:
                                            _original_token = _original_token[:-1]
                                        if permuting_mode == "double_last" and len(original_token) > 1:
                                            _original_token = _original_token+_original_token[-1]
                                        if permuting_mode == "random_replace" and len(original_token) > 1:
                                            start_index = 0
                                            to_replace = np.random.randint(start_index, len(original_token) - 1)
                                            random_letter = np.random.choice(list("abcdefghijklmnopqrstuvwxyz"))
                                            first_letter = original_token[to_replace]
                                            list_original_token = list(original_token)
                                            # pdb.set_trace()

                                            list_original_token[to_replace] = random_letter

                                            _original_token = "".join(list_original_token)



                                        #print("NEW TOKEN", permuting_mode, _original_token)

                                    #pdb.set_trace()

                            if cp_paste:
                                _normalized_token = _original_token

                            norm_file.write("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\tNorm={}|\n".format(ind + 1 - ind_adjust,
                                                                                              _original_token,
                                                                                              ind - ind_adjust if ind - ind_adjust > 0 else 0,
                                                                                              _normalized_token))
                        if tasks[0] == "pos":
                            norm_file.write("{}\t{}\t_\t{}\t_\t_\t{}\t_\t_\tNorm=()|\n".format(ind + 1 - ind_adjust,
                                                                                               original_token,
                                                                                               normalized_token,
                                                                                               ind-ind_adjust if ind - ind_adjust > 0 else 0
                                                                                               ))
                        original.write("{}\t{}\t_\t_\t_\t_\t_\t_\t{}\t_\n".format(ind+1,
                                                                                  original_token,
                                                                                  ind - ind_adjust if ind - ind_adjust > 0 else 0))

                        if cut_sent:
                            if ind > 50:
                                break
                    norm_file.write("\n")
                    original.write("\n")
            printing("WRITING predicted batch of {} original and {} normalized",
                     var=[dir_original, dir_normalized], verbose=verbose, verbose_level="raw_data")

    return max_len_word
def parse_argument_dictionary(argument_as_string,
                              logits_label=None,
                              hyperparameter="multi_task_loss_ponderation",
                              verbose=1):
    """
    All arguments that are meant to be defined as dictionaries are passed to the Argument Parser as string:
    following  template :  i.e 'key1=value1,key2=value,'  (matched with "{}=([^=]*),".format(sub) )
    ALl the dictionary arguments are listed in DIC_ARGS
    """
    assert hyperparameter in DIC_ARGS, "ERROR only supported"
    if argument_as_string in MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE:
        return argument_as_string
    else:
        dic = OrderedDict()
        if hyperparameter == "multi_task_loss_ponderation":
            assert logits_label is not None
            for task in logits_label:
                # useless (I think)
                if task == "parsing":
                    for sub in ["parsing-heads", "parsing-types"]:
                        pattern = "{}=([^=]*),".format(sub)
                        match = re.search(pattern, argument_as_string)
                        assert match is not None, "ERROR : pattern {} not found for task {} in argument_as_string {}  ".format(
                            pattern, task, argument_as_string)
                        dic[sub] = eval(match.group(1))
                # useless (I thinh)
                elif task == "normalize":
                    for sub in ["normalize", "append_masks"]:
                        pattern = "{}=([^=]*),".format(sub)
                        match = re.search(pattern, argument_as_string)
                        if sub == "normalize":
                            assert match is not None, "ERROR : pattern {} not found for task {} " \
                                                      "in argument_as_string {}  ".format( pattern, task, argument_as_string)
                            dic[sub] = eval(match.group(1))
                        else:
                            if match is not None:
                                dic[sub] = eval(match.group(1))
                # all cases should be in this one
                if task != "all" and task != "parsing":

                    pattern = "{}=([^=]*),".format(task)
                    match = re.search(pattern, argument_as_string)
                    assert match is not None, "ERROR : pattern {} not found for task {} in argument_as_string {}  ".format(
                        pattern, task, argument_as_string)
                    dic[task] = eval(match.group(1))

            printing("SANITY CHECK : multi_task_loss_ponderation {} ",
                     var=[argument_as_string],
                     verbose_level=3,
                     verbose=verbose)

        elif hyperparameter in [
                "lr", "norm_order_per_layer", "ponderation_per_layer"
        ]:
            # to handle several optimizers
            try:
                assert isinstance(eval(argument_as_string), float)
                return eval(argument_as_string)
            except Exception as e:
                print("Exception", hyperparameter, e)
                argument_as_string = argument_as_string.split(",")
                for arg in argument_as_string[:-1]:
                    # DIFFERENCE WITH ABOVE IS THE COMMA
                    pattern = "([^=]*)=([^=]*)"
                    match = re.search(pattern, arg)
                    assert match is not None, "ERROR : pattern {} not found in argument_as_string {}  ".format(
                        pattern, arg)
                    if hyperparameter in ["lr"]:
                        dic[match.group(1)] = float(match.group(2))
                    elif hyperparameter in ["norm_order_per_layer"]:
                        if match.group(2) != "fro":
                            dic[match.group(1)] = float(match.group(2))
                        else:
                            dic[match.group(1)] = match.group(2)
                    elif hyperparameter in ["ponderation_per_layer"]:
                        dic[match.group(1)] = float(match.group(2))

        return dic
示例#29
0
def write_conll_multitask(format, dir_pred, dir_original, src_text_ls,
                          pred_per_task, tasks, task_parameters, cp_paste=False, gold=False,
                          all_indexes=None, sep_token=None, cls_token=None,
                          ind_batch=0, new_file=False, cut_sent=False, verbose=0):

    assert format in ["conll"]
    max_len_word = None
    writing_top = 1
    # assert each task is predicting as many sample per batch
    pred_task_len_former = -1
    task_former = ""

    # assertion on number of samples predicted
    for task_label in pred_per_task:

        pred_task_len = len(pred_per_task[task_label]) if gold else len(pred_per_task[task_label][writing_top-1])
        _task = re.match("(.*)-(.*)", task_label)
        if _task is not None:  # , "ERROR writer could not match {}".format(task_label)
            task = _task.group(1)
        else:
            task = task_label
        if pred_task_len_former > 0:
            assert pred_task_len == pred_task_len_former, \
                "ERROR {} and {} task ".format(task_former, task_label)
            if not gold:
                assert pred_task_len == len(src_text_ls[task_parameters[task]["input"]]), "ERROR  src len {} and pred len {} ".format(len(src_text_ls[task_parameters[task]["input"]]),pred_task_len)
            # we check also other input length
            if src_text_ls.get("input_masked") is not None:
                assert pred_task_len == len(src_text_ls["input_masked"])
            if src_text_ls.get("wordpieces_inputs_words") is not None:
                assert pred_task_len == len(src_text_ls["wordpieces_inputs_words"]), "ERROR mismatch source " \
                                                                            "wordpieces_inputs_words {}  " \
                                                                            "and prediction {} ".format(src_text_ls, pred_per_task[task_label])
            if src_text_ls.get("wordpieces_inputs_raw_tokens") is not None:
                assert pred_task_len == len(src_text_ls["wordpieces_inputs_raw_tokens"]), \
                                    "ERROR mismatch source wordpieces_inputs_" \
                                    "raw_tokens {} and prediction {} ".format(src_text_ls, pred_per_task[task_label])
            try:
                assert pred_task_len == all_indexes.shape[0], "ERROR mismatch index {}  and all_indexes {} : pred {}".format(pred_task_len, all_indexes.shape[0], pred_per_task[task_label])
            except:
                pdb.set_trace()
        pred_task_len_former = pred_task_len

        task_former = task_label
        if format == "conll":
            mode_write = "w" if new_file else "a"
        if new_file:
            printing("CREATING NEW FILE (io_/dat/normalized_writer) : {} ", var=[dir_pred], verbose=verbose,
                     verbose_level=1)

    pos_label = "pos-pos" if not gold else "pos"
    types_label = "parsing-types" if not gold else "types"
    heads_label = "parsing-heads" if not gold else "heads"
    n_masks_mwe_label = "n_masks_mwe-n_masks_mwe" if not gold else "n_masks_mwe"
    mwe_detection_label = "mwe_detection-mwe_detection" if not gold else "mwe_detection"

    with open(dir_pred, mode_write) as norm_file:
        with open(dir_original, mode_write) as original:
            len_original = 0
            for ind_sent in range(all_indexes.shape[0]):
                pred_sent = OrderedDict()
                # NB : length assertion for each input-output (correcting if possible)
                # TODO standartize !!  INCONSITENCIES WHEN GOLD TRUE AND GOLD FALSE, IF GOLD : pred_per_task is indexed by labels (no relation 1-1 to task and src ! )
                for task_label_or_gold_label in pred_per_task:
                    #task, _, label_processed = get_task_name_based_on_logit_label(task_label, label_processed)
                    if gold:
                        pred_sent[task_label_or_gold_label] = pred_per_task[task_label_or_gold_label][ind_sent]
                    else:
                        pred_sent[task_label_or_gold_label] = pred_per_task[task_label_or_gold_label][writing_top-1][ind_sent]
                    try:
                        # TODO : standartize  (the first if is needed because we handle at the same time gold data indexed by label and prediction labelled by task+label
                        if gold:
                            try:
                                src = src_text_ls[LABEL_PARAMETER[task_label_or_gold_label]["default_input"]][ind_sent]
                            except Exception as e:
                                src = src_text_ls["input_masked"][ind_sent]
                        else:
                            _task = re.match("(.*)-(.*)", task_label_or_gold_label)
                            assert _task is not None#, "ERROR writer could not match {}".format(task_label)
                            _label = _task.group(2)
                            _task = _task.group(1)
                            src = src_text_ls[TASKS_PARAMETER[_task]["input"]][ind_sent]

                        assert len(src) == len(pred_sent[task_label_or_gold_label]),"WARNING : (writer) task {} original_sent len {} {} \n  predicted sent len {} {}".format(task_label_or_gold_label, len(src), src,len(pred_sent[task_label_or_gold_label]), pred_sent[task_label_or_gold_label])
                    except AssertionError as e:
                        print(e)
                        pdb.set_trace()
                        if len(src) > len(pred_sent[task_label_or_gold_label]):
                            pred_sent[task_label_or_gold_label].extend(["UNK" for _ in range(len(src)-len(pred_sent[task_label_or_gold_label]))])
                            print("WARNING (writer) : original larger than prediction : so appending UNK token for writing")
                        else:
                            print("WARNING (writer) : original smaller than prediction for ")

                norm_file.write("#\n")
                original.write("#\n")
                norm_file.write("#sent_id = {} \n".format(ind_sent+ind_batch+1))
                original.write("#sent_id = {} \n".format(ind_sent+ind_batch+1))
                ind_adjust = 0

                #for ind, original_token in enumerate(original_sent):
                last_mwe_index = -1
                adjust_mwe = 0
                for ind in all_indexes[ind_sent, :]:
                    # WE REMOVE SPECIAL TOKENS ONLY IF THEY APPEAR AT THE BEGINING OR AT THE END
                    # on the source token !! (it tells us when we stop) (we nevern want to use gold information)
                    if "-" in ind and ind != "-1":
                        matching_mwe_ind = re.match("([0-9]+)-([0-9]+)", str(ind))
                        assert matching_mwe_ind is not None, "ERROR ind is {} : could not found mwe index".format(ind)
                        last_mwe_index = int(matching_mwe_ind.group(2))
                        ind_mwe = int(matching_mwe_ind.group(1))

                        original_token = src_text_ls["wordpieces_inputs_raw_tokens"][ind_sent][ind_mwe] if mwe_detection_label in pred_per_task or "wordpieces_inputs_words" in pred_per_task or n_masks_mwe_label in pred_per_task else "NOT_NEEDED"
                        adjust_mwe += (last_mwe_index-ind_mwe)
                        #assert ind_adjust == 0, "ERROR not supported"

                        mwe_meta = "Norm={}|mwe_detection={}|n_masks_mwe={}".format("_", pred_sent[mwe_detection_label][ind_mwe] if mwe_detection_label in pred_per_task else "_",
                                                                                    pred_sent[n_masks_mwe_label][ind_mwe] if n_masks_mwe_label in pred_per_task else "_")

                        norm_file.write("{index}\t{original}\t_\t{pos}\t_\t_\t{dep}\t_\t{types}\t{norm}\n".format(index=ind, original=original_token, pos="_", types="_", dep="_", norm=mwe_meta))
                        original.write("{}\t{}\t_\t_\t_\t_\t_\t_\t{}\t_\n".format(ind, original_token, "_"))
                        continue
                    else:
                        ind = int(ind)
                        try:
                            if "normalize" in [task for _tasks in tasks for task in _tasks]:

                                original_token = src_text_ls["wordpiece_words_src_aligned_with_norm"][ind_sent][ind]
                                original_pretokenized_field = "wordpiece_words_src_aligned_with_norm"
                            else:
                                original_token = src_text_ls["wordpieces_inputs_words"][ind_sent][ind]
                                original_pretokenized_field = "wordpieces_inputs_words"
                        except Exception as e:
                            original_token = src_text_ls["input_masked"][ind_sent][ind]
                            original_pretokenized_field = "input_masked"
                        # asserting that we have everything together on the source side
                        if ind > last_mwe_index:
                            if src_text_ls.get("wordpieces_inputs_raw_tokens") is not None:
                                try:
                                    assert src_text_ls[original_pretokenized_field][ind_sent][ind] == src_text_ls["wordpieces_inputs_raw_tokens"][ind_sent][ind-adjust_mwe], \
                                    "ERROR sequence {} on non-mwe tokens : raw and tokenized " \
                                    "should be same but are raw {} tokenized {}".format(original_pretokenized_field, src_text_ls["wordpieces_inputs_raw_tokens"][ind_sent][ind],
                                                                                        src_text_ls[original_pretokenized_field][ind_sent][ind+adjust_mwe])
                                except:
                                    print("WARNING sanity checking input failed (nomalized_writer) (might be due to dropout) {}".format(e))
                    max_len_word = max(len(original_token), len_original)
                    #if original_token in SPECIAL_TOKEN_LS and (ind+1 == len(original_sent) or ind == 0):
                    if (original_token in SPECIAL_TOKEN_LS or original_token in [cls_token, sep_token]):
                        # ind 0 is skipped because it corresponds to CLS
                        ind_adjust = 1
                        continue

                    pos = pred_sent[pos_label][ind] if pos_label in pred_per_task else "_"
                    types = pred_sent[types_label][ind] if types_label in pred_per_task else "_"
                    heads = pred_sent[heads_label][ind] if heads_label in pred_per_task else ind - 1

                    tenth_col = "Norm={}|mwe_detection={}|n_masks_mwe={}".format(pred_sent["normalize"][ind] if "normalize" in pred_per_task else "_",
                                                                                 pred_sent[mwe_detection_label][ind-adjust_mwe] if mwe_detection_label in pred_per_task else "_",
                                                                                 pred_sent[n_masks_mwe_label][ind-adjust_mwe] if n_masks_mwe_label in pred_per_task else "_")

                    norm_file.write("{index}\t{original}\t_\t{pos}\t_\t_\t{dep}\t_\t{types}\t{norm}\n".format(index=ind, original=original_token, pos=pos, types=types, dep=heads, norm=tenth_col))
                    original.write("{}\t{}\t_\t_\t_\t_\t_\t_\t{}\t_\n".format(ind, original_token, ind-1))
                    if cut_sent:
                        if ind > 50:
                            break
                        print("CUTTING SENT index {}>50 ".format(ind))
                norm_file.write("\n")
                original.write("\n")
        printing("WRITING predicted batch of {} original and {} normalized", var=[dir_original, dir_pred], verbose=verbose, verbose_level=2)
    assert max_len_word is not None, "ERROR : something went wrong in the writer"
    return max_len_word
示例#30
0
def run(args,
        n_observation_max_per_epoch_train,
        vocab_size,
        model_dir,
        voc_tokenizer,
        auxilliary_task_norm_not_norm,
        null_token_index,
        null_str,
        tokenizer,
        n_observation_max_per_epoch_dev_test=None,
        run_mode="train",
        dict_path=None,
        end_predictions=None,
        report=True,
        model_suffix="",
        description="",
        saving_every_epoch=10,
        model_location=None,
        model_id=None,
        report_full_path_shared=None,
        skip_1_t_n=False,
        heuristic_test_ls=None,
        remove_mask_str_prediction=False,
        inverse_writing=False,
        extra_label_for_prediction="",
        random_iterator_train=True,
        bucket_test=False,
        must_get_norm_test=True,
        early_stoppin_metric=None,
        subsample_early_stoping_metric_val=None,
        compute_intersection_score_test=True,
        threshold_edit=3,
        name_with_epoch=False,
        max_token_per_batch=200,
        encoder=None,
        debug=False,
        verbose=1):
    """
    Wrapper for training/prediction/evaluation

    2 modes : train (will train using train and dev iterators with test at the end on test_path)
              test : only test at the end : requires all directories to be created
    :return:
    """
    assert run_mode in ["train", "test"
                        ], "ERROR run mode {} corrupted ".format(run_mode)
    input_level_ls = ["wordpiece"]
    assert early_stoppin_metric is not None and subsample_early_stoping_metric_val is not None, "ERROR : assert early_stoppin_metric should be defined and subsample_early_stoping_metric_val "
    if n_observation_max_per_epoch_dev_test is None:
        n_observation_max_per_epoch_dev_test = n_observation_max_per_epoch_train
    printing("MODEL : RUNNING IN {} mode",
             var=[run_mode],
             verbose=verbose,
             verbose_level=1)
    printing(
        "WARNING : casing was set to {} (this should be consistent at train and test)",
        var=[args.case],
        verbose=verbose,
        verbose_level=2)

    if len(args.tasks) == 1:
        printing("INFO : MODEL : 1 set of simultaneous tasks {}".format(
            args.tasks),
                 verbose=verbose,
                 verbose_level=1)

    if run_mode == "test":
        assert args.test_paths is not None and isinstance(
            args.test_paths, list)
    if run_mode == "train":
        printing("CHECKPOINTING info : "
                 "saving model every {}",
                 var=saving_every_epoch,
                 verbose=verbose,
                 verbose_level=1)

    use_gpu = use_gpu_(use_gpu=None, verbose=verbose)

    def get_commit_id():
        repo = git.Repo(os.path.dirname(os.path.realpath(__file__)),
                        search_parent_directories=True)
        git_commit_id = str(repo.head.commit)  # object.hexsha
        return git_commit_id

    if verbose > 1:
        print(f"GIT ID : {get_commit_id()}")

    train_data_label = get_dataset_label(args.train_path, default="train")

    iter_train = 0
    iter_dev = 0
    row = None
    writer = None

    printout_allocated_gpu_memory(verbose, "{} starting all".format(model_id))

    if run_mode == "train":
        if os.path.isdir(args.train_path[0]) and len(args.train_path) == 1:
            data_sharded = args.train_path[0]
            printing(
                "INFO args.train_path is directory so not rebuilding shards",
                verbose=verbose,
                verbose_level=1)
        elif os.path.isdir(args.train_path[0]):
            raise (Exception(
                " {} is a directory but len is more than one , not supported".
                format(args.train_path[0], len(args.train_path))))
        else:
            data_sharded = None
        assert model_location is None and model_id is None, "ERROR we are creating a new one "

        model_id, model_location, dict_path, tensorboard_log, end_predictions, data_sharded \
            = setup_repoting_location(model_suffix=model_suffix, data_sharded=data_sharded,
                                      root_dir_checkpoints=CHECKPOINT_BERT_DIR,
                                      shared_id=args.overall_label, verbose=verbose)
        hyperparameters = get_hyperparameters_dict(
            args,
            args.case,
            random_iterator_train,
            seed=args.seed,
            verbose=verbose,
            dict_path=dict_path,
            model_id=model_id,
            model_location=model_location)
        args_dir = write_args(model_location,
                              model_id=model_id,
                              hyperparameters=hyperparameters,
                              verbose=verbose)

        if report:
            if report_full_path_shared is not None:
                tensorboard_log = os.path.join(report_full_path_shared,
                                               "tensorboard")
            printing("tensorboard --logdir={} --host=localhost --port=1234 ",
                     var=[tensorboard_log],
                     verbose_level=1,
                     verbose=verbose)
            writer = SummaryWriter(log_dir=tensorboard_log)
            if writer is not None:
                writer.add_text("INFO-ARGUMENT-MODEL-{}".format(model_id),
                                str(hyperparameters), 0)
    else:
        args_checkpoint = json.load(open(args.init_args_dir, "r"))
        dict_path = args_checkpoint["hyperparameters"]["dict_path"]
        assert dict_path is not None and os.path.isdir(
            dict_path), "ERROR {} ".format(dict_path)
        end_predictions = args.end_predictions
        assert end_predictions is not None and os.path.isdir(
            end_predictions), "ERROR end_predictions"
        model_location = args_checkpoint["hyperparameters"]["model_location"]
        model_id = args_checkpoint["hyperparameters"]["model_id"]
        assert model_location is not None and model_id is not None, "ERROR model_location model_id "
        args_dir = os.path.join(model_location,
                                "{}-args.json".format(model_id))

        printing(
            "CHECKPOINTING : starting writing log \ntensorboard --logdir={} --host=localhost --port=1234 ",
            var=[os.path.join(model_id, "tensorboard")],
            verbose_level=1,
            verbose=verbose)

    # build or make dictionaries
    _dev_path = args.dev_path if args.dev_path is not None else args.train_path
    word_dictionary, word_norm_dictionary, char_dictionary, pos_dictionary, \
    xpos_dictionary, type_dictionary = \
        conllu_data.load_dict(dict_path=dict_path,
                              train_path=args.train_path if run_mode == "train" else None,
                              dev_path=args.dev_path if run_mode == "train" else None,
                              test_path=None,
                              word_embed_dict={},
                              dry_run=False,
                              expand_vocab=False,
                              word_normalization=True,
                              force_new_dic=True if run_mode == "train" else False,
                              tasks=args.tasks,
                              pos_specific_data_set=args.train_path[1] if len(args.tasks) > 1 and len(args.train_path)>1 and "pos" in args.tasks else None,
                              case=args.case,
                              # if not normalize pos or parsing in tasks we don't need dictionary
                              do_not_fill_dictionaries=len(set(["normalize", "pos", "parsing"])&set([task for tasks in args.tasks for task in tasks])) == 0,
                              add_start_char=1 if run_mode == "train" else None,
                              verbose=verbose)
    # we flatten the taskssd
    printing("DICTIONARY CREATED/LOADED", verbose=verbose, verbose_level=1)
    num_labels_per_task, task_to_label_dictionary = get_vocab_size_and_dictionary_per_task(
        [task for tasks in args.tasks for task in tasks],
        vocab_bert_wordpieces_len=vocab_size,
        pos_dictionary=pos_dictionary,
        type_dictionary=type_dictionary,
        task_parameters=TASKS_PARAMETER)
    voc_pos_size = num_labels_per_task["pos"] if "pos" in args.tasks else None
    if voc_pos_size is not None:
        printing("MODEL : voc_pos_size defined as {}",
                 var=voc_pos_size,
                 verbose_level=1,
                 verbose=verbose)
    printing("MODEL init...", verbose=verbose, verbose_level=1)
    if verbose > 1:
        print("DEBUG : TOKENIZER :voc_tokenizer from_pretrained",
              voc_tokenizer)
    #pdb.set_trace()
    #voc_tokenizer = "bert-base-multilingual-cased"
    tokenizer = tokenizer.from_pretrained(
        voc_tokenizer,
        do_lower_case=args.case == "lower",
        shuffle_bpe_embedding=args.shuffle_bpe_embedding)
    mask_id = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)  #convert_tokens_to_ids([MASK_BERT])[0]
    printout_allocated_gpu_memory(verbose,
                                  "{} loading model ".format(model_id))
    model = get_model_multi_task_bert(args=args,
                                      model_dir=model_dir,
                                      encoder=encoder,
                                      num_labels_per_task=num_labels_per_task,
                                      mask_id=mask_id)

    def prune_heads(prune_heads):
        if prune_heads is not None:
            pune_heads_ls = prune_heads.split(",")[:-1]
            assert len(pune_heads_ls) > 0
            for layer in pune_heads_ls:
                parsed_layer_to_prune = layer.split("-")
                assert parsed_layer_to_prune[0] == "prune_heads"
                assert parsed_layer_to_prune[1] == "layer"
                assert parsed_layer_to_prune[3] == "heads"
                heads = parsed_layer_to_prune[4]
                head_index_ls = heads.split("_")
                heads_ls = [int(index) for index in head_index_ls]
                print(
                    f"MODEL : pruning layer {parsed_layer_to_prune[2]} heads {heads_ls}"
                )
                model.encoder.encoder.layer[int(
                    parsed_layer_to_prune[2])].attention.prune_heads(heads_ls)

    if args.prune_heads is not None and args.prune_heads != "None":
        print(f"INFO : args.prune_heads {args.prune_heads}")
        prune_heads(args.prune_heads)

    if use_gpu:
        model.to("cuda")
        printing("MODEL TO CUDA", verbose=verbose, verbose_level=1)
    printing("MODEL model.config {} ",
             var=[model.config],
             verbose=verbose,
             verbose_level=1)
    printout_allocated_gpu_memory(verbose, "{} model loaded".format(model_id))
    model_origin = OrderedDict()
    pruning_mask = OrderedDict()
    printout_allocated_gpu_memory(verbose, "{} model cuda".format(model_id))
    for name, param in model.named_parameters():
        model_origin[name] = param.detach().clone()
        printout_allocated_gpu_memory(verbose, "{} param cloned ".format(name))
        if args.penalization_mode == "pruning":
            abs = torch.abs(param.detach().flatten())
            median_value = torch.median(abs)
            pruning_mask[name] = (abs > median_value).float()
        printout_allocated_gpu_memory(
            verbose, "{} pruning mask loaded".format(model_id))

    printout_allocated_gpu_memory(verbose, "{} model clone".format(model_id))

    inv_word_dic = word_dictionary.instance2index
    # load , mask, bucket and index data

    assert tokenizer is not None, "ERROR : tokenizer is None , voc_tokenizer failed to be loaded {}".format(
        voc_tokenizer)
    if run_mode == "train":
        time_load_readers_train_start = time.time()
        if not args.memory_efficient_iterator:

            data_sharded, n_shards, n_sent_dataset_total_train = None, None, None
            args_load_batcher_shard_data = None
            printing("INFO : starting loading readers",
                     verbose=verbose,
                     verbose_level=1)
            readers_train = readers_load(
                datasets=args.train_path,
                tasks=args.tasks,
                word_dictionary=word_dictionary,
                bert_tokenizer=tokenizer,
                word_dictionary_norm=word_norm_dictionary,
                char_dictionary=char_dictionary,
                pos_dictionary=pos_dictionary,
                xpos_dictionary=xpos_dictionary,
                type_dictionary=type_dictionary,
                word_decoder=True,
                run_mode=run_mode,
                add_start_char=1,
                add_end_char=1,
                symbolic_end=1,
                symbolic_root=1,
                bucket=True,
                must_get_norm=True,
                input_level_ls=input_level_ls,
                verbose=verbose)
            n_sent_dataset_total_train = readers_train[list(
                readers_train.keys())[0]][3]
            printing("INFO : done with sharding",
                     verbose=verbose,
                     verbose_level=1)
        else:
            printing("INFO : building/loading shards ",
                     verbose=verbose,
                     verbose_level=1)
            data_sharded, n_shards, n_sent_dataset_total_train = build_shard(
                data_sharded,
                args.train_path,
                n_sent_max_per_file=N_SENT_MAX_CONLL_PER_SHARD,
                verbose=verbose)

        time_load_readers_dev_start = time.time()
        time_load_readers_train = time.time() - time_load_readers_train_start
        readers_dev_ls = []
        dev_data_label_ls = []
        printing("INFO : g readers for dev", verbose=verbose, verbose_level=1)
        printout_allocated_gpu_memory(
            verbose, "{} reader train loaded".format(model_id))
        for dev_path in args.dev_path:
            dev_data_label = get_dataset_label(dev_path, default="dev")
            dev_data_label_ls.append(dev_data_label)
            readers_dev = readers_load(
                datasets=dev_path,
                tasks=args.tasks,
                word_dictionary=word_dictionary,
                word_dictionary_norm=word_norm_dictionary,
                char_dictionary=char_dictionary,
                pos_dictionary=pos_dictionary,
                xpos_dictionary=xpos_dictionary,
                bert_tokenizer=tokenizer,
                type_dictionary=type_dictionary,
                word_decoder=True,
                run_mode=run_mode,
                add_start_char=1,
                add_end_char=1,
                symbolic_end=1,
                symbolic_root=1,
                bucket=False,
                must_get_norm=True,
                input_level_ls=input_level_ls,
                verbose=verbose) if args.dev_path is not None else None
            readers_dev_ls.append(readers_dev)
        printout_allocated_gpu_memory(verbose,
                                      "{} reader dev loaded".format(model_id))

        time_load_readers_dev = time.time() - time_load_readers_dev_start
        # Load tokenizer
        printing("TIME : {} ",
                 var=[
                     OrderedDict([
                         ("time_load_readers_train",
                          "{:0.4f} min".format(time_load_readers_train / 60)),
                         ("time_load_readers_dev",
                          "{:0.4f} min".format(time_load_readers_dev / 60))
                     ])
                 ],
                 verbose=verbose,
                 verbose_level=2)

        early_stoping_val_former = 1000
        # training starts when epoch is 1
        #args.epochs += 1
        #assert args.epochs >= 1, "ERROR need at least 2 epochs (1 eval , 1 train 1 eval"
        flexible_batch_size = False

        if args.optimizer == "AdamW":
            model, optimizer, scheduler = apply_fine_tuning_strategy(
                model=model,
                fine_tuning_strategy=args.fine_tuning_strategy,
                lr_init=args.lr,
                betas=(0.9, 0.99),
                epoch=0,
                weight_decay=args.weight_decay,
                optimizer_name=args.optimizer,
                t_total=n_sent_dataset_total_train / args.batch_update_train *
                args.epochs if n_sent_dataset_total_train /
                args.batch_update_train * args.epochs > 1 else 5,
                verbose=verbose)

        try:
            for epoch in range(args.epochs):
                if args.memory_efficient_iterator:
                    # we start epoch with a new shart everytime !
                    training_file = get_new_shard(data_sharded, n_shards)
                    printing(
                        "INFO Memory efficient iterator triggered (only build for train data , starting with {}",
                        var=[training_file],
                        verbose=verbose,
                        verbose_level=1)
                    args_load_batcher_shard_data = {
                        "word_dictionary": word_dictionary,
                        "tokenizer": tokenizer,
                        "word_norm_dictionary": word_norm_dictionary,
                        "char_dictionary": char_dictionary,
                        "pos_dictionary": pos_dictionary,
                        "xpos_dictionary": xpos_dictionary,
                        "type_dictionary": type_dictionary,
                        "use_gpu": use_gpu,
                        "norm_not_norm": auxilliary_task_norm_not_norm,
                        "word_decoder": True,
                        "add_start_char": 1,
                        "add_end_char": 1,
                        "symbolic_end": 1,
                        "symbolic_root": 1,
                        "bucket": True,
                        "max_char_len": 20,
                        "must_get_norm": True,
                        "use_gpu_hardcoded_readers": False,
                        "bucketing_level": "bpe",
                        "input_level_ls": ["wordpiece"],
                        "auxilliary_task_norm_not_norm":
                        auxilliary_task_norm_not_norm,
                        "random_iterator_train": random_iterator_train
                    }

                    readers_train = readers_load(
                        datasets=args.train_path if
                        not args.memory_efficient_iterator else training_file,
                        tasks=args.tasks,
                        word_dictionary=word_dictionary,
                        bert_tokenizer=tokenizer,
                        word_dictionary_norm=word_norm_dictionary,
                        char_dictionary=char_dictionary,
                        pos_dictionary=pos_dictionary,
                        xpos_dictionary=xpos_dictionary,
                        type_dictionary=type_dictionary,
                        word_decoder=True,
                        run_mode=run_mode,
                        add_start_char=1,
                        add_end_char=1,
                        symbolic_end=1,
                        symbolic_root=1,
                        bucket=True,
                        must_get_norm=True,
                        input_level_ls=input_level_ls,
                        verbose=verbose)

                checkpointing_model_data = (epoch % saving_every_epoch == 0
                                            or epoch == (args.epochs - 1))
                # build iterator on the loaded data
                printout_allocated_gpu_memory(
                    verbose, "{} loading batcher".format(model_id))

                if args.batch_size == "flexible":
                    flexible_batch_size = True

                    printing(
                        "INFO : args.batch_size {} so updating it based on mean value {}",
                        var=[
                            args.batch_size,
                            update_batch_size_mean(readers_train)
                        ],
                        verbose=verbose,
                        verbose_level=1)
                    args.batch_size = update_batch_size_mean(readers_train)

                    if args.batch_update_train == "flexible":
                        args.batch_update_train = args.batch_size
                    printing(
                        "TRAINING : backward pass every {} step of size {} in average",
                        var=[
                            int(args.batch_update_train // args.batch_size),
                            args.batch_size
                        ],
                        verbose=verbose,
                        verbose_level=1)
                    try:
                        assert isinstance(args.batch_update_train // args.batch_size, int)\
                           and args.batch_update_train // args.batch_size > 0, \
                            "ERROR batch_size {} should be a multiple of {} ".format(args.batch_update_train, args.batch_size)
                    except Exception as e:
                        print("WARNING {}".format(e))
                batchIter_train = data_gen_multi_task_sampling_batch(
                    tasks=args.tasks,
                    readers=readers_train,
                    batch_size=readers_train[list(readers_train.keys())[0]][4],
                    max_token_per_batch=max_token_per_batch
                    if flexible_batch_size else None,
                    word_dictionary=word_dictionary,
                    char_dictionary=char_dictionary,
                    pos_dictionary=pos_dictionary,
                    word_dictionary_norm=word_norm_dictionary,
                    get_batch_mode=random_iterator_train,
                    print_raw=False,
                    dropout_input=0.0,
                    verbose=verbose)

                # -|-|-
                printout_allocated_gpu_memory(
                    verbose, "{} batcher train loaded".format(model_id))
                batchIter_dev_ls = []
                batch_size_DEV = 1

                if verbose > 1:
                    print(
                        "WARNING : batch_size for final eval was hardcoded and set to {}"
                        .format(batch_size_DEV))
                for readers_dev in readers_dev_ls:
                    batchIter_dev = data_gen_multi_task_sampling_batch(
                        tasks=args.tasks,
                        readers=readers_dev,
                        batch_size=batch_size_DEV,
                        word_dictionary=word_dictionary,
                        char_dictionary=char_dictionary,
                        pos_dictionary=pos_dictionary,
                        word_dictionary_norm=word_norm_dictionary,
                        get_batch_mode=False,
                        print_raw=False,
                        dropout_input=0.0,
                        verbose=verbose) if args.dev_path is not None else None
                    batchIter_dev_ls.append(batchIter_dev)

                model.train()
                printout_allocated_gpu_memory(
                    verbose, "{} batcher dev loaded".format(model_id))
                if args.optimizer != "AdamW":

                    model, optimizer, scheduler = apply_fine_tuning_strategy(
                        model=model,
                        fine_tuning_strategy=args.fine_tuning_strategy,
                        lr_init=args.lr,
                        betas=(0.9, 0.99),
                        weight_decay=args.weight_decay,
                        optimizer_name=args.optimizer,
                        t_total=n_sent_dataset_total_train /
                        args.batch_update_train *
                        args.epochs if n_sent_dataset_total_train /
                        args.batch_update_train * args.epochs > 1 else 5,
                        epoch=epoch,
                        verbose=verbose)
                printout_allocated_gpu_memory(
                    verbose, "{} optimizer loaded".format(model_id))
                loss_train = None

                if epoch >= 0:
                    printing("TRAINING : training on GET_BATCH_MODE ",
                             verbose=verbose,
                             verbose_level=2)
                    printing(
                        "TRAINING {} training 1 'epoch' = {} observation size args.batch_"
                        "update_train (foward {} batch_size {} backward  "
                        "(every int(args.batch_update_train//args.batch_size) step if {})) ",
                        var=[
                            model_id, n_observation_max_per_epoch_train,
                            args.batch_size, args.batch_update_train,
                            args.low_memory_foot_print_batch_mode
                        ],
                        verbose=verbose,
                        verbose_level=1)
                    loss_train, iter_train, perf_report_train, _ = epoch_run(
                        batchIter_train,
                        tokenizer,
                        args=args,
                        model_origin=model_origin,
                        pruning_mask=pruning_mask,
                        task_to_label_dictionary=task_to_label_dictionary,
                        data_label=train_data_label,
                        model=model,
                        dropout_input_bpe=args.dropout_input_bpe,
                        writer=writer,
                        iter=iter_train,
                        epoch=epoch,
                        writing_pred=epoch == (args.epochs - 1),
                        dir_end_pred=end_predictions,
                        optimizer=optimizer,
                        use_gpu=use_gpu,
                        scheduler=scheduler,
                        predict_mode=(epoch - 1) % 5 == 0,
                        skip_1_t_n=skip_1_t_n,
                        model_id=model_id,
                        reference_word_dic={"InV": inv_word_dic},
                        null_token_index=null_token_index,
                        null_str=null_str,
                        norm_2_noise_eval=False,
                        early_stoppin_metric=None,
                        n_obs_max=n_observation_max_per_epoch_train,
                        data_sharded_dir=data_sharded,
                        n_shards=n_shards,
                        n_sent_dataset_total=n_sent_dataset_total_train,
                        args_load_batcher_shard_data=
                        args_load_batcher_shard_data,
                        memory_efficient_iterator=args.
                        memory_efficient_iterator,
                        verbose=verbose)

                else:
                    printing(
                        "TRAINING : skipping first epoch to start by evaluating on devs dataset0",
                        verbose=verbose,
                        verbose_level=1)
                printout_allocated_gpu_memory(
                    verbose, "{} epoch train done".format(model_id))
                model.eval()

                if args.dev_path is not None and (epoch % 3 == 0
                                                  or epoch <= 6):
                    if verbose > 1:
                        print("RUNNING DEV on ITERATION MODE")
                    early_stoping_val_ls = []
                    loss_dev_ls = []
                    for i_dev, batchIter_dev in enumerate(batchIter_dev_ls):
                        loss_dev, iter_dev, perf_report_dev, early_stoping_val = epoch_run(
                            batchIter_dev,
                            tokenizer,
                            args=args,
                            epoch=epoch,
                            model_origin=model_origin,
                            pruning_mask=pruning_mask,
                            task_to_label_dictionary=task_to_label_dictionary,
                            iter=iter_dev,
                            use_gpu=use_gpu,
                            model=model,
                            writer=writer,
                            optimizer=None,
                            writing_pred=True,  #epoch == (args.epochs - 1),
                            dir_end_pred=end_predictions,
                            predict_mode=True,
                            data_label=dev_data_label_ls[i_dev],
                            null_token_index=null_token_index,
                            null_str=null_str,
                            model_id=model_id,
                            skip_1_t_n=skip_1_t_n,
                            dropout_input_bpe=0,
                            reference_word_dic={"InV": inv_word_dic},
                            norm_2_noise_eval=False,
                            early_stoppin_metric=early_stoppin_metric,
                            subsample_early_stoping_metric_val=
                            subsample_early_stoping_metric_val,
                            #case=case,
                            n_obs_max=n_observation_max_per_epoch_dev_test,
                            verbose=verbose)

                        printing(
                            "TRAINING : loss train:{} dev {}:{} for epoch {}  out of {}",
                            var=[
                                loss_train, i_dev, loss_dev, epoch, args.epochs
                            ],
                            verbose=1,
                            verbose_level=1)
                        printing("PERFORMANCE {} DEV {} {} ",
                                 var=[epoch, i_dev + 1, perf_report_dev],
                                 verbose=verbose,
                                 verbose_level=1)
                        early_stoping_val_ls.append(early_stoping_val)
                        loss_dev_ls.append(loss_dev)

                    else:
                        if verbose > 1:
                            print("NO DEV EVAL")
                        loss_dev, iter_dev, perf_report_dev = None, 0, None
                # NB : early_stoping_val is based on first dev set
                printout_allocated_gpu_memory(
                    verbose, "{} epoch dev done".format(model_id))

                early_stoping_val = early_stoping_val_ls[0]
                if checkpointing_model_data or early_stoping_val < early_stoping_val_former:
                    if early_stoping_val is not None:
                        _epoch = "best" if early_stoping_val < early_stoping_val_former else epoch
                    else:
                        if verbose > 1:
                            print(
                                'WARNING early_stoping_val is None so saving based on checkpointing_model_data only'
                            )
                        _epoch = epoch
                    # model_id enriched possibly with some epoch informaiton if name_with_epoch
                    _model_id = get_name_model_id_with_extra_name(
                        epoch=epoch,
                        _epoch=_epoch,
                        name_with_epoch=name_with_epoch,
                        model_id=model_id)
                    checkpoint_dir = os.path.join(
                        model_location, "{}-checkpoint.pt".format(_model_id))

                    if _epoch == "best":
                        printing(
                            "CHECKPOINT : SAVING BEST MODEL {} (epoch:{}) (new loss is {} former was {})"
                            .format(checkpoint_dir, epoch, early_stoping_val,
                                    early_stoping_val_former),
                            verbose=verbose,
                            verbose_level=1)
                        last_checkpoint_dir_best = checkpoint_dir
                        early_stoping_val_former = early_stoping_val
                        best_epoch = epoch
                        best_loss = early_stoping_val
                    else:
                        printing(
                            "CHECKPOINT : NOT SAVING BEST MODEL : new loss {} did not beat first loss {}"
                            .format(early_stoping_val,
                                    early_stoping_val_former),
                            verbose_level=1,
                            verbose=verbose)
                    last_model = ""
                    if epoch == (args.epochs - 1):
                        last_model = "last"
                    printing("CHECKPOINT : epoch {} saving {} model {} ",
                             var=[epoch, last_model, checkpoint_dir],
                             verbose=verbose,
                             verbose_level=1)
                    torch.save(model.state_dict(), checkpoint_dir)

                    args_dir = write_args(
                        dir=model_location,
                        checkpoint_dir=checkpoint_dir,
                        hyperparameters=hyperparameters
                        if name_with_epoch else None,
                        model_id=_model_id,
                        info_checkpoint=OrderedDict([
                            ("epochs", epoch + 1),
                            ("batch_size", args.batch_size
                             if not args.low_memory_foot_print_batch_mode else
                             args.batch_update_train),
                            ("train_path", train_data_label),
                            ("dev_path", dev_data_label_ls),
                            ("num_labels_per_task", num_labels_per_task)
                        ]),
                        verbose=verbose)

            if row is not None and update_status is not None:
                update_status(row=row, value="training-done", verbose=1)
        except Exception as e:
            if row is not None and update_status is not None:
                update_status(row=row, value="ERROR", verbose=1)
            raise (e)

    # reloading last (best) checkpoint
    if run_mode in ["train", "test"] and args.test_paths is not None:
        report_all = []
        if run_mode == "train" and args.epochs > 0:
            if use_gpu:
                model.load_state_dict(torch.load(last_checkpoint_dir_best))
                model = model.cuda()
                printout_allocated_gpu_memory(
                    verbose, "{} after reloading model".format(model_id))
            else:
                model.load_state_dict(
                    torch.load(last_checkpoint_dir_best,
                               map_location=lambda storage, loc: storage))
            printing(
                "MODEL : RELOADING best model of epoch {} with loss {} based on {}({}) metric (from checkpoint {})",
                var=[
                    best_epoch, best_loss, early_stoppin_metric,
                    subsample_early_stoping_metric_val,
                    last_checkpoint_dir_best
                ],
                verbose=verbose,
                verbose_level=1)

        model.eval()

        printout_allocated_gpu_memory(verbose,
                                      "{} starting test".format(model_id))
        for test_path in args.test_paths:
            assert len(test_path) == len(
                args.tasks), "ERROR test_path {} args.tasks {}".format(
                    test_path, args.tasks)
            for test, task_to_eval in zip(test_path, args.tasks):
                label_data = get_dataset_label([test], default="test")
                if len(extra_label_for_prediction) > 0:
                    label_data += "-" + extra_label_for_prediction

                if args.shuffle_bpe_embedding and args.test_mode_no_shuffle_embedding:
                    printing(
                        "TOKENIZER: as args.shuffle_bpe_embedding {} and test_mode_no_shuffle {} : reloading tokenizer with no shuffle_embedding",
                        var=[
                            args.shuffle_bpe_embedding,
                            args.test_mode_no_shuffle_embedding
                        ],
                        verbose=1,
                        verbose_level=1)
                    tokenizer = tokenizer.from_pretrained(
                        voc_tokenizer,
                        do_lower_case=args.case == "lower",
                        shuffle_bpe_embedding=False)
                readers_test = readers_load(
                    datasets=[test],
                    tasks=[task_to_eval],
                    word_dictionary=word_dictionary,
                    word_dictionary_norm=word_norm_dictionary,
                    char_dictionary=char_dictionary,
                    pos_dictionary=pos_dictionary,
                    xpos_dictionary=xpos_dictionary,
                    type_dictionary=type_dictionary,
                    bert_tokenizer=tokenizer,
                    word_decoder=True,
                    run_mode=run_mode,
                    add_start_char=1,
                    add_end_char=1,
                    symbolic_end=1,
                    symbolic_root=1,
                    bucket=bucket_test,
                    input_level_ls=input_level_ls,
                    must_get_norm=must_get_norm_test,
                    verbose=verbose)

                heuritics_zip = [None]
                gold_error_or_not_zip = [False]
                norm2noise_zip = [False]

                if heuristic_test_ls is None:
                    assert len(gold_error_or_not_zip) == len(
                        heuritics_zip) and len(heuritics_zip) == len(
                            norm2noise_zip)

                batch_size_TEST = 1
                if verbose > 1:
                    print(
                        "WARNING : batch_size for final eval was hardcoded and set to {}"
                        .format(batch_size_TEST))
                for (heuristic_test, gold_error,
                     norm_2_noise_eval) in zip(heuritics_zip,
                                               gold_error_or_not_zip,
                                               norm2noise_zip):

                    assert heuristic_test is None and not gold_error and not norm_2_noise_eval

                    batchIter_test = data_gen_multi_task_sampling_batch(
                        tasks=[task_to_eval],
                        readers=readers_test,
                        batch_size=batch_size_TEST,
                        word_dictionary=word_dictionary,
                        char_dictionary=char_dictionary,
                        pos_dictionary=pos_dictionary,
                        word_dictionary_norm=word_norm_dictionary,
                        get_batch_mode=False,
                        dropout_input=0.0,
                        verbose=verbose)
                    try:
                        loss_test, iter_test, perf_report_test, _ = epoch_run(
                            batchIter_test,
                            tokenizer,
                            args=args,
                            iter=iter_dev,
                            use_gpu=use_gpu,
                            model=model,
                            task_to_label_dictionary=task_to_label_dictionary,
                            writer=None,
                            writing_pred=True,
                            optimizer=None,
                            args_dir=args_dir,
                            model_id=model_id,
                            dir_end_pred=end_predictions,
                            skip_1_t_n=skip_1_t_n,
                            predict_mode=True,
                            data_label=label_data,
                            epoch="LAST",
                            extra_label_for_prediction=label_data,
                            null_token_index=null_token_index,
                            null_str=null_str,
                            log_perf=False,
                            dropout_input_bpe=0,
                            norm_2_noise_eval=norm_2_noise_eval,
                            compute_intersection_score=
                            compute_intersection_score_test,
                            remove_mask_str_prediction=
                            remove_mask_str_prediction,
                            reference_word_dic={"InV": inv_word_dic},
                            threshold_edit=threshold_edit,
                            verbose=verbose,
                            n_obs_max=n_observation_max_per_epoch_dev_test)
                        if verbose > 1:
                            print("LOSS TEST", loss_test)
                    except Exception as e:
                        print(
                            "ERROR (epoch_run test) {} test_path {} , heuristic {} , gold error {} , norm2noise {} "
                            .format(e, test, heuristic_test, gold_error,
                                    norm_2_noise_eval))
                        raise (e)
                    print("PERFORMANCE TEST on data  {} is {} ".format(
                        label_data, perf_report_test))
                    print("DATA WRITTEN {}".format(end_predictions))
                    if writer is not None:
                        writer.add_text(
                            "Accuracy-{}-{}-{}".format(model_id, label_data,
                                                       run_mode),
                            "After {} epochs with {} : performance is \n {} ".
                            format(args.epochs, description,
                                   str(perf_report_test)), 0)
                    else:
                        printing(
                            "WARNING : could not add accuracy to tensorboard cause writer was found None",
                            verbose=verbose,
                            verbose_level=2)
                    report_all.extend(perf_report_test)
                    printout_allocated_gpu_memory(
                        verbose, "{} test done".format(model_id))
    else:
        printing("ERROR : EVALUATION none cause {} empty or run_mode {} ",
                 var=[args.test_paths, run_mode],
                 verbose_level=1,
                 verbose=verbose)

    if writer is not None:
        writer.close()
        printing("tensorboard --logdir={} --host=localhost --port=1234 ",
                 var=[tensorboard_log],
                 verbose_level=1,
                 verbose=verbose)

    report_dir = os.path.join(model_location, model_id + "-report.json")
    if report_full_path_shared is not None:
        report_full_dir = os.path.join(report_full_path_shared,
                                       args.overall_label + "-report.json")
        if os.path.isfile(report_full_dir):
            report = json.load(open(report_full_dir, "r"))
        else:
            report = []
            printing("REPORT = creating overall report at {} ",
                     var=[report_dir],
                     verbose=verbose,
                     verbose_level=1)
        report.extend(report_all)
        json.dump(report, open(report_full_dir, "w"))
        printing("{} {} ",
                 var=[REPORT_FLAG_DIR_STR, report_full_dir],
                 verbose=0,
                 verbose_level=0)

    json.dump(report_all, open(report_dir, "w"))
    printing("REPORTING TO {}".format(report_dir),
             verbose=verbose,
             verbose_level=1)
    if report_full_path_shared is None:
        printing("WARNING ; report_full_path_shared is None",
                 verbose=verbose,
                 verbose_level=1)
        printing("{} {} ",
                 var=[REPORT_FLAG_DIR_STR, report_dir],
                 verbose=verbose,
                 verbose_level=0)

    return model