def get_init_args_dir(init_args_dir): """ to simplify reporting we allow three ways of providing init_args_dir :param init_args_dir: :return: """ if os.path.isfile( init_args_dir ): # , "ERROR {} not found to reload checkpoint".format(init_args_dir) _dir = init_args_dir elif os.path.isfile(os.path.join(CHECKPOINT_BERT_DIR, init_args_dir)): printing( "MODEL init {} not found as directory so using second template ", var=[init_args_dir], verbose=1, verbose_level=1) _dir = os.path.join(CHECKPOINT_BERT_DIR, init_args_dir) else: printing( "MODEL init {} not found as directory and as subdirectory so using third template template ", var=[init_args_dir], verbose=1, verbose_level=1) match = re.match("(.*-model_[0-9]+).*", init_args_dir) assert match is not None, "ERROR : template {} not found in {}".format( "([.*]-model_[0-9]+).*", init_args_dir) _dir = os.path.join(CHECKPOINT_BERT_DIR, match.group(1), init_args_dir + "-args.json") assert os.path.isfile( _dir), "ERROR : {} does not exist (based on param {}) ".format( _dir, init_args_dir) return _dir
def printout_allocated_gpu_memory(verbose, comment): if verbose == "gpu": try: printing("GPU {} {}",var=[comment, torch.cuda.memory_allocated()], verbose=verbose, verbose_level="gpu") except Exception as e: print(e)
def get_dataset_label(dataset_dir_ls, default): if dataset_dir_ls is None: return None if REPO_DATASET.get(dataset_dir_ls[0], None) is None: try: label = "|".join( [get_code_data(path) for _, path in enumerate(dataset_dir_ls)]) except: printing( "REPORT : dataset name of directory {} not found as UD so using default ", var=[dataset_dir_ls], verbose=0, verbose_level=1) label = "|".join([ REPO_DATASET.get(path, "{}_{}".format(default, i)) for i, path in enumerate(dataset_dir_ls) ]) else: label = "|".join([ REPO_DATASET.get(path, "{}_{}".format(default, i)) for i, path in enumerate(dataset_dir_ls) ]) return label
def write_args(dir, model_id, checkpoint_dir=None, hyperparameters=None, info_checkpoint=None, verbose=1): args_dir = os.path.join(dir, "{}-args.json".format(model_id)) if os.path.isfile(args_dir): info = "updated" args = json.load(open(args_dir, "r")) args["checkpoint_dir"] = checkpoint_dir args["info_checkpoint"] = info_checkpoint json.dump(args, open(args_dir, "w")) else: assert hyperparameters is not None, "REPORT : args.json created for the first time : hyperparameters dic required " #assert info_checkpoint is None, "REPORT : args. created for the first time : no checkpoint yet " info = "new" json.dump( OrderedDict([("checkpoint_dir", checkpoint_dir), ("hyperparameters", hyperparameters), ("info_checkpoint", info_checkpoint)]), open(args_dir, "w")) printing("MODEL args.json {} written {} ".format(info, args_dir), verbose_level=1, verbose=verbose) return args_dir
def data_gen_dummy(V, batch, nbatches, sent_len=9, word_len=5, verbose=0, seed=None): "Generate random data for a src-tgt copy task." if seed is not None: np.random.seed(seed) for i in tqdm(range(nbatches), disable=disable_tqdm_level(verbose, verbose_level=2)): data = torch.from_numpy( np.random.randint(low=2, high=V, size=(batch, sent_len, word_len))) data[:, :, 0] = 2 # we force padding in the dummy model data[:, :, -1] = 1 data[:, :, -2] = 1 printing("DATA dummy {} ", var=(data), verbose=verbose, verbose_level=5) src = Variable(data, requires_grad=False) tgt = Variable(data, requires_grad=False) yield MaskBatch(src, tgt, pad=1)
def align_bpe(n_bpe_target_minus_source, source_aligned, source_aligned_index, target_aligned, target_aligned_index, n_masks_to_add, src_token_len, bert_tokenizer, mask_token, mode="dummy", index_src=None, index_target=None, verbose=0): """ align bpe of a given token using mode :return: """ assert mode in ["dummy"] # dummy means appending with SPACE or MASK when needed if n_bpe_target_minus_source > 0: assert index_src is not None source_aligned_index.extend( [index_src for _ in range(n_bpe_target_minus_source)]) source_aligned.extend( bert_tokenizer.convert_tokens_to_ids( [mask_token for _ in range(n_bpe_target_minus_source)])) elif n_bpe_target_minus_source < 0: assert index_target is not None # we add a NULL_STR (to be predicted) and index it as the former bpe token target_aligned_index.extend( [index_target for _ in range(-n_bpe_target_minus_source)]) target_aligned.extend( bert_tokenizer.convert_tokens_to_ids( [NULL_STR for _ in range(-n_bpe_target_minus_source)])) n_masks_to_add.append(n_bpe_target_minus_source) n_masks_to_add.extend([-1 for _ in range(src_token_len - 1)]) if verbose == "reader": printing( "SRC appending word bpe align : {}\nTARGET appending word bpe align : {} \nN_MASKS------------ : {}", var=[[mask_token for _ in range(n_bpe_target_minus_source)] if n_bpe_target_minus_source > 0 else "", [NULL_STR for _ in range(-n_bpe_target_minus_source)] if n_bpe_target_minus_source < 0 else "", [n_bpe_target_minus_source] + [-1 for _ in range(src_token_len - 1)]], verbose_level="reader", verbose=verbose) return source_aligned, source_aligned_index, target_aligned, target_aligned_index, n_masks_to_add
def sanity_check_loss_poneration(ponderation_dic, verbose=1): if isinstance(ponderation_dic, dict): for task in TASKS_PARAMETER: assert task in ponderation_dic, "ERROR : task {} is not related to a ponderation while it should ".format( task) elif isinstance(ponderation_dic, str): assert ponderation_dic in MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE, "ERROR ponderation should be in {}".format( ponderation_dic, MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE) printing("WARNING : COULD NOT SANITY CHECK ponderation_dic {} ", var=[ponderation_dic], verbose=verbose, verbose_level=1) else: raise (Exception("ponderation_dic is neither string or dict {}".format( ponderation_dic)))
def get_new_shard(shard_path, n_shards, rand=True, verbose=1): # pick a new file randomly assert rand i_shard = random.choice(range(n_shards)) path = os.path.join(shard_path, "train_{}.conll".format(i_shard)) assert os.path.isfile(path), "ERROR {}".format(path) printing("INFO : picking shard {} ", var=[path], verbose=verbose, verbose_level=1) return [path]
def get_perf_rate(metric, score_dic, n_tokens_dic, agg_func, task, verbose=1): """ provides metric : the confusion matrix standart rates for the given task :param metric: :param score_dic: two level dictionay : first level for agg_func second for prediciton class based on CLASS_PER_TASK and task :param agg_func: :return: rate, denumerator of the rate (if means like f1 : returns all ) """ pdb.set_trace() if metric in ["recall-{}".format(task), "f1-{}".format(task), "accuracy-{}".format(task)]: positive_obs = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]] recall = score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]] / positive_obs \ if positive_obs > 0 else None if positive_obs == 0: printing("WARNING : no positive observation were seen ", verbose=verbose, verbose_level=1) if metric == "recall-{}".format(task): return recall, positive_obs if metric in ["precision-{}".format(task), "f1-{}".format(task), "accuracy-{}".format(task)]: #positive_prediction = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]] - score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]] \ # + score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]] positive_prediction = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes_pred_field"][1]] precision = score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]] / positive_prediction if positive_prediction > 0 else None if metric == "precision-{}".format(task): return precision, positive_prediction if metric in ["tnr-{}".format(task), "accuracy-{}".format(task), "f1-{}".format(task)]: negative_obs = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]] if metric == "tnr-{}".format(task): return score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]] / negative_obs if negative_obs>0 else None, \ negative_obs if metric == "f1-{}".format(task): if recall is not None and precision is not None and recall>0 and precision>0: return hmean([recall, precision]), negative_obs + positive_obs else: return None, negative_obs + positive_obs if metric in ["npv-{}".format(task)]: negative_prediction = n_tokens_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes_pred_field"][0]] return score_dic[agg_func][ TASKS_PARAMETER[task]["predicted_classes"][0]] / negative_prediction if negative_prediction > 0 else None, \ negative_prediction if metric == "accuracy-{}".format(task): accuracy = (score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][0]] + score_dic[agg_func][TASKS_PARAMETER[task]["predicted_classes"][1]]) / (positive_obs + negative_obs) if positive_obs > 0 and negative_obs > 0 else None return accuracy, positive_obs + negative_obs raise(Exception("metric {} not supported".format(metric)))
def setup_repoting_location(root_dir_checkpoints, model_suffix="", shared_id=None, data_sharded=None, verbose=1): """ create an id for a model and locations for checkpoints, dictionaries, tensorboard logs, data :param model_suffix: :param verbose: :return: """ model_local_id = str(uuid4())[:5] if shared_id is not None: if len(shared_id) > 0: model_local_id = shared_id + "-" + model_local_id if model_suffix != "": model_local_id += "-" + model_suffix model_location = os.path.join(root_dir_checkpoints, model_local_id) dictionaries = os.path.join(root_dir_checkpoints, model_local_id, "dictionaries") tensorboard_log = os.path.join(root_dir_checkpoints, model_local_id, "tensorboard") end_predictions = os.path.join(root_dir_checkpoints, model_local_id, "predictions") os.mkdir(model_location) if data_sharded is None: data_sharded = os.path.join(root_dir_checkpoints, model_local_id, "shards") os.mkdir(data_sharded) else: assert os.path.isdir( data_sharded), "ERROR data_sharded not dir {} ".format( data_sharded) printing("INFO DATA already sharded in {}", var=[data_sharded], verbose=verbose, verbose_level=1) printing("CHECKPOINTING model location:{}", var=[model_location], verbose=verbose, verbose_level=1) printing("CHECKPOINTING model ID:{}", var=[model_local_id], verbose=verbose, verbose_level=1) os.mkdir(dictionaries) os.mkdir(tensorboard_log) os.mkdir(end_predictions) printing( "CHECKPOINTING \n- {} for checkpoints \n- {} for dictionaries created \n- {} predictions {} ", var=[model_location, dictionaries, end_predictions, data_sharded], verbose_level=1, verbose=verbose) return model_local_id, model_location, dictionaries, tensorboard_log, end_predictions, data_sharded
def print_align_bpe(source_preprocessed, gold, input_alignement_with_raw, labels_n_mask_prediction, verbose, verbose_level): if labels_n_mask_prediction is None: labels_n_mask_prediction = [[None for _ in range(len(sent))] for sent in input_alignement_with_raw] if isinstance(verbose, int) or verbose == "alignement": if verbose == "alignement" or verbose >= verbose_level: assert len(source_preprocessed) == len(gold), "" assert len(input_alignement_with_raw) == len(gold), "" for sent_src, sent_gold, index_match_with_src, append_masks in zip( source_preprocessed, gold, input_alignement_with_raw, labels_n_mask_prediction): assert len(sent_src) == len(sent_gold) assert len(sent_src) == len(sent_gold) for src, gold_tok, index, masks in zip(sent_src, sent_gold, index_match_with_src, append_masks): printing("{}:{} --> {} (n_masks {})", var=[index, src, gold_tok, masks], verbose=1, verbose_level=1)
def get_optimizer(parameters, lr, optimizer="adam", betas=None, weight_decay=None, verbose=1): assert optimizer in AVAILABLE_OPTIMIZER, "ERROR optimizers supported are {} ".format( AVAILABLE_OPTIMIZER) if optimizer == "adam": if betas is None: # betas = (0.9, 0.9) print("DEFAULT betas:", betas) if weight_decay is None: weight_decay = 0 opt = torch.optim.Adam(parameters, lr=lr, betas=betas, eps=1e-9, weight_decay=weight_decay) elif optimizer == "SGD": assert betas is None, "ERROR " opt = torch.optim.SGD(parameters, lr=lr) elif optimizer == "bahdanu-adadelta": assert betas is None, "ERROR betas not supported for optimizer {}".format( optimizer) opt = torch.optim.Adadelta(parameters, eps=10e-6, rho=0.95) elif optimizer == "AdamW": opt = AdamW(parameters, lr=lr, weight_decay=weight_decay) printing("TRAINING : optimizer {} has been reloaded with lr {} betas {} ", var=[optimizer, lr, betas], verbose=verbose, verbose_level=2) return opt
def build_shard(dir_shard, dir_file, n_sent_max_per_file, format="conll", dry_run=False, verbose=1): onlyfiles = [f for f in listdir(dir_shard) if isfile(join(dir_shard, f))] if len(onlyfiles) > 0: n_shards = len(onlyfiles) n_sents = 0 for file in onlyfiles: n_sents += count_conll_n_sent(os.path.join(dir_shard, file)) printing("INFO : shards already filled in {} files {} sentences total", var=[n_shards, n_sents], verbose=1, verbose_level=1) return dir_shard, n_shards, n_sents assert format in "conll" assert len(dir_file ) == 1, "ONLY 1 set of simultaneous task supported for sharding" printing("STARTING SHARDING {} of {} ".format(dir_shard, dir_file), verbose=verbose, verbose_level=1) dir_file = dir_file[0] n_sents = count_conll_n_sent(dir_file) n_shards = n_sents // n_sent_max_per_file if n_shards == 0: printing( "INFO SHARDING : n_sent_max_per_file is lower that number of files in {} so only building 1 shard", var=[dir_file], verbose=verbose, verbose_level=1) n_shards += 1 split_randomly(n_shards, dir_shard, dir_file, n_sents, dry_run=dry_run) sys.stdout.flush() printing( "INFO SHARD n_sent written {} splitted in {} files with " "in average {} sent per file written to {}", var=[n_sents, n_shards, n_sent_max_per_file, dir_shard], verbose=verbose, verbose_level=1) return dir_shard, n_shards, n_sents
def get_normalized_token(norm_field, n_exception, verbose, predict_mode_only=False): match = re.match("^Norm=([^|]+)|.+", norm_field) try: assert match.group( 1 ) is not None, " ERROR : not normalization found for norm_field {} ".format( norm_field) normalized_token = match.group(1) except: match_double_bar = re.match("^Norm=([|]+)|.+", norm_field) if match_double_bar.group(1) is not None: match = match_double_bar n_exception += 1 printing("Exception handled we match with {}".format( match_double_bar.group(1)), verbose=verbose, verbose_level=2) normalized_token = match.group(1) else: exc = Exception( "Failed to handle exception with | on field {} ".format( norm_field)) if not predict_mode_only: raise (exc) else: print("REPLACING with UNK", exc) normalized_token = "UNK" return normalized_token, n_exception
def log_data_src_label_pred(src_detokenized_dic, predict_detokenize_dic, label_detokenized_dic, tasks, verbose, verbose_level): if isinstance(verbose, int) or verbose == "alignment": if verbose == "alignment" or verbose >= verbose_level: for task in [_task for _tasks in tasks for _task in _tasks]: input_name = TASKS_PARAMETER[task]["input"] label_name_ls = TASKS_PARAMETER[task]["label"] for ind_src_sent, src_sent in enumerate( src_detokenized_dic[input_name]): print(" ") for label in label_name_ls: try: assert len(predict_detokenize_dic[task + "-" + label][0][ind_src_sent]) == len(label_detokenized_dic[label][ind_src_sent]), \ "ERROR pred {} label {} ".format(predict_detokenize_dic[task + "-" + label][ind_src_sent], label_detokenized_dic[label][ind_src_sent]) assert len(src_detokenized_dic[input_name] [ind_src_sent]) == len( label_detokenized_dic[label] [ind_src_sent]), "ERROR " for ind_src, src in enumerate(src_sent): to_print = "SRC : {} , ".format( src) + " ".join([ "PRED:{} GOLD:{} (label {})".format( predict_detokenize_dic[task + "-" + label][0] [ind_src_sent][ind_src], label_detokenized_dic[label] [ind_src_sent][ind_src], label) for label in label_name_ls ]) printing(to_print, verbose=1, verbose_level=1) except Exception as e: print("ERROR : not aligned labels so cannot log ", e)
def train_predict_eval(args, verbose=0): init_seed(args) if args.bert_model in BERT_MODEL_DIC: model_dir = BERT_MODEL_DIC[args.bert_model]["model"] if args.bert_model else None encoder = BERT_MODEL_DIC[args.bert_model]["encoder"] if args.bert_model else None else: model_dir = None encoder = "AutoModel" if args.init_args_dir is not None: args_checkpoint = json.load(open(args.init_args_dir, "r")) args.bert_model = args_checkpoint["hyperparameters"]["bert_model"] # if model referenced BERT_MODEL_DIC : using tokenizer directory otherwise loading from hugging face if args.bert_model in BERT_MODEL_DIC: tokenizer = eval(BERT_MODEL_DIC[args.bert_model]["tokenizer"]) if args.bert_model else None # , "BertTokenizer")) voc_tokenizer = BERT_MODEL_DIC[args.bert_model]["vocab"] if args.bert_model else None vocab_size = BERT_MODEL_DIC[args.bert_model].get("vocab_size") if args.bert_model else None else: print("TOKENIZER Model not in BERT_MODEL_DIC so loading tokenizer from hugging face") tokenizer = AutoTokenizer voc_tokenizer = args.bert_model vocab_size = None null_token_index = vocab_size description = "grid" # We checkpoint the model only if early_stoppin_metric gets better , # early_stoppin_metric choosen in relation to the first task defined in the list early_stoppin_metric, subsample_early_stoping_metric_val = get_early_stopping_metric(tasks=args.tasks,early_stoppin_metric=None, verbose=verbose) printing("INFO : tasks is {} so setting early_stoppin_metric to {} ", var=[args.tasks, early_stoppin_metric], verbose=verbose, verbose_level=1) printing("INFO : model {} batch_update_train {} batch_size {} ", var=[args.model_id_pref, args.batch_update_train, args.batch_size], verbose=verbose, verbose_level=1) run(args=args, voc_tokenizer=voc_tokenizer, vocab_size=vocab_size, model_dir=model_dir, report_full_path_shared=args.overall_report_dir, description=description, null_token_index=null_token_index, null_str=NULL_STR, model_suffix="{}".format(args.model_id_pref), debug=False, random_iterator_train=True, bucket_test=False, compute_intersection_score_test=True, n_observation_max_per_epoch_train=args.n_iter_max_train if not args.demo_run else 2, n_observation_max_per_epoch_dev_test=50000 if not args.demo_run else 2, early_stoppin_metric=early_stoppin_metric, subsample_early_stoping_metric_val=subsample_early_stoping_metric_val, saving_every_epoch=args.saving_every_n_epoch, run_mode="train" if args.train else "test", auxilliary_task_norm_not_norm=True, tokenizer=tokenizer, max_token_per_batch=300, name_with_epoch=args.name_inflation, encoder=encoder, report=True, verbose=verbose) printing("MODEL {} trained and evaluated", var=[args.model_id_pref], verbose_level=1, verbose=verbose)
def get_early_stopping_metric(tasks, verbose, main_task=None, early_stoppin_metric=None, subsample_early_stoping_metric_val=None): """ getting early stopping metric and evaluation subsample if early_stoppin_metric is None : uses first eval_metrics stated in TASKS_PARAMETER of the first task of the list passed in args.tasks :return: """ if main_task is None: printing( "INFO : default main task provided is the first of the first list {} ", var=[tasks], verbose=verbose, verbose_level=1) if isinstance(tasks[0], list): main_task = tasks[0][0] else: main_task = tasks[0] if early_stoppin_metric is None: early_stoppin_metric = TASKS_PARAMETER[main_task]["eval_metrics"][0][0] printing( "INFO : default early_stoppin_metric is early_stoppin_metric {} first one of " "the first possible in TASK_PARAMETER", var=[early_stoppin_metric], verbose=verbose, verbose_level=1) if subsample_early_stoping_metric_val is None: get_subsample = TASKS_PARAMETER[main_task].get("default-subsample") if get_subsample is None: get_subsample = "all" printing( "INFO : early stopping subsample is set to default {} all as not found in {}", var=["all", TASKS_PARAMETER[main_task]], verbose=verbose, verbose_level=1) subsample_early_stoping_metric_val = get_subsample assert subsample_early_stoping_metric_val in TASKS_PARAMETER[main_task][ "subsample-allowed"], "ERROR task {} subsample not in {} ".format( main_task, subsample_early_stoping_metric_val) #sanity_check_early_stop_metric(early_stoppin_metric, TASKS_PARAMETER, tasks) return early_stoppin_metric, subsample_early_stoping_metric_val
def log_warning(counting_failure_parralel_bpe_batch, data_label, batch_i, batch, noisy_under_splitted, skipping_batch_n_to_1, aligned, noisy_over_splitted, skip_1_t_n, skipping_evaluated_batch, verbose): printing("WARNING {} aignement failure caused by parallel ", var=[counting_failure_parralel_bpe_batch], verbose=verbose, verbose_level=1) printing( "WARNING on {} : Out of {} batch of X sentences each {} skipped ({} batch aligned ; {} with at least 1 sentence noisy MORE SPLITTED ; {} with LESS SPLITTED {} + SENT with skipped_1_to_n : {}) ", var=[ data_label, batch_i, noisy_under_splitted + skipping_batch_n_to_1, aligned, noisy_over_splitted, noisy_under_splitted, "SKIPPED" if skip_1_t_n else "", skipping_batch_n_to_1 ], verbose=verbose, verbose_level=1) printing("WARNING on {} ON THE EVALUATION SIDE we skipped extra {} batch ", var=[data_label, skipping_evaluated_batch], verbose_level=1, verbose=verbose)
def input_normalization_processing(task_normalize_is, batch, norm_2_noise_training, norm_2_noise_eval): norm2noise_bool = False if (norm_2_noise_training is not None or norm_2_noise_eval) and task_normalize_is: portion_norm2noise = norm_2_noise_training if norm_2_noise_training is not None else 1. norm_2_noise_training = portion_norm2noise is not None rand = np.random.uniform(low=0, high=1, size=1)[0] norm2noise_bool = portion_norm2noise >= rand if norm2noise_bool: batch_raw_input = preprocess_batch_string_for_bert( batch.raw_output) printing("WARNING : input is gold norm", verbose_level=2, verbose=1) else: printing("WARNING : input is input", verbose_level=2, verbose=1) batch_raw_input = preprocess_batch_string_for_bert(batch.raw_input) else: printing("WARNING : input is input ", verbose_level=2, verbose=1) batch_raw_input = preprocess_batch_string_for_bert(batch.raw_input) return batch_raw_input, norm2noise_bool, norm_2_noise_training
def logging_processing_data(_verbose, verbose, verbose_level, batch_raw_input, input_tokens_tensor, batch_raw_output, output_tokens_tensor, inp_bpe_tokenized, out_bpe_tokenized): printing("DATA : pre-tokenized input {} ", var=[batch_raw_input], verbose_level=verbose_level, verbose=_verbose) printing("DATA : BPEtokenized input ids {}", var=[input_tokens_tensor], verbose_level=3, verbose=verbose) printing("DATA : pre-tokenized output {} ", var=[batch_raw_output], verbose_level=verbose_level, verbose=_verbose) printing("DATA : BPE tokenized output ids {}", var=[output_tokens_tensor], verbose_level=4, verbose=verbose) # BPE printing("DATA : BPE tokenized input {}", var=[inp_bpe_tokenized], verbose_level=4, verbose=_verbose) printing("DATA : BPE tokenized output {}", var=[out_bpe_tokenized], verbose_level=4, verbose=_verbose)
def get_indexes(list_pretokenized_str, tokenizer, verbose, use_gpu, word_norm_not_norm=None): """ from pretokenized string : it will bpe-tokenize it using BERT 'tokenizer' and then convert it to tokens ids :param list_pretokenized_str: :param tokenizer: :param verbose: :param use_gpu: :return: """ all_tokenized_ls = [ tokenizer.tokenize_origin(inp, ) for inp in list_pretokenized_str ] tokenized_ls = [tup[0] for tup in all_tokenized_ls] aligned_index = [tup[1] for tup in all_tokenized_ls] segments_ids = [[0 for _ in range(len(tokenized))] for tokenized in tokenized_ls] printing("DATA : bpe tokenized {} , {} {} ", var=[tokenized_ls, len(tokenized_ls), len(tokenized_ls[0])], verbose=verbose, verbose_level="raw_data") printing("DATA : bpe tokenized {} , {} {} ", var=[tokenized_ls, len(tokenized_ls), len(tokenized_ls[0])], verbose=verbose, verbose_level="alignement") ids_ls = [tokenizer.convert_tokens_to_ids(inp) for inp in tokenized_ls] max_sent_len = max([len(inp) for inp in tokenized_ls]) ids_padded = [ inp + [PAD_ID_BERT for _ in range(max_sent_len - len(inp))] for inp in ids_ls ] aligned_index_padded = [[e for e in inp] + [1000 for _ in range(max_sent_len - len(inp))] for inp in aligned_index] segments_padded = [ inp + [PAD_ID_BERT for _ in range(max_sent_len - len(inp))] for inp in segments_ids ] if word_norm_not_norm is not None: mask = mask_group(word_norm_not_norm, bpe_aligned_index=aligned_index_padded) else: mask = [[1 for _ in inp] + [0 for _ in range(max_sent_len - len(inp))] for inp in segments_ids] mask = torch.LongTensor(mask) tokens_tensor = torch.LongTensor(ids_padded) segments_tensors = torch.LongTensor(segments_padded) if use_gpu: mask = mask.cuda() tokens_tensor = tokens_tensor.cuda() segments_tensors = segments_tensors.cuda() printing("DATA {}", var=[tokens_tensor], verbose=verbose, verbose_level=3) sanity_check_data_len(tokens_tensor, segments_tensors, tokenized_ls, aligned_index, raising_error=True) return tokens_tensor, segments_tensors, tokenized_ls, aligned_index_padded, mask
def outputing_raw_data_from_iterator(words, word_norm, chars, chars_norm, word_norm_not_norm, pos, verbose, print_raw, normalization, char_dictionary, word_dictionary, word_norm_dictionary, pos_dictionary): """ printing real data on the fly for debugging, data sanity check, ... TODO : may factorize a few things here :param words: :param word_norm: :param chars: :param chars_norm: :param word_norm_not_norm: :param pos: :param verbose: :param print_raw: :param normalization: :param char_dictionary: :param word_dictionary: :param word_norm_dictionary: :param pos_dictionary: :return: """ _verbose = verbose if isinstance(verbose, int) else 0 if print_raw: _verbose = 5 if _verbose >= 5: if word_norm_not_norm is not None: character_display = [ " ".join([ char_dictionary.get_instance(chars[sent, word_ind, char_i]) for char_i in range(chars.size(2)) ]) + " | NORM : {} |SENT {} WORD {}| ".format( word_norm_not_norm[sent, word_ind], sent, word_ind) for ind_sent, sent in enumerate(range(chars.size(0))) for ind_w, word_ind in enumerate(range(chars.size(1))) ] else: character_display = [ " ".join([ char_dictionary.get_instance(chars[sent, word_ind, char_i]) for char_i in range(chars.size(2)) ]) for ind_sent, sent in enumerate(range(chars.size(0))) for ind_w, word_ind in enumerate(range(chars.size(1))) ] if word_norm is not None: assert word_norm_dictionary is not None word_norm_display = " ".join([ word_norm_dictionary.get_instance(word_norm[sent, word_ind]) for word_ind in range(word_norm.size(1)) for sent in range(word_norm.size(0)) ]) else: print("No word level normalized word (only char)") word_norm_display = ["NONE"] word_display = [ word_dictionary.get_instance(words[batch, word_ind]) + " " for batch in range(chars.size(0)) for word_ind in range(chars.size(1)) ] if pos_dictionary is not None: pos_display = [ pos_dictionary.get_instance(pos[batch, 0]) + " " for batch in range(chars.size(0)) ] else: pos_display = None else: word_display = [] character_display = [] pos_display = [] if not normalization and chars is not None: chars_norm = chars.clone() # TODO add word_norm if _verbose >= 5: if word_norm_not_norm is not None: character_norm_display = [ " ".join([ char_dictionary.get_instance(chars_norm[sent, word_ind, char_i]) for char_i in range(chars_norm.size(2)) ]) + "| NORM : {} |SENT {} WORD {}| \n ".format( word_norm_not_norm[sent, word_ind], sent, word_ind) for ind_sent, sent in enumerate(range(chars_norm.size(0))) for ind_w, word_ind in enumerate(range(chars_norm.size(1))) ] else: character_norm_display = [ " ".join([ char_dictionary.get_instance(chars_norm[sent, word_ind, char_i]) for char_i in range(chars_norm.size(2)) ]) for ind_sent, sent in enumerate(range(chars_norm.size(0))) for ind_w, word_ind in enumerate(range(chars_norm.size(1))) ] printing( "Feeding source characters {} \n ------ Target characters {} " "(NB : the character vocabulary is the same at input and output)", var=(character_display, character_norm_display), verbose=_verbose, verbose_level=5) printing("Feeding source words {} ", var=[word_display], verbose=_verbose, verbose_level=5) printing("Feeding Word normalized (word level) {}", var=[word_norm_display], verbose=_verbose, verbose_level=5) printing("Feeding source pos {} ", var=[pos_display], verbose=_verbose, verbose_level=5) if chars is not None and chars_norm is not None: printing("TYPE {} char before batch chars_norm {} ", var=(chars.is_cuda, chars_norm.is_cuda), verbose=verbose, verbose_level=5)
def make_bert_multitask(pretrained_model_dir, tasks, num_labels_per_task, init_args_dir, mask_id, encoder=None, args=None): assert num_labels_per_task is not None and isinstance(num_labels_per_task, dict), \ "ERROR : num_labels_per_task {} should be a dictionary".format(num_labels_per_task) assert isinstance(tasks, list) and len( tasks) >= 1, "ERROR tasks {} should be a list of len >=1".format(tasks) if init_args_dir is None: if pretrained_model_dir is None: pretrained_model_dir = args.bert_model # assert args.output_attentions is None or not args.output_attentions, "ERROR not supported " multitask_wrapper = BertMultiTask def get_state_dict_mapping(model): if model.startswith("xlm") or model.startswith( "rob") or model.startswith("camembert"): return { "roberta": "encoder", # "lm_head":, "lm_head.decoder": "head.mlm.predictions.decoder", "lm_head.dense": "head.mlm.predictions.transform.dense", "lm_head.bias": "head.mlm.predictions.bias", "lm_head.layer_norm": "head.mlm.predictions.transform.LayerNorm" } elif model.startswith("bert") or model.startswith( "cahya") or model.startswith("KB"): return {"bert": "encoder", "cls": "head.mlm"} elif model.startswith("asafaya"): return {"bert": "encoder", "cls": "head.mlm"} else: raise (Exception( f"not supported by {multitask_wrapper} needs to define a ") ) state_dict_mapping = get_state_dict_mapping(args.bert_model) model = multitask_wrapper.from_pretrained( pretrained_model_dir, tasks=tasks, mask_id=mask_id, output_attentions=args.output_attentions, output_hidden_states=args.output_all_encoded_layers, output_hidden_states_per_head=args.output_hidden_states_per_head, hard_skip_attention_layers=args.hard_skip_attention_layers, hard_skip_all_layers=args.hard_skip_all_layers, hard_skip_dense_layers=args.hard_skip_dense_layers, num_labels_per_task=num_labels_per_task, mapping_keys_state_dic= state_dict_mapping, #DIR_2_STAT_MAPPING[pretrained_model_dir], encoder=eval(encoder) if encoder is not None else BertModel, dropout_classifier=args.dropout_classifier, hidden_dropout_prob=args.hidden_dropout_prob, random_init=args.random_init, load_params_only_ls=None, not_load_params_ls=args.not_load_params_ls) elif init_args_dir is not None: assert pretrained_model_dir is not None, "ERROR model_dir is needed here for reloading" init_args_dir = get_init_args_dir(init_args_dir) args_checkpoint = json.load(open(init_args_dir, "r")) assert "checkpoint_dir" in args_checkpoint, "ERROR checkpoint_dir not in {} ".format( args_checkpoint) checkpoint_dir = args_checkpoint["checkpoint_dir"] assert os.path.isfile( checkpoint_dir), "ERROR checkpoint {} not found ".format( checkpoint_dir) # redefining model and reloading def get_config_bert(bert_model, config_file_name="bert_config.json"): model_dir = BERT_MODEL_DIC[bert_model]["model"] #tempdir = tempfile.mkdtemp() #print("extracting archive file {} to temp dir {}".format(model_dir, tempdir)) #with tarfile.open(model_dir, 'r:gz') as archive: # archive.extractall(tempdir) #serialization_dir = tempdir serialization_dir = None config_file = os.path.join(model_dir, config_file_name) try: assert os.path.isfile( config_file ), "ERROR {} not a file , extracted from {} : dir includes {} ".format( config_file, model_dir, [x[0] for x in os.walk(serialization_dir)]) except Exception as e: config_file = os.path.join(model_dir, "config.json") assert os.path.join(config_file) return config_file config_file = get_config_bert( args_checkpoint["hyperparameters"]["bert_model"]) encoder = eval(BERT_MODEL_DIC[args_checkpoint["hyperparameters"] ["bert_model"]]["encoder"]) config = BertConfig( config_file, output_attentions=args.output_attentions, output_hidden_states=args.output_all_encoded_layers, output_hidden_states_per_head=args.output_hidden_states_per_head) # config.vocab_size = 119547 model = BertMultiTask( config=config, tasks=[ task for tasks in args_checkpoint["hyperparameters"]["tasks"] for task in tasks ], num_labels_per_task=args_checkpoint["info_checkpoint"] ["num_labels_per_task"], encoder=encoder, mask_id=mask_id) printing("MODEL : loading model from checkpoint {}", var=[checkpoint_dir], verbose=1, verbose_level=1) model.load_state_dict( torch.load(checkpoint_dir, map_location=lambda storage, loc: storage)) model.append_extra_heads_model(downstream_tasks=tasks, num_labels_dic_new=num_labels_per_task) else: raise (Exception( "only one of pretrained_model_dir checkpoint_dir can be defined ")) return model
def focused_masking(masking_strategy, input_tokens_tensor, output_tokens_tensor_aligned, dropout_input_bpe, mask_token_index, sep_token_index, use_gpu, epoch, n_epoch, portion_mask, input_mask, tokenizer, verbose): if masking_strategy in ["mlm", "mlm_need_norm"]: dropout = 0.15 assert dropout_input_bpe == 0., "in args.masking_strategy mlm we hardcoded dropout to 0.2 {}".format( dropout) # standart standart_mlm means : standart MLM prediction standart_mlm = True # unmask_loss : bool do we unmask other loss than only the MASKed tokens unmask_loss = portion_mask if masking_strategy == "mlm_need_norm": # if mlm_need_norm strategy : in args.portion_mask% of the time we learn as a standart mlm the rest # of the time we do the same but only on need_norm tokens (masking them) standart_mlm = np.random.random() < portion_mask # we force unmask loss to 0 unmask_loss = 0 if standart_mlm: # standart mlm input_tokens_tensor, mask_dropout, dropout_applied = dropout_input_tensor( input_tokens_tensor, mask_token_index, sep_token_index=sep_token_index, applied_dropout_rate=0.8, dropout=dropout) elif masking_strategy == "mlm_need_norm" and not standart_mlm: # todo : factorize feeding_the_model_with_label = output_tokens_tensor_aligned.clone() # we only learn on tokens that are different from gold feeding_the_model_with_label[input_tokens_tensor == output_tokens_tensor_aligned] = -1 if np.random.random() < 0.85: # 80% of the time we mask the tokens as standart mlm input_tokens_tensor[ input_tokens_tensor != output_tokens_tensor_aligned] = mask_token_index else: # within the 15% rest : 50% of the time we replace by random 50% we keep if np.random.random() < 0.5: permute = (torch.randperm( torch.tensor(len(tokenizer.vocab) - 2) )[:len(input_tokens_tensor[ input_tokens_tensor != output_tokens_tensor_aligned])] + 1) permute[permute == sep_token_index] = sep_token_index + 10 permute[permute == mask_token_index] = mask_token_index + 10 permute[permute == 0] = 53 if use_gpu: permute = permute.cuda() input_tokens_tensor[input_tokens_tensor != output_tokens_tensor_aligned] = permute mask_dropout = ( input_tokens_tensor == output_tokens_tensor_aligned) if standart_mlm and not dropout_applied: random_bpe_instead = np.random.random() < 0.5 if random_bpe_instead: permute = ( torch.randperm(torch.tensor(len(tokenizer.vocab) - 2)) [:len(input_tokens_tensor[mask_dropout == 0])] + 1) permute[permute == sep_token_index] = sep_token_index + 10 permute[permute == mask_token_index] = mask_token_index + 10 permute[permute == 0] = 53 if use_gpu: permute = permute.cuda() input_tokens_tensor[mask_dropout == 0] = permute if unmask_loss: print( "WARNING : unmaskloss is {} : 0 means only optimizing on the MASK , > 0 means optimizes " "also on some other sampled based on dropout_adapted)".format( unmask_loss)) power = 3 capped = 0.5 dropout_adated = min(((epoch + 1) / n_epoch)**power, capped) printing( "LABEL NOT MASKING {}/1 of gold labels with power {} and capped {}" .format(dropout_adated, power, capped), verbose=verbose, verbose_level=2) _, mask_losses = dropout_input_tensor( input_tokens_tensor, mask_token_index, sep_token_index=sep_token_index, apply_dropout=False, dropout=dropout_adated) # we backpropagate only on tokens that receive a mask (MLM objective) + # some extra ones tgat we control with dropout_adated mask_loss = mask_dropout * mask_losses else: mask_loss = mask_dropout feeding_the_model_with_label = output_tokens_tensor_aligned.clone() feeding_the_model_with_label[mask_loss != 0] = -1 # hald the time we actually mask those tokens otherwise we predict elif masking_strategy in ["norm_mask", "norm_mask_variable"]: if masking_strategy == "norm_mask_variable": # args.portion_mask = min(((epoch + 1) / n_epoch), 0.6) portion_mask = 1 - (epoch + 1) / n_epoch # , 0.6)) mask_normed = np.random.random() < portion_mask feeding_the_model_with_label = output_tokens_tensor_aligned.clone() if mask_normed: print("MASKING NORMED in mode {} portion mask {}".format( masking_strategy, portion_mask)) feeding_the_model_with_label[input_tokens_tensor == output_tokens_tensor_aligned] = -1 if np.random.random() < 0.5: # half the time we mask not to make the model only normalizing input_tokens_tensor[ input_tokens_tensor != output_tokens_tensor_aligned] = mask_token_index else: feeding_the_model_with_label = output_tokens_tensor_aligned.clone() # TODO -- handle loggin of output_tokens_tensor_aligned everywhere printing("MASK mask:{} \nMASK input:{} \nMASK output:{}", var=[ input_mask, input_tokens_tensor, feeding_the_model_with_label ], verbose_level="raw_data", verbose=verbose) return input_tokens_tensor, feeding_the_model_with_label
def args_preprocessing(args, verbose=1): """ sanity checking , changing types of arguments and parsing arguments """ args.tasks = [task_simul.split(",") for task_simul in args.tasks] if args.hard_skip_dense_layers is None or args.hard_skip_dense_layers == "None": args.hard_skip_dense_layers = [] else: args.hard_skip_dense_layers = args.hard_skip_dense_layers.split(",") assert len(args.hard_skip_dense_layers) > 0 if args.hard_skip_attention_layers is None or args.hard_skip_attention_layers == "None": args.hard_skip_attention_layers = [] else: args.hard_skip_attention_layers = args.hard_skip_attention_layers.split( ",") assert len(args.hard_skip_attention_layers) > 0 if args.hard_skip_all_layers is None or args.hard_skip_all_layers == "None": args.hard_skip_all_layers = [] else: args.hard_skip_all_layers = args.hard_skip_all_layers.split(",") assert len(args.hard_skip_all_layers) > 0 if args.prune_heads is not None and args.prune_heads != "None": pune_heads_ls = args.prune_heads.split(",")[:-1] assert len(pune_heads_ls) > 0 for layer in pune_heads_ls: parsed_layer_to_prune = layer.split("-") assert parsed_layer_to_prune[ 0] == "prune_heads", f"ERROR {parsed_layer_to_prune} layer arg: {layer} pune_heads_ls {pune_heads_ls} args.prune_heads {args.prune_heads}" assert parsed_layer_to_prune[ 1] == "layer", f"ERROR {parsed_layer_to_prune}" assert parsed_layer_to_prune[ 3] == "heads", f"ERROR {parsed_layer_to_prune}" try: int(parsed_layer_to_prune[2]) heads = parsed_layer_to_prune[4] head_index_ls = heads.split("_") heads_ls = [int(index) for index in head_index_ls] except Exception as e: print(f"Error parsing prune_heads argument {e}") if isinstance(args.schedule_lr, str) and args.schedule_lr == "None": args.schedule_lr = eval(args.schedule_lr) if args.batch_size != "flexible": args.batch_size = int(args.batch_size) if args.low_memory_foot_print_batch_mode is not None and args.low_memory_foot_print_batch_mode != "flexible_forward_batch_size": args.low_memory_foot_print_batch_mode = int( args.low_memory_foot_print_batch_mode) low_memory_foot_print_batch_mode_available = [ 0, 1, "flexible_forward_batch_size" ] if args.not_load_params_ls is not None: args.not_load_params_ls = args.not_load_params_ls.split(",")[:-1] assert args.low_memory_foot_print_batch_mode is None or args.low_memory_foot_print_batch_mode in low_memory_foot_print_batch_mode_available, "ERROR args.low_memory_foot_print_batch_mode {} should be in {}".format( args.low_memory_foot_print_batch_mode, low_memory_foot_print_batch_mode_available) if args.low_memory_foot_print_batch_mode: args.batch_update_train = args.batch_size args.batch_size = "flexible" if args.low_memory_foot_print_batch_mode == "flexible_forward_batch_size" else 2 printing( "INFO : args.low_memory_foot_print_batch_mode {} " "so setting batch_size to {} and args.batch_update_train {}", var=[ args.low_memory_foot_print_batch_mode, args.batch_size, args.batch_update_train ], verbose=verbose, verbose_level=1) if args.low_memory_foot_print_batch_mode != "flexible_forward_batch_size": assert isinstance( args.batch_update_train // args.batch_size, int ) and args.batch_update_train // args.batch_size > 0, "ERROR batch_size {} should be a multiple of 2 ".format( args.batch_update_train) printing( "INFO iterator : updating with {} equivalent batch size : forward pass is {} batch size", var=[args.batch_update_train, args.batch_size], verbose=verbose, verbose_level=1) else: args.batch_update_train = args.batch_size params = vars(args) args.lr = parse_argument_dictionary(params["lr"], hyperparameter="lr") if args.test_paths is not None: args.test_paths = [ test_path_task.split(",") for test_path_task in args.test_paths ] if args.dev_path is not None: args.dev_path = [ dev_path_task.split(",") for dev_path_task in args.dev_path ] if args.ponderation_per_layer is not None: args.ponderation_per_layer = parse_argument_dictionary( params["ponderation_per_layer"], hyperparameter="ponderation_per_layer") if args.norm_order_per_layer is not None: args.norm_order_per_layer = parse_argument_dictionary( params["norm_order_per_layer"], hyperparameter="norm_order_per_layer") if args.test_paths is not None: assert isinstance(args.test_paths, list) and isinstance( args.test_paths[0], list), "ERROR args.test_paths should be a list" # 1 simultaneous set of tasks per training dataset assert len(args.tasks) == len( args.train_path ), "ERROR args.tasks is {} but train paths are {}".format( args.tasks, args.train_path) assert args.penalization_mode in AVAILALE_PENALIZATION_MODE, "ERROR args.penalization_mode {} should be in {}".format( args.penalization_mode, AVAILALE_PENALIZATION_MODE) if args.multi_task_loss_ponderation is not None: argument_as_string = args.multi_task_loss_ponderation assert args.tasks is not None tasks = [task for tasks in args.tasks for task in tasks] # should add test on task X label calling task setting for task in tasks: if task != "all": for label in TASKS_PARAMETER[task]["label"]: pattern = "{}-{}=([^=]*),".format(task, label) match = re.search(pattern, argument_as_string) assert match is not None, "ERROR : pattern {} not found for task {} in argument_as_string {} ".format( pattern, task, argument_as_string) if args.bert_model is not None: try: assert args.bert_model in BERT_MODEL_DIC, "ERROR args.bert_model {} should be in {}".format( args.bert_model, BERT_MODEL_DIC.keys()) except Exception as e: print(f"Will load model and tokenization from transformers {e}") return args
def from_bpe_token_to_str(bpe_tensor, topk, pred_mode, null_token_index, null_str, task, tokenizer=None, bpe_tensor_src=None, pos_dictionary=None, label="normalize", label_dictionary=None, mask_index=None, get_bpe_string=False, verbose=1): """ it actually supports not only bpe token but also pos-token pred_mode allow to handle gold data also (which only have 2 dim and not three) :param bpe_tensor: :param topk: int : number of top prediction : will arrange them with all the top1 all the 2nd all the third... :param pred_mode: book :return: """ assert label is not None or get_bpe_string, \ "ERROR : task {} get_string {} : one of them should be defined or True".format(label, get_bpe_string) if task == "mlm" and pred_mode: assert bpe_tensor_src is not None and mask_index is not None, "ERROR bpe_tensor_src is needed to get not-predicted token as well as mask_index " predictions_topk_ls = [[[ bpe_tensor[sent, word, top].item() if bpe_tensor_src[sent, word].item() == mask_index else bpe_tensor_src[sent, word].item() for word in range(bpe_tensor.size(1)) ] for sent in range(bpe_tensor.size(0))] for top in range(topk)] else: predictions_topk_ls = [[[ bpe_tensor[sent, word, top].item() if pred_mode else bpe_tensor[sent, word].item() for word in range(bpe_tensor.size(1)) ] for sent in range(bpe_tensor.size(0))] for top in range(topk)] # here all labels that require the tokenizer (should factorize it in some way) if get_bpe_string: #label in ["normalize", "mwe_prediction", "input_masked"] or assert tokenizer is not None # requires task specific here : mlm only prediction we are interested in are # RM , special_extra_token=null_token_index, special_token_string=null_str sent_ls_top = [[ tokenizer.convert_ids_to_tokens(sent_bpe) for sent_bpe in predictions_topk ] for predictions_topk in predictions_topk_ls] printing("DATA : bpe string again {}", var=[sent_ls_top], verbose=verbose, verbose_level="raw_data") else: dictionary = label_dictionary if label_dictionary == "index": sent_ls_top = [[[token_ind for token_ind in sent_bpe] for sent_bpe in predictions_topk] for predictions_topk in predictions_topk_ls] else: try: sent_ls_top = [[[ dictionary.instances[token_ind - 1] if token_ind > 0 else "UNK" for token_ind in sent_bpe ] for sent_bpe in predictions_topk] for predictions_topk in predictions_topk_ls] # adding more information about the exe except Exception as e: print( "{} : dictionary : {} and prediction {} (POS specificity was removed )" .format(e, dictionary.instances, predictions_topk_ls)) raise (e) if not pred_mode: sent_ls_top = sent_ls_top[0] return sent_ls_top
def write_conll(format, dir_normalized, dir_original, src_text_ls, text_decoded_ls, src_text_pos, pred_pos_ls, tasks, inverse=False,permuting_mode=None, cp_paste=False, sep_token=None, cls_token=None, ind_batch=0, new_file=False, cut_sent=False, verbose=0): assert format in ["conll"] #assert len(tasks) == 1, "ERROR : only supported so far 1 task at a time" if tasks[0] == "normalize": src_ls = src_text_ls pred_ls = text_decoded_ls if text_decoded_ls is None: assert permuting_mode is not None or cp_paste pred_ls = src_text_ls elif tasks[0] == "pos": src_ls = src_text_pos pred_ls = pred_pos_ls if format == "conll": mode_write = "w" if new_file else "a" if new_file: printing("CREATING NEW FILE (io_/dat/normalized_writer) : {} ", var=[dir_normalized], verbose=verbose, verbose_level=1) with open(dir_normalized, mode_write) as norm_file: with open(dir_original, mode_write) as original: len_original = 0 for ind_sent, (original_sent, normalized_sent) in enumerate(zip(src_ls, pred_ls)): try: assert len(original_sent) == len(normalized_sent), "WARNING : (writer) original_sent len {} {} \n " \ "normalized_sent len {} {} ".format(len(original_sent), original_sent, len(normalized_sent), normalized_sent) except AssertionError as e: print(e) if len(original_sent) > len(normalized_sent): normalized_sent.extend(["UNK" for _ in range(len(original_sent)-len(normalized_sent))]) print("WARNING (writer) : original larger than prediction : so appending UNK token for writing") else: print("WARNING (writer) : original smaller than prediction ! ") norm_file.write("#\n") original.write("#\n") norm_file.write("#sent_id = {} \n".format(ind_sent+ind_batch+1)) original.write("#sent_id = {} \n".format(ind_sent+ind_batch+1)) ind_adjust = 0 if permuting_mode == "sample_mode": noise_level_sentence = np.random.random(1)[0] for ind, (original_token, normalized_token) in enumerate(zip(original_sent, normalized_sent)): # WE REMOVE SPECIAL TOKENS ONLY IF THEY APPEAR AT THE BEGINING OR AT THE END # on the source token !! (it tells us when we stop) (we nevern want to use gold information) max_len_word = max(len(original_token), len_original) if (original_token in SPECIAL_TOKEN_LS or original_token in [cls_token, sep_token]) and (ind+1 == len(original_sent) or ind == 0): ind_adjust = 1 continue if permuting_mode == "sample_mode": # 20% of sentences we apply a 80 noise level n 80% of cases only a 20% noise level rand_word = np.random.random(1)[0] threshold_word = 0.8 if noise_level_sentence < 0.2 else 0.2 if rand_word < threshold_word: permuting_mode = np.random.choice(["permute", "double", "random_replace", "multiply_last", "double_last","remove", "remove_last", "z_replace_s"]) #print("PERMUTATION is ", permuting_mode, rand_word, APPLY_PERMUTE_WORD,noise_level_sentence) else: rand_word = None # TODO : when want simultanuous training : assert src_pos src_norm same # --> assert pred_pos and pred_norm are same lengh (number of words) ans write if tasks[0] == "normalize": if inverse: assert not cp_paste _original_token = normalized_token _normalized_token = original_token else: _original_token = original_token _normalized_token = normalized_token if permuting_mode is not None: assert not cp_paste # rule one #print("ORIGINAL TOKEN", original_token) if ( _original_token == _normalized_token or _original_token.lower() == _normalized_token.lower())\ and not (original_token.startswith("#") or original_token.startswith("@")): # rule 1 if permuting_mode == "z_replace_s" and len(original_token) > 1: if original_token.endswith("s"): _original_token = original_token[:-1] + "z" else: permuting_mode = np.random.choice(["permute", "double", "random_replace", "remove", "remove_last", "multiply_last","double_last", "z_replace_s"]) if permuting_mode == "permute" and len(original_token) > 1: start_index = 0 if not (original_token.startswith("#") or original_token.startswith("@")) else 1 to_permute = np.random.randint(start_index, len(original_token)-1) second_letter = original_token[to_permute+1] first_letter = original_token[to_permute] list_original_token = list(original_token) #pdb.set_trace() list_original_token[to_permute] = second_letter list_original_token[to_permute+1] = first_letter _original_token = "".join(list_original_token) # rule 2 if (permuting_mode == "double" or permuting_mode == "remove") and len(original_token) > 1: start_index = 0 to_double = np.random.randint(start_index, len(original_token)-1) first_letter = original_token[to_double] list_original_token = list(original_token) #pdb.set_trace() if permuting_mode == "double": list_original_token = list_original_token[:to_double] + [first_letter] + list_original_token[to_double:] else: list_original_token = list_original_token[:to_double] + list_original_token[to_double:] _original_token = "".join(list_original_token) if permuting_mode == "remove_last" and len(original_token) > 1: _original_token = _original_token[:-1] if permuting_mode == "double_last" and len(original_token) > 1: _original_token = _original_token+_original_token[-1] if permuting_mode == "random_replace" and len(original_token) > 1: start_index = 0 to_replace = np.random.randint(start_index, len(original_token) - 1) random_letter = np.random.choice(list("abcdefghijklmnopqrstuvwxyz")) first_letter = original_token[to_replace] list_original_token = list(original_token) # pdb.set_trace() list_original_token[to_replace] = random_letter _original_token = "".join(list_original_token) #print("NEW TOKEN", permuting_mode, _original_token) #pdb.set_trace() if cp_paste: _normalized_token = _original_token norm_file.write("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\tNorm={}|\n".format(ind + 1 - ind_adjust, _original_token, ind - ind_adjust if ind - ind_adjust > 0 else 0, _normalized_token)) if tasks[0] == "pos": norm_file.write("{}\t{}\t_\t{}\t_\t_\t{}\t_\t_\tNorm=()|\n".format(ind + 1 - ind_adjust, original_token, normalized_token, ind-ind_adjust if ind - ind_adjust > 0 else 0 )) original.write("{}\t{}\t_\t_\t_\t_\t_\t_\t{}\t_\n".format(ind+1, original_token, ind - ind_adjust if ind - ind_adjust > 0 else 0)) if cut_sent: if ind > 50: break norm_file.write("\n") original.write("\n") printing("WRITING predicted batch of {} original and {} normalized", var=[dir_original, dir_normalized], verbose=verbose, verbose_level="raw_data") return max_len_word
def parse_argument_dictionary(argument_as_string, logits_label=None, hyperparameter="multi_task_loss_ponderation", verbose=1): """ All arguments that are meant to be defined as dictionaries are passed to the Argument Parser as string: following template : i.e 'key1=value1,key2=value,' (matched with "{}=([^=]*),".format(sub) ) ALl the dictionary arguments are listed in DIC_ARGS """ assert hyperparameter in DIC_ARGS, "ERROR only supported" if argument_as_string in MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE: return argument_as_string else: dic = OrderedDict() if hyperparameter == "multi_task_loss_ponderation": assert logits_label is not None for task in logits_label: # useless (I think) if task == "parsing": for sub in ["parsing-heads", "parsing-types"]: pattern = "{}=([^=]*),".format(sub) match = re.search(pattern, argument_as_string) assert match is not None, "ERROR : pattern {} not found for task {} in argument_as_string {} ".format( pattern, task, argument_as_string) dic[sub] = eval(match.group(1)) # useless (I thinh) elif task == "normalize": for sub in ["normalize", "append_masks"]: pattern = "{}=([^=]*),".format(sub) match = re.search(pattern, argument_as_string) if sub == "normalize": assert match is not None, "ERROR : pattern {} not found for task {} " \ "in argument_as_string {} ".format( pattern, task, argument_as_string) dic[sub] = eval(match.group(1)) else: if match is not None: dic[sub] = eval(match.group(1)) # all cases should be in this one if task != "all" and task != "parsing": pattern = "{}=([^=]*),".format(task) match = re.search(pattern, argument_as_string) assert match is not None, "ERROR : pattern {} not found for task {} in argument_as_string {} ".format( pattern, task, argument_as_string) dic[task] = eval(match.group(1)) printing("SANITY CHECK : multi_task_loss_ponderation {} ", var=[argument_as_string], verbose_level=3, verbose=verbose) elif hyperparameter in [ "lr", "norm_order_per_layer", "ponderation_per_layer" ]: # to handle several optimizers try: assert isinstance(eval(argument_as_string), float) return eval(argument_as_string) except Exception as e: print("Exception", hyperparameter, e) argument_as_string = argument_as_string.split(",") for arg in argument_as_string[:-1]: # DIFFERENCE WITH ABOVE IS THE COMMA pattern = "([^=]*)=([^=]*)" match = re.search(pattern, arg) assert match is not None, "ERROR : pattern {} not found in argument_as_string {} ".format( pattern, arg) if hyperparameter in ["lr"]: dic[match.group(1)] = float(match.group(2)) elif hyperparameter in ["norm_order_per_layer"]: if match.group(2) != "fro": dic[match.group(1)] = float(match.group(2)) else: dic[match.group(1)] = match.group(2) elif hyperparameter in ["ponderation_per_layer"]: dic[match.group(1)] = float(match.group(2)) return dic
def write_conll_multitask(format, dir_pred, dir_original, src_text_ls, pred_per_task, tasks, task_parameters, cp_paste=False, gold=False, all_indexes=None, sep_token=None, cls_token=None, ind_batch=0, new_file=False, cut_sent=False, verbose=0): assert format in ["conll"] max_len_word = None writing_top = 1 # assert each task is predicting as many sample per batch pred_task_len_former = -1 task_former = "" # assertion on number of samples predicted for task_label in pred_per_task: pred_task_len = len(pred_per_task[task_label]) if gold else len(pred_per_task[task_label][writing_top-1]) _task = re.match("(.*)-(.*)", task_label) if _task is not None: # , "ERROR writer could not match {}".format(task_label) task = _task.group(1) else: task = task_label if pred_task_len_former > 0: assert pred_task_len == pred_task_len_former, \ "ERROR {} and {} task ".format(task_former, task_label) if not gold: assert pred_task_len == len(src_text_ls[task_parameters[task]["input"]]), "ERROR src len {} and pred len {} ".format(len(src_text_ls[task_parameters[task]["input"]]),pred_task_len) # we check also other input length if src_text_ls.get("input_masked") is not None: assert pred_task_len == len(src_text_ls["input_masked"]) if src_text_ls.get("wordpieces_inputs_words") is not None: assert pred_task_len == len(src_text_ls["wordpieces_inputs_words"]), "ERROR mismatch source " \ "wordpieces_inputs_words {} " \ "and prediction {} ".format(src_text_ls, pred_per_task[task_label]) if src_text_ls.get("wordpieces_inputs_raw_tokens") is not None: assert pred_task_len == len(src_text_ls["wordpieces_inputs_raw_tokens"]), \ "ERROR mismatch source wordpieces_inputs_" \ "raw_tokens {} and prediction {} ".format(src_text_ls, pred_per_task[task_label]) try: assert pred_task_len == all_indexes.shape[0], "ERROR mismatch index {} and all_indexes {} : pred {}".format(pred_task_len, all_indexes.shape[0], pred_per_task[task_label]) except: pdb.set_trace() pred_task_len_former = pred_task_len task_former = task_label if format == "conll": mode_write = "w" if new_file else "a" if new_file: printing("CREATING NEW FILE (io_/dat/normalized_writer) : {} ", var=[dir_pred], verbose=verbose, verbose_level=1) pos_label = "pos-pos" if not gold else "pos" types_label = "parsing-types" if not gold else "types" heads_label = "parsing-heads" if not gold else "heads" n_masks_mwe_label = "n_masks_mwe-n_masks_mwe" if not gold else "n_masks_mwe" mwe_detection_label = "mwe_detection-mwe_detection" if not gold else "mwe_detection" with open(dir_pred, mode_write) as norm_file: with open(dir_original, mode_write) as original: len_original = 0 for ind_sent in range(all_indexes.shape[0]): pred_sent = OrderedDict() # NB : length assertion for each input-output (correcting if possible) # TODO standartize !! INCONSITENCIES WHEN GOLD TRUE AND GOLD FALSE, IF GOLD : pred_per_task is indexed by labels (no relation 1-1 to task and src ! ) for task_label_or_gold_label in pred_per_task: #task, _, label_processed = get_task_name_based_on_logit_label(task_label, label_processed) if gold: pred_sent[task_label_or_gold_label] = pred_per_task[task_label_or_gold_label][ind_sent] else: pred_sent[task_label_or_gold_label] = pred_per_task[task_label_or_gold_label][writing_top-1][ind_sent] try: # TODO : standartize (the first if is needed because we handle at the same time gold data indexed by label and prediction labelled by task+label if gold: try: src = src_text_ls[LABEL_PARAMETER[task_label_or_gold_label]["default_input"]][ind_sent] except Exception as e: src = src_text_ls["input_masked"][ind_sent] else: _task = re.match("(.*)-(.*)", task_label_or_gold_label) assert _task is not None#, "ERROR writer could not match {}".format(task_label) _label = _task.group(2) _task = _task.group(1) src = src_text_ls[TASKS_PARAMETER[_task]["input"]][ind_sent] assert len(src) == len(pred_sent[task_label_or_gold_label]),"WARNING : (writer) task {} original_sent len {} {} \n predicted sent len {} {}".format(task_label_or_gold_label, len(src), src,len(pred_sent[task_label_or_gold_label]), pred_sent[task_label_or_gold_label]) except AssertionError as e: print(e) pdb.set_trace() if len(src) > len(pred_sent[task_label_or_gold_label]): pred_sent[task_label_or_gold_label].extend(["UNK" for _ in range(len(src)-len(pred_sent[task_label_or_gold_label]))]) print("WARNING (writer) : original larger than prediction : so appending UNK token for writing") else: print("WARNING (writer) : original smaller than prediction for ") norm_file.write("#\n") original.write("#\n") norm_file.write("#sent_id = {} \n".format(ind_sent+ind_batch+1)) original.write("#sent_id = {} \n".format(ind_sent+ind_batch+1)) ind_adjust = 0 #for ind, original_token in enumerate(original_sent): last_mwe_index = -1 adjust_mwe = 0 for ind in all_indexes[ind_sent, :]: # WE REMOVE SPECIAL TOKENS ONLY IF THEY APPEAR AT THE BEGINING OR AT THE END # on the source token !! (it tells us when we stop) (we nevern want to use gold information) if "-" in ind and ind != "-1": matching_mwe_ind = re.match("([0-9]+)-([0-9]+)", str(ind)) assert matching_mwe_ind is not None, "ERROR ind is {} : could not found mwe index".format(ind) last_mwe_index = int(matching_mwe_ind.group(2)) ind_mwe = int(matching_mwe_ind.group(1)) original_token = src_text_ls["wordpieces_inputs_raw_tokens"][ind_sent][ind_mwe] if mwe_detection_label in pred_per_task or "wordpieces_inputs_words" in pred_per_task or n_masks_mwe_label in pred_per_task else "NOT_NEEDED" adjust_mwe += (last_mwe_index-ind_mwe) #assert ind_adjust == 0, "ERROR not supported" mwe_meta = "Norm={}|mwe_detection={}|n_masks_mwe={}".format("_", pred_sent[mwe_detection_label][ind_mwe] if mwe_detection_label in pred_per_task else "_", pred_sent[n_masks_mwe_label][ind_mwe] if n_masks_mwe_label in pred_per_task else "_") norm_file.write("{index}\t{original}\t_\t{pos}\t_\t_\t{dep}\t_\t{types}\t{norm}\n".format(index=ind, original=original_token, pos="_", types="_", dep="_", norm=mwe_meta)) original.write("{}\t{}\t_\t_\t_\t_\t_\t_\t{}\t_\n".format(ind, original_token, "_")) continue else: ind = int(ind) try: if "normalize" in [task for _tasks in tasks for task in _tasks]: original_token = src_text_ls["wordpiece_words_src_aligned_with_norm"][ind_sent][ind] original_pretokenized_field = "wordpiece_words_src_aligned_with_norm" else: original_token = src_text_ls["wordpieces_inputs_words"][ind_sent][ind] original_pretokenized_field = "wordpieces_inputs_words" except Exception as e: original_token = src_text_ls["input_masked"][ind_sent][ind] original_pretokenized_field = "input_masked" # asserting that we have everything together on the source side if ind > last_mwe_index: if src_text_ls.get("wordpieces_inputs_raw_tokens") is not None: try: assert src_text_ls[original_pretokenized_field][ind_sent][ind] == src_text_ls["wordpieces_inputs_raw_tokens"][ind_sent][ind-adjust_mwe], \ "ERROR sequence {} on non-mwe tokens : raw and tokenized " \ "should be same but are raw {} tokenized {}".format(original_pretokenized_field, src_text_ls["wordpieces_inputs_raw_tokens"][ind_sent][ind], src_text_ls[original_pretokenized_field][ind_sent][ind+adjust_mwe]) except: print("WARNING sanity checking input failed (nomalized_writer) (might be due to dropout) {}".format(e)) max_len_word = max(len(original_token), len_original) #if original_token in SPECIAL_TOKEN_LS and (ind+1 == len(original_sent) or ind == 0): if (original_token in SPECIAL_TOKEN_LS or original_token in [cls_token, sep_token]): # ind 0 is skipped because it corresponds to CLS ind_adjust = 1 continue pos = pred_sent[pos_label][ind] if pos_label in pred_per_task else "_" types = pred_sent[types_label][ind] if types_label in pred_per_task else "_" heads = pred_sent[heads_label][ind] if heads_label in pred_per_task else ind - 1 tenth_col = "Norm={}|mwe_detection={}|n_masks_mwe={}".format(pred_sent["normalize"][ind] if "normalize" in pred_per_task else "_", pred_sent[mwe_detection_label][ind-adjust_mwe] if mwe_detection_label in pred_per_task else "_", pred_sent[n_masks_mwe_label][ind-adjust_mwe] if n_masks_mwe_label in pred_per_task else "_") norm_file.write("{index}\t{original}\t_\t{pos}\t_\t_\t{dep}\t_\t{types}\t{norm}\n".format(index=ind, original=original_token, pos=pos, types=types, dep=heads, norm=tenth_col)) original.write("{}\t{}\t_\t_\t_\t_\t_\t_\t{}\t_\n".format(ind, original_token, ind-1)) if cut_sent: if ind > 50: break print("CUTTING SENT index {}>50 ".format(ind)) norm_file.write("\n") original.write("\n") printing("WRITING predicted batch of {} original and {} normalized", var=[dir_original, dir_pred], verbose=verbose, verbose_level=2) assert max_len_word is not None, "ERROR : something went wrong in the writer" return max_len_word
def run(args, n_observation_max_per_epoch_train, vocab_size, model_dir, voc_tokenizer, auxilliary_task_norm_not_norm, null_token_index, null_str, tokenizer, n_observation_max_per_epoch_dev_test=None, run_mode="train", dict_path=None, end_predictions=None, report=True, model_suffix="", description="", saving_every_epoch=10, model_location=None, model_id=None, report_full_path_shared=None, skip_1_t_n=False, heuristic_test_ls=None, remove_mask_str_prediction=False, inverse_writing=False, extra_label_for_prediction="", random_iterator_train=True, bucket_test=False, must_get_norm_test=True, early_stoppin_metric=None, subsample_early_stoping_metric_val=None, compute_intersection_score_test=True, threshold_edit=3, name_with_epoch=False, max_token_per_batch=200, encoder=None, debug=False, verbose=1): """ Wrapper for training/prediction/evaluation 2 modes : train (will train using train and dev iterators with test at the end on test_path) test : only test at the end : requires all directories to be created :return: """ assert run_mode in ["train", "test" ], "ERROR run mode {} corrupted ".format(run_mode) input_level_ls = ["wordpiece"] assert early_stoppin_metric is not None and subsample_early_stoping_metric_val is not None, "ERROR : assert early_stoppin_metric should be defined and subsample_early_stoping_metric_val " if n_observation_max_per_epoch_dev_test is None: n_observation_max_per_epoch_dev_test = n_observation_max_per_epoch_train printing("MODEL : RUNNING IN {} mode", var=[run_mode], verbose=verbose, verbose_level=1) printing( "WARNING : casing was set to {} (this should be consistent at train and test)", var=[args.case], verbose=verbose, verbose_level=2) if len(args.tasks) == 1: printing("INFO : MODEL : 1 set of simultaneous tasks {}".format( args.tasks), verbose=verbose, verbose_level=1) if run_mode == "test": assert args.test_paths is not None and isinstance( args.test_paths, list) if run_mode == "train": printing("CHECKPOINTING info : " "saving model every {}", var=saving_every_epoch, verbose=verbose, verbose_level=1) use_gpu = use_gpu_(use_gpu=None, verbose=verbose) def get_commit_id(): repo = git.Repo(os.path.dirname(os.path.realpath(__file__)), search_parent_directories=True) git_commit_id = str(repo.head.commit) # object.hexsha return git_commit_id if verbose > 1: print(f"GIT ID : {get_commit_id()}") train_data_label = get_dataset_label(args.train_path, default="train") iter_train = 0 iter_dev = 0 row = None writer = None printout_allocated_gpu_memory(verbose, "{} starting all".format(model_id)) if run_mode == "train": if os.path.isdir(args.train_path[0]) and len(args.train_path) == 1: data_sharded = args.train_path[0] printing( "INFO args.train_path is directory so not rebuilding shards", verbose=verbose, verbose_level=1) elif os.path.isdir(args.train_path[0]): raise (Exception( " {} is a directory but len is more than one , not supported". format(args.train_path[0], len(args.train_path)))) else: data_sharded = None assert model_location is None and model_id is None, "ERROR we are creating a new one " model_id, model_location, dict_path, tensorboard_log, end_predictions, data_sharded \ = setup_repoting_location(model_suffix=model_suffix, data_sharded=data_sharded, root_dir_checkpoints=CHECKPOINT_BERT_DIR, shared_id=args.overall_label, verbose=verbose) hyperparameters = get_hyperparameters_dict( args, args.case, random_iterator_train, seed=args.seed, verbose=verbose, dict_path=dict_path, model_id=model_id, model_location=model_location) args_dir = write_args(model_location, model_id=model_id, hyperparameters=hyperparameters, verbose=verbose) if report: if report_full_path_shared is not None: tensorboard_log = os.path.join(report_full_path_shared, "tensorboard") printing("tensorboard --logdir={} --host=localhost --port=1234 ", var=[tensorboard_log], verbose_level=1, verbose=verbose) writer = SummaryWriter(log_dir=tensorboard_log) if writer is not None: writer.add_text("INFO-ARGUMENT-MODEL-{}".format(model_id), str(hyperparameters), 0) else: args_checkpoint = json.load(open(args.init_args_dir, "r")) dict_path = args_checkpoint["hyperparameters"]["dict_path"] assert dict_path is not None and os.path.isdir( dict_path), "ERROR {} ".format(dict_path) end_predictions = args.end_predictions assert end_predictions is not None and os.path.isdir( end_predictions), "ERROR end_predictions" model_location = args_checkpoint["hyperparameters"]["model_location"] model_id = args_checkpoint["hyperparameters"]["model_id"] assert model_location is not None and model_id is not None, "ERROR model_location model_id " args_dir = os.path.join(model_location, "{}-args.json".format(model_id)) printing( "CHECKPOINTING : starting writing log \ntensorboard --logdir={} --host=localhost --port=1234 ", var=[os.path.join(model_id, "tensorboard")], verbose_level=1, verbose=verbose) # build or make dictionaries _dev_path = args.dev_path if args.dev_path is not None else args.train_path word_dictionary, word_norm_dictionary, char_dictionary, pos_dictionary, \ xpos_dictionary, type_dictionary = \ conllu_data.load_dict(dict_path=dict_path, train_path=args.train_path if run_mode == "train" else None, dev_path=args.dev_path if run_mode == "train" else None, test_path=None, word_embed_dict={}, dry_run=False, expand_vocab=False, word_normalization=True, force_new_dic=True if run_mode == "train" else False, tasks=args.tasks, pos_specific_data_set=args.train_path[1] if len(args.tasks) > 1 and len(args.train_path)>1 and "pos" in args.tasks else None, case=args.case, # if not normalize pos or parsing in tasks we don't need dictionary do_not_fill_dictionaries=len(set(["normalize", "pos", "parsing"])&set([task for tasks in args.tasks for task in tasks])) == 0, add_start_char=1 if run_mode == "train" else None, verbose=verbose) # we flatten the taskssd printing("DICTIONARY CREATED/LOADED", verbose=verbose, verbose_level=1) num_labels_per_task, task_to_label_dictionary = get_vocab_size_and_dictionary_per_task( [task for tasks in args.tasks for task in tasks], vocab_bert_wordpieces_len=vocab_size, pos_dictionary=pos_dictionary, type_dictionary=type_dictionary, task_parameters=TASKS_PARAMETER) voc_pos_size = num_labels_per_task["pos"] if "pos" in args.tasks else None if voc_pos_size is not None: printing("MODEL : voc_pos_size defined as {}", var=voc_pos_size, verbose_level=1, verbose=verbose) printing("MODEL init...", verbose=verbose, verbose_level=1) if verbose > 1: print("DEBUG : TOKENIZER :voc_tokenizer from_pretrained", voc_tokenizer) #pdb.set_trace() #voc_tokenizer = "bert-base-multilingual-cased" tokenizer = tokenizer.from_pretrained( voc_tokenizer, do_lower_case=args.case == "lower", shuffle_bpe_embedding=args.shuffle_bpe_embedding) mask_id = tokenizer.convert_tokens_to_ids( tokenizer.mask_token) #convert_tokens_to_ids([MASK_BERT])[0] printout_allocated_gpu_memory(verbose, "{} loading model ".format(model_id)) model = get_model_multi_task_bert(args=args, model_dir=model_dir, encoder=encoder, num_labels_per_task=num_labels_per_task, mask_id=mask_id) def prune_heads(prune_heads): if prune_heads is not None: pune_heads_ls = prune_heads.split(",")[:-1] assert len(pune_heads_ls) > 0 for layer in pune_heads_ls: parsed_layer_to_prune = layer.split("-") assert parsed_layer_to_prune[0] == "prune_heads" assert parsed_layer_to_prune[1] == "layer" assert parsed_layer_to_prune[3] == "heads" heads = parsed_layer_to_prune[4] head_index_ls = heads.split("_") heads_ls = [int(index) for index in head_index_ls] print( f"MODEL : pruning layer {parsed_layer_to_prune[2]} heads {heads_ls}" ) model.encoder.encoder.layer[int( parsed_layer_to_prune[2])].attention.prune_heads(heads_ls) if args.prune_heads is not None and args.prune_heads != "None": print(f"INFO : args.prune_heads {args.prune_heads}") prune_heads(args.prune_heads) if use_gpu: model.to("cuda") printing("MODEL TO CUDA", verbose=verbose, verbose_level=1) printing("MODEL model.config {} ", var=[model.config], verbose=verbose, verbose_level=1) printout_allocated_gpu_memory(verbose, "{} model loaded".format(model_id)) model_origin = OrderedDict() pruning_mask = OrderedDict() printout_allocated_gpu_memory(verbose, "{} model cuda".format(model_id)) for name, param in model.named_parameters(): model_origin[name] = param.detach().clone() printout_allocated_gpu_memory(verbose, "{} param cloned ".format(name)) if args.penalization_mode == "pruning": abs = torch.abs(param.detach().flatten()) median_value = torch.median(abs) pruning_mask[name] = (abs > median_value).float() printout_allocated_gpu_memory( verbose, "{} pruning mask loaded".format(model_id)) printout_allocated_gpu_memory(verbose, "{} model clone".format(model_id)) inv_word_dic = word_dictionary.instance2index # load , mask, bucket and index data assert tokenizer is not None, "ERROR : tokenizer is None , voc_tokenizer failed to be loaded {}".format( voc_tokenizer) if run_mode == "train": time_load_readers_train_start = time.time() if not args.memory_efficient_iterator: data_sharded, n_shards, n_sent_dataset_total_train = None, None, None args_load_batcher_shard_data = None printing("INFO : starting loading readers", verbose=verbose, verbose_level=1) readers_train = readers_load( datasets=args.train_path, tasks=args.tasks, word_dictionary=word_dictionary, bert_tokenizer=tokenizer, word_dictionary_norm=word_norm_dictionary, char_dictionary=char_dictionary, pos_dictionary=pos_dictionary, xpos_dictionary=xpos_dictionary, type_dictionary=type_dictionary, word_decoder=True, run_mode=run_mode, add_start_char=1, add_end_char=1, symbolic_end=1, symbolic_root=1, bucket=True, must_get_norm=True, input_level_ls=input_level_ls, verbose=verbose) n_sent_dataset_total_train = readers_train[list( readers_train.keys())[0]][3] printing("INFO : done with sharding", verbose=verbose, verbose_level=1) else: printing("INFO : building/loading shards ", verbose=verbose, verbose_level=1) data_sharded, n_shards, n_sent_dataset_total_train = build_shard( data_sharded, args.train_path, n_sent_max_per_file=N_SENT_MAX_CONLL_PER_SHARD, verbose=verbose) time_load_readers_dev_start = time.time() time_load_readers_train = time.time() - time_load_readers_train_start readers_dev_ls = [] dev_data_label_ls = [] printing("INFO : g readers for dev", verbose=verbose, verbose_level=1) printout_allocated_gpu_memory( verbose, "{} reader train loaded".format(model_id)) for dev_path in args.dev_path: dev_data_label = get_dataset_label(dev_path, default="dev") dev_data_label_ls.append(dev_data_label) readers_dev = readers_load( datasets=dev_path, tasks=args.tasks, word_dictionary=word_dictionary, word_dictionary_norm=word_norm_dictionary, char_dictionary=char_dictionary, pos_dictionary=pos_dictionary, xpos_dictionary=xpos_dictionary, bert_tokenizer=tokenizer, type_dictionary=type_dictionary, word_decoder=True, run_mode=run_mode, add_start_char=1, add_end_char=1, symbolic_end=1, symbolic_root=1, bucket=False, must_get_norm=True, input_level_ls=input_level_ls, verbose=verbose) if args.dev_path is not None else None readers_dev_ls.append(readers_dev) printout_allocated_gpu_memory(verbose, "{} reader dev loaded".format(model_id)) time_load_readers_dev = time.time() - time_load_readers_dev_start # Load tokenizer printing("TIME : {} ", var=[ OrderedDict([ ("time_load_readers_train", "{:0.4f} min".format(time_load_readers_train / 60)), ("time_load_readers_dev", "{:0.4f} min".format(time_load_readers_dev / 60)) ]) ], verbose=verbose, verbose_level=2) early_stoping_val_former = 1000 # training starts when epoch is 1 #args.epochs += 1 #assert args.epochs >= 1, "ERROR need at least 2 epochs (1 eval , 1 train 1 eval" flexible_batch_size = False if args.optimizer == "AdamW": model, optimizer, scheduler = apply_fine_tuning_strategy( model=model, fine_tuning_strategy=args.fine_tuning_strategy, lr_init=args.lr, betas=(0.9, 0.99), epoch=0, weight_decay=args.weight_decay, optimizer_name=args.optimizer, t_total=n_sent_dataset_total_train / args.batch_update_train * args.epochs if n_sent_dataset_total_train / args.batch_update_train * args.epochs > 1 else 5, verbose=verbose) try: for epoch in range(args.epochs): if args.memory_efficient_iterator: # we start epoch with a new shart everytime ! training_file = get_new_shard(data_sharded, n_shards) printing( "INFO Memory efficient iterator triggered (only build for train data , starting with {}", var=[training_file], verbose=verbose, verbose_level=1) args_load_batcher_shard_data = { "word_dictionary": word_dictionary, "tokenizer": tokenizer, "word_norm_dictionary": word_norm_dictionary, "char_dictionary": char_dictionary, "pos_dictionary": pos_dictionary, "xpos_dictionary": xpos_dictionary, "type_dictionary": type_dictionary, "use_gpu": use_gpu, "norm_not_norm": auxilliary_task_norm_not_norm, "word_decoder": True, "add_start_char": 1, "add_end_char": 1, "symbolic_end": 1, "symbolic_root": 1, "bucket": True, "max_char_len": 20, "must_get_norm": True, "use_gpu_hardcoded_readers": False, "bucketing_level": "bpe", "input_level_ls": ["wordpiece"], "auxilliary_task_norm_not_norm": auxilliary_task_norm_not_norm, "random_iterator_train": random_iterator_train } readers_train = readers_load( datasets=args.train_path if not args.memory_efficient_iterator else training_file, tasks=args.tasks, word_dictionary=word_dictionary, bert_tokenizer=tokenizer, word_dictionary_norm=word_norm_dictionary, char_dictionary=char_dictionary, pos_dictionary=pos_dictionary, xpos_dictionary=xpos_dictionary, type_dictionary=type_dictionary, word_decoder=True, run_mode=run_mode, add_start_char=1, add_end_char=1, symbolic_end=1, symbolic_root=1, bucket=True, must_get_norm=True, input_level_ls=input_level_ls, verbose=verbose) checkpointing_model_data = (epoch % saving_every_epoch == 0 or epoch == (args.epochs - 1)) # build iterator on the loaded data printout_allocated_gpu_memory( verbose, "{} loading batcher".format(model_id)) if args.batch_size == "flexible": flexible_batch_size = True printing( "INFO : args.batch_size {} so updating it based on mean value {}", var=[ args.batch_size, update_batch_size_mean(readers_train) ], verbose=verbose, verbose_level=1) args.batch_size = update_batch_size_mean(readers_train) if args.batch_update_train == "flexible": args.batch_update_train = args.batch_size printing( "TRAINING : backward pass every {} step of size {} in average", var=[ int(args.batch_update_train // args.batch_size), args.batch_size ], verbose=verbose, verbose_level=1) try: assert isinstance(args.batch_update_train // args.batch_size, int)\ and args.batch_update_train // args.batch_size > 0, \ "ERROR batch_size {} should be a multiple of {} ".format(args.batch_update_train, args.batch_size) except Exception as e: print("WARNING {}".format(e)) batchIter_train = data_gen_multi_task_sampling_batch( tasks=args.tasks, readers=readers_train, batch_size=readers_train[list(readers_train.keys())[0]][4], max_token_per_batch=max_token_per_batch if flexible_batch_size else None, word_dictionary=word_dictionary, char_dictionary=char_dictionary, pos_dictionary=pos_dictionary, word_dictionary_norm=word_norm_dictionary, get_batch_mode=random_iterator_train, print_raw=False, dropout_input=0.0, verbose=verbose) # -|-|- printout_allocated_gpu_memory( verbose, "{} batcher train loaded".format(model_id)) batchIter_dev_ls = [] batch_size_DEV = 1 if verbose > 1: print( "WARNING : batch_size for final eval was hardcoded and set to {}" .format(batch_size_DEV)) for readers_dev in readers_dev_ls: batchIter_dev = data_gen_multi_task_sampling_batch( tasks=args.tasks, readers=readers_dev, batch_size=batch_size_DEV, word_dictionary=word_dictionary, char_dictionary=char_dictionary, pos_dictionary=pos_dictionary, word_dictionary_norm=word_norm_dictionary, get_batch_mode=False, print_raw=False, dropout_input=0.0, verbose=verbose) if args.dev_path is not None else None batchIter_dev_ls.append(batchIter_dev) model.train() printout_allocated_gpu_memory( verbose, "{} batcher dev loaded".format(model_id)) if args.optimizer != "AdamW": model, optimizer, scheduler = apply_fine_tuning_strategy( model=model, fine_tuning_strategy=args.fine_tuning_strategy, lr_init=args.lr, betas=(0.9, 0.99), weight_decay=args.weight_decay, optimizer_name=args.optimizer, t_total=n_sent_dataset_total_train / args.batch_update_train * args.epochs if n_sent_dataset_total_train / args.batch_update_train * args.epochs > 1 else 5, epoch=epoch, verbose=verbose) printout_allocated_gpu_memory( verbose, "{} optimizer loaded".format(model_id)) loss_train = None if epoch >= 0: printing("TRAINING : training on GET_BATCH_MODE ", verbose=verbose, verbose_level=2) printing( "TRAINING {} training 1 'epoch' = {} observation size args.batch_" "update_train (foward {} batch_size {} backward " "(every int(args.batch_update_train//args.batch_size) step if {})) ", var=[ model_id, n_observation_max_per_epoch_train, args.batch_size, args.batch_update_train, args.low_memory_foot_print_batch_mode ], verbose=verbose, verbose_level=1) loss_train, iter_train, perf_report_train, _ = epoch_run( batchIter_train, tokenizer, args=args, model_origin=model_origin, pruning_mask=pruning_mask, task_to_label_dictionary=task_to_label_dictionary, data_label=train_data_label, model=model, dropout_input_bpe=args.dropout_input_bpe, writer=writer, iter=iter_train, epoch=epoch, writing_pred=epoch == (args.epochs - 1), dir_end_pred=end_predictions, optimizer=optimizer, use_gpu=use_gpu, scheduler=scheduler, predict_mode=(epoch - 1) % 5 == 0, skip_1_t_n=skip_1_t_n, model_id=model_id, reference_word_dic={"InV": inv_word_dic}, null_token_index=null_token_index, null_str=null_str, norm_2_noise_eval=False, early_stoppin_metric=None, n_obs_max=n_observation_max_per_epoch_train, data_sharded_dir=data_sharded, n_shards=n_shards, n_sent_dataset_total=n_sent_dataset_total_train, args_load_batcher_shard_data= args_load_batcher_shard_data, memory_efficient_iterator=args. memory_efficient_iterator, verbose=verbose) else: printing( "TRAINING : skipping first epoch to start by evaluating on devs dataset0", verbose=verbose, verbose_level=1) printout_allocated_gpu_memory( verbose, "{} epoch train done".format(model_id)) model.eval() if args.dev_path is not None and (epoch % 3 == 0 or epoch <= 6): if verbose > 1: print("RUNNING DEV on ITERATION MODE") early_stoping_val_ls = [] loss_dev_ls = [] for i_dev, batchIter_dev in enumerate(batchIter_dev_ls): loss_dev, iter_dev, perf_report_dev, early_stoping_val = epoch_run( batchIter_dev, tokenizer, args=args, epoch=epoch, model_origin=model_origin, pruning_mask=pruning_mask, task_to_label_dictionary=task_to_label_dictionary, iter=iter_dev, use_gpu=use_gpu, model=model, writer=writer, optimizer=None, writing_pred=True, #epoch == (args.epochs - 1), dir_end_pred=end_predictions, predict_mode=True, data_label=dev_data_label_ls[i_dev], null_token_index=null_token_index, null_str=null_str, model_id=model_id, skip_1_t_n=skip_1_t_n, dropout_input_bpe=0, reference_word_dic={"InV": inv_word_dic}, norm_2_noise_eval=False, early_stoppin_metric=early_stoppin_metric, subsample_early_stoping_metric_val= subsample_early_stoping_metric_val, #case=case, n_obs_max=n_observation_max_per_epoch_dev_test, verbose=verbose) printing( "TRAINING : loss train:{} dev {}:{} for epoch {} out of {}", var=[ loss_train, i_dev, loss_dev, epoch, args.epochs ], verbose=1, verbose_level=1) printing("PERFORMANCE {} DEV {} {} ", var=[epoch, i_dev + 1, perf_report_dev], verbose=verbose, verbose_level=1) early_stoping_val_ls.append(early_stoping_val) loss_dev_ls.append(loss_dev) else: if verbose > 1: print("NO DEV EVAL") loss_dev, iter_dev, perf_report_dev = None, 0, None # NB : early_stoping_val is based on first dev set printout_allocated_gpu_memory( verbose, "{} epoch dev done".format(model_id)) early_stoping_val = early_stoping_val_ls[0] if checkpointing_model_data or early_stoping_val < early_stoping_val_former: if early_stoping_val is not None: _epoch = "best" if early_stoping_val < early_stoping_val_former else epoch else: if verbose > 1: print( 'WARNING early_stoping_val is None so saving based on checkpointing_model_data only' ) _epoch = epoch # model_id enriched possibly with some epoch informaiton if name_with_epoch _model_id = get_name_model_id_with_extra_name( epoch=epoch, _epoch=_epoch, name_with_epoch=name_with_epoch, model_id=model_id) checkpoint_dir = os.path.join( model_location, "{}-checkpoint.pt".format(_model_id)) if _epoch == "best": printing( "CHECKPOINT : SAVING BEST MODEL {} (epoch:{}) (new loss is {} former was {})" .format(checkpoint_dir, epoch, early_stoping_val, early_stoping_val_former), verbose=verbose, verbose_level=1) last_checkpoint_dir_best = checkpoint_dir early_stoping_val_former = early_stoping_val best_epoch = epoch best_loss = early_stoping_val else: printing( "CHECKPOINT : NOT SAVING BEST MODEL : new loss {} did not beat first loss {}" .format(early_stoping_val, early_stoping_val_former), verbose_level=1, verbose=verbose) last_model = "" if epoch == (args.epochs - 1): last_model = "last" printing("CHECKPOINT : epoch {} saving {} model {} ", var=[epoch, last_model, checkpoint_dir], verbose=verbose, verbose_level=1) torch.save(model.state_dict(), checkpoint_dir) args_dir = write_args( dir=model_location, checkpoint_dir=checkpoint_dir, hyperparameters=hyperparameters if name_with_epoch else None, model_id=_model_id, info_checkpoint=OrderedDict([ ("epochs", epoch + 1), ("batch_size", args.batch_size if not args.low_memory_foot_print_batch_mode else args.batch_update_train), ("train_path", train_data_label), ("dev_path", dev_data_label_ls), ("num_labels_per_task", num_labels_per_task) ]), verbose=verbose) if row is not None and update_status is not None: update_status(row=row, value="training-done", verbose=1) except Exception as e: if row is not None and update_status is not None: update_status(row=row, value="ERROR", verbose=1) raise (e) # reloading last (best) checkpoint if run_mode in ["train", "test"] and args.test_paths is not None: report_all = [] if run_mode == "train" and args.epochs > 0: if use_gpu: model.load_state_dict(torch.load(last_checkpoint_dir_best)) model = model.cuda() printout_allocated_gpu_memory( verbose, "{} after reloading model".format(model_id)) else: model.load_state_dict( torch.load(last_checkpoint_dir_best, map_location=lambda storage, loc: storage)) printing( "MODEL : RELOADING best model of epoch {} with loss {} based on {}({}) metric (from checkpoint {})", var=[ best_epoch, best_loss, early_stoppin_metric, subsample_early_stoping_metric_val, last_checkpoint_dir_best ], verbose=verbose, verbose_level=1) model.eval() printout_allocated_gpu_memory(verbose, "{} starting test".format(model_id)) for test_path in args.test_paths: assert len(test_path) == len( args.tasks), "ERROR test_path {} args.tasks {}".format( test_path, args.tasks) for test, task_to_eval in zip(test_path, args.tasks): label_data = get_dataset_label([test], default="test") if len(extra_label_for_prediction) > 0: label_data += "-" + extra_label_for_prediction if args.shuffle_bpe_embedding and args.test_mode_no_shuffle_embedding: printing( "TOKENIZER: as args.shuffle_bpe_embedding {} and test_mode_no_shuffle {} : reloading tokenizer with no shuffle_embedding", var=[ args.shuffle_bpe_embedding, args.test_mode_no_shuffle_embedding ], verbose=1, verbose_level=1) tokenizer = tokenizer.from_pretrained( voc_tokenizer, do_lower_case=args.case == "lower", shuffle_bpe_embedding=False) readers_test = readers_load( datasets=[test], tasks=[task_to_eval], word_dictionary=word_dictionary, word_dictionary_norm=word_norm_dictionary, char_dictionary=char_dictionary, pos_dictionary=pos_dictionary, xpos_dictionary=xpos_dictionary, type_dictionary=type_dictionary, bert_tokenizer=tokenizer, word_decoder=True, run_mode=run_mode, add_start_char=1, add_end_char=1, symbolic_end=1, symbolic_root=1, bucket=bucket_test, input_level_ls=input_level_ls, must_get_norm=must_get_norm_test, verbose=verbose) heuritics_zip = [None] gold_error_or_not_zip = [False] norm2noise_zip = [False] if heuristic_test_ls is None: assert len(gold_error_or_not_zip) == len( heuritics_zip) and len(heuritics_zip) == len( norm2noise_zip) batch_size_TEST = 1 if verbose > 1: print( "WARNING : batch_size for final eval was hardcoded and set to {}" .format(batch_size_TEST)) for (heuristic_test, gold_error, norm_2_noise_eval) in zip(heuritics_zip, gold_error_or_not_zip, norm2noise_zip): assert heuristic_test is None and not gold_error and not norm_2_noise_eval batchIter_test = data_gen_multi_task_sampling_batch( tasks=[task_to_eval], readers=readers_test, batch_size=batch_size_TEST, word_dictionary=word_dictionary, char_dictionary=char_dictionary, pos_dictionary=pos_dictionary, word_dictionary_norm=word_norm_dictionary, get_batch_mode=False, dropout_input=0.0, verbose=verbose) try: loss_test, iter_test, perf_report_test, _ = epoch_run( batchIter_test, tokenizer, args=args, iter=iter_dev, use_gpu=use_gpu, model=model, task_to_label_dictionary=task_to_label_dictionary, writer=None, writing_pred=True, optimizer=None, args_dir=args_dir, model_id=model_id, dir_end_pred=end_predictions, skip_1_t_n=skip_1_t_n, predict_mode=True, data_label=label_data, epoch="LAST", extra_label_for_prediction=label_data, null_token_index=null_token_index, null_str=null_str, log_perf=False, dropout_input_bpe=0, norm_2_noise_eval=norm_2_noise_eval, compute_intersection_score= compute_intersection_score_test, remove_mask_str_prediction= remove_mask_str_prediction, reference_word_dic={"InV": inv_word_dic}, threshold_edit=threshold_edit, verbose=verbose, n_obs_max=n_observation_max_per_epoch_dev_test) if verbose > 1: print("LOSS TEST", loss_test) except Exception as e: print( "ERROR (epoch_run test) {} test_path {} , heuristic {} , gold error {} , norm2noise {} " .format(e, test, heuristic_test, gold_error, norm_2_noise_eval)) raise (e) print("PERFORMANCE TEST on data {} is {} ".format( label_data, perf_report_test)) print("DATA WRITTEN {}".format(end_predictions)) if writer is not None: writer.add_text( "Accuracy-{}-{}-{}".format(model_id, label_data, run_mode), "After {} epochs with {} : performance is \n {} ". format(args.epochs, description, str(perf_report_test)), 0) else: printing( "WARNING : could not add accuracy to tensorboard cause writer was found None", verbose=verbose, verbose_level=2) report_all.extend(perf_report_test) printout_allocated_gpu_memory( verbose, "{} test done".format(model_id)) else: printing("ERROR : EVALUATION none cause {} empty or run_mode {} ", var=[args.test_paths, run_mode], verbose_level=1, verbose=verbose) if writer is not None: writer.close() printing("tensorboard --logdir={} --host=localhost --port=1234 ", var=[tensorboard_log], verbose_level=1, verbose=verbose) report_dir = os.path.join(model_location, model_id + "-report.json") if report_full_path_shared is not None: report_full_dir = os.path.join(report_full_path_shared, args.overall_label + "-report.json") if os.path.isfile(report_full_dir): report = json.load(open(report_full_dir, "r")) else: report = [] printing("REPORT = creating overall report at {} ", var=[report_dir], verbose=verbose, verbose_level=1) report.extend(report_all) json.dump(report, open(report_full_dir, "w")) printing("{} {} ", var=[REPORT_FLAG_DIR_STR, report_full_dir], verbose=0, verbose_level=0) json.dump(report_all, open(report_dir, "w")) printing("REPORTING TO {}".format(report_dir), verbose=verbose, verbose_level=1) if report_full_path_shared is None: printing("WARNING ; report_full_path_shared is None", verbose=verbose, verbose_level=1) printing("{} {} ", var=[REPORT_FLAG_DIR_STR, report_dir], verbose=verbose, verbose_level=0) return model