def pred_word_to_list(pred_word, special_symb_ls):
    index_special_ls = []

    pred_word = [pred_word]
    ind_pred_word = 0
    counter = 0
    while True:
        counter += 1
        index_special_ls = []
        _pred_word = pred_word[ind_pred_word]
        # Looking for all special character (we only look at the first one found)
        for special_symb in special_symb_ls:
            index_special_ls.append(_pred_word.find(special_symb))
        indexes = np.argsort(index_special_ls)
        index_special_char = -1
        # Getting the index and the character of the first special character if nothing we get -1
        for ind, a in enumerate(indexes):
            if index_special_ls[a] >= 0:
                special_symb = special_symb_ls[a]
                index_special_char = index_special_ls[a]
                break
            if ind > len(indexes):
                index_special_char = -1
                special_symb = ""
                break
        # if found a special character
        if (index_special_char) >= 0:
            starting_seq = [_pred_word[:index_special_char]
                            ] if index_special_char > 0 else []
            middle = [
                _pred_word[index_special_char:index_special_char +
                           len(special_symb)]
            ]
            end_seq = [_pred_word[index_special_char + len(special_symb):]]
            if len(end_seq[0].strip()) == 0:
                end_seq = []
            _pred_word_ls = starting_seq + middle + end_seq
            pred_word[ind_pred_word] = _pred_word_ls[0]
            if len(_pred_word_ls) > 0:
                pred_word.extend(_pred_word_ls[1:])
            ind_pred_word += 1
            pdb.set_trace()
            if len(starting_seq) > 0:
                ind_pred_word += 1
        else:
            ind_pred_word += 1
        pdb.set_trace()
        if ind_pred_word >= len(pred_word):
            break

    new_word = []
    # transform the way we splitted in list of characters (including special ones)
    for word in pred_word:
        if word in special_symb_ls:
            new_word.append(word)
        else:
            new_word.extend(list(word))

    return new_word
예제 #2
0
    def __init__(
            self, config,
            **kwargs):  # tasks, num_labels_per_task, mask_id, encoder_class):
        super(BertMultiTask, self).__init__(config)
        # encoder_class only BertModel or RobertaModel
        # some arguments specific to BertMultiTask could be passed in config or in kwargs
        encoder_class = kwargs["encoder"]
        tasks = kwargs["tasks"] if "tasks" in kwargs else config.tasks
        num_labels_per_task = kwargs[
            "num_labels_per_task"] if "num_labels_per_task" in kwargs else config.tasks
        mask_id = kwargs["mask_id"] if "mask_id" in kwargs else config.tasks
        config.dropout_classifier = kwargs.get("dropout_classifier", 0.1)

        if "parsing" in tasks:
            config.graph_head_hidden_size_mlp_arc = kwargs.get(
                "graph_head_hidden_size_mlp_arc", 500)
            config.graph_head_hidden_size_mlp_rel = kwargs.get(
                "graph_head_hidden_size_mlp_rel", 200)
        pdb.set_trace()
        self.encoder = encoder_class(config)
        print("BertMultitask instantiated with {} encoder".format(
            self.encoder.__class__.__name__))
        self.config = config
        assert isinstance(num_labels_per_task, dict)
        assert isinstance(tasks, list) and len(
            tasks) >= 1, "config.tasks should be a list of len >=1"
        self.head = nn.ModuleDict()
        self.mask_index_bert = mask_id
        self.tasks = tasks  #
        self.tasks_available = tasks  # all tasks available in the model (not only the one we want to use at a given run (self.tasks))
        self.task_parameters = TASKS_PARAMETER
        self.layer_wise_attention = None
        self.labels_supported = [
            label for task in tasks
            for label in self.task_parameters[task]["label"]
        ]
        self.sanity_checking_num_labels_per_task(num_labels_per_task, tasks,
                                                 self.task_parameters)
        self.num_labels_dic = num_labels_per_task

        for task in TASKS_PARAMETER:
            if task in tasks:
                num_label = get_key_name_num_label(task, self.task_parameters)
                if not self.task_parameters[task]["num_labels_mandatory"]:
                    # in this case we need to define and load MLM head of the model
                    self.head[task] = eval(self.task_parameters[task]["head"])(
                        config
                    )  #, self.encoder.embeddings.word_embeddings.weight)
                else:
                    self.head[task] = eval(self.task_parameters[task]["head"])(
                        config, num_labels=self.num_labels_dic[num_label])
            else:
                # we define empty heads for downstream use
                self.head[task] = None
예제 #3
0
    def get_loss(loss_func,
                 label,
                 num_label_dic,
                 labels,
                 logits_dict,
                 task,
                 logit_label,
                 head_label=None):
        if label not in ["heads", "types"]:
            try:
                loss = loss_func(
                    logits_dict[logit_label].view(-1,
                                                  num_label_dic[logit_label]),
                    labels.view(-1))
            except Exception as e:
                print(e)
                pdb.set_trace()
                print("ERROR task {} num_label {} , labels {} ".format(
                    task, num_label_dic, labels.view(-1)))
                raise (e)

        elif label == "heads":
            # trying alternative way for loss
            loss = CrossEntropyLoss(
                ignore_index=LABEL_PARAMETER[label]["pad_value"],
                reduction="sum")(logits_dict[logit_label].view(
                    -1, logits_dict[logit_label].size(2)), labels.view(-1))
            # other possibilities is to do log softmax then L1 loss (lead to other results)

        elif label == "types":
            assert head_label is not None, "ERROR head_label should be passed"
            # gold label after removing 0 gold
            gold = labels[head_label != LABEL_PARAMETER["heads"]["pad_value"]]
            # pred logits (after removing -1) on the gold heads
            pred = logits_dict["parsing-types"][(
                head_label != LABEL_PARAMETER["heads"]["pad_value"]
            ).nonzero()[:, 0], (
                head_label != LABEL_PARAMETER["heads"]["pad_value"]
            ).nonzero()[:, 1], head_label[
                head_label != LABEL_PARAMETER["heads"]["pad_value"]]]
            # remark : in the way it's coded for paring : the padding is already removed (so ignore index is null)
            loss = loss_func(pred, gold)

        return loss
def preprocess_batch_string_for_bert(batch,
                                     start_token,
                                     end_token,
                                     rp_space=False):
    """
    adding starting and ending token in raw sentences
    :param batch:
    :return:
    """
    for i in range(len(batch)):
        try:
            batch[i][0] = start_token
        except:
            pdb.set_trace()
        batch[i][-1] = end_token
        if rp_space:
            batch[i] = rp_space_func(batch[i])
        batch[i] = " ".join(batch[i])
    return batch
예제 #5
0
 def sanity_test_parsing_label(labels, output_tokens_tensor_new,
                               input_alignement_with_raw, cumulate_shift):
     for sent in range(labels.size(0)):
         ind_max = len(cumulate_shift[sent]) - 1
         for _ in range(5):
             ind = np.random.choice(range(ind_max))
             # the new label must be equal to the old one at the corresponding position + 1 + the number of non-first-bpe-token (original indexing of the label)
             if output_tokens_tensor_new[sent][ind] not in [
                     ROOT_HEADS_INDEX + 1, END_HEADS_INDEX,
                     PAD_ID_LOSS_STANDART
             ]:
                 try:
                     assert output_tokens_tensor_new[sent][ind] == labels[sent, int(input_alignement_with_raw[sent][ind])]+CLS_ADJUST+cumulate_shift[sent][labels[sent, int(input_alignement_with_raw[sent][ind])]], \
                     "ERROR sent {} ind word {} " \
                     "new {} and old {} cumulted {} ".format(sent, ind, output_tokens_tensor_new[sent][ind],
                                                         labels[sent, input_alignement_with_raw[sent][ind]], cumulate_shift[sent][ind])
                 except AssertionError as e:
                     print(e)
                     pdb.set_trace()
def main(args, dict_path, model_dir):

    encoder = BERT_MODEL_DIC[args.bert_model]["encoder"]
    vocab_size = BERT_MODEL_DIC[args.bert_model]["vocab_size"]
    voc_tokenizer = BERT_MODEL_DIC[args.bert_model]["vocab"]

    tokenizer = eval(BERT_MODEL_DIC[args.bert_model]["tokenizer"])
    random.seed(args.seed)

    if args.model_id_pref is None:
        run_id = str(uuid4())[:4]
    else:
        run_id = args.model_id_pref + "1"

    if args.init_args_dir is None:
        dict_path += "/" + run_id
        os.mkdir(dict_path)
    tokenizer = tokenizer.from_pretrained(voc_tokenizer,
                                          do_lower_case=args.case == "lower",
                                          shuffle_bpe_embedding=False)
    mask_id = tokenizer.encode([
        "[MASK]"
    ])[0] if args.bert_model == "bert_base_multilingual_cased" else None

    _dev_path = args.dev_path if args.dev_path is not None else args.train_path
    word_dictionary, word_norm_dictionary, char_dictionary, pos_dictionary, \
    xpos_dictionary, type_dictionary = \
        conllu_data.load_dict(dict_path=dict_path,
                              train_path=args.train_path if args.init_args_dir is None else None,
                              dev_path=args.dev_path if args.init_args_dir is None else None,
                              test_path=args.test_paths if args.init_args_dir is None else None,
                              word_embed_dict={},
                              dry_run=False,
                              expand_vocab=False,
                              word_normalization=True,
                              force_new_dic=False,
                              tasks=args.tasks,
                              pos_specific_data_set=None,
                              #pos_specific_data_set=args.train_path[1] if len(args.tasks) > 1 and len(
                              #    args.train_path) > 1 and "pos" in args.tasks else None,
                              case=args.case,
                              # if not normalize pos or parsing in tasks we don't need dictionary
                              do_not_fill_dictionaries=len(set(["normalize", "pos", "parsing"]) & set(
                                  [task for tasks in args.tasks for task in tasks])) == 0,
                              add_start_char=True if args.init_args_dir is None else None,
                              verbose=1)

    num_labels_per_task, task_to_label_dictionary = get_vocab_size_and_dictionary_per_task(
        [task for tasks in args.tasks for task in tasks],
        vocab_bert_wordpieces_len=vocab_size,
        pos_dictionary=pos_dictionary,
        type_dictionary=type_dictionary,
        task_parameters=TASKS_PARAMETER)

    model = make_bert_multitask(
        args=args,
        pretrained_model_dir=model_dir,
        init_args_dir=args.init_args_dir,
        tasks=[task for tasks in args.tasks for task in tasks],
        mask_id=mask_id,
        encoder=encoder,
        num_labels_per_task=num_labels_per_task)

    def get_n_params(model):
        pp = 0
        for p in list(model.parameters()):
            nn = 1
            for s in list(p.size()):

                nn = nn * s
            pp += nn
        return pp

    param = get_n_params(model)
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])

    pdb.set_trace()

    data = ["I am here", "How are you"]
    model.eval()
    n_obs = args.n_sent
    max_len = args.max_seq_len
    lang_ls = args.raw_text_code
    data_all, y_all = load_lang_ls(DATA_UD_RAW, lang_ls=lang_ls)

    reg = linear_model.LogisticRegression()
    X_train = OrderedDict()
    X_test = OrderedDict()
    y_train = OrderedDict()
    y_test = OrderedDict()
    # just to get the keyw
    layer_head_att = get_hidden_representation(data,
                                               model,
                                               tokenizer,
                                               max_len=max_len,
                                               output_dic=False,
                                               pad_below_max_len=True)
    layer_head_att = layer_head_att[0]
    report_ls = []
    accuracy_dic = OrderedDict()
    sampling = args.sampling
    for ind, layer_head in enumerate(list(layer_head_att.keys())):
        report = OrderedDict()
        accuracy_ls = []
        layer_head = list(
            layer_head_att.keys())[len(list(layer_head_att.keys())) - ind - 1]
        for _ in range(sampling):
            sample_ind = random.sample(population=range(len(data_all)),
                                       k=n_obs)
            sample_ind_test = random.sample(population=range(len(data_all)),
                                            k=n_obs)

            data = data_all[sample_ind]
            y = y_all[sample_ind]

            all = get_hidden_representation(data,
                                            model,
                                            tokenizer,
                                            max_len=max_len,
                                            output_dic=False,
                                            pad_below_max_len=True)

            layer_head_att = all[0]

            #pdb.set_trace()
            def reshape_x(z):
                return np.array(z.view(z.size(0) * z.size(1), -1))

            def reshape_y(z, n_seq):
                'multiply each element n_seq times'
                new_z = []
                for _z in z:
                    #for _ in range(n_seq):
                    new_z.extend([_z for _ in range(n_seq)])
                return np.array(new_z)
                #return np.array(z.view(z.size(0), -1).transpose(1, 0))

            #X_train[layer_head] = np.array(layer_head_att[layer_head].view(layer_head_att[layer_head].size(0), -1).transpose(1,0))
            X_train[layer_head] = reshape_x(layer_head_att[layer_head])
            y_train[layer_head] = reshape_y(y, max_len)
            #db.set_trace()
            #y_train[layer_head] = y

            reg.fit(X=X_train[layer_head], y=y_train[layer_head])

            # test
            data_test = data_all[sample_ind_test]
            layer_head_att_test = get_hidden_representation(
                data_test,
                model,
                tokenizer,
                max_len=max_len,
                output_dic=False,
                pad_below_max_len=True)
            X_test[layer_head] = reshape_x(layer_head_att_test[layer_head])
            y_test[layer_head] = reshape_y(y_all[sample_ind_test], max_len)

            y_pred = reg.predict(X_test[layer_head])

            Accuracy = np.sum(
                (y_test[layer_head] == y_pred)) / len(y_test[layer_head])
            accuracy_ls.append(Accuracy)

        accuracy_dic[layer_head] = np.mean(accuracy_ls)
        layer = layer_head.split("-")[0]
        if layer not in accuracy_dic:
            accuracy_dic[layer] = []
        accuracy_dic[layer].append(np.mean(accuracy_ls))

        print(
            f"Regression {layer_head} Accuracy test {np.mean(accuracy_ls)} on {n_obs * max_len}"
            f" word sample from {len(lang_ls)} languages task {args.tasks} args {'/'.join(args.init_args_dir.split('/')[-2:]) if args.init_args_dir is not None else None} "
            f"bert {args.bert_model} random init {args.random_init} std {np.std(accuracy_ls)} sampling {len(accuracy_ls)}=={sampling}"
        )

        #report["model_type"] = args.bert_model if args.init_args_dir is None else args.tasks[0][0]+"-tune"
        #report["accuracy"] = np.mean(accuracy_ls)
        #report["sampling"] = len(accuracy_ls)
        #report["std"] = np.std(accuracy_ls)
        #report["n_sent"] = n_obs
        #report["n_obs"] = n_obs*max_len

        report = report_template(
            metric_val="accuracy",
            subsample=",".join(lang_ls),
            info_score_val=sampling,
            score_val=np.mean(accuracy_ls),
            n_sents=n_obs,
            avg_per_sent=np.std(accuracy_ls),
            n_tokens_score=n_obs * max_len,
            model_full_name_val=run_id,
            task="attention_analysis",
            evaluation_script_val="exact_match",
            model_args_dir=args.init_args_dir
            if args.init_args_dir is not None else args.random_init,
            token_type="word",
            report_path_val=None,
            data_val=layer_head)
        report_ls.append(report)

        # break

    for key in accuracy_dic:
        print(
            f"Summary {key} {np.mean(accuracy_dic[key])} model word sample from {len(lang_ls)} languages task {args.tasks} args {'/'.join(args.init_args_dir.split('/')[-2:]) if args.init_args_dir is not None else None} "
            f"bert {args.bert_model} random init {args.random_init} std {np.std(accuracy_ls)} sampling {len(accuracy_ls)}=={sampling}"
        )

    if args.report_dir is None:
        report_dir = PROJECT_PATH + f"/../../analysis/attention_analysis/report/{run_id}-report"
        os.mkdir(report_dir)
    else:
        report_dir = args.report_dir
    assert os.path.isdir(report_dir)
    with open(report_dir + "/report.json", "w") as f:
        json.dump(report_ls, f)
    overall_report = args.overall_report_dir + "/" + args.overall_label + "-grid-report.json"
    with open(overall_report, "r") as g:
        report_all = json.load(g)
        report_all.extend(report_ls)
    with open(overall_report, "w") as file:
        json.dump(report_all, file)

    print("{} {} ".format(REPORT_FLAG_DIR_STR, overall_report))
예제 #7
0
    def forward(self,
                input_ids_dict,
                token_type_ids=None,
                attention_mask=None,
                labels=None,
                head_masks=None):
        if labels is None:
            labels = OrderedDict()
        sequence_output_dict = OrderedDict()
        logits_dict = OrderedDict()
        loss_dict = OrderedDict()
        # sanity check the labels : they should all be in
        for label, value in labels.items():
            assert label in self.labels_supported, "label {} in {} not supported".format(
                label, self.labels_supported)

        # task_wise layer attention
        printout_allocated_gpu_memory(1, " foward starting ")
        for input_name, input_tensors in input_ids_dict.items():
            # not able to output all layers anymore
            sequence_output = self.encoder(
                input_tensors,
                token_type_ids=None,
                attention_mask=attention_mask[input_name])
            try:
                assert len(
                    sequence_output
                ) == 2 + int(self.config.output_attentions) + int(
                    self.config.output_hidden_states
                ), f"ERROR should be {2+int(self.config.output_attentions)+int(self.config.output_hidden_states_per_head)+int(self.config.output_hidden_states)} : check that you're not outputing also all hidden states"
                # add to remove : int(self.config.output_hidden_states_per_head)
            except Exception as e:
                #print(f"Exception output sequence {e}")
                pdb.set_trace()
                assert len(sequence_output) == 2
            logits = sequence_output[0]

            sequence_output_dict[input_name] = logits
            printout_allocated_gpu_memory(1, " forward pass bert")

        for task in self.tasks:
            # we don't use mask for parsing heads (cf. test performed below : the -1 already ignore the heads we don't want)
            # NB : head_masks for parsing only applies to heads not types
            head_masks_task = None  # head_masks.get(task, None) if task != "parsing" else None
            # NB : head_mask means masks specific the the module heads (nothing related to parsing !! )
            assert self.task_parameters[task]["input"] in sequence_output_dict, \
                "ERROR input {} of task {} was not found in input_ids_dict {}" \
                " and therefore not in sequence_output_dict {} ".format(self.task_parameters[task]["input"],
                                                                        task, input_ids_dict.keys(),
                                                                        sequence_output_dict.keys())

            if not self.head[
                    task].__class__.__name__ == BertOnlyMLMHead.__name__:  #isinstance(self.head[task], BertOnlyMLMHead):

                logits_dict[task] = self.head[task](
                    sequence_output_dict[self.task_parameters[task]["input"]],
                    head_mask=head_masks_task)
            else:
                logits_dict[task] = self.head[task](
                    sequence_output_dict[self.task_parameters[task]["input"]])
            # test performed : (logits_dict[task][0][1,2,:20]==float('-inf'))==(labels["parsing_heads"][1,:20]==-1)
            # handle several labels at output (e.g  parsing)

            printout_allocated_gpu_memory(1,
                                          " foward pass head {}".format(task))

            logits_dict = self.rename_multi_modal_task_logits(
                labels=self.task_parameters[task]["label"],
                task=task,
                logits_dict=logits_dict,
                task_parameters=self.task_parameters)

            printout_allocated_gpu_memory(1, "after renaming")

            for logit_label in logits_dict:

                label = re.match("(.*)-(.*)", logit_label)
                assert label is not None, "ERROR logit_label {}".format(
                    logit_label)
                label = label.group(2)
                if label in self.task_parameters[task]["label"]:
                    _labels = None
                    if self.task_parameters[task]["input"] == "input_masked":
                        _labels = labels.get(label)
                        if _labels is not None:
                            _labels = _labels.clone()
                            _labels[input_ids_dict["input_masked"] != self.
                                    mask_index_bert] = PAD_ID_LOSS_STANDART
                    else:
                        _labels = labels.get(label)
                    printout_allocated_gpu_memory(
                        1, " get label head {}".format(logit_label))
                    if _labels is not None:
                        #print("LABEL label {} {}".format(label, _labels))
                        loss_dict[logit_label] = self.get_loss(
                            loss_func=self.task_parameters[task]["loss"],
                            label=label,
                            num_label_dic=self.num_labels_dic,
                            labels=_labels,
                            logits_dict=logits_dict,
                            task=task,
                            logit_label=logit_label,
                            head_label=labels["heads"]
                            if label == "types" else None)
                    printout_allocated_gpu_memory(1,
                                                  " get loss {}".format(task))
                printout_allocated_gpu_memory(
                    1, " puting to cpu {}".format(logit_label))
        # thrid output is for potential attention weights
        output = (
            logits_dict,
            loss_dict,
        )

        output = output + sequence_output[2:]

        return output
예제 #8
0
def write_conll_multitask(format, dir_pred, dir_original, src_text_ls,
                          pred_per_task, tasks, task_parameters, cp_paste=False, gold=False,
                          all_indexes=None, sep_token=None, cls_token=None,
                          ind_batch=0, new_file=False, cut_sent=False, verbose=0):

    assert format in ["conll"]
    max_len_word = None
    writing_top = 1
    # assert each task is predicting as many sample per batch
    pred_task_len_former = -1
    task_former = ""

    # assertion on number of samples predicted
    for task_label in pred_per_task:

        pred_task_len = len(pred_per_task[task_label]) if gold else len(pred_per_task[task_label][writing_top-1])
        _task = re.match("(.*)-(.*)", task_label)
        if _task is not None:  # , "ERROR writer could not match {}".format(task_label)
            task = _task.group(1)
        else:
            task = task_label
        if pred_task_len_former > 0:
            assert pred_task_len == pred_task_len_former, \
                "ERROR {} and {} task ".format(task_former, task_label)
            if not gold:
                assert pred_task_len == len(src_text_ls[task_parameters[task]["input"]]), "ERROR  src len {} and pred len {} ".format(len(src_text_ls[task_parameters[task]["input"]]),pred_task_len)
            # we check also other input length
            if src_text_ls.get("input_masked") is not None:
                assert pred_task_len == len(src_text_ls["input_masked"])
            if src_text_ls.get("wordpieces_inputs_words") is not None:
                assert pred_task_len == len(src_text_ls["wordpieces_inputs_words"]), "ERROR mismatch source " \
                                                                            "wordpieces_inputs_words {}  " \
                                                                            "and prediction {} ".format(src_text_ls, pred_per_task[task_label])
            if src_text_ls.get("wordpieces_inputs_raw_tokens") is not None:
                assert pred_task_len == len(src_text_ls["wordpieces_inputs_raw_tokens"]), \
                                    "ERROR mismatch source wordpieces_inputs_" \
                                    "raw_tokens {} and prediction {} ".format(src_text_ls, pred_per_task[task_label])
            try:
                assert pred_task_len == all_indexes.shape[0], "ERROR mismatch index {}  and all_indexes {} : pred {}".format(pred_task_len, all_indexes.shape[0], pred_per_task[task_label])
            except:
                pdb.set_trace()
        pred_task_len_former = pred_task_len

        task_former = task_label
        if format == "conll":
            mode_write = "w" if new_file else "a"
        if new_file:
            printing("CREATING NEW FILE (io_/dat/normalized_writer) : {} ", var=[dir_pred], verbose=verbose,
                     verbose_level=1)

    pos_label = "pos-pos" if not gold else "pos"
    types_label = "parsing-types" if not gold else "types"
    heads_label = "parsing-heads" if not gold else "heads"
    n_masks_mwe_label = "n_masks_mwe-n_masks_mwe" if not gold else "n_masks_mwe"
    mwe_detection_label = "mwe_detection-mwe_detection" if not gold else "mwe_detection"

    with open(dir_pred, mode_write) as norm_file:
        with open(dir_original, mode_write) as original:
            len_original = 0
            for ind_sent in range(all_indexes.shape[0]):
                pred_sent = OrderedDict()
                # NB : length assertion for each input-output (correcting if possible)
                # TODO standartize !!  INCONSITENCIES WHEN GOLD TRUE AND GOLD FALSE, IF GOLD : pred_per_task is indexed by labels (no relation 1-1 to task and src ! )
                for task_label_or_gold_label in pred_per_task:
                    #task, _, label_processed = get_task_name_based_on_logit_label(task_label, label_processed)
                    if gold:
                        pred_sent[task_label_or_gold_label] = pred_per_task[task_label_or_gold_label][ind_sent]
                    else:
                        pred_sent[task_label_or_gold_label] = pred_per_task[task_label_or_gold_label][writing_top-1][ind_sent]
                    try:
                        # TODO : standartize  (the first if is needed because we handle at the same time gold data indexed by label and prediction labelled by task+label
                        if gold:
                            try:
                                src = src_text_ls[LABEL_PARAMETER[task_label_or_gold_label]["default_input"]][ind_sent]
                            except Exception as e:
                                src = src_text_ls["input_masked"][ind_sent]
                        else:
                            _task = re.match("(.*)-(.*)", task_label_or_gold_label)
                            assert _task is not None#, "ERROR writer could not match {}".format(task_label)
                            _label = _task.group(2)
                            _task = _task.group(1)
                            src = src_text_ls[TASKS_PARAMETER[_task]["input"]][ind_sent]

                        assert len(src) == len(pred_sent[task_label_or_gold_label]),"WARNING : (writer) task {} original_sent len {} {} \n  predicted sent len {} {}".format(task_label_or_gold_label, len(src), src,len(pred_sent[task_label_or_gold_label]), pred_sent[task_label_or_gold_label])
                    except AssertionError as e:
                        print(e)
                        pdb.set_trace()
                        if len(src) > len(pred_sent[task_label_or_gold_label]):
                            pred_sent[task_label_or_gold_label].extend(["UNK" for _ in range(len(src)-len(pred_sent[task_label_or_gold_label]))])
                            print("WARNING (writer) : original larger than prediction : so appending UNK token for writing")
                        else:
                            print("WARNING (writer) : original smaller than prediction for ")

                norm_file.write("#\n")
                original.write("#\n")
                norm_file.write("#sent_id = {} \n".format(ind_sent+ind_batch+1))
                original.write("#sent_id = {} \n".format(ind_sent+ind_batch+1))
                ind_adjust = 0

                #for ind, original_token in enumerate(original_sent):
                last_mwe_index = -1
                adjust_mwe = 0
                for ind in all_indexes[ind_sent, :]:
                    # WE REMOVE SPECIAL TOKENS ONLY IF THEY APPEAR AT THE BEGINING OR AT THE END
                    # on the source token !! (it tells us when we stop) (we nevern want to use gold information)
                    if "-" in ind and ind != "-1":
                        matching_mwe_ind = re.match("([0-9]+)-([0-9]+)", str(ind))
                        assert matching_mwe_ind is not None, "ERROR ind is {} : could not found mwe index".format(ind)
                        last_mwe_index = int(matching_mwe_ind.group(2))
                        ind_mwe = int(matching_mwe_ind.group(1))

                        original_token = src_text_ls["wordpieces_inputs_raw_tokens"][ind_sent][ind_mwe] if mwe_detection_label in pred_per_task or "wordpieces_inputs_words" in pred_per_task or n_masks_mwe_label in pred_per_task else "NOT_NEEDED"
                        adjust_mwe += (last_mwe_index-ind_mwe)
                        #assert ind_adjust == 0, "ERROR not supported"

                        mwe_meta = "Norm={}|mwe_detection={}|n_masks_mwe={}".format("_", pred_sent[mwe_detection_label][ind_mwe] if mwe_detection_label in pred_per_task else "_",
                                                                                    pred_sent[n_masks_mwe_label][ind_mwe] if n_masks_mwe_label in pred_per_task else "_")

                        norm_file.write("{index}\t{original}\t_\t{pos}\t_\t_\t{dep}\t_\t{types}\t{norm}\n".format(index=ind, original=original_token, pos="_", types="_", dep="_", norm=mwe_meta))
                        original.write("{}\t{}\t_\t_\t_\t_\t_\t_\t{}\t_\n".format(ind, original_token, "_"))
                        continue
                    else:
                        ind = int(ind)
                        try:
                            if "normalize" in [task for _tasks in tasks for task in _tasks]:

                                original_token = src_text_ls["wordpiece_words_src_aligned_with_norm"][ind_sent][ind]
                                original_pretokenized_field = "wordpiece_words_src_aligned_with_norm"
                            else:
                                original_token = src_text_ls["wordpieces_inputs_words"][ind_sent][ind]
                                original_pretokenized_field = "wordpieces_inputs_words"
                        except Exception as e:
                            original_token = src_text_ls["input_masked"][ind_sent][ind]
                            original_pretokenized_field = "input_masked"
                        # asserting that we have everything together on the source side
                        if ind > last_mwe_index:
                            if src_text_ls.get("wordpieces_inputs_raw_tokens") is not None:
                                try:
                                    assert src_text_ls[original_pretokenized_field][ind_sent][ind] == src_text_ls["wordpieces_inputs_raw_tokens"][ind_sent][ind-adjust_mwe], \
                                    "ERROR sequence {} on non-mwe tokens : raw and tokenized " \
                                    "should be same but are raw {} tokenized {}".format(original_pretokenized_field, src_text_ls["wordpieces_inputs_raw_tokens"][ind_sent][ind],
                                                                                        src_text_ls[original_pretokenized_field][ind_sent][ind+adjust_mwe])
                                except:
                                    print("WARNING sanity checking input failed (nomalized_writer) (might be due to dropout) {}".format(e))
                    max_len_word = max(len(original_token), len_original)
                    #if original_token in SPECIAL_TOKEN_LS and (ind+1 == len(original_sent) or ind == 0):
                    if (original_token in SPECIAL_TOKEN_LS or original_token in [cls_token, sep_token]):
                        # ind 0 is skipped because it corresponds to CLS
                        ind_adjust = 1
                        continue

                    pos = pred_sent[pos_label][ind] if pos_label in pred_per_task else "_"
                    types = pred_sent[types_label][ind] if types_label in pred_per_task else "_"
                    heads = pred_sent[heads_label][ind] if heads_label in pred_per_task else ind - 1

                    tenth_col = "Norm={}|mwe_detection={}|n_masks_mwe={}".format(pred_sent["normalize"][ind] if "normalize" in pred_per_task else "_",
                                                                                 pred_sent[mwe_detection_label][ind-adjust_mwe] if mwe_detection_label in pred_per_task else "_",
                                                                                 pred_sent[n_masks_mwe_label][ind-adjust_mwe] if n_masks_mwe_label in pred_per_task else "_")

                    norm_file.write("{index}\t{original}\t_\t{pos}\t_\t_\t{dep}\t_\t{types}\t{norm}\n".format(index=ind, original=original_token, pos=pos, types=types, dep=heads, norm=tenth_col))
                    original.write("{}\t{}\t_\t_\t_\t_\t_\t_\t{}\t_\n".format(ind, original_token, ind-1))
                    if cut_sent:
                        if ind > 50:
                            break
                        print("CUTTING SENT index {}>50 ".format(ind))
                norm_file.write("\n")
                original.write("\n")
        printing("WRITING predicted batch of {} original and {} normalized", var=[dir_original, dir_pred], verbose=verbose, verbose_level=2)
    assert max_len_word is not None, "ERROR : something went wrong in the writer"
    return max_len_word
예제 #9
0
def overall_word_level_metric_measure(task_label,
                                      pred_label,
                                      gold_sent_ls_dict,
                                      pred_sent_ls_topk_dict,
                                      topk,
                                      metric="exact_match",
                                      samples=None,
                                      src_detokenized=None,
                                      reference_word_dic=None,
                                      compute_intersection_score=True,
                                      mask_token=None,
                                      cls_token=None,
                                      sep_token=None,
                                      agg_func_ls=None):
    """
    'metric' based on a word level comparison of (pred,gold) : e.g : exact_match , edit
    'agg_func' based on a aggregation func to get the overall batch score : e.g : sum
    :param metric:
    :param agg_func:
    :return batch : score, number of token measured
    """
    if samples is None:
        samples = ["all"]
    if agg_func_ls is None:
        agg_func_ls = ["sum"]
    if task_label == "parsing_types":
        assert "parsing_heads" in gold_sent_ls_dict and "parsing_heads" in pred_sent_ls_topk_dict, \
            "ERROR : to compute the score of parsing_types : parsing_heads is required "
    assert isinstance(samples, list)
    assert len(set(samples) & set(AVAILABLE_EVALUATION_SAMPLE_FILTER)) > 0, \
        "ERROR : one of the samples in {} not supported {}".format(samples, AVAILABLE_EVALUATION_SAMPLE_FILTER)

    assert isinstance(agg_func_ls, list)

    pred_sent_ls_topk = pred_sent_ls_topk_dict[pred_label]
    gold_sent_ls = gold_sent_ls_dict[task_label]

    assert len(
        pred_sent_ls_topk
    ) == topk, "ERROR topk not consistent with prediction list ".format(
        len(pred_sent_ls_topk), topk)
    overall_score_ls_sent = []
    intersected_samples = []

    if compute_intersection_score:
        intersected_samples, sample_to_intersesct = get_intersection_score(
            samples)

    overall_filter_ls = {
        sample: []
        for sample in samples + intersected_samples
    }

    skipping_sent = 0

    for gold_ind_sent, gold_sent in enumerate(gold_sent_ls):
        try:
            assert len(gold_sent) == len(
                pred_sent_ls_topk[0][gold_ind_sent]
            ), "ERROR : gold_sent : {} len {} and pred_sent_ls_topk[0][gold_ind_sent] {} len {} ".format(
                gold_sent, len(gold_sent), pred_sent_ls_topk[0][gold_ind_sent],
                len(pred_sent_ls_topk[0][gold_ind_sent]))
            # WARNING : this might not be true in POS mode for some cases (when mask bpe is used)
        except Exception as e:
            raise (e)
            print("ERROR (scoring/report) on task {} ".format(task_label), e)
            if len(gold_sent) > len(pred_sent_ls_topk[0][gold_ind_sent]):
                counter = 0
                n_to_solve = len(gold_sent) - len(
                    pred_sent_ls_topk[0][gold_ind_sent])
                for ind in range(n_to_solve):
                    counter += gold_sent[-n_to_solve:][ind] == "_PAD_POS"
                if n_to_solve == counter:
                    gold_sent = gold_sent[:-n_to_solve]
                    src_detokenized[gold_ind_sent] = src_detokenized[
                        gold_ind_sent][:-n_to_solve]
                    print(
                        "WARNING : we handled mismatch between pred len/src and gold len by cutting it based on "
                        "GOLD padding (SHOULD BE RAISED IN TASK POS)")
                    # NB : this should be handle properly :
                    #   the detokenization has a problem when dropout_bpe_mask is not null
                else:
                    if pred_sent_ls_topk[0][gold_ind_sent][-1] == "[SEP]":
                        pred_sent_ls_topk[0][gold_ind_sent] = pred_sent_ls_topk[
                            0][gold_ind_sent] + [
                                "[SEP]" for _ in range(
                                    len(gold_sent) -
                                    len(pred_sent_ls_topk[0][gold_ind_sent]))
                            ]
                        print("APPENDING pred_sent_ls_topk[0] {} with {} ".
                              format(
                                  len(gold_sent) -
                                  len(pred_sent_ls_topk[0][gold_ind_sent]),
                                  pred_sent_ls_topk[0][gold_ind_sent]))
                        assert len(gold_sent) == len(
                            pred_sent_ls_topk[0][gold_ind_sent])
                    else:
                        print(
                            Exception(
                                "ERROR {} : could not handled mismatch between pred {} len/src {} "
                                "and gold len by cutting it based on GOLD padding (SHOULD BE RAISED IN TASK POS)"
                                .format(e, gold_sent,
                                        pred_sent_ls_topk[0][gold_ind_sent])))
                        skipping_sent += len(gold_sent_ls)
                        overall_score_ls_sent = [[0]]
                        break
            else:
                skipping_sent += len(gold_sent_ls)
                overall_score_ls_sent = [[0]]

                break
        if src_detokenized is not None and samples[0] != "all" and len(
                samples) > 1:
            # otherise we don't need src_detokenized
            assert len(gold_sent) == len(
                src_detokenized[gold_ind_sent]
            ), "ERROR src_detokenized {} and gold_sent_ls for sent {} have different length ".format(
                gold_sent, src_detokenized[gold_ind_sent])

        score_sent = []
        filter_sent = {_sample: [] for _sample in samples}

        for ind_word in range(len(gold_sent)):
            gold_token = gold_sent[ind_word]
            topk_word_pred = [
                pred_sent_ls_topk[top][gold_ind_sent][ind_word]
                for top in range(topk)
            ]
            # handling with LAS specificity ; a types is correct iif its label is correct and its head is correct
            if task_label == "types":
                gold_token_to_score = gold_token \
                    if gold_sent_ls_dict["heads"][gold_ind_sent][ind_word] == pred_sent_ls_topk_dict["parsing-heads"][0][gold_ind_sent][ind_word] else "ZERO-ING-SCORE-TYPES-AS-HEADS-IS-UNCORRECT"

            else:
                gold_token_to_score = gold_token
            score_sent.append(
                word_level_scoring(metric=metric,
                                   gold=gold_token_to_score,
                                   topk_pred=topk_word_pred,
                                   topk=topk))
            for ind_sample, _sample in enumerate(samples):
                try:

                    src = src_detokenized[gold_ind_sent][
                        ind_word] if _sample != "all" and not _sample.startswith(
                            "n_masks") else None
                except Exception as e:
                    print(
                        "ERROR (scoring/report) handling src {} index ({},{}) "
                        .format(src_detokenized, gold_ind_sent, ind_word), e)
                    pdb.set_trace()
                filter_sent[_sample].append(
                    word_level_filter(sample=_sample,
                                      gold=gold_token,
                                      topk_pred=topk_word_pred,
                                      topk=topk,
                                      src=src,
                                      is_mwe=gold_sent_ls_dict[task_label]
                                      [gold_ind_sent][ind_word],
                                      word_reference_dic_ls=reference_word_dic,
                                      mask_token=mask_token,
                                      cls_token=cls_token,
                                      sep_token=sep_token))
            if compute_intersection_score:
                for ind_sample, _sample in enumerate(sample_to_intersesct):
                    for ind_sample_2 in range(ind_sample):
                        inter = _sample + "-n-" + sample_to_intersesct[
                            ind_sample_2]
                        if filter_sent.get(inter, None) is None:
                            filter_sent[inter] = []
                        filter_sent[inter].append(
                            word_level_filter(
                                sample=_sample,
                                sample_2=sample_to_intersesct[ind_sample_2],
                                gold=gold_token,
                                topk_pred=topk_word_pred,
                                topk=topk,
                                src=src_detokenized[gold_ind_sent][ind_word],
                                word_reference_dic_ls=reference_word_dic,
                                is_mwe=gold_sent_ls_dict[task_label]
                                [gold_ind_sent][ind_word],
                                cls_token=cls_token,
                                sep_token=sep_token,
                                mask_token=mask_token))
        if compute_intersection_score:
            for _sample in samples + intersected_samples:
                overall_filter_ls[_sample].append(filter_sent[_sample])
        else:
            for _sample in samples:
                overall_filter_ls[_sample].append(filter_sent[_sample])
        overall_score_ls_sent.append(score_sent)

    result = {agg_func: {} for agg_func in agg_func_ls}

    for agg_func in agg_func_ls:
        for sample in samples + intersected_samples:
            try:

                result[agg_func][sample] = {
                    "score":
                    agg_func_batch_score(
                        overall_ls_sent_score=overall_score_ls_sent,
                        agg_func=agg_func,
                        overall_filter=overall_filter_ls[sample]),
                    "agg_func":
                    agg_func,
                    "metric":
                    "exact_match",
                    "n_tokens":
                    agg_func_batch_score(
                        overall_ls_sent_score=overall_score_ls_sent,
                        overall_filter=overall_filter_ls[sample],
                        agg_func="n_tokens"),
                    "n_sents":
                    agg_func_batch_score(
                        overall_ls_sent_score=overall_score_ls_sent,
                        overall_filter=overall_filter_ls[sample],
                        agg_func="n_sents")
                }

            except Exception as e:
                print(e)
                pdb.set_trace()

    return result, skipping_sent, samples + intersected_samples
예제 #10
0
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
                        **kwargs):
        r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.

        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with ``model.train()``

        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
        It is up to you to train those weights with a downstream fine-tuning task.

        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.

        Parameters:
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)

            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method

            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:

                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.

            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.

            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.

            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.

            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.

            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.

            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:

                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.

        Examples::

            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
        config = kwargs.pop('config', None)
        state_dict = kwargs.pop('state_dict', None)
        cache_dir = kwargs.pop('cache_dir', None)
        from_tf = kwargs.pop('from_tf', False)
        force_download = kwargs.pop('force_download', False)
        proxies = kwargs.pop('proxies', None)
        output_loading_info = kwargs.pop('output_loading_info', False)
        random_init = kwargs.pop("random_init", False)

        kwargs_config = kwargs.copy()

        mapping_keys_state_dic = kwargs.pop("mapping_keys_state_dic", None)
        kwargs_config.pop("mapping_keys_state_dic", None)
        pdb.set_trace()
        if config is None:
            config, model_kwargs = cls.config_class.from_pretrained(
                pretrained_model_name_or_path,
                *model_args,
                cache_dir=cache_dir,
                return_unused_kwargs=True,
                force_download=force_download,
                **kwargs_config)
        else:
            model_kwargs = kwargs

        # Load model
        if pretrained_model_name_or_path is not None:
            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
                archive_file = cls.pretrained_model_archive_map[
                    pretrained_model_name_or_path]
            elif os.path.isdir(pretrained_model_name_or_path):
                if from_tf and os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     TF_WEIGHTS_NAME + ".index")):
                    # Load from a TF 1.0 checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                TF_WEIGHTS_NAME + ".index")
                elif from_tf and os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     TF2_WEIGHTS_NAME)):
                    # Load from a TF 2.0 checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                TF2_WEIGHTS_NAME)
                elif os.path.isfile(
                        os.path.join(pretrained_model_name_or_path,
                                     WEIGHTS_NAME)):
                    # Load from a PyTorch checkpoint
                    archive_file = os.path.join(pretrained_model_name_or_path,
                                                WEIGHTS_NAME)
                else:
                    raise EnvironmentError(
                        "Error no file named {} found in directory {} or `from_tf` set to False"
                        .format([
                            WEIGHTS_NAME, TF2_WEIGHTS_NAME,
                            TF_WEIGHTS_NAME + ".index"
                        ], pretrained_model_name_or_path))
            elif os.path.isfile(pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
            else:
                assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(
                    pretrained_model_name_or_path)
                archive_file = pretrained_model_name_or_path + ".index"

            # redirect to the cache, if necessary
            try:
                resolved_archive_file = cached_path(
                    archive_file,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies)
            except EnvironmentError:
                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
                        archive_file)
                else:
                    msg = "Model name '{}' was not found in model name list ({}). " \
                        "We assumed '{}' was a path or url to model weight files named one of {} but " \
                        "couldn't find any such file at this path or url.".format(
                            pretrained_model_name_or_path,
                            ', '.join(cls.pretrained_model_archive_map.keys()),
                            archive_file,
                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME])
                raise EnvironmentError(msg)

            if resolved_archive_file == archive_file:
                logger.info("loading weights file {}".format(archive_file))
            else:
                logger.info("loading weights file {} from cache at {}".format(
                    archive_file, resolved_archive_file))
        else:
            resolved_archive_file = None

        # Instantiate model.

        model = cls(config, *model_args, **model_kwargs)

        if state_dict is None and not from_tf:
            state_dict = torch.load(resolved_archive_file, map_location='cpu')

        missing_keys = []
        unexpected_keys = []
        error_msgs = []

        if from_tf:
            if resolved_archive_file.endswith('.index'):
                # Load from a TensorFlow 1.X checkpoint - provided by original authors
                model = cls.load_tf_weights(
                    model, config,
                    resolved_archive_file[:-6])  # Remove the '.index'
            else:
                # Load from our TensorFlow 2.0 checkpoints
                try:
                    from transformers import load_tf2_checkpoint_in_pytorch_model
                    model = load_tf2_checkpoint_in_pytorch_model(
                        model, resolved_archive_file, allow_missing_keys=True)
                except ImportError as e:
                    logger.error(
                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
                    )
                    raise e
        else:
            # Convert old format to new format if needed from a PyTorch state_dict
            old_keys = []
            new_keys = []
            for key in state_dict.keys():
                new_key = None
                if 'gamma' in key:
                    new_key = key.replace('gamma', 'weight')
                if 'beta' in key:
                    new_key = key.replace('beta', 'bias')
                if new_key:
                    old_keys.append(key)
                    new_keys.append(new_key)
            for old_key, new_key in zip(old_keys, new_keys):
                state_dict[new_key] = state_dict.pop(old_key)

            # copy state_dict so _load_from_state_dict can modify it
            metadata = getattr(state_dict, '_metadata', None)
            state_dict = state_dict.copy()
            if metadata is not None:
                state_dict._metadata = metadata
            #assert mapping_keys_state_dic is not None, "ERROR did not found mapping dicts for {} ".format(pretrained_model_name_or_path)
            #mapping_keys_state_dic = {"roberta": "encoder", "lm_head": "head.mlm"}
            if mapping_keys_state_dic is not None:
                assert isinstance(mapping_keys_state_dic, dict), "ERROR "
                print(
                    "INFO : from loading from pretrained method (assuming loading original google model : "
                    "need to rename some keys {})".format(
                        mapping_keys_state_dic))

                state_dict = cls.adapt_state_dic_to_multitask(
                    state_dict, keys_mapping=mapping_keys_state_dic)

            def load(module, prefix=''):
                ##pdb.set_trace()
                #local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
                local_metadata = {"version": 1}
                #print("MODEL loading with {} prefix , local metadata {} , missing keys {} unexpected keys {} ".format(prefix, local_metadata, missing_keys, unexpected_keys))
                if not prefix.startswith("head") or prefix.startswith(
                        "head.mlm"):
                    assert len(
                        missing_keys
                    ) == 0, "ERROR {} missing keys in state_dict {}".format(
                        prefix, missing_keys)
                else:
                    if len(missing_keys) == 0:
                        print(
                            "WARNING {} missing keys in state_dict {}".format(
                                prefix, missing_keys))
                module._load_from_state_dict(state_dict, prefix,
                                             local_metadata, True,
                                             missing_keys, unexpected_keys,
                                             error_msgs)
                for name, child in module._modules.items():
                    #load_params_only_ls = kwargs.get("load_params_only_ls ")
                    not_load_params_ls = kwargs.get(
                        "not_load_params_ls") if kwargs.get(
                            "not_load_params_ls") is not None else []
                    assert isinstance(
                        not_load_params_ls, list
                    ), f"Argument error not_load_params_ls should be a list but is {not_load_params_ls}"
                    matching_not_load = []
                    for pattern in not_load_params_ls:
                        matching = re.match(pattern, prefix + name)
                        if matching is not None:
                            matching_not_load.append(matching)
                    if len(matching_not_load) > 0:
                        # means there is at least one patter in not load pattern that matched --> so should load
                        print("MATCH not loading : {} parameters {} ".format(
                            prefix + name, not_load_params_ls))
                    if child is not None and len(matching_not_load) == 0:
                        print("MODEL loading : child {} full {} ".format(
                            name, prefix + name + '.'))
                        load(child, prefix + name + '.')
                    else:
                        print(
                            "MODEL not loading : child {} matching_not_load {} "
                            .format(child, matching_not_load))

            # Make sure we are able to load base models as well as derived models (with heads)
            start_prefix = ''
            model_to_load = model
            if not hasattr(model, cls.base_model_prefix) and any(
                    s.startswith(cls.base_model_prefix)
                    for s in state_dict.keys()):
                start_prefix = cls.base_model_prefix + '.'
            if hasattr(model, cls.base_model_prefix) and not any(
                    s.startswith(cls.base_model_prefix)
                    for s in state_dict.keys()):
                model_to_load = getattr(model, cls.base_model_prefix)
            if not random_init:
                load(model_to_load, prefix=start_prefix)
            else:
                print("WARNING : RANDOM INTIALIZATION OF BERTMULTITASK")

            if len(missing_keys) > 0:
                logger.info(
                    "Weights of {} not initialized from pretrained model: {}".
                    format(model.__class__.__name__, missing_keys))
            if len(unexpected_keys) > 0:
                logger.info(
                    "Weights from pretrained model not used in {}: {}".format(
                        model.__class__.__name__, unexpected_keys))
            if len(error_msgs) > 0:
                raise RuntimeError(
                    'Error(s) in loading state_dict for {}:\n\t{}'.format(
                        model.__class__.__name__, "\n\t".join(error_msgs)))

        if hasattr(model, 'tie_weights'):
            model.tie_weights(
            )  # make sure word embedding weights are still tied

        # Set model in evaluation mode to desactivate DropOut modules by default
        model.eval()

        if output_loading_info:
            loading_info = {
                "missing_keys": missing_keys,
                "unexpected_keys": unexpected_keys,
                "error_msgs": error_msgs
            }
            return model, loading_info

        return model
예제 #11
0
def readers_load(datasets,
                 tasks,
                 word_dictionary,
                 word_dictionary_norm,
                 char_dictionary,
                 pos_dictionary,
                 xpos_dictionary,
                 type_dictionary,
                 bert_tokenizer,
                 word_decoder=False,
                 must_get_norm=True,
                 bucket=True,
                 input_level_ls=None,
                 run_mode="train",
                 add_start_char=1,
                 add_end_char=1,
                 symbolic_end=True,
                 symbolic_root=True,
                 verbose=1):

    readers = {}
    simultanuous_training = False  #depreciated
    assert "all" not in tasks, "ERROR not supported yet (pb for simultanuous training..) "
    if not "all" in tasks and not simultanuous_training:
        try:
            assert len(tasks) == len(datasets), "ERROR : as simultanuous_training is {} : " \
                                                "we need 1 dataset per task but have only {} for task {} ".format(simultanuous_training, datasets, tasks)

        except Exception as e:
            pdb.set_trace()
            datasets = [datasets[0] for _ in tasks]
            # SHOULD NOT DO THAT !!
            print("WARNING : duplicating readers", e)

    elif not simultanuous_training:
        assert len(
            tasks) == 1, "ERROR : if all should have only all nothing else"
        printing("TRAINING : MultiTask Iterator wit task 'all' ",
                 verbose_level=1,
                 verbose=verbose)
    elif simultanuous_training:
        printing(
            "TRAINING : Training simultaneously tasks provided in {} (should have all required labels in datasets)",
            verbose_level=1,
            verbose=verbose)
        raise (Exception("Not supported yet --> should handle the loop "))

    for simul_task, data in zip(tasks, datasets):
        normalization_in_reader = does_one_task_require_normalization(
            simul_task)
        # 1 reader per simultaneously trained task
        readers[",".join(simul_task)] = conllu_data.read_data_to_variable(
            data,
            word_dictionary,
            char_dictionary,
            pos_dictionary,
            xpos_dictionary,
            type_dictionary,
            word_decoder=word_decoder,
            symbolic_end=symbolic_end,
            symbolic_root=symbolic_root,
            dry_run=0,
            normalization=normalization_in_reader,
            bucket=bucket,
            add_start_char=add_start_char,
            add_end_char=add_end_char,
            tasks=simul_task,
            max_char_len=None,
            must_get_norm=must_get_norm,
            bert_tokenizer=bert_tokenizer,
            input_level_ls=input_level_ls,
            run_mode=run_mode,
            word_norm_dictionary=word_dictionary_norm,
            pad_id=bert_tokenizer.convert_tokens_to_ids(
                bert_tokenizer.pad_token),
            verbose=verbose)

    return readers
예제 #12
0
                               xpos_dictionary=xpos_dictionary,
                               type_dictionary=type_dictionary,
                               use_gpu=None,
                               norm_not_norm=True,
                               word_decoder=word_decoder,
                               bucket=False,
                               add_start_char=1,
                               add_end_char=1,
                               symbolic_end=True,
                               symbolic_root=True,
                               verbose=1)
        iterator_multi = data_gen_multi_task_sampling_batch(
            tasks=tasks,
            readers=readers,
            batch_size=1,
            word_dictionary=word_dictionary,
            char_dictionary=char_dictionary,
            pos_dictionary=pos_dictionary,
            word_dictionary_norm=word_dictionary_norm,
            print_raw=True,
            get_batch_mode=False,
            verbose=1)

        while True:
            try:
                batch = iterator_multi.__next__()
                pdb.set_trace()
            except StopIteration as e:
                print(Exception(e))
                break