예제 #1
0
    def __init__(self, model_archived_file: str, cuda_device: str = "cpu"):

        tar = tarfile.open(model_archived_file)
        tar.extractall()
        folder_name = tar.getnames()[0]
        tar.close()

        f = open(folder_name + "/config.conf", 'rb')
        self.conf = pickle.load(
            f)  # variables come out in the order you put them in
        # default batch size for conf is `10`
        f.close()
        device = torch.device(cuda_device)
        self.conf.device = device
        self.model = NNCRF(self.conf, print_info=False)
        self.model.load_state_dict(
            torch.load(folder_name + "/lstm_crf.m", map_location=device))
        self.model.eval()

        if self.conf.context_emb != ContextEmb.none:
            if cuda_device == "cpu":
                cuda_device = -1
            else:
                cuda_device = int(cuda_device.split(":")[1])
            self.elmo = load_elmo(cuda_device)
예제 #2
0
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str,
                   insts: List[Instance]):
    ## evaluation
    metrics = np.asarray([0, 0, 0], dtype=int)
    batch_id = 0
    batch_size = config.batch_size
    for batch in batch_insts_ids:
        one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                batch_size]
        sorted_batch_insts = sorted(one_batch_insts,
                                    key=lambda inst: len(inst.input.words),
                                    reverse=True)
        batch_max_scores, batch_max_ids = model.decode(batch)
        metrics += evaluate_num(sorted_batch_insts, batch_max_ids, batch[-1],
                                batch[1], config.idx2labels)
        batch_id += 1
    p, total_predict, total_entity = metrics[0], metrics[1], metrics[2]
    precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0
    recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0
    fscore = 2.0 * precision * recall / (
        precision + recall) if precision != 0 or recall != 0 else 0
    print("[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" %
          (name, precision, recall, fscore),
          flush=True)
    return [precision, recall, fscore]
예제 #3
0
def evaluate_model(config: Config,
                   model: NNCRF,
                   batch_insts_ids,
                   name: str,
                   insts: List[Instance],
                   print_each_type_metric: bool = False):
    ## evaluation
    p_dict, total_predict_dict, total_entity_dict = Counter(), Counter(
    ), Counter()
    batch_id = 0
    batch_size = config.batch_size
    with torch.no_grad():
        for batch in batch_insts_ids:
            one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                    batch_size]
            batch_max_scores, batch_max_ids = model.decode(**batch)
            batch_p, batch_predict, batch_total = evaluate_batch_insts(
                one_batch_insts, batch_max_ids, batch["labels"],
                batch["word_seq_lens"], config.idx2labels)
            p_dict += batch_p
            total_predict_dict += batch_predict
            total_entity_dict += batch_total
            batch_id += 1
    if print_each_type_metric:
        for key in total_entity_dict:
            precision_key, recall_key, fscore_key = get_metric(
                p_dict[key], total_entity_dict[key], total_predict_dict[key])
            print(
                f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}"
            )

    total_p = sum(list(p_dict.values()))
    total_predict = sum(list(total_predict_dict.values()))
    total_entity = sum(list(total_entity_dict.values()))
    precision, recall, fscore = get_metric(total_p, total_entity,
                                           total_predict)
    print(colored(
        f"[{name} set Total] Prec.: {precision:.2f}, Rec.: {recall:.2f}, F1: {fscore:.2f}",
        'blue'),
          flush=True)

    return [precision, recall, fscore]
예제 #4
0
def test_model(config: Config, test_insts):
    model_name = "model_files/lstm_{}_crf_{}_{}_dep_{}_elmo_{}_lr_{}.m".format(
        config.hidden_dim,
        config.dataset, config.train_num, config.context_emb.name,
        config.optimizer.lower(), config.learning_rate)
    res_name = "results/lstm_{}_crf_{}_{}_dep_{}_elmo_{}_lr_{}.results".format(
        config.hidden_dim,
        config.dataset, config.train_num, config.context_emb.name,
        config.optimizer.lower(), config.learning_rate)

    model = NNCRF(config)
    model.load_state_dict(torch.load(model_name))
    model.eval()
    test_batches = batching_list_instances(config, test_insts)
    evaluate_model(config, model, test_batches, "test", test_insts)
    write_results(res_name, test_insts)
예제 #5
0
class NERPredictor:
    """
    Usage:
    sentence = "This is a sentence"
    model_path = "model_files.tar.gz"
    model = Predictor(model_path)
    prediction = model.predict(sentence)
    """
    def __init__(self, model_archived_file: str, cuda_device: str = "cpu"):

        tar = tarfile.open(model_archived_file)
        tar.extractall()
        folder_name = tar.getnames()[0]
        tar.close()

        f = open(folder_name + "/config.conf", 'rb')
        self.conf = pickle.load(
            f)  # variables come out in the order you put them in
        # default batch size for conf is `10`
        f.close()
        device = torch.device(cuda_device)
        self.conf.device = device
        self.model = NNCRF(self.conf, print_info=False)
        self.model.load_state_dict(
            torch.load(folder_name + "/lstm_crf.m", map_location=device))
        self.model.eval()

        if self.conf.context_emb != ContextEmb.none:
            if cuda_device == "cpu":
                cuda_device = -1
            else:
                cuda_device = int(cuda_device.split(":")[1])
            self.elmo = load_elmo(cuda_device)

    def predict_insts(self, batch_insts_ids: Tuple) -> List[List[str]]:
        batch_max_scores, batch_max_ids = self.model.decode(batch_insts_ids)
        predictions = []
        for idx in range(len(batch_max_ids)):
            length = batch_insts_ids[1][idx]
            prediction = batch_max_ids[idx][:length].tolist()
            prediction = prediction[::-1]
            prediction = [self.conf.idx2labels[l] for l in prediction]
            predictions.append(prediction)
        return predictions

    def sent_to_insts(self, sentence: str) -> List[Instance]:
        words = sentence.split()
        return [Instance(Sentence(words))]

    def sents_to_insts(self, sentences: List[str]) -> List[Instance]:
        insts = []
        for sentence in sentences:
            words = sentence.split()
            insts.append(Instance(Sentence(words)))
        return insts

    def create_batch_data(self, insts: List[Instance]):
        return simple_batching(self.conf, insts)

    def predict(self, sentences: Union[str, List[str]]):

        sents = [sentences] if isinstance(sentences, str) else sentences
        insts = self.sents_to_insts(sents)
        self.conf.map_insts_ids(insts)
        if self.conf.context_emb != ContextEmb.none:
            read_parse_write(self.elmo, insts)
        test_batches = self.create_batch_data(insts)
        predictions = self.predict_insts(test_batches)
        if len(predictions) == 1:
            return predictions[0]
        else:
            return predictions
예제 #6
0
def train_model(config: Config, epoch: int, train_insts: List[Instance],
                dev_insts: List[Instance], test_insts: List[Instance]):
    ### Data Processing Info
    train_num = len(train_insts)
    print("number of instances: %d" % (train_num))
    print(colored("[Shuffled] Shuffle the training instance ids", "red"))
    random.shuffle(train_insts)

    batched_data = batching_list_instances(config, train_insts)
    dev_batches = batching_list_instances(config, dev_insts)
    test_batches = batching_list_instances(config, test_insts)

    if config.embedder_type == "normal":
        model = NNCRF(config)
        optimizer = get_optimizer(config, model)
        scheduler = None
    else:
        print(
            colored(
                f"[Model Info]: Working with transformers package from huggingface with {config.embedder_type}",
                'red'))
        print(
            colored(
                f"[Optimizer Info]: You should be aware that you are using the optimizer from huggingface.",
                'red'))
        print(
            colored(
                f"[Optimizer Info]: Change the optimier in transformers_util.py if you want to make some modifications.",
                'red'))
        model = TransformersCRF(config)
        optimizer, scheduler = get_huggingface_optimizer_and_scheduler(
            config,
            model,
            num_training_steps=len(batched_data) * epoch,
            weight_decay=0.0,
            eps=1e-8,
            warmup_step=0)
        print(
            colored(f"[Optimizer Info] Modify the optimizer info as you need.",
                    'red'))
        print(optimizer)

    model.to(config.device)

    best_dev = [-1, 0]
    best_test = [-1, 0]

    model_folder = config.model_folder
    res_folder = "results"
    if os.path.exists("model_files/" + model_folder):
        raise FileExistsError(
            f"The folder model_files/{model_folder} exists. Please either delete it or create a new one "
            f"to avoid override.")
    model_path = f"model_files/{model_folder}/lstm_crf.m"
    config_path = f"model_files/{model_folder}/config.conf"
    res_path = f"{res_folder}/{model_folder}.results"
    print("[Info] The model will be saved to: %s.tar.gz" % (model_folder))
    os.makedirs(f"model_files/{model_folder}",
                exist_ok=True)  ## create model files. not raise error if exist
    os.makedirs(res_folder, exist_ok=True)
    no_incre_dev = 0
    print(
        colored(
            f"[Train Info] Start training, you have set to stop if performace not increase for {config.max_no_incre} epochs",
            'red'))
    for i in tqdm(range(1, epoch + 1), desc="Epoch"):
        epoch_loss = 0
        start_time = time.time()
        model.zero_grad()
        if config.optimizer.lower() == "sgd":
            optimizer = lr_decay(config, optimizer, i)
        for index in tqdm(np.random.permutation(len(batched_data)),
                          desc="--training batch",
                          total=len(batched_data)):
            model.train()
            loss = model(**batched_data[index])
            epoch_loss += loss.item()
            loss.backward()
            if config.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               config.max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()
            if scheduler is not None:
                scheduler.step()
        end_time = time.time()
        print("Epoch %d: %.5f, Time is %.2fs" %
              (i, epoch_loss, end_time - start_time),
              flush=True)

        model.eval()
        dev_metrics = evaluate_model(config, model, dev_batches, "dev",
                                     dev_insts)
        test_metrics = evaluate_model(config, model, test_batches, "test",
                                      test_insts)
        if dev_metrics[2] > best_dev[0]:
            print("saving the best model...")
            no_incre_dev = 0
            best_dev[0] = dev_metrics[2]
            best_dev[1] = i
            best_test[0] = test_metrics[2]
            best_test[1] = i
            torch.save(model.state_dict(), model_path)
            # Save the corresponding config as well.
            f = open(config_path, 'wb')
            pickle.dump(config, f)
            f.close()
            write_results(res_path, test_insts)
        else:
            no_incre_dev += 1
        model.zero_grad()
        if no_incre_dev >= config.max_no_incre:
            print(
                "early stop because there are %d epochs not increasing f1 on dev"
                % no_incre_dev)
            break

    print("Archiving the best Model...")
    with tarfile.open(f"model_files/{model_folder}/{model_folder}.tar.gz",
                      "w:gz") as tar:
        tar.add(f"model_files/{model_folder}",
                arcname=os.path.basename(model_folder))

    print("Finished archiving the models")

    print("The best dev: %.2f" % (best_dev[0]))
    print("The corresponding test: %.2f" % (best_test[0]))
    print("Final testing.")
    model.load_state_dict(torch.load(model_path))
    model.eval()
    evaluate_model(config, model, test_batches, "test", test_insts)
    write_results(res_path, test_insts)
예제 #7
0
def learn_from_insts(config: Config, epoch: int, train_insts, dev_insts,
                     test_insts):
    # train_insts: List[Instance], dev_insts: List[Instance], test_insts: List[Instance], batch_size: int = 1
    model = NNCRF(config)
    optimizer = get_optimizer(config, model)
    train_num = len(train_insts)
    print("number of instances: %d" % (train_num))
    print(colored("[Shuffled] Shuffle the training instance ids", "red"))
    random.shuffle(train_insts)

    batched_data = batching_list_instances(config, train_insts)
    dev_batches = batching_list_instances(config, dev_insts)
    test_batches = batching_list_instances(config, test_insts)

    best_dev = [-1, 0]
    best_test = [-1, 0]

    model_folder = "model_files"
    res_folder = "results"

    model_name = model_folder + "/lstm_{}_crf_{}_{}_dep_{}_elmo_{}_lr_{}.m".format(
        config.hidden_dim,
        config.dataset, config.train_num, config.context_emb.name,
        config.optimizer.lower(), config.learning_rate)
    res_name = res_folder + "/lstm_{}_crf_{}_{}_dep_{}_elmo_{}_lr_{}.results".format(
        config.hidden_dim,
        config.dataset, config.train_num, config.context_emb.name,
        config.optimizer.lower(), config.learning_rate)
    print("[Info] The model will be saved to: %s" % (model_name))
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    if not os.path.exists(res_folder):
        os.makedirs(res_folder)

    for i in range(1, epoch + 1):
        epoch_loss = 0
        start_time = time.time()
        model.zero_grad()
        if config.optimizer.lower() == "sgd":
            optimizer = lr_decay(config, optimizer, i)
        for index in np.random.permutation(len(batched_data)):
            # for index in range(len(batched_data)):
            model.train()
            batch_word, batch_wordlen, batch_context_emb, batch_char, batch_charlen, batch_label = batched_data[
                index]
            loss = model.neg_log_obj(batch_word, batch_wordlen,
                                     batch_context_emb, batch_char,
                                     batch_charlen, batch_label)
            epoch_loss += loss.item()
            loss.backward()
            # # torch.nn.utils.clip_grad_norm_(model.parameters(), config.clip) ##clipping the gradient
            optimizer.step()
            model.zero_grad()

        end_time = time.time()
        print("Epoch %d: %.5f, Time is %.2fs" %
              (i, epoch_loss, end_time - start_time),
              flush=True)

        model.eval()
        dev_metrics = evaluate_model(config, model, dev_batches, "dev",
                                     dev_insts)
        test_metrics = evaluate_model(config, model, test_batches, "test",
                                      test_insts)
        if dev_metrics[2] > best_dev[0]:
            print("saving the best model...")
            best_dev[0] = dev_metrics[2]
            best_dev[1] = i
            best_test[0] = test_metrics[2]
            best_test[1] = i
            torch.save(model.state_dict(), model_name)
            write_results(res_name, test_insts)
        model.zero_grad()

    print("The best dev: %.2f" % (best_dev[0]))
    print("The corresponding test: %.2f" % (best_test[0]))
    print("Final testing.")
    model.load_state_dict(torch.load(model_name))
    model.eval()
    evaluate_model(config, model, test_batches, "test", test_insts)
    write_results(res_name, test_insts)