Exemplo n.º 1
0
Arquivo: run.py Projeto: yhcc/fastNLP
def predict():
    # Config Loader
    test_args = ConfigSection()
    ConfigLoader().load_config(cfgfile, {"POS_test": test_args})

    # fetch dictionary size and number of labels from pickle files
    word2index = load_pickle(pickle_path, "word2id.pkl")
    test_args["vocab_size"] = len(word2index)
    index2label = load_pickle(pickle_path, "label2id.pkl")
    test_args["num_classes"] = len(index2label)

    # load dev data
    dev_data = load_pickle(pickle_path, "data_dev.pkl")

    # Define the same model
    model = AdvSeqLabel(test_args)

    # Dump trained parameters into the model
    ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
    print("model loaded!")

    # Tester
    test_args["evaluator"] = SeqLabelEvaluator()
    tester = SeqLabelTester(**test_args.data)

    # Start testing
    tester.test(model, dev_data)
Exemplo n.º 2
0
Arquivo: run.py Projeto: yhcc/fastNLP
def infer():
    # Config Loader
    test_args = ConfigSection()
    ConfigLoader().load_config(cfgfile, {"POS_test": test_args})

    # fetch dictionary size and number of labels from pickle files
    word2index = load_pickle(pickle_path, "word2id.pkl")
    test_args["vocab_size"] = len(word2index)
    index2label = load_pickle(pickle_path, "label2id.pkl")
    test_args["num_classes"] = len(index2label)

    # Define the same model
    model = AdvSeqLabel(test_args)

    try:
        ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
        print('model loaded!')
    except Exception as e:
        print('cannot load model!')
        raise

    # Data Loader
    infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines)
    infer_data.load(data_infer_path,
                    vocabs={"word_vocab": word2index},
                    infer=True)
    print('data loaded')

    # Inference interface
    infer = SeqLabelInfer(pickle_path)
    results = infer.predict(model, infer_data)

    print(results)
    print("Inference finished!")
Exemplo n.º 3
0
def mock_cws():
    os.makedirs("mock", exist_ok=True)
    text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"]

    word2id = Vocabulary()
    word_list = [ch for ch in "".join(text)]
    word2id.update(word_list)
    save_pickle(word2id, "./mock/", "word2id.pkl")

    class2id = Vocabulary(need_default=False)
    label_list = ['B', 'M', 'E', 'S']
    class2id.update(label_list)
    save_pickle(class2id, "./mock/", "label2id.pkl")

    model_args = {
        "vocab_size": len(word2id),
        "word_emb_dim": 50,
        "rnn_hidden_units": 50,
        "num_classes": len(class2id)
    }
    config_file = """
    [test_section]
    vocab_size = {}
    word_emb_dim = 50
    rnn_hidden_units = 50
    num_classes = {}
    """.format(len(word2id), len(class2id))
    with open("mock/test.cfg", "w", encoding="utf-8") as f:
        f.write(config_file)

    model = AdvSeqLabel(model_args)
    ModelSaver("mock/cws_basic_model_v_0.pkl").save_pytorch(model)
Exemplo n.º 4
0
def mock_pos_tag():
    os.makedirs("mock", exist_ok=True)
    text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"]

    vocab = Vocabulary()
    word_list = [ch for ch in "".join(text)]
    vocab.update(word_list)
    save_pickle(vocab, "./mock/", "word2id.pkl")

    idx2label = Vocabulary(need_default=False)
    label_list = ['B-n', 'M-v', 'E-nv', 'S-adj', 'B-v', 'M-vn', 'S-adv']
    idx2label.update(label_list)
    save_pickle(idx2label, "./mock/", "label2id.pkl")

    model_args = {
        "vocab_size": len(vocab),
        "word_emb_dim": 50,
        "rnn_hidden_units": 50,
        "num_classes": len(idx2label)
    }
    config_file = """
        [test_section]
        vocab_size = {}
        word_emb_dim = 50
        rnn_hidden_units = 50
        num_classes = {}
        """.format(len(vocab), len(idx2label))
    with open("mock/test.cfg", "w", encoding="utf-8") as f:
        f.write(config_file)

    model = AdvSeqLabel(model_args)
    ModelSaver("mock/pos_tag_model_v_0.pkl").save_pytorch(model)
Exemplo n.º 5
0
Arquivo: run.py Projeto: yhcc/fastNLP
def train():
    # Config Loader
    train_args = ConfigSection()
    test_args = ConfigSection()
    ConfigLoader().load_config(cfgfile, {
        "train": train_args,
        "test": test_args
    })

    print("loading data set...")
    data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load)
    data.load(cws_data_path)
    data_train, data_dev = data.split(ratio=0.3)
    train_args["vocab_size"] = len(data.word_vocab)
    train_args["num_classes"] = len(data.label_vocab)
    print("vocab size={}, num_classes={}".format(len(data.word_vocab),
                                                 len(data.label_vocab)))

    change_field_is_target(data_dev, "truth", True)
    save_pickle(data_dev, "./save/", "data_dev.pkl")
    save_pickle(data.word_vocab, "./save/", "word2id.pkl")
    save_pickle(data.label_vocab, "./save/", "label2id.pkl")

    # Trainer
    trainer = SeqLabelTrainer(epochs=train_args["epochs"],
                              batch_size=train_args["batch_size"],
                              validate=train_args["validate"],
                              use_cuda=train_args["use_cuda"],
                              pickle_path=train_args["pickle_path"],
                              save_best_dev=True,
                              print_every_step=10,
                              model_name="trained_model.pkl",
                              evaluator=SeqLabelEvaluator())

    # Model
    model = AdvSeqLabel(train_args)
    try:
        ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
        print('model parameter loaded!')
    except Exception as e:
        print("No saved model. Continue.")
        pass

    # Start training
    trainer.train(model, data_train, data_dev)
    print("Training finished!")

    # Saver
    saver = ModelSaver("./save/trained_model.pkl")
    saver.save_pytorch(model)
    print("Model saved!")
Exemplo n.º 6
0
def train():
    # Config Loader
    train_args = ConfigSection()
    test_args = ConfigSection()
    ConfigLoader("good_path").load_config(cfgfile, {
        "train": train_args,
        "test": test_args
    })

    # Data Loader
    loader = TokenizeDatasetLoader(cws_data_path)
    train_data = loader.load_pku()

    # Preprocessor
    preprocessor = SeqLabelPreprocess()
    data_train, data_dev = preprocessor.run(train_data,
                                            pickle_path=pickle_path,
                                            train_dev_split=0.3)
    train_args["vocab_size"] = preprocessor.vocab_size
    train_args["num_classes"] = preprocessor.num_classes

    # Trainer
    trainer = SeqLabelTrainer(**train_args.data)

    # Model
    model = AdvSeqLabel(train_args)
    try:
        ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
        print('model parameter loaded!')
    except Exception as e:
        print("No saved model. Continue.")
        pass

    # Start training
    trainer.train(model, data_train, data_dev)
    print("Training finished!")

    # Saver
    saver = ModelSaver("./save/saved_model.pkl")
    saver.save_pytorch(model)
    print("Model saved!")
Exemplo n.º 7
0
def train(checkpoint=None):
    # load config
    train_param = ConfigSection()
    model_param = ConfigSection()
    ConfigLoader().load_config(cfgfile, {
        "train": train_param,
        "model": model_param
    })
    print("config loaded")

    # Data Loader
    dataset = ZhConllPOSReader().load("/home/hyan/train.conllx")
    print(dataset)
    print("dataset transformed")

    dataset.rename_field("tag", "truth")

    vocab_proc = VocabIndexerProcessor("words",
                                       new_added_filed_name="word_seq")
    tag_proc = VocabIndexerProcessor("truth")
    seq_len_proc = SeqLenProcessor(field_name="word_seq",
                                   new_added_field_name="word_seq_origin_len",
                                   is_input=True)

    vocab_proc(dataset)
    tag_proc(dataset)
    seq_len_proc(dataset)

    dataset.set_input("word_seq", "word_seq_origin_len", "truth")
    dataset.set_target("truth", "word_seq_origin_len")

    print("processors defined")

    # dataset.set_is_target(tag_ids=True)
    model_param["vocab_size"] = vocab_proc.get_vocab_size()
    model_param["num_classes"] = tag_proc.get_vocab_size()
    print("vocab_size={}  num_classes={}".format(model_param["vocab_size"],
                                                 model_param["num_classes"]))

    # define a model
    if checkpoint is None:
        # pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx)
        pre_trained = None
        model = AdvSeqLabel(model_param,
                            id2words=tag_proc.vocab.idx2word,
                            emb=pre_trained)
        print(model)
    else:
        model = torch.load(checkpoint)

    # call trainer to train
    trainer = Trainer(dataset,
                      model,
                      loss=None,
                      metrics=SpanFPreRecMetric(
                          tag_proc.vocab,
                          pred="predict",
                          target="truth",
                          seq_lens="word_seq_origin_len"),
                      dev_data=dataset,
                      metric_key="f",
                      use_tqdm=True,
                      use_cuda=True,
                      print_every=5,
                      n_epochs=6,
                      save_path="./save")
    trainer.train(load_best_model=True)

    # save model & pipeline
    model_proc = ModelProcessor(model,
                                seq_len_field_name="word_seq_origin_len")
    id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag")

    pp = Pipeline([vocab_proc, seq_len_proc, model_proc, id2tag])
    save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab}
    torch.save(save_dict, "model_pp.pkl")
    print("pipeline saved")

    torch.save(model, "./save/best_model.pkl")
Exemplo n.º 8
0
def train(train_data_path, dev_data_path, checkpoint=None, save=None):
    # load config
    train_param = ConfigSection()
    model_param = ConfigSection()
    ConfigLoader().load_config(cfgfile, {
        "train": train_param,
        "model": model_param
    })
    print("config loaded")

    # Data Loader
    print("loading training set...")
    dataset = ConllxDataLoader().load(train_data_path, return_dataset=True)
    print("loading dev set...")
    dev_data = ConllxDataLoader().load(dev_data_path, return_dataset=True)
    print(dataset)
    print("================= dataset ready =====================")

    dataset.rename_field("tag", "truth")
    dev_data.rename_field("tag", "truth")

    vocab_proc = VocabIndexerProcessor("words",
                                       new_added_filed_name="word_seq")
    tag_proc = VocabIndexerProcessor("truth", is_input=True)
    seq_len_proc = SeqLenProcessor(field_name="word_seq",
                                   new_added_field_name="word_seq_origin_len",
                                   is_input=True)
    set_input_proc = SetInputProcessor("word_seq", "word_seq_origin_len")

    vocab_proc(dataset)
    tag_proc(dataset)
    seq_len_proc(dataset)

    # index dev set
    word_vocab, tag_vocab = vocab_proc.vocab, tag_proc.vocab
    dev_data.apply(lambda ins: [word_vocab.to_index(w) for w in ins["words"]],
                   new_field_name="word_seq")
    dev_data.apply(lambda ins: [tag_vocab.to_index(w) for w in ins["truth"]],
                   new_field_name="truth")
    dev_data.apply(lambda ins: len(ins["word_seq"]),
                   new_field_name="word_seq_origin_len")

    # set input & target
    dataset.set_input("word_seq", "word_seq_origin_len", "truth")
    dev_data.set_input("word_seq", "word_seq_origin_len", "truth")
    dataset.set_target("truth", "word_seq_origin_len")
    dev_data.set_target("truth", "word_seq_origin_len")

    # dataset.set_is_target(tag_ids=True)
    model_param["vocab_size"] = vocab_proc.get_vocab_size()
    model_param["num_classes"] = tag_proc.get_vocab_size()
    print("vocab_size={}  num_classes={}".format(model_param["vocab_size"],
                                                 model_param["num_classes"]))

    # define a model
    if checkpoint is None:
        # pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx)
        pre_trained = None
        model = AdvSeqLabel(model_param, id2words=None, emb=pre_trained)
        print(model)
    else:
        model = torch.load(checkpoint)

    # call trainer to train
    trainer = Trainer(dataset,
                      model,
                      loss=None,
                      metrics=SpanFPreRecMetric(
                          tag_proc.vocab,
                          pred="predict",
                          target="truth",
                          seq_lens="word_seq_origin_len"),
                      dev_data=dev_data,
                      metric_key="f",
                      use_tqdm=True,
                      use_cuda=True,
                      print_every=10,
                      n_epochs=20,
                      save_path=save)
    trainer.train(load_best_model=True)

    # save model & pipeline
    model_proc = ModelProcessor(model,
                                seq_len_field_name="word_seq_origin_len")
    id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag")

    pp = Pipeline(
        [vocab_proc, seq_len_proc, set_input_proc, model_proc, id2tag])
    save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab}
    torch.save(save_dict, os.path.join(save, "model_pp.pkl"))
    print("pipeline saved")