def infer(): # load dataset print("Loading data...") word_vocab = load_pickle(save_dir, "word2id.pkl") label_vocab = load_pickle(save_dir, "label2id.pkl") print("vocabulary size:", len(word_vocab)) print("number of classes:", len(label_vocab)) infer_data = TextClassifyDataSet(load_func=ClassDataSetLoader.load) infer_data.load(train_data_dir, vocabs={"word_vocab": word_vocab, "label_vocab": label_vocab}) model_args = ConfigSection() model_args["vocab_size"] = len(word_vocab) model_args["num_classes"] = len(label_vocab) ConfigLoader.load_config(config_dir, {"text_class_model": model_args}) # construct model print("Building model...") cnn = CNNText(model_args) # Dump trained parameters into the model ModelLoader.load_pytorch(cnn, os.path.join(save_dir, model_name)) print("model loaded!") infer = ClassificationInfer(pickle_path=save_dir) results = infer.predict(cnn, infer_data) print(results)
def train(): train_args, model_args = ConfigSection(), ConfigSection() ConfigLoader.load_config(config_dir, {"text_class": train_args}) # load dataset print("Loading data...") ds_loader = ClassDatasetLoader(train_data_dir) data = ds_loader.load() print(data[0]) # pre-process data pre = ClassPreprocess() data_train = pre.run(data, pickle_path=save_dir) print("vocabulary size:", pre.vocab_size) print("number of classes:", pre.num_classes) model_args["num_classes"] = pre.num_classes model_args["vocab_size"] = pre.vocab_size # construct model print("Building model...") model = CNNText(model_args) # ConfigSaver().save_config(config_dir, {"text_class_model": model_args}) # train print("Training...") # 1 # trainer = ClassificationTrainer(train_args) # 2 trainer = ClassificationTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], validate=train_args["validate"], use_cuda=train_args["use_cuda"], pickle_path=save_dir, save_best_dev=train_args["save_best_dev"], model_name=model_name, loss=Loss("cross_entropy"), optimizer=Optimizer("SGD", lr=0.001, momentum=0.9)) trainer.train(model, data_train) print("Training finished!") saver = ModelSaver(os.path.join(save_dir, model_name)) saver.save_pytorch(model) print("Model saved!")
def load(self, model_name, config_file="config", section_name="model"): """ Load a pre-trained FastNLP model together with additional data. :param model_name: str, the name of a FastNLP model. :param config_file: str, the name of the config file which stores the initialization information of the model. (default: "config") :param section_name: str, the name of the corresponding section in the config file. (default: model) """ assert type(model_name) is str if model_name not in FastNLP_MODEL_COLLECTION: raise ValueError("No FastNLP model named {}.".format(model_name)) if not self.model_exist(model_dir=self.model_dir): self._download(model_name, FastNLP_MODEL_COLLECTION[model_name]["url"]) model_class = self._get_model_class(FastNLP_MODEL_COLLECTION[model_name]["class"]) print("Restore model class {}".format(str(model_class))) model_args = ConfigSection() ConfigLoader.load_config(os.path.join(self.model_dir, config_file), {section_name: model_args}) print("Restore model hyper-parameters {}".format(str(model_args.data))) # fetch dictionary size and number of labels from pickle files self.word_vocab = load_pickle(self.model_dir, "word2id.pkl") model_args["vocab_size"] = len(self.word_vocab) self.label_vocab = load_pickle(self.model_dir, "label2id.pkl") model_args["num_classes"] = len(self.label_vocab) # Construct the model model = model_class(model_args) print("Model constructed.") # To do: framework independent ModelLoader.load_pytorch(model, os.path.join(self.model_dir, FastNLP_MODEL_COLLECTION[model_name]["pickle"])) print("Model weights loaded.") self.model = model self.infer_type = FastNLP_MODEL_COLLECTION[model_name]["type"] print("Inference ready.")
def train(): train_args, model_args = ConfigSection(), ConfigSection() ConfigLoader.load_config(config_dir, {"text_class": train_args}) # load dataset print("Loading data...") data = TextClassifyDataSet(load_func=ClassDataSetLoader.load) data.load(train_data_dir) print("vocabulary size:", len(data.word_vocab)) print("number of classes:", len(data.label_vocab)) save_pickle(data.word_vocab, save_dir, "word2id.pkl") save_pickle(data.label_vocab, save_dir, "label2id.pkl") model_args["num_classes"] = len(data.label_vocab) model_args["vocab_size"] = len(data.word_vocab) # construct model print("Building model...") model = CNNText(model_args) # train print("Training...") trainer = ClassificationTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], validate=train_args["validate"], use_cuda=train_args["use_cuda"], pickle_path=save_dir, save_best_dev=train_args["save_best_dev"], model_name=model_name, loss=Loss("cross_entropy"), optimizer=Optimizer("SGD", lr=0.001, momentum=0.9)) trainer.train(model, data) print("Training finished!") saver = ModelSaver(os.path.join(save_dir, model_name)) saver.save_pytorch(model) print("Model saved!")
datadir = "/home/yfshao/UD_English-EWT" cfgfile = './cfg.cfg' train_data_name = "en_ewt-ud-train.conllu" dev_data_name = "en_ewt-ud-dev.conllu" emb_file_name = '/home/yfshao/glove.6B.100d.txt' processed_datadir = './save' # Config Loader train_args = ConfigSection() test_args = ConfigSection() model_args = ConfigSection() optim_args = ConfigSection() ConfigLoader.load_config( cfgfile, { "train": train_args, "test": test_args, "model": model_args, "optim": optim_args }) # Data Loader def save_data(dirpath, **kwargs): import _pickle if not os.path.exists(dirpath): os.mkdir(dirpath) for name, data in kwargs.items(): with open(os.path.join(dirpath, name + '.pkl'), 'wb') as f: _pickle.dump(data, f)