def infer(): # load dataset print("Loading data...") word_vocab = load_pickle(save_dir, "word2id.pkl") label_vocab = load_pickle(save_dir, "label2id.pkl") print("vocabulary size:", len(word_vocab)) print("number of classes:", len(label_vocab)) infer_data = TextClassifyDataSet(load_func=ClassDataSetLoader.load) infer_data.load(train_data_dir, vocabs={"word_vocab": word_vocab, "label_vocab": label_vocab}) model_args = ConfigSection() model_args["vocab_size"] = len(word_vocab) model_args["num_classes"] = len(label_vocab) ConfigLoader.load_config(config_dir, {"text_class_model": model_args}) # construct model print("Building model...") cnn = CNNText(model_args) # Dump trained parameters into the model ModelLoader.load_pytorch(cnn, os.path.join(save_dir, model_name)) print("model loaded!") infer = ClassificationInfer(pickle_path=save_dir) results = infer.predict(cnn, infer_data) print(results)
def train_test(): # Config Loader train_args = ConfigSection() ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) # Data Loader loader = TokenizeDatasetLoader(cws_data_path) train_data = loader.load_pku() # Preprocessor p = SeqLabelPreprocess() data_train = p.run(train_data, pickle_path=pickle_path) train_args["vocab_size"] = p.vocab_size train_args["num_classes"] = p.num_classes # Trainer trainer = SeqLabelTrainer(**train_args.data) # Model model = SeqLabeling(train_args) # Start training trainer.train(model, data_train) print("Training finished!") # Saver saver = ModelSaver("./data_for_tests/saved_model.pkl") saver.save_pytorch(model) print("Model saved!") del model, trainer, loader # Define the same model model = SeqLabeling(train_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") print("model loaded!") # Load test configuration test_args = ConfigSection() ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) # Tester tester = SeqLabelTester(**test_args.data) # Start testing tester.test(model, data_train) # print test results print(tester.show_metrics()) print("model tested!")
def train_test(): # Config Loader train_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": train_args}) # define dataset data_train = TokenizeDataSetLoader().load(cws_data_path) word_vocab = Vocabulary() label_vocab = Vocabulary() data_train.update_vocab(word_seq=word_vocab, label_seq=label_vocab) data_train.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab) data_train.set_origin_len("word_seq") data_train.rename_field("label_seq", "truth").set_target(truth=False) train_args["vocab_size"] = len(word_vocab) train_args["num_classes"] = len(label_vocab) save_pickle(word_vocab, pickle_path, "word2id.pkl") save_pickle(label_vocab, pickle_path, "label2id.pkl") # Trainer trainer = SeqLabelTrainer(**train_args.data) # Model model = SeqLabeling(train_args) # Start training trainer.train(model, data_train) # Saver saver = ModelSaver("./save/saved_model.pkl") saver.save_pytorch(model) del model, trainer # Define the same model model = SeqLabeling(train_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./save/saved_model.pkl") # Load test configuration test_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": test_args}) test_args["evaluator"] = SeqLabelEvaluator() # Tester tester = SeqLabelTester(**test_args.data) # Start testing data_train.set_target(truth=True) tester.test(model, data_train)
def train(): train_args, model_args = ConfigSection(), ConfigSection() ConfigLoader.load_config(config_dir, {"text_class": train_args}) # load dataset print("Loading data...") ds_loader = ClassDatasetLoader(train_data_dir) data = ds_loader.load() print(data[0]) # pre-process data pre = ClassPreprocess() data_train = pre.run(data, pickle_path=save_dir) print("vocabulary size:", pre.vocab_size) print("number of classes:", pre.num_classes) model_args["num_classes"] = pre.num_classes model_args["vocab_size"] = pre.vocab_size # construct model print("Building model...") model = CNNText(model_args) # ConfigSaver().save_config(config_dir, {"text_class_model": model_args}) # train print("Training...") # 1 # trainer = ClassificationTrainer(train_args) # 2 trainer = ClassificationTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], validate=train_args["validate"], use_cuda=train_args["use_cuda"], pickle_path=save_dir, save_best_dev=train_args["save_best_dev"], model_name=model_name, loss=Loss("cross_entropy"), optimizer=Optimizer("SGD", lr=0.001, momentum=0.9)) trainer.train(model, data_train) print("Training finished!") saver = ModelSaver(os.path.join(save_dir, model_name)) saver.save_pytorch(model) print("Model saved!")
def infer(): # Load infer configuration, the same as test test_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "label2id.pkl") test_args["num_classes"] = len(index2label) # Define the same model model = SeqLabeling(test_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./save/saved_model.pkl") print("model loaded!") # Load infer data infer_data = SeqLabelDataSet(load_func=BaseLoader.load) infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True) # inference infer = SeqLabelInfer(pickle_path) results = infer.predict(model, infer_data) print(results)
def foo(): loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8") train_data = loader.load_pku() train_args = ConfigSection() ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) # Preprocessor p = SeqLabelPreprocess() train_data = p.run(train_data) train_args["vocab_size"] = p.vocab_size train_args["num_classes"] = p.num_classes model = SeqLabeling(train_args) valid_args = { "save_output": True, "validate_in_training": True, "save_dev_input": True, "save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/", "use_cuda": True } validator = SeqLabelTester(**valid_args) print("start validation.") validator.test(model, train_data) print(validator.show_metrics())
def infer(): # Config Loader test_args = ConfigSection() ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "label2id.pkl") test_args["num_classes"] = len(index2label) # Define the same model model = AdvSeqLabel(test_args) try: ModelLoader.load_pytorch(model, "./save/trained_model.pkl") print('model loaded!') except Exception as e: print('cannot load model!') raise # Data Loader infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines) infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True) print('data loaded') # Inference interface infer = SeqLabelInfer(pickle_path) results = infer.predict(model, infer_data) print(results) print("Inference finished!")
def predict(): # Config Loader test_args = ConfigSection() ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "label2id.pkl") test_args["num_classes"] = len(index2label) # load dev data dev_data = load_pickle(pickle_path, "data_dev.pkl") # Define the same model model = AdvSeqLabel(test_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./save/trained_model.pkl") print("model loaded!") # Tester test_args["evaluator"] = SeqLabelEvaluator() tester = SeqLabelTester(**test_args.data) # Start testing tester.test(model, dev_data)
def infer(): # Load infer configuration, the same as test test_args = ConfigSection() ConfigLoader("config.cfg").load_config(config_dir, {"POS_infer": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "id2class.pkl") test_args["num_classes"] = len(index2label) # Define the same model model = SeqLabeling(test_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) print("model loaded!") # Data Loader raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() # Inference interface infer = SeqLabelInfer(pickle_path) results = infer.predict(model, infer_data) for res in results: print(res) print("Inference finished!")
def train_test(): # Config Loader train_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": train_args}) # define dataset data_train = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load) data_train.load(cws_data_path) train_args["vocab_size"] = len(data_train.word_vocab) train_args["num_classes"] = len(data_train.label_vocab) save_pickle(data_train.word_vocab, pickle_path, "word2id.pkl") save_pickle(data_train.label_vocab, pickle_path, "label2id.pkl") # Trainer trainer = SeqLabelTrainer(**train_args.data) # Model model = SeqLabeling(train_args) # Start training trainer.train(model, data_train) # Saver saver = ModelSaver("./save/saved_model.pkl") saver.save_pytorch(model) del model, trainer # Define the same model model = SeqLabeling(train_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./save/saved_model.pkl") # Load test configuration test_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": test_args}) test_args["evaluator"] = SeqLabelEvaluator() # Tester tester = SeqLabelTester(**test_args.data) # Start testing change_field_is_target(data_train, "truth", True) tester.test(model, data_train)
def _get_section(self, sect_name): """This is the function to get the section with the section name. :param sect_name: The name of section what wants to load. :return: The section. """ sect = ConfigSection() ConfigLoader().load_config(self.file_path, {sect_name: sect}) return sect
def load(self, model_name, config_file="config", section_name="model"): """ Load a pre-trained FastNLP model together with additional data. :param model_name: str, the name of a FastNLP model. :param config_file: str, the name of the config file which stores the initialization information of the model. (default: "config") :param section_name: str, the name of the corresponding section in the config file. (default: model) """ assert type(model_name) is str if model_name not in FastNLP_MODEL_COLLECTION: raise ValueError("No FastNLP model named {}.".format(model_name)) if not self.model_exist(model_dir=self.model_dir): self._download(model_name, FastNLP_MODEL_COLLECTION[model_name]["url"]) model_class = self._get_model_class(FastNLP_MODEL_COLLECTION[model_name]["class"]) print("Restore model class {}".format(str(model_class))) model_args = ConfigSection() ConfigLoader.load_config(os.path.join(self.model_dir, config_file), {section_name: model_args}) print("Restore model hyper-parameters {}".format(str(model_args.data))) # fetch dictionary size and number of labels from pickle files self.word_vocab = load_pickle(self.model_dir, "word2id.pkl") model_args["vocab_size"] = len(self.word_vocab) self.label_vocab = load_pickle(self.model_dir, "label2id.pkl") model_args["num_classes"] = len(self.label_vocab) # Construct the model model = model_class(model_args) print("Model constructed.") # To do: framework independent ModelLoader.load_pytorch(model, os.path.join(self.model_dir, FastNLP_MODEL_COLLECTION[model_name]["pickle"])) print("Model weights loaded.") self.model = model self.infer_type = FastNLP_MODEL_COLLECTION[model_name]["type"] print("Inference ready.")
def train(): train_args, model_args = ConfigSection(), ConfigSection() ConfigLoader.load_config(config_dir, {"text_class": train_args}) # load dataset print("Loading data...") data = TextClassifyDataSet(load_func=ClassDataSetLoader.load) data.load(train_data_dir) print("vocabulary size:", len(data.word_vocab)) print("number of classes:", len(data.label_vocab)) save_pickle(data.word_vocab, save_dir, "word2id.pkl") save_pickle(data.label_vocab, save_dir, "label2id.pkl") model_args["num_classes"] = len(data.label_vocab) model_args["vocab_size"] = len(data.word_vocab) # construct model print("Building model...") model = CNNText(model_args) # train print("Training...") trainer = ClassificationTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], validate=train_args["validate"], use_cuda=train_args["use_cuda"], pickle_path=save_dir, save_best_dev=train_args["save_best_dev"], model_name=model_name, loss=Loss("cross_entropy"), optimizer=Optimizer("SGD", lr=0.001, momentum=0.9)) trainer.train(model, data) print("Training finished!") saver = ModelSaver(os.path.join(save_dir, model_name)) saver.save_pytorch(model) print("Model saved!")
def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() ConfigLoader().load_config(cfgfile, { "train": train_args, "test": test_args }) print("loading data set...") data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load) data.load(cws_data_path) data_train, data_dev = data.split(ratio=0.3) train_args["vocab_size"] = len(data.word_vocab) train_args["num_classes"] = len(data.label_vocab) print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab))) change_field_is_target(data_dev, "truth", True) save_pickle(data_dev, "./save/", "data_dev.pkl") save_pickle(data.word_vocab, "./save/", "word2id.pkl") save_pickle(data.label_vocab, "./save/", "label2id.pkl") # Trainer trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], validate=train_args["validate"], use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"], save_best_dev=True, print_every_step=10, model_name="trained_model.pkl", evaluator=SeqLabelEvaluator()) # Model model = AdvSeqLabel(train_args) try: ModelLoader.load_pytorch(model, "./save/saved_model.pkl") print('model parameter loaded!') except Exception as e: print("No saved model. Continue.") pass # Start training trainer.train(model, data_train, data_dev) print("Training finished!") # Saver saver = ModelSaver("./save/trained_model.pkl") saver.save_pytorch(model) print("Model saved!")
def test_case_ConfigLoader(self): def read_section_from_config(config_path, section_name): dict = {} if not os.path.exists(config_path): raise FileNotFoundError( "config file {} NOT found.".format(config_path)) cfg = configparser.ConfigParser() cfg.read(config_path) if section_name not in cfg: raise AttributeError( "config file {} do NOT have section {}".format( config_path, section_name)) gen_sec = cfg[section_name] for s in gen_sec.keys(): try: val = json.loads(gen_sec[s]) dict[s] = val except Exception as e: raise AttributeError( "json can NOT load {} in section {}, config file {}". format(s, section_name, config_path)) return dict test_arg = ConfigSection() ConfigLoader().load_config(os.path.join("./test/loader", "config"), {"test": test_arg}) section = read_section_from_config( os.path.join("./test/loader", "config"), "test") for sec in section: if (sec not in test_arg) or (section[sec] != test_arg[sec]): raise AttributeError("ERROR") for sec in test_arg.__dict__.keys(): if (sec not in section) or (section[sec] != test_arg[sec]): raise AttributeError("ERROR") try: not_exist = test_arg["NOT EXIST"] except Exception as e: pass print("pass config test!")
def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() ConfigLoader("good_path").load_config(cfgfile, { "train": train_args, "test": test_args }) # Data Loader loader = TokenizeDatasetLoader(cws_data_path) train_data = loader.load_pku() # Preprocessor preprocessor = SeqLabelPreprocess() data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) train_args["vocab_size"] = preprocessor.vocab_size train_args["num_classes"] = preprocessor.num_classes # Trainer trainer = SeqLabelTrainer(**train_args.data) # Model model = AdvSeqLabel(train_args) try: ModelLoader.load_pytorch(model, "./save/saved_model.pkl") print('model parameter loaded!') except Exception as e: print("No saved model. Continue.") pass # Start training trainer.train(model, data_train, data_dev) print("Training finished!") # Saver saver = ModelSaver("./save/saved_model.pkl") saver.save_pytorch(model) print("Model saved!")
def infer(): # Load infer configuration, the same as test test_args = ConfigSection() ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "id2class.pkl") test_args["num_classes"] = len(index2label) # Define the same model model = SeqLabeling(test_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl") print("model loaded!") # Data Loader raw_data_loader = BaseLoader(data_infer_path) infer_data = raw_data_loader.load_lines() """ Transform strings into list of list of strings. [ [word_11, word_12, ...], [word_21, word_22, ...], ... ] In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them. """ # Inference interface infer = Predictor(pickle_path) results = infer.predict(model, infer_data) print(results) print("Inference finished!")
def infer(): # Load infer configuration, the same as test test_args = ConfigSection() ConfigLoader().load_config(config_dir, {"POS_infer": test_args}) # fetch dictionary size and number of labels from pickle files word_vocab = load_pickle(pickle_path, "word2id.pkl") label_vocab = load_pickle(pickle_path, "label2id.pkl") test_args["vocab_size"] = len(word_vocab) test_args["num_classes"] = len(label_vocab) print("vocabularies loaded") # Define the same model model = SeqLabeling(test_args) print("model defined") # Dump trained parameters into the model ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) print("model loaded!") # Data Loader infer_data = SeqLabelDataSet(load_func=BaseLoader.load) infer_data.load(data_infer_path, vocabs={ "word_vocab": word_vocab, "label_vocab": label_vocab }, infer=True) print("data set prepared") # Inference interface infer = SeqLabelInfer(pickle_path) results = infer.predict(model, infer_data) for res in results: print(res) print("Inference finished!")
def test_training(): # Config Loader trainer_args = ConfigSection() model_args = ConfigSection() ConfigLoader().load_config(config_dir, { "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) data_set = TokenizeDataSetLoader().load(data_path) word_vocab = Vocabulary() label_vocab = Vocabulary() data_set.update_vocab(word_seq=word_vocab, label_seq=label_vocab) data_set.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab) data_set.set_origin_len("word_seq") data_set.rename_field("label_seq", "truth").set_target(truth=False) data_train, data_dev = data_set.split(0.3, shuffle=True) model_args["vocab_size"] = len(word_vocab) model_args["num_classes"] = len(label_vocab) save_pickle(word_vocab, pickle_path, "word2id.pkl") save_pickle(label_vocab, pickle_path, "label2id.pkl") trainer = SeqLabelTrainer( epochs=trainer_args["epochs"], batch_size=trainer_args["batch_size"], validate=False, use_cuda=False, pickle_path=pickle_path, save_best_dev=trainer_args["save_best_dev"], model_name=model_name, optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), ) # Model model = SeqLabeling(model_args) # Start training trainer.train(model, data_train, data_dev) # Saver saver = ModelSaver(os.path.join(pickle_path, model_name)) saver.save_pytorch(model) del model, trainer # Define the same model model = SeqLabeling(model_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) # Load test configuration tester_args = ConfigSection() ConfigLoader().load_config(config_dir, {"test_seq_label_tester": tester_args}) # Tester tester = SeqLabelTester(batch_size=4, use_cuda=False, pickle_path=pickle_path, model_name="seq_label_in_test.pkl", evaluator=SeqLabelEvaluator() ) # Start testing with validation data data_dev.set_target(truth=True) tester.test(model, data_dev)
def train_and_test(): # Config Loader trainer_args = ConfigSection() model_args = ConfigSection() ConfigLoader().load_config(config_dir, { "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args }) data_set = SeqLabelDataSet() data_set.load(data_path) train_set, dev_set = data_set.split(0.3, shuffle=True) model_args["vocab_size"] = len(data_set.word_vocab) model_args["num_classes"] = len(data_set.label_vocab) save_pickle(data_set.word_vocab, pickle_path, "word2id.pkl") save_pickle(data_set.label_vocab, pickle_path, "label2id.pkl") trainer = SeqLabelTrainer( epochs=trainer_args["epochs"], batch_size=trainer_args["batch_size"], validate=False, use_cuda=trainer_args["use_cuda"], pickle_path=pickle_path, save_best_dev=trainer_args["save_best_dev"], model_name=model_name, optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), ) # Model model = SeqLabeling(model_args) # Start training trainer.train(model, train_set, dev_set) print("Training finished!") # Saver saver = ModelSaver(os.path.join(pickle_path, model_name)) saver.save_pytorch(model) print("Model saved!") del model, trainer change_field_is_target(dev_set, "truth", True) # Define the same model model = SeqLabeling(model_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) print("model loaded!") # Load test configuration tester_args = ConfigSection() ConfigLoader().load_config(config_dir, {"test_seq_label_tester": tester_args}) # Tester tester = SeqLabelTester(batch_size=4, use_cuda=False, pickle_path=pickle_path, model_name="seq_label_in_test.pkl", evaluator=SeqLabelEvaluator()) # Start testing with validation data tester.test(model, dev_set) print("model tested!")
def test_case_1(self): config_file_dir = "test/loader/" config_file_name = "config" config_file_path = os.path.join(config_file_dir, config_file_name) tmp_config_file_path = os.path.join(config_file_dir, "tmp_config") with open(config_file_path, "r") as f: lines = f.readlines() standard_section = ConfigSection() t_section = ConfigSection() ConfigLoader().load_config(config_file_path, { "test": standard_section, "t": t_section }) config_saver = ConfigSaver(config_file_path) section = ConfigSection() section["doubles"] = 0.8 section["tt"] = 0.5 section["test"] = 105 section["str"] = "this is a str" test_case_2_section = section test_case_2_section["double"] = 0.5 for k in section.__dict__.keys(): standard_section[k] = section[k] config_saver.save_config_file("test", section) config_saver.save_config_file("another-test", section) config_saver.save_config_file("one-another-test", section) config_saver.save_config_file("test-case-2", section) test_section = ConfigSection() at_section = ConfigSection() another_test_section = ConfigSection() one_another_test_section = ConfigSection() a_test_case_2_section = ConfigSection() ConfigLoader().load_config( config_file_path, { "test": test_section, "another-test": another_test_section, "t": at_section, "one-another-test": one_another_test_section, "test-case-2": a_test_case_2_section }) assert test_section == standard_section assert at_section == t_section assert another_test_section == section assert one_another_test_section == section assert a_test_case_2_section == test_case_2_section config_saver.save_config_file("test", section) with open(config_file_path, "w") as f: f.writelines(lines) with open(tmp_config_file_path, "w") as f: f.write('[test]\n') f.write('this is an fault example\n') tmp_config_saver = ConfigSaver(tmp_config_file_path) try: tmp_config_saver._read_section() except Exception as e: pass os.remove(tmp_config_file_path) try: tmp_config_saver = ConfigSaver("file-NOT-exist") except Exception as e: pass
def train_and_test(): # Config Loader trainer_args = ConfigSection() model_args = ConfigSection() ConfigLoader("config.cfg").load_config(config_dir, { "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args }) # Data Loader pos_loader = POSDatasetLoader(data_path) train_data = pos_loader.load_lines() # Preprocessor p = SeqLabelPreprocess() data_train, data_dev = p.run(train_data, pickle_path=pickle_path, train_dev_split=0.5) model_args["vocab_size"] = p.vocab_size model_args["num_classes"] = p.num_classes # Trainer: two definition styles # 1 # trainer = SeqLabelTrainer(trainer_args.data) # 2 trainer = SeqLabelTrainer( epochs=trainer_args["epochs"], batch_size=trainer_args["batch_size"], validate=trainer_args["validate"], use_cuda=trainer_args["use_cuda"], pickle_path=pickle_path, save_best_dev=trainer_args["save_best_dev"], model_name=model_name, optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), ) # Model model = SeqLabeling(model_args) # Start training trainer.train(model, data_train, data_dev) print("Training finished!") # Saver saver = ModelSaver(os.path.join(pickle_path, model_name)) saver.save_pytorch(model) print("Model saved!") del model, trainer, pos_loader # Define the same model model = SeqLabeling(model_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) print("model loaded!") # Load test configuration tester_args = ConfigSection() ConfigLoader("config.cfg").load_config( config_dir, {"test_seq_label_tester": tester_args}) # Tester tester = SeqLabelTester(save_output=False, save_loss=False, save_best_dev=False, batch_size=4, use_cuda=False, pickle_path=pickle_path, model_name="seq_label_in_test.pkl", print_every_step=1) # Start testing with validation data tester.test(model, data_dev) # print test results print(tester.show_metrics()) print("model tested!")
datadir = "/home/yfshao/UD_English-EWT" cfgfile = './cfg.cfg' train_data_name = "en_ewt-ud-train.conllu" dev_data_name = "en_ewt-ud-dev.conllu" emb_file_name = '/home/yfshao/glove.6B.100d.txt' processed_datadir = './save' # Config Loader train_args = ConfigSection() test_args = ConfigSection() model_args = ConfigSection() optim_args = ConfigSection() ConfigLoader.load_config( cfgfile, { "train": train_args, "test": test_args, "model": model_args, "optim": optim_args }) # Data Loader def save_data(dirpath, **kwargs): import _pickle if not os.path.exists(dirpath): os.mkdir(dirpath) for name, data in kwargs.items(): with open(os.path.join(dirpath, name + '.pkl'), 'wb') as f: _pickle.dump(data, f)
dim=attention_unit, num_vec=attention_hops) self.mlp = MLP( size_layer=[lstm_hidden_size * 2 * attention_hops, nfc, class_num]) def forward(self, x): x_emb = self.embedding(x) output = self.lstm(x_emb) after_attention, penalty = self.attention(output, x) after_attention = after_attention.view(after_attention.size(0), -1) output = self.mlp(after_attention) return output def loss(self, predict, ground_truth): print("predict:%s; g:%s" % (str(predict.size()), str(ground_truth.size()))) print(ground_truth) return F.cross_entropy(predict, ground_truth) train_args = ConfigSection() ConfigLoader("good path").load_config('config.cfg', {"train": train_args}) train_args['vocab'] = len(word2index) trainer = ClassificationTrainer(**train_args.data) # for k in train_args.__dict__.keys(): # print(k, train_args[k]) model = SELF_ATTENTION_YELP_CLASSIFICATION(train_args) trainer.train(model, train_data, dev_data)