def test_fit(self): # test_11 nagisa.fit( train_file="nagisa/data/sample_datasets/sample.train", dev_file="nagisa/data/sample_datasets/sample.dev", test_file="nagisa/data/sample_datasets/sample.test", model_name="sample", ) # test_12 nagisa.fit(train_file="nagisa/data/sample_datasets/sample.train", dev_file="nagisa/data/sample_datasets/sample.dev", test_file="nagisa/data/sample_datasets/sample.test", dict_file="nagisa/data/sample_datasets/sample.dict", emb_file="nagisa/data/sample_datasets/sample.emb", model_name="sample", newline="EOS", delimiter="\t")
def main(): random.seed(1234) # preprocess fn_in = "kwdlc.txt" X, Y = nagisa.utils.load_file(fn_in) indice = [i for i in range(len(X))] random.shuffle(indice) num_train = int(0.8 * len(indice)) num_dev = int(0.1 * len(indice)) num_test = int(0.1 * len(indice)) train_X = [X[i] for i in indice[:num_train]] train_Y = [Y[i] for i in indice[:num_train]] dev_X = [X[i] for i in indice[num_train:num_train + num_dev]] dev_Y = [Y[i] for i in indice[num_train:num_train + num_dev]] test_X = [ X[i] for i in indice[num_train + num_dev:num_train + num_dev + num_test] ] test_Y = [ Y[i] for i in indice[num_train + num_dev:num_train + num_dev + num_test] ] fn_out_train = "kwdlc.train" fn_out_dev = "kwdlc.dev" fn_out_test = "kwdlc.test" write_file(fn_out_train, train_X, train_Y) write_file(fn_out_dev, dev_X, dev_Y) write_file(fn_out_test, test_X, test_Y) # start training fn_out_model = "kwdlc_ner_model" nagisa.fit(train_file=fn_out_train, dev_file=fn_out_dev, test_file=fn_out_test, model_name=fn_out_model)
import nagisa # After finish training, save the three model files (*.vocabs, *.params, *.hp). nagisa.fit(train_file="jp/sample.train", dev_file="jp/sample.dev", test_file="jp/sample.test", model_name="jp/sample") # Build the tagger by loading the trained model files. sample_tagger = nagisa.Tagger(vocabs='jp/sample.vocabs', params='jp/sample.params', hp='jp/sample.hp') text = "福岡・博多の観光情報" words = sample_tagger.tagging(text) print(words) #> 福岡/PROPN ・/SYM 博多/PROPN の/ADP 観光/NOUN 情報/NOUN
with open(fn_out, "w") as f: for words, postags in data: for word, postag in zip(words, postags): f.write("\t".join([word, postag]) + "\n") f.write("EOS\n") if __name__ == "__main__": # files fn_in_train = "UD_Japanese-GSD/ja_gsd-ud-train.conllu" fn_in_dev = "UD_Japanese-GSD/ja_gsd-ud-dev.conllu" fn_in_test = "UD_Japanese-GSD/ja_gsd-ud-test.conllu" fn_out_train = "ja_gsd_ud.train" fn_out_dev = "ja_gsd_ud.dev" fn_out_test = "ja_gsd_ud.test" fn_out_model = "ja_gsd_ud" # write files for nagisa write_file(fn_in_train, fn_out_train) write_file(fn_in_dev, fn_out_dev) write_file(fn_in_test, fn_out_test) # start training nagisa.fit(train_file=fn_out_train, dev_file=fn_out_dev, test_file=fn_out_test, model_name=fn_out_model)
import nagisa # After finish training, save the three model files (*.vocabs, *.params, *.hp). nagisa.fit(train_file="cantonese/train.txt", dev_file="cantonese/dev.txt", test_file="cantonese/test.txt", model_name="cantonese/model") # Build the tagger by loading the trained model files. sample_tagger = nagisa.Tagger(vocabs='cantonese/model.vocabs', params='cantonese/model.params', hp='cantonese/model.hp') text = "我唔系随便嘅人,我随便起来唔系人" words = sample_tagger.tagging(text) print(words) #> 福岡/PROPN ・/SYM 博多/PROPN の/ADP 観光/NOUN 情報/NOUN
import nagisa nagisa.fit(train_file="data/kwdlc.train", dev_file="data/kwdlc.dev", test_file="data/kwdlc.test", model_name="data/kwdlc_ner_model")