def generate(conf_path, n, epoch, prefix_words, ignore_words): with open(conf_path) as f: conf = yaml.load(f) print("== initialize tokenizer ==") token_files = glob.glob(conf["input_token_files"]) tokenizer = create_tokenizer(token_files, num_words=conf["num_vocab"]) print("output vocab size:", tokenizer.num_words) print("| + <UNK> token") inverse_vocab = {idx: w for w, idx in tokenizer.word_index.items()} print("load model") print("> create instance") model = ThreadTitleGenerator(**conf["model_params"]) print("> load model") model.load(conf["model_path"], epoch) print("> print summary") model.print_summary() print("generate words!") end_token_idx = tokenizer.word_index[END_TOKEN] prefix_tokens = [ tokenizer.word_index[t] for t in [START_TOKEN] + prefix_words ] ignore_idx = [tokenizer.word_index[t] for t in ignore_words] \ + [conf["num_vocab"] + 1] # unk_idx ret = model.gen_nbest(prefix_tokens, end_token_idx, ignore_idx, n=n) print(ret) print("convert to readable tokens") for tokens, prob in ret: title = [inverse_vocab.get(idx, "???") for idx in tokens] print(" ".join(title)) print(prob) K.clear_session()
def train(conf_path): with open(conf_path) as f: conf = yaml.load(f) print("== initialize tokenizer ==") token_files = glob.glob(conf["input_token_files"]) tokenizer = create_tokenizer(token_files, num_words=conf["num_vocab"]) print("output vocab size:", tokenizer.num_words) print("| + <UNK> token") print("== load input tokens ==") input_sentences = [] for fpath in token_files: with open(fpath) as f: for line in f: input_sentences.append(line.rstrip()) src_X, dst_X = texts_to_sequences_with_unk_seq(tokenizer, input_sentences) X, y = create_training_data(src_X, dst_X, conf["max_title_tokens"]) print("build model") print("> create instance") model = ThreadTitleGenerator(**conf["model_params"]) print("> build model") model.build_model(conf["word2vec_model_path"], tokenizer, conf["max_title_tokens"]) print("> print summary") model.print_summary() print("train model") model.train(X, y, conf["model_path"])
type=int, required=True, help="Number of epochs to train for") args = vars(ap.parse_args()) dataset = np.array(load_saved_lines('eng-german-both.pkl')) train = np.array(load_saved_lines('eng-german-train.pkl')) for i in range(5): print(train[i]) dev = np.array(load_saved_lines('eng-german-dev.pkl')) for i in range(5): print(dev[i]) print('[INFO] Training set size: {:d}'.format(len(train))) print('[INFO] Dev set size: {:d}'.format(len(dev))) eng_tokenizer = create_tokenizer(dataset[:, 0]) eng_vocab_size = len(eng_tokenizer.word_index) + 1 eng_length = sentence_length(dataset[:, 0]) print('[INFO] English Vocab size: {:d}'.format(eng_vocab_size)) print('[INFO] English Max length: {:d}'.format(eng_length)) ger_tokenizer = create_tokenizer(dataset[:, 1]) ger_vocab_size = len(ger_tokenizer.word_index) + 1 ger_length = sentence_length(dataset[:, 1]) print('[INFO] Ger Vocab size: {:d}'.format(ger_vocab_size)) print('[INFO] Ger Max length: {:d}'.format(ger_length)) print('[INFO] Defining model...') model = baseline_model(eng_vocab_size, ger_vocab_size, eng_length, ger_length, 256)