import os import json from itertools import tee from collections import Counter import codecs import numpy as np from utilities.utilities import IterableSentences, tokenize, get_logger from configs.config import TOKEN_MIN_FREQUENCY EOS_SYMBOL = '$$$' EMPTY_TOKEN = '###' _logger = get_logger(__name__) def get_tokens_voc(tokenized_dialog_lines): """ :param tokenized_dialog_lines: generator for the efficient use of RAM """ token_counter = Counter() for line in tokenized_dialog_lines: for token in line: token_counter.update([token]) token_voc = [token for token, count in token_counter.most_common() if count >= TOKEN_MIN_FREQUENCY] token_voc.append(EMPTY_TOKEN) return set(token_voc)
import sys import os from itertools import tee sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from lib.dialog_processor import get_processed_dialog_lines_and_index_to_token from configs.config import CORPUS_PATH_EN, CORPUS_PATH_DE, PROCESSED_CORPUS_PATH_EN, PROCESSED_CORPUS_PATH_DE, TOKEN_INDEX_PATH_EN, TOKEN_INDEX_PATH_DE, W2V_PARAMS_EN, W2V_PARAMS_DE from lib.w2v_model import w2v from lib.nn_model.model import get_nn_model from lib.nn_model.train import train_model from lib.nn_model.train import train_model_new from utilities.utilities import get_logger _logger = get_logger(__name__) def learn(): # preprocess the dialog and get index for its vocabulary # processed_dialog_lines, index_to_token = \ # get_processed_dialog_lines_and_index_to_token(CORPUS_PATH, PROCESSED_CORPUS_PATH, TOKEN_INDEX_PATH) processed_dialog_lines_en, processed_dialog_lines_de, index_to_token_en, index_to_token_de = \ get_processed_dialog_lines_and_index_to_token(CORPUS_PATH_EN, CORPUS_PATH_DE, PROCESSED_CORPUS_PATH_EN, PROCESSED_CORPUS_PATH_DE, TOKEN_INDEX_PATH_EN, TOKEN_INDEX_PATH_DE) # dualize iterator # dialog_lines_for_w2v, dialog_lines_for_nn = tee(processed_dialog_lines) dialog_lines_for_w2v_en, dialog_lines_for_nn_en = tee( processed_dialog_lines_en) dialog_lines_for_w2v_de, dialog_lines_for_nn_de = tee(