def learn():
    # preprocess the dialog and get index for its vocabulary
    # processed_dialog_lines, index_to_token = \
    #     get_processed_dialog_lines_and_index_to_token(CORPUS_PATH, PROCESSED_CORPUS_PATH, TOKEN_INDEX_PATH)

    processed_dialog_lines_en, processed_dialog_lines_de, index_to_token_en, index_to_token_de = \
        get_processed_dialog_lines_and_index_to_token(CORPUS_PATH_EN, CORPUS_PATH_DE, PROCESSED_CORPUS_PATH_EN, PROCESSED_CORPUS_PATH_DE, TOKEN_INDEX_PATH_EN, TOKEN_INDEX_PATH_DE)

    # dualize iterator
    # dialog_lines_for_w2v, dialog_lines_for_nn = tee(processed_dialog_lines)

    dialog_lines_for_w2v_en, dialog_lines_for_nn_en = tee(
        processed_dialog_lines_en)
    dialog_lines_for_w2v_de, dialog_lines_for_nn_de = tee(
        processed_dialog_lines_de)
    _logger.info('-----')

    # use gensim realisatino of word2vec instead of keras embeddings due to extra flexibility
    w2v_model_en = w2v.get_dialogs_model(W2V_PARAMS_EN,
                                         dialog_lines_for_w2v_en)
    w2v_model_de = w2v.get_dialogs_model(W2V_PARAMS_DE,
                                         dialog_lines_for_w2v_de)

    _logger.info('-----')

    nn_model = get_nn_model(token_dict_size=len(index_to_token_de))
    _logger.info('-----')

    train_model(nn_model, w2v_model_en, w2v_model_de, dialog_lines_for_nn_en,
                dialog_lines_for_nn_de, index_to_token_en, index_to_token_de)
def learn():
    # preprocess the dialog and get index for its vocabulary
    # processed_dialog_lines, index_to_token = \
    #     get_processed_dialog_lines_and_index_to_token(CORPUS_PATH, PROCESSED_CORPUS_PATH, TOKEN_INDEX_PATH)

    processed_dialog_lines_en, processed_dialog_lines_de, index_to_token_en, index_to_token_de = \
        get_processed_dialog_lines_and_index_to_token(CORPUS_PATH_EN, CORPUS_PATH_DE, PROCESSED_CORPUS_PATH_EN, PROCESSED_CORPUS_PATH_DE, TOKEN_INDEX_PATH_EN, TOKEN_INDEX_PATH_DE)

    # dualize iterator
    # dialog_lines_for_w2v, dialog_lines_for_nn = tee(processed_dialog_lines)

    dialog_lines_for_w2v_en, dialog_lines_for_nn_en = tee(processed_dialog_lines_en)
    dialog_lines_for_w2v_de, dialog_lines_for_nn_de = tee(processed_dialog_lines_de)
    _logger.info('-----')

    # use gensim realisatino of word2vec instead of keras embeddings due to extra flexibility
    w2v_model_en = w2v.get_dialogs_model(W2V_PARAMS_EN, dialog_lines_for_w2v_en)
    w2v_model_de = w2v.get_dialogs_model(W2V_PARAMS_DE, dialog_lines_for_w2v_de)

    _logger.info('-----')

    nn_model = get_nn_model(token_dict_size=len(index_to_token_de))
    _logger.info('-----')

    train_model(nn_model, w2v_model_en, w2v_model_de, dialog_lines_for_nn_en,dialog_lines_for_nn_de, index_to_token_en, index_to_token_de)
Exemplo n.º 3
0
def learn():
    # preprocess the dialog and get index for its vocabulary
    processed_dialog_lines, index_to_token = \
        get_processed_dialog_lines_and_index_to_token(CORPUS_PATH, PROCESSED_CORPUS_PATH, TOKEN_INDEX_PATH)

    lines_for_validation = get_lines_for_validation(TEST_DATASET_PATH,
                                                    index_to_token)

    # dualize iterator
    if INITIALIZE_WORD_EMBEDDINGS_WITH_WORD2VEC:
        dialog_lines_for_w2v, dialog_lines_for_nn = tee_nobuffer(
            processed_dialog_lines)
        _logger.info('-----')

        # use gensim implementation of word2vec instead of keras embeddings due to extra flexibility
        w2v_model = w2v.get_dialogs_model(W2V_PARAMS, dialog_lines_for_w2v)
        _logger.info('-----')
        w2v_matrix = transform_w2v_model_to_matrix(w2v_model, index_to_token)
    else:
        dialog_lines_for_nn = processed_dialog_lines
        w2v_matrix = None

    nn_model = get_nn_model(len(index_to_token), w2v_matrix)
    _logger.info('-----')

    train_model(nn_model, dialog_lines_for_nn, lines_for_validation,
                index_to_token)
Exemplo n.º 4
0
def predict():
    # preprocess the dialog and get index for its vocabulary
    processed_dialog_lines, index_to_token = \
        get_processed_dialog_lines_and_index_to_token(CORPUS_PATH, PROCESSED_CORPUS_PATH, TOKEN_INDEX_PATH)

    # dualize iterator
    dialog_lines_for_w2v, dialog_lines_for_nn = tee(processed_dialog_lines)
    _logger.info('-----')

    # use gensim realisatino of word2vec instead of keras embeddings due to extra flexibility
    w2v_model = w2v.get_dialogs_model(W2V_PARAMS, dialog_lines_for_w2v)
    _logger.info('-----')

    nn_model = get_nn_model(token_dict_size=len(index_to_token))

    while True:
        input_sentence = raw_input('> ')
        predict_sentence(input_sentence, nn_model, w2v_model, index_to_token)
Exemplo n.º 5
0
def learn():
    # preprocess the dialog and get index for its vocabulary
    processed_dialog_lines, index_to_token = \
        get_processed_dialog_lines_and_index_to_token(CORPUS_PATH, PROCESSED_CORPUS_PATH, TOKEN_INDEX_PATH)

    lines_for_validation = get_lines_for_validation(SMALL_TEST_DATASET_PATH, index_to_token)

    # dualize iterator
    dialog_lines_for_w2v, dialog_lines_for_nn = tee(processed_dialog_lines)
    _logger.info('-----')

    # use gensim realisatino of word2vec instead of keras embeddings due to extra flexibility
    w2v_model = w2v.get_dialogs_model(W2V_PARAMS, dialog_lines_for_w2v)
    _logger.info('-----')

    nn_model = get_nn_model(vocab_size=len(index_to_token))
    _logger.info('-----')

    train_model(nn_model, w2v_model, dialog_lines_for_nn, lines_for_validation, index_to_token)
Exemplo n.º 6
0
def learn():
    # preprocess the dialog and get index for its vocabulary
    processed_dialog_lines, index_to_token = \
        get_processed_dialog_lines_and_index_to_token(CORPUS_PATH, PROCESSED_CORPUS_PATH, TOKEN_INDEX_PATH)

    lines_for_validation = get_lines_for_validation(TEST_DATASET_PATH, index_to_token)

    # dualize iterator
    if INITIALIZE_WORD_EMBEDDINGS_WITH_WORD2VEC:
        dialog_lines_for_w2v, dialog_lines_for_nn = tee_nobuffer(processed_dialog_lines)
        _logger.info('-----')

        # use gensim implementation of word2vec instead of keras embeddings due to extra flexibility
        w2v_model = w2v.get_dialogs_model(W2V_PARAMS, dialog_lines_for_w2v)
        _logger.info('-----')
        w2v_matrix = transform_w2v_model_to_matrix(w2v_model, index_to_token)
    else:
        dialog_lines_for_nn = processed_dialog_lines
        w2v_matrix = None

    nn_model = get_nn_model(len(index_to_token), w2v_matrix)
    _logger.info('-----')

    train_model(nn_model, dialog_lines_for_nn, lines_for_validation, index_to_token)