Exemplo n.º 1
0
def transform_lines_to_nn_input(tokenized_dialog_lines, token_to_index):
    """
    Splits lines (IterableSentences) and generates numpy arrays of token ids suitable for training.
    Doesn't store all lines in memory.
    """
    x_data_iterator, y_data_iterator, iterator_for_len_calc = file_buffered_tee(tokenized_dialog_lines, 3)

    _logger.info('Iterating through lines to get number of elements in the dataset')
    n_dialogs = sum(1 for _ in iterator_for_len_calc)

    x_data_iterator = islice(x_data_iterator, 0, None, 2)
    y_data_iterator = islice(y_data_iterator, 1, None, 2)
    n_dialogs /= 2

    y_data_iterator, y_data_iterator_for_context = file_buffered_tee(y_data_iterator)
    x_data_iterator = _get_x_data_iterator_with_context(x_data_iterator, y_data_iterator_for_context)

    _logger.info('Iterating through lines to get input matrix')
    x_ids = transform_contexts_to_token_ids(
        x_data_iterator, token_to_index, INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE, max_contexts_num=n_dialogs)

    _logger.info('Iterating through lines to get output matrix')
    y_ids = transform_lines_to_token_ids(
        y_data_iterator, token_to_index, OUTPUT_SEQUENCE_LENGTH, n_dialogs, add_start_end=True)
    return x_ids, y_ids, n_dialogs
Exemplo n.º 2
0
def _train_model(tokenized_lines, voc_size, vec_size, window_size, skip_gram):
    _logger.info(
        'Word2Vec model will be trained now. It can take long, so relax and have fun.'
    )

    params_str = get_w2v_params_str(voc_size, vec_size, window_size, skip_gram)
    _logger.info('Parameters for training: %s' % params_str)

    model = Word2Vec(window=window_size,
                     size=vec_size,
                     max_vocab_size=voc_size,
                     min_count=MIN_WORD_FREQ,
                     workers=_WORKERS_NUM,
                     sg=skip_gram)

    tokenized_lines_for_voc, tokenized_lines_for_train = file_buffered_tee(
        tokenized_lines)

    model.build_vocab(tokenized_lines_for_voc)
    model.train(tokenized_lines_for_train)

    # forget the original vectors and only keep the normalized ones = saves lots of memory
    # https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec.init_sims
    model.init_sims(replace=True)

    return model
Exemplo n.º 3
0
def get_dialog_lines_and_conditions(dialog_lines, text_field_name, condition_field_name):
    """
    Splits one dialog_lines generator into two generators - one for conditions and one for dialog lines
    """
    conditions_iter, dialog_lines_iter = file_buffered_tee(
        imap(lambda line: [line[condition_field_name], line[text_field_name]], dialog_lines))
    conditions_iter = imap(itemgetter(0), conditions_iter)
    dialog_lines_iter = imap(itemgetter(1), dialog_lines_iter)
    return dialog_lines_iter, conditions_iter
Exemplo n.º 4
0
def get_dialog_lines_and_conditions(dialog_lines, text_field_name,
                                    condition_field_name):
    """
    Splits one dialog_lines generator into two generators - one for conditions and one for dialog lines
    """
    conditions_iter, dialog_lines_iter = file_buffered_tee(
        imap(lambda line: [line[condition_field_name], line[text_field_name]],
             dialog_lines))
    conditions_iter = imap(itemgetter(0), conditions_iter)
    dialog_lines_iter = imap(itemgetter(1), dialog_lines_iter)
    return dialog_lines_iter, conditions_iter
Exemplo n.º 5
0
def get_dialog_lines_and_conditions(dialog_lines, text_field_name,
                                    condition_field_name):
    """
    Splits one dialog_lines generator into two generators - one for conditions and one for dialog lines
    """
    conditions_iter, dialog_lines_iter = file_buffered_tee(
        [[line[condition_field_name], line[text_field_name]]
         for line in dialog_lines])
    conditions_iter = list(map(itemgetter(0), conditions_iter))
    dialog_lines_iter = list(map(itemgetter(1), dialog_lines_iter))
    return dialog_lines_iter, conditions_iter
Exemplo n.º 6
0
def _train_model(tokenized_lines, voc_size, vec_size, window_size, skip_gram):
    _logger.info('Word2Vec model will be trained now. It can take long, so relax and have fun.')

    params_str = get_w2v_params_str(voc_size, vec_size, window_size, skip_gram)
    _logger.info('Parameters for training: %s' % params_str)

    model = Word2Vec(
        window=window_size,
        size=vec_size,
        max_vocab_size=voc_size,
        min_count=MIN_WORD_FREQ,
        workers=_WORKERS_NUM,
        sg=skip_gram)

    tokenized_lines_for_voc, tokenized_lines_for_train = file_buffered_tee(tokenized_lines)

    model.build_vocab(tokenized_lines_for_voc)
    model.train(tokenized_lines_for_train)

    # forget the original vectors and only keep the normalized ones = saves lots of memory
    # https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec.init_sims
    model.init_sims(replace=True)

    return model