예제 #1
0
def main():
    model = int(sys.argv[1])
    file_name = sys.argv[2]
    val_inp = text_retrieve(file_name + '.en')
    val_tar = text_retrieve(file_name + '.phone')
    inp_lines, tar_lines, pred_lines = [], [], []
    for i in range(len(val_inp)):
        inp = str(val_inp[i])
        tar = str(val_tar[i])
        try:
            pred = translate(inp, model)
        except:
            continue
        print(i)
        print('Input sentence: ', preprocess_inp_tar(inp))
        print('Target sentence: ', preprocess_inp_tar(tar))
        print('Predict sentence: ', pred)
        print()
        inp_lines.append(preprocess_inp_tar(inp))
        tar_lines.append(preprocess_inp_tar(tar))
        pred_lines.append(pred)
    inp_text = lines_to_text(inp_lines, '\n')
    tar_text = lines_to_text(tar_lines, '\n')
    pred_text = lines_to_text(pred_lines, '\n')
    text_save(inp_text,
              'model_' + str(model) + '/predictions/' + file_name + '_inp.txt')
    text_save(tar_text,
              'model_' + str(model) + '/predictions/' + file_name + '_tar.txt')
    text_save(
        pred_text,
        'model_' + str(model) + '/predictions/' + file_name + '_pred.txt')
예제 #2
0
def main():
    print()
    model = int(sys.argv[1])
    train_inp = text_retrieve('spt-tokenized/train.gloss')
    val_inp = text_retrieve('spt-tokenized/val.gloss')
    test_inp = text_retrieve('spt-tokenized/test.gloss')
    train_tar = text_retrieve('spt-tokenized/train.en')
    val_tar = text_retrieve('spt-tokenized/val.en')
    test_tar = text_retrieve('spt-tokenized/test.en')
    print('No. of original sentences in Training set: ', len(train_inp))
    print('No. of original sentences in Validation set: ', len(val_inp))
    print('No. of original sentences in Test set: ', len(test_inp))
    print()
    max_length = 40
    train_inp, train_tar = create_new_dataset(train_inp, train_tar, max_length)
    val_inp, val_tar = create_new_dataset(val_inp, val_tar, max_length)
    test_inp, test_tar = create_new_dataset(test_inp, test_tar, max_length)
    print('No. of new sentences in Training set: ', len(train_inp))
    print('No. of new sentences in Validation set: ', len(val_inp))
    print('No. of new sentences in Test set: ', len(test_inp))
    print()
    inp_lang, train_inp, val_inp, test_inp = tokenize(train_inp, val_inp, test_inp, max_length)
    tar_lang, train_tar, val_tar, test_tar = tokenize(train_tar, val_tar, test_tar, max_length)
    print('Input Vocabulary size: ', len(inp_lang.word_index) + 1)
    print('Target Vocabulary size: ', len(tar_lang.word_index) + 1)
    print()
    batch_size = 128
    save_file(inp_lang.word_index, 'model_' + str(model) + '/utils/inp-word-index')
    save_file(inp_lang.index_word, 'model_' + str(model) + '/utils/inp-index-word')
    save_file(tar_lang.word_index, 'model_' + str(model) + '/utils/tar-word-index')
    save_file(tar_lang.index_word, 'model_' + str(model) + '/utils/tar-index-word')
    parameters = {'inp_vocab_size': len(inp_lang.word_index) + 1, 'tar_vocab_size': len(tar_lang.word_index) + 1,
                  'emb_size': 512, 'rnn_size': 512, 'batch_size': batch_size, 'epochs': 30,
                  'train_steps_per_epoch': len(train_inp) // batch_size, 'rate': 0.3,
                  'val_steps_per_epoch': len(val_inp) // batch_size, 'test_steps': len(test_inp) // batch_size,
                  'max_length': max_length, 'model': model}
    save_file(parameters, 'model_' + str(model) + '/utils/parameters')
    print()
    print('No. of Training steps per epoch: ', parameters['train_steps_per_epoch'])
    print('No. of Validation steps per epoch: ', parameters['val_steps_per_epoch'])
    print('No. of Testing steps: ', parameters['test_steps'])
    print()
    train_dataset = tf.data.Dataset.from_tensor_slices((train_inp, train_tar)).shuffle(len(train_inp))
    train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
    val_dataset = tf.data.Dataset.from_tensor_slices((val_inp, val_tar)).shuffle(len(val_inp))
    val_dataset = val_dataset.batch(batch_size, drop_remainder=True)
    test_dataset = tf.data.Dataset.from_tensor_slices((test_inp, test_tar)).shuffle(len(test_inp))
    test_dataset = test_dataset.batch(batch_size, drop_remainder=True)
    print('Model training started')
    print()
    model_training(train_dataset, val_dataset, parameters)
    model_testing(test_dataset, parameters)
예제 #3
0
def main():
    print()
    train_inp = text_retrieve('train.gloss')
    val_inp = text_retrieve('val.gloss')
    test_inp = text_retrieve('test.gloss')
    train_tar = text_retrieve('train.en')
    val_tar = text_retrieve('val.en')
    test_tar = text_retrieve('test.en')
    print('No. of original sentences in Training set: ', len(train_inp))
    print('No. of original sentences in Validation set: ', len(val_inp))
    print('No. of original sentences in Test set: ', len(test_inp))
    print()
    vocab_size = 4000
    max_length = 40
    tokenizer(train_inp, train_tar, val_inp, val_tar, test_inp, test_tar,
              vocab_size, max_length)
예제 #4
0
def main():
    print()
    loc = '/home/preetham/Documents/Preetham/masters-thesis/'
    files = text_retrieve('files_list.txt')
    print('No. of files in original dataset: ', len(files))
    print()
    shuffle(files)
    train, val, test = files[:1000], files[:20], files[:20]
    print('No. of files in training dataset: ', len(train))
    print('No. of files in validation dataset: ', len(val))
    print('No. of files in testing dataset: ', len(test))
    print()
    inp_word_index = open_file(
        'results/grapheme-to-phoneme/luong/model_7/utils/tar-word-index.pkl')
    start_index = 0
    batch_size = 8
    train = train[start_index:start_index + batch_size]
    train_batch_inp, train_batch_tar = create_batch(train, inp_word_index)
    dec_pre_net = DecoderPreNet(256, 0.1)
    print(train_batch_tar.shape)
    print(train_batch_tar[:, :, 0].shape)
    x = dec_pre_net(train_batch_tar[:, 0], False)
    print(x.shape)