# parser.add_argument('--encoding_output_folder', type=str, help='Will be used for training')

arg_p = parser.parse_args()

print('Building vocabulary...')

build_vocabulary(arg_p.training_filename)

print('Vectorization...')

DATA_LOADER = LazyDataLoader(arg_p.training_filename)

_, _, training_records_count = DATA_LOADER.statistics()

# TOKEN_INDICES = get_token_indices()

chars, c_table = get_chars_and_ctable()

inputs = []
targets = []
print('Generating data...')
for i in tqdm(range(training_records_count), desc='Generating inputs and targets'):
    x_, y_ = DATA_LOADER.next()
    # Pad the data with spaces such that it is always MAXLEN.
    inputs.append(x_)
    targets.append(y_)

np.savez_compressed('/tmp/x_y.npz', inputs=inputs, targets=targets)

print('Done... File is /tmp/x_y.npz')
arg_p = parser.parse_args()

print('Building vocabulary...')

build_vocabulary(arg_p.training_filename)

print('Vectorization...')

DATA_LOADER = LazyDataLoader(arg_p.training_filename)

_, _, training_records_count = DATA_LOADER.statistics()

# TOKEN_INDICES = get_token_indices()

chars, c_table = get_chars_and_ctable()

inputs = []
targets = []
print('Generating data...')
for i in tqdm(range(training_records_count),
              desc='Generating inputs and targets'):
    x_, y_ = DATA_LOADER.next()
    # Pad the data with spaces such that it is always MAXLEN.
    inputs.append(x_)
    targets.append(y_)

np.savez_compressed('/tmp/x_y.npz', inputs=inputs, targets=targets)

print('Done... File is /tmp/x_y.npz')
print('Vectorization...')

DATA_LOADER = LazyDataLoader()

INPUT_MAX_LEN, OUTPUT_MAX_LEN, TRAINING_SIZE = DATA_LOADER.statistics()

TOKEN_INDICES = get_TOKEN_INDICES()

chars, c_table = get_chars_and_ctable()

questions = []
expected = []
print('Generating data...')
while len(questions) < TRAINING_SIZE:
    x, y = DATA_LOADER.next()
    # Pad the data with spaces such that it is always MAXLEN.
    q = x
    query = q
    ans = y

    if ADD_NOISE_TO_DATA:
        #print('Old query =', query, end='  |   ')
        query, _ = add_noise_to_data(input_str=query, probs=NOISE_PROBS, vocabulary=chars)
        #print('Query =', query, '  |   Noise type =', noise_type)

    if INVERT:
        # 反转输入序列
        query = query[::-1]

    questions.append(query)