Пример #1
0
def load_data_from_jsons_stream(FLAGS,
                                input_data_file,
                                input_kb_file,
                                output_vab,
                                output_all_vab,
                                gen_cat,
                                cat_files,
                                self_play_start_turn=None):
    vocal_map = {}
    sent_tokenize = nltk.sent_tokenize

    for raw_data, raw_kb in tqdm(load_and_drop_stream(
            input_data_file,
            input_kb_file,
            drop_incorrect=not FLAGS.keep_incorrect,
            verbose=FLAGS.verbose),
                                 desc='processing stream'):
        # has to be there no matter what
        if raw_kb is not None:
            processed_kb, vocal_map = process_kb([raw_kb],
                                                 vocal_map,
                                                 stream=True)
        else:
            processed_kb = [['no_res']]
        # if dialogue, everything will be there.
        # if context, only intents, actions, vocal_map will be there
        result = process_main_data([raw_data],
                                   sent_tokenize,
                                   word_tokenize,
                                   vocal_map,
                                   stream=True,
                                   input_type=FLAGS.input_type,
                                   self_play_start_turn=self_play_start_turn)
        intents, actions, expected_actions, dialogues, vocal_map, boundaries1, boundaries2, cats = result
        frequency_cutoff = FLAGS.word_cutoff
        # 3 is the number of special tokens
        # if FLAGS.verbose: print 'vocabulary before cutoff', len(vocal_map) + 3
        # vocal_map = write_vocabulary(output_vab, output_all_vab, vocal_map,
        #                             frequency_cutoff, FLAGS.keep_non_ascii)
        # print("CC")
        # print(vocal_map)
        if gen_cat:
            if FLAGS.verbose:
                print('writing category')
            write_cat(cat_files, cats)

        if FLAGS.verbose:
            print(
                'frequency_cutoff= {0}, vocabulary after cutoff'.format(
                    frequency_cutoff), len(vocal_map))
        data = reorganize_data(intents, actions, expected_actions, dialogues,
                               processed_kb, boundaries1, boundaries2)[0]
        yield data
Пример #2
0
def load_data_from_jsons(FLAGS, input_data_file, input_kb_file, output_vab,
                         output_all_vab, gen_cat, cat_files):
    vocal_map = {}
    sent_tokenize = nltk.sent_tokenize

    raw_data, raw_kb = load_and_drop(input_data_file,
                                     input_kb_file,
                                     drop_incorrect=not FLAGS.keep_incorrect,
                                     verbose=FLAGS.verbose)
    # has to be there no matter what
    if FLAGS.verbose:
        print('processing kb')
    processed_kb, vocal_map = process_kb(raw_kb, vocal_map)
    # if dialogue, everything will be there.
    # if context, only intents, actions, vocal_map will be there
    if FLAGS.verbose:
        print('processing data')
    result = process_main_data(raw_data,
                               sent_tokenize,
                               word_tokenize,
                               vocal_map,
                               input_type=FLAGS.input_type)
    intents, actions, expected_actions, dialogues, vocal_map, boundaries1, boundaries2, cats = result
    frequency_cutoff = FLAGS.word_cutoff
    # 3 is the number of special tokens
    if FLAGS.verbose:
        print('vocabulary before cutoff', len(vocal_map) + 3)
    vocal_map = write_vocabulary(output_vab, output_all_vab, vocal_map,
                                 frequency_cutoff, FLAGS.keep_non_ascii)
    if gen_cat:
        if FLAGS.verbose:
            print('writing category')
        write_cat(cat_files, cats)

    if FLAGS.verbose:
        print(
            'frequency_cutoff= {0}, vocabulary after cutoff'.format(
                frequency_cutoff), len(vocal_map))
    data = reorganize_data(intents, actions, expected_actions, dialogues,
                           processed_kb, boundaries1, boundaries2)
    return data
Пример #3
0
def main(FLAGS):
    all_jobs = process_job_type(FLAGS.job_type, FLAGS.input_type)
    output_dir = FLAGS.output_dir
    if FLAGS.verbose:
        print 'all_jobs', all_jobs
        print 'input_type', FLAGS.input_type
        print 'output_dir', output_dir
        print 'data_file', FLAGS.data_file
        print 'kb_file', FLAGS.kb_file
        print 'output_prefix', FLAGS.output_prefix
        print 'skip_standardize', FLAGS.skip_standardize
        print 'keep_incorrect', FLAGS.keep_incorrect
        print 'word_cutoff', FLAGS.word_cutoff
        print 'gen_voc', FLAGS.gen_voc

    if not tf.io.gfile.isdir(output_dir):
        gfile.MkDir(output_dir)

    input_data_file = FLAGS.data_file
    input_kb_file = FLAGS.kb_file
    if len(FLAGS.output_prefix.strip()) == 0:
        FLAGS.output_prefix = ''
    else:
        FLAGS.output_prefix = FLAGS.output_prefix
    # output_vab = output_dir + '/{0}.vocab'.format(FLAGS.output_prefix)
    output_vab = output_dir + '/vocab.txt'
    output_all_vab = output_dir + '/{0}.full.vocab'.format(FLAGS.output_prefix)
    all_token_file = output_dir + '/{0}.special.vocab'.format(
        FLAGS.output_prefix)
    first_name_cats_file = output_dir + '/{0}.firstname.cat'.format(
        FLAGS.output_prefix)
    last_name_cats_file = output_dir + '/{0}.lastname.cat'.format(
        FLAGS.output_prefix)
    flight_cats_file = output_dir + '/{0}.flight.cat'.format(
        FLAGS.output_prefix)
    status_cats_file = output_dir + '/{0}.status.cat'.format(
        FLAGS.output_prefix)

    output_data_pattern = output_dir + '/{0}data'
    output_kb_pattern = output_dir + '/{0}kb'

    nltk_path = FLAGS.ntlk_data
    nltk.data.path.append(nltk_path)
    sent_tokenize = nltk.sent_tokenize

    vocal_map = {}
    # load data and do standardization
    if not FLAGS.skip_standardize:
        raw_data, raw_kb = standardize_and_drop(
            input_data_file,
            input_kb_file,
            drop_incorrect=not FLAGS.keep_incorrect,
            verbose=FLAGS.verbose)
    else:
        raw_data, raw_kb = load_and_drop(
            input_data_file,
            input_kb_file,
            drop_incorrect=not FLAGS.drop_incorrect,
            verbose=FLAGS.verbose)
    # has to be there no matter what
    if FLAGS.verbose: print 'processing kb'
    processed_kb, vocal_map = process_kb(raw_kb, vocal_map)
    # if dialogue, everything will be there.
    # if context, only intents, actions, vocal_map will be there
    if FLAGS.verbose: print 'processing data'
    result = process_main_data(raw_data,
                               sent_tokenize,
                               word_tokenize,
                               vocal_map,
                               input_type=FLAGS.input_type)
    intents, actions, expected_actions, dialogues, vocal_map, boundaries1, boundaries2, cats = result
    frequency_cutoff = FLAGS.word_cutoff
    # 3 is the number of special tokens
    if FLAGS.verbose: print 'vocabulary before cutoff', len(vocal_map) + 3
    if not FLAGS.gen_voc:
        # if we choose not to generate vocabulary file, we set output_vab to None
        output_vab = None
    if not FLAGS.gen_voc_map:
        output_all_vab = None
    vocal_map = write_vocabulary(output_vab, output_all_vab, vocal_map,
                                 frequency_cutoff, FLAGS.keep_non_ascii)
    if FLAGS.gen_cat:
        if FLAGS.verbose: print 'writing category'
        cat_files = [
            first_name_cats_file, last_name_cats_file, flight_cats_file,
            status_cats_file
        ]
        write_cat(cat_files, cats)

    if FLAGS.verbose:
        print 'frequency_cutoff= {0}, vocabulary after cutoff'.format(
            frequency_cutoff), len(vocal_map)
    data = reorganize_data(intents, actions, expected_actions, dialogues,
                           processed_kb, boundaries1, boundaries2)

    if 'train' in all_jobs:
        if FLAGS.verbose:
            print 'writing train data'
        write_data(data, output_data_pattern.format(FLAGS.output_prefix + '.'),
                   output_kb_pattern.format(FLAGS.output_prefix + '.'))
    if 'eval' in all_jobs:
        if FLAGS.verbose:
            print 'writing eval data'
        write_data(data,
                   output_data_pattern.format(FLAGS.output_prefix + '.eval.'),
                   output_kb_pattern.format(FLAGS.output_prefix + '.eval.'))
    if 'infer' in all_jobs:
        if FLAGS.verbose: print 'writing infer data'
        write_completion(
            data,
            output_data_pattern.format(FLAGS.output_prefix + '.infer.src.'),
            output_data_pattern.format(FLAGS.output_prefix + '.infer.tar.'),
            output_kb_pattern.format(FLAGS.output_prefix + '.infer.'))
    if 'sp-train' in all_jobs:
        if FLAGS.verbose: print 'writing self play training data'
        write_self_play(
            data,
            output_data_pattern.format(FLAGS.output_prefix + '.selfplay.'),
            output_kb_pattern.format(FLAGS.output_prefix + '.selfplay.'))
    if 'sp-eval' in all_jobs:
        if FLAGS.verbose: print 'writing self play eval data'
        write_self_play(
            data,
            output_data_pattern.format(FLAGS.output_prefix +
                                       '.selfplay.eval.'),
            output_kb_pattern.format(FLAGS.output_prefix + '.selfplay.eval.'))

    if FLAGS.gen_special_token:
        # write all token file.
        f_tokens = gfile.Open(all_token_file, 'w')
        for token in list(list_of_action_tokens_except_name):
            f_tokens.write(token + '\n')
        f_tokens.close()