def main(argv=None):
    """Converts a dataset to tfrecords."""
    args = commandLineParser.parse_args()

    if os.path.isdir(args.destination_dir):
        print 'destination directory exists. Exiting...'
    else:
        os.makedirs(args.destination_dir)

    if not os.path.isdir('CMDs'):
        os.makedirs('CMDs')

    with open('CMDs/step_preprocess_data.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

        # Load responses and prompts as sequences of word ids
    responses, _ = load_text(args.input_data_path, args.input_wlist_path)
    prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path)

    # Load up the prompts as sequences of words
    with open(args.input_prompt_path, 'r') as file:
        topics = [line.replace('\n', '') for line in file.readlines()]

    # Get unique set of topics and topic counts (and sort tem)
    unique_topics, topic_counts = np.unique(topics, return_counts=True)
    topics = unique_topics[np.flip(np.argsort(topic_counts), 0)]
    topic_counts = np.flip(np.sort(topic_counts), 0)

    # Create dictionary for topics mapping sentence to topic id
    # Also create file of sorted topics and unigrams file
    # Unigram file later used for training
    topic_dict = {}
    with open(os.path.join(args.destination_dir, 'unigrams.txt'),
              'w') as ufile:
        with open(os.path.join(args.destination_dir, 'sorted_topics.txt'),
                  'w') as tfile:
            for i, topic, count in zip(xrange(topics.shape[0]), topics,
                                       topic_counts):
                topic_dict[topic] = i
                ufile.write(str(i) + ',' + str(int(count)) + '\n')
                tfile.write(topic + '\n')

    # Load up the speakers and speakers
    grades = np.loadtxt(args.input_grade_path)
    with open(args.input_spkr_path, 'r') as file:
        speakers = np.asarray(
            [line.replace('\n', '') for line in file.readlines()])

    # Create a list of topic IDs for every response
    with open(args.input_prompt_path, 'r') as file:
        q_ids = np.asarray(
            [topic_dict[line.replace('\n', '')] for line in file.readlines()])

    ### Split data into train and validation  data sets
    n = len(responses)
    train_size = int(n * (1.0 - args.valid_fraction))
    valid_size = n - train_size

    print 'Total dataset size', n, 'Train dataset size', train_size, 'Valid dataset size', valid_size

    np.random.seed(1000)

    permutation = np.random.choice(np.arange(n), n, replace=False)
    index_train = permutation[:train_size]
    inded_valid = permutation[train_size:]

    trn_responses = responses[index_train]
    trn_prompts = prompts[index_train]
    trn_q_ids = q_ids[index_train]
    trn_speakers = speakers[index_train]
    trn_grades = grades[index_train]

    valid_responses = responses[inded_valid]
    valid_prompts = prompts[inded_valid]
    valid_q_ids = q_ids[inded_valid]
    valid_speakers = speakers[inded_valid]
    valid_grades = grades[inded_valid]

    # Create the training TF Record file
    filename = 'relevance.train.tfrecords'
    print 'Writing', filename
    writer = tf.python_io.TFRecordWriter(
        os.path.join(args.destination_dir, filename))
    for response, prompt, q_id, grd, spkr in zip(trn_responses, trn_prompts,
                                                 trn_q_ids, trn_grades,
                                                 trn_speakers):
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([1.0]),
                    'grade': tfrecord_utils.float_feature([grd]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()

    # Create the validation TF Record file
    filename = 'relevance.valid.tfrecords'
    print 'Writing', filename
    writer = tf.python_io.TFRecordWriter(
        os.path.join(args.destination_dir, filename))
    for response, prompt, q_id, grd, spkr in zip(valid_responses,
                                                 valid_prompts, valid_q_ids,
                                                 valid_grades, valid_speakers):
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([1.0]),
                    'grade': tfrecord_utils.float_feature([grd]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()
Пример #2
0
def main(args):
    """Converts a dataset to tfrecords."""

    if os.path.isdir(args.destination_dir):
        usr_response = raw_input(
            "Destination directory exists. Type 'y' if you want to overwrite it: "
        )
        if usr_response != 'y':
            print('destination directory {} exists.\nExiting...'.format(
                args.destination_dir))
            exit()
    else:
        os.makedirs(args.destination_dir)

    shutil.copyfile(args.input_wlist_path,
                    os.path.join(args.destination_dir, 'input.wlist.index'))

    # Get the paths to the relevant files
    responses_path = os.path.join(args.data_dir, args.responses_file)
    prompts_path = os.path.join(args.data_dir, args.prompts_file)
    grades_path = os.path.join(args.data_dir, args.grades_file)
    speakers_path = os.path.join(args.data_dir, args.speakers_file)
    predictions_path = os.path.join(args.data_dir, args.predictions_file)
    targets_path = os.path.join(args.data_dir, args.targets_file)
    dirich_params_path = os.path.join(args.data_dir, args.dirich_params_file)

    required_files = [
        responses_path, prompts_path, grades_path, speakers_path,
        predictions_path, targets_path, dirich_params_path
    ]

    # Assert the required files exist
    for path in required_files:
        if not os.path.isfile(path):
            print('File: {} doesn`t exist. Exiting...'.format(path))
            exit()

    # Cache the command:
    if not os.path.isdir(os.path.join(args.destination_dir, 'CMDs')):
        os.makedirs(os.path.join(args.destination_dir, 'CMDs'))
    with open(os.path.join(args.destination_dir, 'CMDs/preprocessing.cmd'),
              'a') as cmd_cache:
        cmd_cache.write(' '.join(sys.argv) + '\n')
        cmd_cache.write('--------------------------------\n')

    # Load responses and prompts as sequences of word ids
    responses, _ = load_text(responses_path,
                             args.input_wlist_path,
                             strip_start_end=args.remove_sentence_tags)
    prompts, _ = load_text(prompts_path,
                           args.input_wlist_path,
                           strip_start_end=args.remove_sentence_tags)

    # Load up the speakers and grades
    with open(grades_path, 'r') as file:
        grades = np.asarray(
            [line.replace('\n', '') for line in file.readlines()])
    with open(speakers_path, 'r') as file:
        speakers = np.asarray(
            [line.replace('\n', '') for line in file.readlines()])
    # Load the teacher predictions
    predictions = np.loadtxt(predictions_path, dtype=np.float32)
    # Load true targets
    targets = np.loadtxt(targets_path, dtype=np.float32)
    # Load max-likelihood Dirichlet parameters
    alphas = np.loadtxt(dirich_params_path, dtype=np.float32)

    # Create or load the topic ID dictionary:
    if args.sorted_topics_path == '':
        # Generate and save new topic ID mapping
        topic_dict = generate_topic_dict(prompts_path, args.destination_dir)
    else:
        # Load a pre-existing topic ID mapping
        topic_dict = load_topic_dict(args.sorted_topics_path)

    # Create a list of topic IDs for every response
    with open(prompts_path, 'r') as file:
        # Use id -1 if topic not in topic_dict (not present in training set)
        q_ids = np.asarray([
            topic_dict.get(line.replace('\n', ''), -1)
            for line in file.readlines()
        ])

    if args.preprocessing_type == 'train':
        ### Split data into train and validation data sets
        n = len(responses)
        train_size = int(n * (1.0 - args.valid_fraction))
        valid_size = n - train_size

        print('Total dataset size', n, 'Train dataset size', train_size,
              'Valid dataset size', valid_size)

        np.random.seed(args.rand_seed)

        permutation = np.random.choice(np.arange(n), n, replace=False)
        index_train = permutation[:train_size]
        index_valid = permutation[train_size:]

        trn_responses = responses[index_train]
        trn_prompts = prompts[index_train]
        trn_q_ids = q_ids[index_train]
        trn_speakers = speakers[index_train]
        trn_grades = grades[index_train]
        trn_targets = targets[index_train]
        trn_predictions = predictions[index_train]
        trn_alphas = alphas[index_train]

        print("Number training examples: {}".format(len(trn_responses)))

        valid_responses = responses[index_valid]
        valid_prompts = prompts[index_valid]
        valid_q_ids = q_ids[index_valid]
        valid_speakers = speakers[index_valid]
        valid_grades = grades[index_valid]
        valid_targets = targets[index_valid]
        valid_predictions = predictions[index_valid]
        valid_alphas = alphas[index_valid]

        print("Number validation examples: {}".format(len(valid_responses)))

        # Create the training TF Record file
        write_to_tfrecords('relevance.train.tfrecords',
                           args.destination_dir,
                           trn_responses,
                           trn_prompts,
                           trn_q_ids,
                           trn_grades,
                           trn_speakers,
                           trn_targets,
                           trn_predictions,
                           trn_alphas,
                           debug=args.debug)

        # Create the validation TF Record file
        write_to_tfrecords('relevance.valid.tfrecords',
                           args.destination_dir,
                           valid_responses,
                           valid_prompts,
                           valid_q_ids,
                           valid_grades,
                           valid_speakers,
                           valid_targets,
                           valid_predictions,
                           valid_alphas,
                           debug=args.debug)

        # Write a metadata file for convenience:
        with open(os.path.join(args.destination_dir, 'dataset_meta.txt'),
                  'w') as meta_file:
            meta_string = 'num_examples_train:\t{}\nnum_examples_valid:\t{}\nnum_unique_topics:\t{}'.format(
                len(trn_responses), len(valid_responses), len(topic_dict))
            meta_file.write(meta_string)

    elif args.preprocessing_type == 'test':
        write_to_tfrecords('relevance.test.tfrecords',
                           args.destination_dir,
                           responses,
                           prompts,
                           q_ids,
                           grades,
                           speakers,
                           targets,
                           predictions,
                           alphas,
                           debug=args.debug)

        # Write a metadata file for convenience:
        with open(os.path.join(args.destination_dir, 'dataset_meta.txt'),
                  'w') as meta_file:
            meta_string = 'num_examples:\t{}\nnum_unique_topics:\t{}'.format(
                len(responses), len(topic_dict))
            meta_file.write(meta_string)

    print('Finished')
def main(argv=None):
    """Converts a dataset to tfrecords."""
    args = commandLineParser.parse_args()

    if not os.path.isdir(args.destination_dir):
        os.makedirs(args.destination_dir)

    if not os.path.isdir('CMDs'):
        os.makedirs('CMDs')

    with open('CMDs/step_preprocess_test_data.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    # Load responses and prompts as sequences of word ids
    responses, _ = load_text(args.input_data_path, args.input_wlist_path)
    prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path)

    # Load up the grades, targets and speakers
    grades = np.loadtxt(args.input_grade_path)
    targets = np.loadtxt(args.input_tgt_path, dtype=np.float32)
    with open(args.input_spkr_path, 'r') as file:
        speakers = np.asarray(
            [line.replace('\n', '') for line in file.readlines()])

    # Load up sorted topics and (re)construct the topic dict so that I map each prompt word sequence to its q_id
    topic_dict = {}
    i = 0
    with open(os.path.join(args.sorted_topics_path), 'r') as tfile:
        for topic in tfile.readlines():
            topic_dict[topic.replace('\n', '')] = i
            i += 1

    # Load up the prompts as sequences of words and convert to q_id
    try:
        with open(args.input_prompt_path, 'r') as file:
            q_ids = np.asarray([
                topic_dict[line.replace('\n', '')]
                for line in file.readlines()
            ])
    except:
        with open(args.input_prompt_path, 'r') as file:
            q_ids = np.asarray([-1 for line in file.readlines()])

    # Create the training TF Record file
    filename = args.name + '.tfrecords'
    print 'Writing', filename

    writer = tf.python_io.TFRecordWriter(
        os.path.join(args.destination_dir, filename))
    for response, prompt, q_id, grd, spkr, tgt in zip(responses, prompts,
                                                      q_ids, grades, speakers,
                                                      targets):
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([tgt]),
                    'grade': tfrecord_utils.float_feature([grd]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()