def main(argv=None): """Converts a dataset to tfrecords.""" args = commandLineParser.parse_args() if os.path.isdir(args.destination_dir): print 'destination directory exists. Exiting...' else: os.makedirs(args.destination_dir) if not os.path.isdir('CMDs'): os.makedirs('CMDs') with open('CMDs/step_preprocess_data.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') # Load responses and prompts as sequences of word ids responses, _ = load_text(args.input_data_path, args.input_wlist_path) prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path) # Load up the prompts as sequences of words with open(args.input_prompt_path, 'r') as file: topics = [line.replace('\n', '') for line in file.readlines()] # Get unique set of topics and topic counts (and sort tem) unique_topics, topic_counts = np.unique(topics, return_counts=True) topics = unique_topics[np.flip(np.argsort(topic_counts), 0)] topic_counts = np.flip(np.sort(topic_counts), 0) # Create dictionary for topics mapping sentence to topic id # Also create file of sorted topics and unigrams file # Unigram file later used for training topic_dict = {} with open(os.path.join(args.destination_dir, 'unigrams.txt'), 'w') as ufile: with open(os.path.join(args.destination_dir, 'sorted_topics.txt'), 'w') as tfile: for i, topic, count in zip(xrange(topics.shape[0]), topics, topic_counts): topic_dict[topic] = i ufile.write(str(i) + ',' + str(int(count)) + '\n') tfile.write(topic + '\n') # Load up the speakers and speakers grades = np.loadtxt(args.input_grade_path) with open(args.input_spkr_path, 'r') as file: speakers = np.asarray( [line.replace('\n', '') for line in file.readlines()]) # Create a list of topic IDs for every response with open(args.input_prompt_path, 'r') as file: q_ids = np.asarray( [topic_dict[line.replace('\n', '')] for line in file.readlines()]) ### Split data into train and validation data sets n = len(responses) train_size = int(n * (1.0 - args.valid_fraction)) valid_size = n - train_size print 'Total dataset size', n, 'Train dataset size', train_size, 'Valid dataset size', valid_size np.random.seed(1000) permutation = np.random.choice(np.arange(n), n, replace=False) index_train = permutation[:train_size] inded_valid = permutation[train_size:] trn_responses = responses[index_train] trn_prompts = prompts[index_train] trn_q_ids = q_ids[index_train] trn_speakers = speakers[index_train] trn_grades = grades[index_train] valid_responses = responses[inded_valid] valid_prompts = prompts[inded_valid] valid_q_ids = q_ids[inded_valid] valid_speakers = speakers[inded_valid] valid_grades = grades[inded_valid] # Create the training TF Record file filename = 'relevance.train.tfrecords' print 'Writing', filename writer = tf.python_io.TFRecordWriter( os.path.join(args.destination_dir, filename)) for response, prompt, q_id, grd, spkr in zip(trn_responses, trn_prompts, trn_q_ids, trn_grades, trn_speakers): example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'targets': tfrecord_utils.float_feature([1.0]), 'grade': tfrecord_utils.float_feature([grd]), 'spkr': tfrecord_utils.bytes_feature([spkr]), 'q_id': tfrecord_utils.int64_feature([q_id]) }), feature_lists=tf.train.FeatureLists( feature_list={ 'response': tfrecord_utils.int64_feature_list(response), 'prompt': tfrecord_utils.int64_feature_list(prompt) })) writer.write(example.SerializeToString()) writer.close() # Create the validation TF Record file filename = 'relevance.valid.tfrecords' print 'Writing', filename writer = tf.python_io.TFRecordWriter( os.path.join(args.destination_dir, filename)) for response, prompt, q_id, grd, spkr in zip(valid_responses, valid_prompts, valid_q_ids, valid_grades, valid_speakers): example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'targets': tfrecord_utils.float_feature([1.0]), 'grade': tfrecord_utils.float_feature([grd]), 'spkr': tfrecord_utils.bytes_feature([spkr]), 'q_id': tfrecord_utils.int64_feature([q_id]) }), feature_lists=tf.train.FeatureLists( feature_list={ 'response': tfrecord_utils.int64_feature_list(response), 'prompt': tfrecord_utils.int64_feature_list(prompt) })) writer.write(example.SerializeToString()) writer.close()
def main(args): """Converts a dataset to tfrecords.""" if os.path.isdir(args.destination_dir): usr_response = raw_input( "Destination directory exists. Type 'y' if you want to overwrite it: " ) if usr_response != 'y': print('destination directory {} exists.\nExiting...'.format( args.destination_dir)) exit() else: os.makedirs(args.destination_dir) shutil.copyfile(args.input_wlist_path, os.path.join(args.destination_dir, 'input.wlist.index')) # Get the paths to the relevant files responses_path = os.path.join(args.data_dir, args.responses_file) prompts_path = os.path.join(args.data_dir, args.prompts_file) grades_path = os.path.join(args.data_dir, args.grades_file) speakers_path = os.path.join(args.data_dir, args.speakers_file) predictions_path = os.path.join(args.data_dir, args.predictions_file) targets_path = os.path.join(args.data_dir, args.targets_file) dirich_params_path = os.path.join(args.data_dir, args.dirich_params_file) required_files = [ responses_path, prompts_path, grades_path, speakers_path, predictions_path, targets_path, dirich_params_path ] # Assert the required files exist for path in required_files: if not os.path.isfile(path): print('File: {} doesn`t exist. Exiting...'.format(path)) exit() # Cache the command: if not os.path.isdir(os.path.join(args.destination_dir, 'CMDs')): os.makedirs(os.path.join(args.destination_dir, 'CMDs')) with open(os.path.join(args.destination_dir, 'CMDs/preprocessing.cmd'), 'a') as cmd_cache: cmd_cache.write(' '.join(sys.argv) + '\n') cmd_cache.write('--------------------------------\n') # Load responses and prompts as sequences of word ids responses, _ = load_text(responses_path, args.input_wlist_path, strip_start_end=args.remove_sentence_tags) prompts, _ = load_text(prompts_path, args.input_wlist_path, strip_start_end=args.remove_sentence_tags) # Load up the speakers and grades with open(grades_path, 'r') as file: grades = np.asarray( [line.replace('\n', '') for line in file.readlines()]) with open(speakers_path, 'r') as file: speakers = np.asarray( [line.replace('\n', '') for line in file.readlines()]) # Load the teacher predictions predictions = np.loadtxt(predictions_path, dtype=np.float32) # Load true targets targets = np.loadtxt(targets_path, dtype=np.float32) # Load max-likelihood Dirichlet parameters alphas = np.loadtxt(dirich_params_path, dtype=np.float32) # Create or load the topic ID dictionary: if args.sorted_topics_path == '': # Generate and save new topic ID mapping topic_dict = generate_topic_dict(prompts_path, args.destination_dir) else: # Load a pre-existing topic ID mapping topic_dict = load_topic_dict(args.sorted_topics_path) # Create a list of topic IDs for every response with open(prompts_path, 'r') as file: # Use id -1 if topic not in topic_dict (not present in training set) q_ids = np.asarray([ topic_dict.get(line.replace('\n', ''), -1) for line in file.readlines() ]) if args.preprocessing_type == 'train': ### Split data into train and validation data sets n = len(responses) train_size = int(n * (1.0 - args.valid_fraction)) valid_size = n - train_size print('Total dataset size', n, 'Train dataset size', train_size, 'Valid dataset size', valid_size) np.random.seed(args.rand_seed) permutation = np.random.choice(np.arange(n), n, replace=False) index_train = permutation[:train_size] index_valid = permutation[train_size:] trn_responses = responses[index_train] trn_prompts = prompts[index_train] trn_q_ids = q_ids[index_train] trn_speakers = speakers[index_train] trn_grades = grades[index_train] trn_targets = targets[index_train] trn_predictions = predictions[index_train] trn_alphas = alphas[index_train] print("Number training examples: {}".format(len(trn_responses))) valid_responses = responses[index_valid] valid_prompts = prompts[index_valid] valid_q_ids = q_ids[index_valid] valid_speakers = speakers[index_valid] valid_grades = grades[index_valid] valid_targets = targets[index_valid] valid_predictions = predictions[index_valid] valid_alphas = alphas[index_valid] print("Number validation examples: {}".format(len(valid_responses))) # Create the training TF Record file write_to_tfrecords('relevance.train.tfrecords', args.destination_dir, trn_responses, trn_prompts, trn_q_ids, trn_grades, trn_speakers, trn_targets, trn_predictions, trn_alphas, debug=args.debug) # Create the validation TF Record file write_to_tfrecords('relevance.valid.tfrecords', args.destination_dir, valid_responses, valid_prompts, valid_q_ids, valid_grades, valid_speakers, valid_targets, valid_predictions, valid_alphas, debug=args.debug) # Write a metadata file for convenience: with open(os.path.join(args.destination_dir, 'dataset_meta.txt'), 'w') as meta_file: meta_string = 'num_examples_train:\t{}\nnum_examples_valid:\t{}\nnum_unique_topics:\t{}'.format( len(trn_responses), len(valid_responses), len(topic_dict)) meta_file.write(meta_string) elif args.preprocessing_type == 'test': write_to_tfrecords('relevance.test.tfrecords', args.destination_dir, responses, prompts, q_ids, grades, speakers, targets, predictions, alphas, debug=args.debug) # Write a metadata file for convenience: with open(os.path.join(args.destination_dir, 'dataset_meta.txt'), 'w') as meta_file: meta_string = 'num_examples:\t{}\nnum_unique_topics:\t{}'.format( len(responses), len(topic_dict)) meta_file.write(meta_string) print('Finished')
def main(argv=None): """Converts a dataset to tfrecords.""" args = commandLineParser.parse_args() if not os.path.isdir(args.destination_dir): os.makedirs(args.destination_dir) if not os.path.isdir('CMDs'): os.makedirs('CMDs') with open('CMDs/step_preprocess_test_data.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') # Load responses and prompts as sequences of word ids responses, _ = load_text(args.input_data_path, args.input_wlist_path) prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path) # Load up the grades, targets and speakers grades = np.loadtxt(args.input_grade_path) targets = np.loadtxt(args.input_tgt_path, dtype=np.float32) with open(args.input_spkr_path, 'r') as file: speakers = np.asarray( [line.replace('\n', '') for line in file.readlines()]) # Load up sorted topics and (re)construct the topic dict so that I map each prompt word sequence to its q_id topic_dict = {} i = 0 with open(os.path.join(args.sorted_topics_path), 'r') as tfile: for topic in tfile.readlines(): topic_dict[topic.replace('\n', '')] = i i += 1 # Load up the prompts as sequences of words and convert to q_id try: with open(args.input_prompt_path, 'r') as file: q_ids = np.asarray([ topic_dict[line.replace('\n', '')] for line in file.readlines() ]) except: with open(args.input_prompt_path, 'r') as file: q_ids = np.asarray([-1 for line in file.readlines()]) # Create the training TF Record file filename = args.name + '.tfrecords' print 'Writing', filename writer = tf.python_io.TFRecordWriter( os.path.join(args.destination_dir, filename)) for response, prompt, q_id, grd, spkr, tgt in zip(responses, prompts, q_ids, grades, speakers, targets): example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'targets': tfrecord_utils.float_feature([tgt]), 'grade': tfrecord_utils.float_feature([grd]), 'spkr': tfrecord_utils.bytes_feature([spkr]), 'q_id': tfrecord_utils.int64_feature([q_id]) }), feature_lists=tf.train.FeatureLists( feature_list={ 'response': tfrecord_utils.int64_feature_list(response), 'prompt': tfrecord_utils.int64_feature_list(prompt) })) writer.write(example.SerializeToString()) writer.close()