def write_to_tfrecords(filename, destination_dir, responses, prompts, q_ids, grades, speakers, targets, predictions, debug=False): # Check that all the input lists are of equal lengths assert len({len(responses), len(prompts), len(q_ids), len(grades), len(speakers), len(targets), len(predictions)}) == 1 # Create the training TF Record file print('Writing: ', filename) writer = tf.python_io.TFRecordWriter(os.path.join(destination_dir, filename)) for response, prompt, q_id, grd, spkr, tgt, example_pred, idx in zip(responses, prompts, q_ids, grades, speakers, targets, predictions, range(len(q_ids))): example = tf.train.SequenceExample( context=tf.train.Features(feature={ 'targets': tfrecord_utils.float_feature([tgt]), 'grade': tfrecord_utils.float_feature([float(grd)]), 'teacher_pred': tfrecord_utils.float_feature(list(example_pred)), 'spkr': tfrecord_utils.bytes_feature([spkr]), 'q_id': tfrecord_utils.int64_feature([q_id]), 'example_idx': tfrecord_utils.int64_feature([idx]) # Stores the example number for easy back-reference to txt files even when examples get shuffled (0 indexed) }), feature_lists=tf.train.FeatureLists(feature_list={ 'response': tfrecord_utils.int64_feature_list(response), 'prompt': tfrecord_utils.int64_feature_list(prompt)})) if debug: # Print out the data that is going to be saved: print("-----------------\n", "EXAMPLE: \n", "Response: {}\nPrompt: {}\nQ_id: {}\n\ntarget: {}\ngrade: {}\n,teacher_pred: {}\nexample_num: {}\n\n".format(response, prompt, q_id, tgt, grd, example_pred, idx)) writer.write(example.SerializeToString()) writer.close() return
def main(argv=None): """Converts a dataset to tfrecords.""" args = commandLineParser.parse_args() if os.path.isdir(args.destination_dir): print 'destination directory exists. Exiting...' else: os.makedirs(args.destination_dir) if not os.path.isdir('CMDs'): os.makedirs('CMDs') with open('CMDs/step_preprocess_data.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') # Load responses and prompts as sequences of word ids responses, _ = load_text(args.input_data_path, args.input_wlist_path) prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path) # Load up the prompts as sequences of words with open(args.input_prompt_path, 'r') as file: topics = [line.replace('\n', '') for line in file.readlines()] # Get unique set of topics and topic counts (and sort tem) unique_topics, topic_counts = np.unique(topics, return_counts=True) topics = unique_topics[np.flip(np.argsort(topic_counts), 0)] topic_counts = np.flip(np.sort(topic_counts), 0) # Create dictionary for topics mapping sentence to topic id # Also create file of sorted topics and unigrams file # Unigram file later used for training topic_dict = {} with open(os.path.join(args.destination_dir, 'unigrams.txt'), 'w') as ufile: with open(os.path.join(args.destination_dir, 'sorted_topics.txt'), 'w') as tfile: for i, topic, count in zip(xrange(topics.shape[0]), topics, topic_counts): topic_dict[topic] = i ufile.write(str(i) + ',' + str(int(count)) + '\n') tfile.write(topic + '\n') # Load up the speakers and speakers grades = np.loadtxt(args.input_grade_path) with open(args.input_spkr_path, 'r') as file: speakers = np.asarray( [line.replace('\n', '') for line in file.readlines()]) # Create a list of topic IDs for every response with open(args.input_prompt_path, 'r') as file: q_ids = np.asarray( [topic_dict[line.replace('\n', '')] for line in file.readlines()]) ### Split data into train and validation data sets n = len(responses) train_size = int(n * (1.0 - args.valid_fraction)) valid_size = n - train_size print 'Total dataset size', n, 'Train dataset size', train_size, 'Valid dataset size', valid_size np.random.seed(1000) permutation = np.random.choice(np.arange(n), n, replace=False) index_train = permutation[:train_size] inded_valid = permutation[train_size:] trn_responses = responses[index_train] trn_prompts = prompts[index_train] trn_q_ids = q_ids[index_train] trn_speakers = speakers[index_train] trn_grades = grades[index_train] valid_responses = responses[inded_valid] valid_prompts = prompts[inded_valid] valid_q_ids = q_ids[inded_valid] valid_speakers = speakers[inded_valid] valid_grades = grades[inded_valid] # Create the training TF Record file filename = 'relevance.train.tfrecords' print 'Writing', filename writer = tf.python_io.TFRecordWriter( os.path.join(args.destination_dir, filename)) for response, prompt, q_id, grd, spkr in zip(trn_responses, trn_prompts, trn_q_ids, trn_grades, trn_speakers): example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'targets': tfrecord_utils.float_feature([1.0]), 'grade': tfrecord_utils.float_feature([grd]), 'spkr': tfrecord_utils.bytes_feature([spkr]), 'q_id': tfrecord_utils.int64_feature([q_id]) }), feature_lists=tf.train.FeatureLists( feature_list={ 'response': tfrecord_utils.int64_feature_list(response), 'prompt': tfrecord_utils.int64_feature_list(prompt) })) writer.write(example.SerializeToString()) writer.close() # Create the validation TF Record file filename = 'relevance.valid.tfrecords' print 'Writing', filename writer = tf.python_io.TFRecordWriter( os.path.join(args.destination_dir, filename)) for response, prompt, q_id, grd, spkr in zip(valid_responses, valid_prompts, valid_q_ids, valid_grades, valid_speakers): example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'targets': tfrecord_utils.float_feature([1.0]), 'grade': tfrecord_utils.float_feature([grd]), 'spkr': tfrecord_utils.bytes_feature([spkr]), 'q_id': tfrecord_utils.int64_feature([q_id]) }), feature_lists=tf.train.FeatureLists( feature_list={ 'response': tfrecord_utils.int64_feature_list(response), 'prompt': tfrecord_utils.int64_feature_list(prompt) })) writer.write(example.SerializeToString()) writer.close()
def main(argv=None): """Converts a dataset to tfrecords.""" args = commandLineParser.parse_args() if not os.path.isdir(args.destination_dir): os.makedirs(args.destination_dir) if not os.path.isdir('CMDs'): os.makedirs('CMDs') with open('CMDs/step_preprocess_test_data.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') # Load responses and prompts as sequences of word ids responses, _ = load_text(args.input_data_path, args.input_wlist_path) prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path) # Load up the grades, targets and speakers grades = np.loadtxt(args.input_grade_path) targets = np.loadtxt(args.input_tgt_path, dtype=np.float32) with open(args.input_spkr_path, 'r') as file: speakers = np.asarray( [line.replace('\n', '') for line in file.readlines()]) # Load up sorted topics and (re)construct the topic dict so that I map each prompt word sequence to its q_id topic_dict = {} i = 0 with open(os.path.join(args.sorted_topics_path), 'r') as tfile: for topic in tfile.readlines(): topic_dict[topic.replace('\n', '')] = i i += 1 # Load up the prompts as sequences of words and convert to q_id try: with open(args.input_prompt_path, 'r') as file: q_ids = np.asarray([ topic_dict[line.replace('\n', '')] for line in file.readlines() ]) except: with open(args.input_prompt_path, 'r') as file: q_ids = np.asarray([-1 for line in file.readlines()]) # Create the training TF Record file filename = args.name + '.tfrecords' print 'Writing', filename writer = tf.python_io.TFRecordWriter( os.path.join(args.destination_dir, filename)) for response, prompt, q_id, grd, spkr, tgt in zip(responses, prompts, q_ids, grades, speakers, targets): example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'targets': tfrecord_utils.float_feature([tgt]), 'grade': tfrecord_utils.float_feature([grd]), 'spkr': tfrecord_utils.bytes_feature([spkr]), 'q_id': tfrecord_utils.int64_feature([q_id]) }), feature_lists=tf.train.FeatureLists( feature_list={ 'response': tfrecord_utils.int64_feature_list(response), 'prompt': tfrecord_utils.int64_feature_list(prompt) })) writer.write(example.SerializeToString()) writer.close()
def write_to_tfrecords(filename, destination_dir, responses, prompts, q_ids, grades, speakers, targets=1.0, debug=False): # Check that all the input lists are of equal lengths # TEMP print(len(responses)) print(len(prompts)) print(len(q_ids)) print(len(grades)) print(len(speakers)) assert len( {len(responses), len(prompts), len(q_ids), len(grades), len(speakers)}) == 1 if type(targets) is float or type(targets) is int: # If targets is an integer make each target this value targets = [float(targets)] * len(responses) else: assert type(targets) is list assert len(targets) == len(responses) # Create the training TF Record file print('Writing: ', filename) writer = tf.python_io.TFRecordWriter( os.path.join(destination_dir, filename)) for response, prompt, q_id, grd, spkr, tgt in zip(responses, prompts, q_ids, grades, speakers, targets): if debug: # Print out the data that is going to be saved: print( "-----------------\n", "EXAMPLE: \n", "Response: {}\nPrompt: {}\nQ_id: {}\n\ntarget: {}\ngrade: {}\n\n" .format(response, prompt, q_id, tgt, grd)) example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'targets': tfrecord_utils.float_feature([tgt]), 'grade': tfrecord_utils.float_feature([float(grd)]), 'spkr': tfrecord_utils.bytes_feature([spkr]), 'q_id': tfrecord_utils.int64_feature([q_id]) }), feature_lists=tf.train.FeatureLists( feature_list={ 'response': tfrecord_utils.int64_feature_list(response), 'prompt': tfrecord_utils.int64_feature_list(prompt) })) writer.write(example.SerializeToString()) writer.close() return