def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print 'Dataset files already exist. Exiting without re-creating them.' return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names, test_set = permutate(FLAGS.dataset_dir, 10000, 400) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need #num_validation = int(FLAGS.validation_size * len(photo_filenames)) #Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) random.shuffle(test_set) #training_filenames = photo_filenames[num_validation:] #validation_filenames = photo_filenames[:num_validation] training_filenames = photo_filenames validation_filenames = test_set # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def main(): #==============================================================CHECKS========================================================================== # Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.') # Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.') # Check if there is a dataset directory entered if not FLAGS.tfrecord_dir: raise ValueError('tfrecord_dir is empty. Please state a tfrecord_dir argument.') if not os.path.exists(FLAGS.tfrecord_dir): os.makedirs(FLAGS.tfrecord_dir) # If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists( dataset_dir = FLAGS.tfrecord_dir, _NUM_SHARDS = FLAGS.num_shards, output_filename = FLAGS.tfrecord_filename): print 'Dataset files already exist. Exiting without re-creating them.' return None #==============================================================END OF CHECKS=================================================================== # Get a list of photo_filenames and a list of sorted class names from parsing the subdirectories. photo_filenames, class_ids = _get_filenames_and_classes(FLAGS.dataset_dir) # Refer each of the class name to a specific integer number for predictions later class_ids_to_serial = dict(zip(class_ids, range(len(class_ids)))) # Write the labels file: serial_to_class_ids = dict(zip(range(len(class_ids)), class_ids)) write_label_file(serial_to_class_ids, FLAGS.dataset_dir) # Find the number of validation examples we need num_validation = int(float(FLAGS.validation_size) * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # Convert the training and validation sets. _convert_dataset('train', training_filenames, class_ids_to_serial, dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards, simulate = FLAGS.simulate) _convert_dataset('validation', validation_filenames, class_ids_to_serial, dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards, simulate = FLAGS.simulate) print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo__train_filenames, photo__val_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) training_filenames = photo__train_filenames validation_filenames = photo__val_filenames # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(argv): if not FLAGS.tfrecord_filename: raise ValueError( "tfrecord_filename is empty. Please state a tfrecord_filename argument." ) if not FLAGS.dataset_dir: raise ValueError( "dataset_dir is empty. Please state a dataset_dir argument.") if _dataset_exists( dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename, ): return None photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) num_validation = int(FLAGS.validation_size * len(photo_filenames)) random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] _convert_dataset( "train", training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, ) _convert_dataset( "validation", validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, ) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir)
def main(): if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) num_validation = int(FLAGS.validation_size * len(photo_filenames)) random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) write_data_summary(num_validation, len(photo_filenames), FLAGS.dataset_dir)
def main(): #=============CHECKS============== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_file=FLAGS.dataset_file, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==========END OF CHECKS============ #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_file) #Refer each of the class name to a specific integer number for predictions later class_ids = [1 if label == "normal" else 0 for label in class_names] #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] training_labels = class_ids[num_validation:] validation_labels = class_ids[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, training_labels, dataset_file=FLAGS.dataset_file, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, validation_labels, dataset_file=FLAGS.dataset_file, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== rank = MPI.COMM_WORLD.Get_rank() name = MPI.Get_processor_name() size = MPI.COMM_WORLD.Get_size() #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) if rank == 0: all_dat = master() else: #config = tf.ConfigProto() #config.intra_op_parallelism_threads = 8 #config.inter_op_parallelism_threads = 4 #worker_sess = tf.Session(config=config) worker(class_names_to_ids)
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) print('{}; {}'.format(len(photo_filenames), len(class_names))) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # Save validation images list to CSV file with open("./data/validate.csv", 'w') as f: f.write('IMAGE_NAME\n') for file in validation_filenames: head, filename = os.path.split(file) class_name = os.path.basename(os.path.dirname(file)) f.write(str(filename) + ',' + str(class_name) + '\n') # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(): #=============CHECKS============== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==========END OF CHECKS============ #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir, output_filename=FLAGS.tfrecord_filename) # Some datasets have a "categories" file with actual names those photo_names correspond to # such as UECFOOD256 dataset. Let's map it out. # if os.path.exists(os.path.join(FLAGS.dataset_dir, 'category.txt')): # with open(os.path.join(FLAGS.dataset_dir, 'category.txt')) as cat_file: # replacement_dict = [cat_name.split('\t') for cat_name in cat_file] # class_names = [replacement_dict[int(class_name)][1].replace('\n','') for class_name in class_names] # import pdb; pdb.set_trace() #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered #if not FLAGS.tfrecord_filename: #raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.') #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) training_filenames = [] validation_filenames = [] random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) # Divide dataset into training and validation. for class_ in class_names: files_matching_class = [ fil for fil in photo_filenames if fil.startswith(class_) ] val_samples_cnt = int( len(files_matching_class) * FLAGS.validation_size) training_filenames.extend(files_matching_class[val_samples_cnt:]) validation_filenames.extend(files_matching_class[:val_samples_cnt]) print("Training files size", len(training_filenames)) print("Validation files size", len(validation_filenames)) ''' # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] ''' # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, output_dir=FLAGS.output_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, output_dir=FLAGS.output_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
#==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes(FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test:
def main(): #==============================================================CHECKS========================================================================== #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) # generate fileids to create a <fileid, filename> .csv mapping file to be stored in TFDataset photo_fileids = list(range(1, len(photo_filenames) + 1)) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) print(f'\nrandom seed partition = {FLAGS.random_seed}') # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] training_fileids = photo_fileids[num_validation:] validation_fileids = photo_fileids[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, training_fileids, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards) write_image_ids_file(training_filenames, training_fileids, FLAGS.dataset_dir, IMAGE_IDS_FILENAME) if num_validation > 0: _convert_dataset('validation', validation_filenames, validation_fileids, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards) write_image_ids_file(validation_filenames, validation_fileids, FLAGS.dataset_dir, IMAGE_IDS_VALIDATION_TMP_FILENAME) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the dataset!')
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.') #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir = FLAGS.dataset_dir, _NUM_SHARDS = FLAGS.num_shards, output_filename = FLAGS.tfrecord_filename): print 'Dataset files already exist. Exiting without re-creating them.' return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes(FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = {'pos': 1, 'neg': 0} training_filenames = [] validation_filenames = [] pos_dir = root_dir + "training_clips/split/photos/pos/" neg_dir = root_dir + "training_clips/split/photos/neg/" trainf = open(root_dir + "train_llbl.txt", 'r') for line in trainf: name = line.split(" ")[0] classid = int(line.split(" ")[1]) if classid == 1: full_name = pos_dir + name elif classid == 0: full_name = neg_dir + name training_filenames.append(full_name) trainf.close() testf = open(root_dir + "test_llbl.txt", 'r') for line in testf: name = line.split(" ")[0] classid = int(line.split(" ")[1]) if classid == 1: full_name = pos_dir + name elif classid == 0: full_name = neg_dir + name validation_filenames.append(full_name) testf.close() random.seed(FLAGS.random_seed) random.shuffle(training_filenames) random.shuffle(validation_filenames) # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir = FLAGS.dataset_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir = FLAGS.dataset_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards) print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print 'Dataset files already exist. Exiting without re-creating them.' return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. # photo_filenames, labels = _get_image_label(read_label_file(FLAGS.dataset_dir, 'data/list/binary_effusion.txt')) train_image, train_label = _get_image_label( read_label_file(FLAGS.dataset_dir, FLAGS.train_list)) logging.debug("train_image: %s, train_label: %s", train_image[:10], train_label[:10]) val_image, val_label = _get_image_label( read_label_file(FLAGS.dataset_dir, FLAGS.val_list)) logging.debug("val_image: %s, val_label: %s", val_image[:10], val_label[:10]) #Refer each of the class name to a specific integer number for predictions later # class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need # num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) # random.shuffle(photo_filenames) # training_filenames = photo_filenames[num_validation:] # train_label = labels[num_validation:] # validation_filenames = photo_filenames[:num_validation] # val_label = labels[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', train_image, train_label, dataset_dir=FLAGS.dataset_dir, write_dir=FLAGS.write_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', val_image, val_label, dataset_dir=FLAGS.dataset_dir, write_dir=FLAGS.write_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: # labels_to_class_names = dict(zip(range(len(class_names)), class_names)) # write_label_file(labels_to_class_names, FLAGS.dataset_dir) print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] if comm.rank == 0: # First, convert the training and validation sets. train_file_mappings = _convert_dataset( 'train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) val_file_mappings = _convert_dataset( 'validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) #file_mappings.extend(val_file_mappings) else: train_file_mappings = None val_file_mappings = None train_file_mappings = comm.bcast(train_file_mappings, root=0) rank_files = [] outfile = None for rank, o, file in train_file_mappings: if comm.rank == rank: rank_files.append(file) outfile = o #print("rank %d len files = %d, outfile = %s\n" % (comm.rank, len(rank_files), outfile)) print("rank: %d, %s, outfile = %s" % (comm.rank, gethostname(), outfile)) # write training files if outfile != None: _write_dataset(rank_files, outfile, class_names_to_ids) val_file_mappings = comm.bcast(val_file_mappings, root=0) rank_files = [] outfile = None for rank, o, file in val_file_mappings: #if comm.rank == 29: #print("rank %d, shards = %d, comm rank = %d, mod = %d" % (rank, FLAGS.num_shards, comm.rank, rank % FLAGS.num_shards)) if rank % FLAGS.num_shards == comm.rank: rank_files.append(file) outfile = o #print("rannk: %d, file = %s" % ( #pprint("rank: %s, %d, %s, %s" % (comm.rank, a,b,c)) print("val rank: %d, %s, outfile = %s" % (comm.rank, gethostname(), outfile)) # write validation files if outfile != None: _write_dataset(rank_files, outfile, class_names_to_ids) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('Finished converting the %s dataset for %s rank %d' % (FLAGS.tfrecord_filename, gethostname(), comm.rank))