def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) for key, sub in labels_to_class_names.items(): labels_to_class_names[key] = sub.replace("_", " ") write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(): #==============================================================CHECKS========================================================================== # Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.') # Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.') # Check if there is a dataset directory entered if not FLAGS.tfrecord_dir: raise ValueError('tfrecord_dir is empty. Please state a tfrecord_dir argument.') if not os.path.exists(FLAGS.tfrecord_dir): os.makedirs(FLAGS.tfrecord_dir) # If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists( dataset_dir = FLAGS.tfrecord_dir, _NUM_SHARDS = FLAGS.num_shards, output_filename = FLAGS.tfrecord_filename): print 'Dataset files already exist. Exiting without re-creating them.' return None #==============================================================END OF CHECKS=================================================================== # Get a list of photo_filenames and a list of sorted class names from parsing the subdirectories. photo_filenames, class_ids = _get_filenames_and_classes(FLAGS.dataset_dir) # Refer each of the class name to a specific integer number for predictions later class_ids_to_serial = dict(zip(class_ids, range(len(class_ids)))) # Write the labels file: serial_to_class_ids = dict(zip(range(len(class_ids)), class_ids)) write_label_file(serial_to_class_ids, FLAGS.dataset_dir) # Find the number of validation examples we need num_validation = int(float(FLAGS.validation_size) * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # Convert the training and validation sets. _convert_dataset('train', training_filenames, class_ids_to_serial, dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards, simulate = FLAGS.simulate) _convert_dataset('validation', validation_filenames, class_ids_to_serial, dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards, simulate = FLAGS.simulate) print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def main(argv): if not FLAGS.tfrecord_filename: raise ValueError( "tfrecord_filename is empty. Please state a tfrecord_filename argument." ) if not FLAGS.dataset_dir: raise ValueError( "dataset_dir is empty. Please state a dataset_dir argument.") if _dataset_exists( dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename, ): return None photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) num_validation = int(FLAGS.validation_size * len(photo_filenames)) random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] _convert_dataset( "train", training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, ) _convert_dataset( "validation", validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, ) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir)
def main(): if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) num_validation = int(FLAGS.validation_size * len(photo_filenames)) random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) write_data_summary(num_validation, len(photo_filenames), FLAGS.dataset_dir)
def main(): #=============CHECKS============== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_file=FLAGS.dataset_file, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==========END OF CHECKS============ #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_file) #Refer each of the class name to a specific integer number for predictions later class_ids = [1 if label == "normal" else 0 for label in class_names] #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] training_labels = class_ids[num_validation:] validation_labels = class_ids[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, training_labels, dataset_file=FLAGS.dataset_file, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, validation_labels, dataset_file=FLAGS.dataset_file, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== rank = MPI.COMM_WORLD.Get_rank() name = MPI.Get_processor_name() size = MPI.COMM_WORLD.Get_size() #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) if rank == 0: all_dat = master() else: #config = tf.ConfigProto() #config.intra_op_parallelism_threads = 8 #config.inter_op_parallelism_threads = 4 #worker_sess = tf.Session(config=config) worker(class_names_to_ids)
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) print('{}; {}'.format(len(photo_filenames), len(class_names))) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # Save validation images list to CSV file with open("./data/validate.csv", 'w') as f: f.write('IMAGE_NAME\n') for file in validation_filenames: head, filename = os.path.split(file) class_name = os.path.basename(os.path.dirname(file)) f.write(str(filename) + ',' + str(class_name) + '\n') # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(): #=============CHECKS============== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==========END OF CHECKS============ #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir, output_filename=FLAGS.tfrecord_filename) # Some datasets have a "categories" file with actual names those photo_names correspond to # such as UECFOOD256 dataset. Let's map it out. # if os.path.exists(os.path.join(FLAGS.dataset_dir, 'category.txt')): # with open(os.path.join(FLAGS.dataset_dir, 'category.txt')) as cat_file: # replacement_dict = [cat_name.split('\t') for cat_name in cat_file] # class_names = [replacement_dict[int(class_name)][1].replace('\n','') for class_name in class_names] # import pdb; pdb.set_trace() #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.training_dataset_dir: raise ValueError( 'training_dataset_dir is empty. Please state a training_dataset_dir argument.' ) #Check if there is a dataset directory entered if not FLAGS.validation_dataset_dir: raise ValueError( 'validation_dataset_dir is empty. Please state a validation_dataset_dir argument.' ) #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.training_dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. training_filenames, training_class_names = _get_filenames_and_classes( FLAGS.training_dataset_dir) #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. validation_filenames, validation_class_names = _get_filenames_and_classes( FLAGS.validation_dataset_dir) #Refer each of the class name to a specific integer number for predictions later training_class_names_to_ids = dict( zip(training_class_names, range(len(training_class_names)))) #Refer each of the class name to a specific integer number for predictions later validation_class_names_to_ids = dict( zip(validation_class_names, range(len(validation_class_names)))) # First, convert the training and validation sets. _convert_dataset('train', training_filenames, training_class_names_to_ids, dataset_dir=FLAGS.training_dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) _convert_dataset('validation', validation_filenames, validation_class_names_to_ids, dataset_dir=FLAGS.validation_dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) # Finally, write the labels file: labels_to_class_names = dict( zip(list(map(int, training_class_names)), training_class_names)) write_label_file(labels_to_class_names, FLAGS.training_dataset_dir) # Finally, write the labels file: labels_to_class_names = dict( zip(list(map(int, validation_class_names)), validation_class_names)) write_label_file(labels_to_class_names, FLAGS.validation_dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered #if not FLAGS.tfrecord_filename: #raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.') #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) training_filenames = [] validation_filenames = [] random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) # Divide dataset into training and validation. for class_ in class_names: files_matching_class = [ fil for fil in photo_filenames if fil.startswith(class_) ] val_samples_cnt = int( len(files_matching_class) * FLAGS.validation_size) training_filenames.extend(files_matching_class[val_samples_cnt:]) validation_filenames.extend(files_matching_class[:val_samples_cnt]) print("Training files size", len(training_filenames)) print("Validation files size", len(validation_filenames)) ''' # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] ''' # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, output_dir=FLAGS.output_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards, output_dir=FLAGS.output_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
#Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes(FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # First, convert the training and validation sets. _convert_dataset('train',
def main(): #==============================================================CHECKS========================================================================== #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) # generate fileids to create a <fileid, filename> .csv mapping file to be stored in TFDataset photo_fileids = list(range(1, len(photo_filenames) + 1)) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) print(f'\nrandom seed partition = {FLAGS.random_seed}') # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] training_fileids = photo_fileids[num_validation:] validation_fileids = photo_fileids[:num_validation] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, training_fileids, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards) write_image_ids_file(training_filenames, training_fileids, FLAGS.dataset_dir, IMAGE_IDS_FILENAME) if num_validation > 0: _convert_dataset('validation', validation_filenames, validation_fileids, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards) write_image_ids_file(validation_filenames, validation_fileids, FLAGS.dataset_dir, IMAGE_IDS_VALIDATION_TMP_FILENAME) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('\nFinished converting the dataset!')
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.') #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir = FLAGS.dataset_dir, _NUM_SHARDS = FLAGS.num_shards, output_filename = FLAGS.tfrecord_filename): print 'Dataset files already exist. Exiting without re-creating them.' return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes(FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = {'pos': 1, 'neg': 0} training_filenames = [] validation_filenames = [] pos_dir = root_dir + "training_clips/split/photos/pos/" neg_dir = root_dir + "training_clips/split/photos/neg/" trainf = open(root_dir + "train_llbl.txt", 'r') for line in trainf: name = line.split(" ")[0] classid = int(line.split(" ")[1]) if classid == 1: full_name = pos_dir + name elif classid == 0: full_name = neg_dir + name training_filenames.append(full_name) trainf.close() testf = open(root_dir + "test_llbl.txt", 'r') for line in testf: name = line.split(" ")[0] classid = int(line.split(" ")[1]) if classid == 1: full_name = pos_dir + name elif classid == 0: full_name = neg_dir + name validation_filenames.append(full_name) testf.close() random.seed(FLAGS.random_seed) random.shuffle(training_filenames) random.shuffle(validation_filenames) # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir = FLAGS.dataset_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir = FLAGS.dataset_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards) print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def master(): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/home/agravat/key.json' #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) """ cloud_filenames, cloud_class_names = get_cloud_filenames_and_classes("gs://agravat-demo/images") for f in cloud_filenames: print(f) """ #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] # First, convert the training and validation sets. train_file_mappings = _convert_dataset( 'train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) val_file_mappings = _convert_dataset( 'validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) file_mappings = train_file_mappings file_mappings.update(val_file_mappings) rank_files = [] outfile = "out.tfrecord" """ for rank,o,file in train_file_mappings: if comm.rank == rank: rank_files.append(file) outfile = o #print("rank %d len files = %d, outfile = %s\n" % (comm.rank, len(rank_files), outfile)) """ #print("rank: %d, %s, outfile = %s" % (comm.rank, gethostname(), outfile)) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) #print('Finished converting the %s dataset for %s rank %d' % (FLAGS.tfrecord_filename, gethostname(), comm.rank)) all_data = [] size = MPI.COMM_WORLD.Get_size() current_work = Work(train_file_mappings) comm = MPI.COMM_WORLD status = MPI.Status() # this is the loop where the master distributes all the work based on the number of workers # that are available for i in range(1, size): # the master gets the next element in the list anext = current_work.get_next_item() if not anext: break # master sends the element to a worker comm.send(obj=anext, dest=i, tag=WORKTAG) # this is a fallback if there are more work items than workers while 1: # get the next work item but we break if there are None anext = current_work.get_next_item() if not anext: break # get the result from any worker data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) #print("more work rank %d, host %s data %s" % (comm.rank, gethostname(), data)) # add the processed result to the list of results all_data.append(data) # send another work item to the worker who completed the last task print("spillover %d %s" % (comm.rank, gethostname())) comm.send(obj=anext, dest=status.Get_source(), tag=WORKTAG) # get the results back from the workers for i in range(1, size): data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG) print("recieved from %d" % (i)) all_data.append(data) # end the tasks for i in range(1, size): comm.send(obj=None, dest=i, tag=DIETAG) return all_data
def main(): #==============================================================CHECKS========================================================================== #Check if there is a tfrecord_filename entered if not FLAGS.tfrecord_filename: raise ValueError( 'tfrecord_filename is empty. Please state a tfrecord_filename argument.' ) #Check if there is a dataset directory entered if not FLAGS.dataset_dir: raise ValueError( 'dataset_dir is empty. Please state a dataset_dir argument.') #If the TFRecord files already exist in the directory, then exit without creating the files again if _dataset_exists(dataset_dir=FLAGS.dataset_dir, _NUM_SHARDS=FLAGS.num_shards, output_filename=FLAGS.tfrecord_filename): print('Dataset files already exist. Exiting without re-creating them.') return None #==============================================================END OF CHECKS=================================================================== #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. photo_filenames, class_names = _get_filenames_and_classes( FLAGS.dataset_dir) #Refer each of the class name to a specific integer number for predictions later class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #Find the number of validation examples we need num_validation = int(FLAGS.validation_size * len(photo_filenames)) # Divide the training datasets into train and test: random.seed(FLAGS.random_seed) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_validation:] validation_filenames = photo_filenames[:num_validation] if comm.rank == 0: # First, convert the training and validation sets. train_file_mappings = _convert_dataset( 'train', training_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) val_file_mappings = _convert_dataset( 'validation', validation_filenames, class_names_to_ids, dataset_dir=FLAGS.dataset_dir, tfrecord_filename=FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards) #file_mappings.extend(val_file_mappings) else: train_file_mappings = None val_file_mappings = None train_file_mappings = comm.bcast(train_file_mappings, root=0) rank_files = [] outfile = None for rank, o, file in train_file_mappings: if comm.rank == rank: rank_files.append(file) outfile = o #print("rank %d len files = %d, outfile = %s\n" % (comm.rank, len(rank_files), outfile)) print("rank: %d, %s, outfile = %s" % (comm.rank, gethostname(), outfile)) # write training files if outfile != None: _write_dataset(rank_files, outfile, class_names_to_ids) val_file_mappings = comm.bcast(val_file_mappings, root=0) rank_files = [] outfile = None for rank, o, file in val_file_mappings: #if comm.rank == 29: #print("rank %d, shards = %d, comm rank = %d, mod = %d" % (rank, FLAGS.num_shards, comm.rank, rank % FLAGS.num_shards)) if rank % FLAGS.num_shards == comm.rank: rank_files.append(file) outfile = o #print("rannk: %d, file = %s" % ( #pprint("rank: %s, %d, %s, %s" % (comm.rank, a,b,c)) print("val rank: %d, %s, outfile = %s" % (comm.rank, gethostname(), outfile)) # write validation files if outfile != None: _write_dataset(rank_files, outfile, class_names_to_ids) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) write_label_file(labels_to_class_names, FLAGS.dataset_dir) print('Finished converting the %s dataset for %s rank %d' % (FLAGS.tfrecord_filename, gethostname(), comm.rank))