def load_data(base_dir, min_samples, images_per_class=None): train = build_files_dataframe(os.path.join(args.base_dir, 'train')) print(train.head()) train = prune_file_list(train, label_col='label', min_samples=args.min_samples) train = train.sample(frac=1).reset_index(drop=True) # Filter out the classes that we do not wait # in our dev set. classes = np.unique(train['label']) # We either get None or some # integer which specifies the # target number of samples. if images_per_class: counts = {} for c in classes: counts[c] = len(train[train['label'] == c]) if images_per_class > max(counts.values()): print(( "[FATAL] The number of images per class requested is larger than " " the number of samples in the majority class. This is a fatal error!" )) exit() train_dataframes = [] for c in classes: train_dataframes.append( resample(train, replace=True, n_samples=images_per_class)) # Replace the training dataframe by the resampled stuffs # here. train = pd.concat(train_dataframes) # Load dev set and return it dev = build_files_dataframe(os.path.join(args.base_dir, 'dev')) print(dev.head()) dev = dev.sample(frac=1).reset_index(drop=True) return_cols = list(dev.columns) dev['keep'] = dev['label'].apply(lambda x: x in classes) dev = dev[dev['keep'] == True] # Load test set and return it test = build_files_dataframe(os.path.join(args.base_dir, 'test')) print(test.head()) test = test.sample(frac=1).reset_index(drop=True) return_cols = list(test.columns) test['keep'] = test['label'].apply(lambda x: x in classes) test = test[test['keep'] == True] return train, dev[return_cols], test[return_cols]
def load_dataframes(data_dir, min_samples): train = build_files_dataframe(os.path.join(data_dir, 'train')) train = prune_file_list(train, 'label', min_samples) dev = build_files_dataframe(os.path.join(data_dir, 'dev')) dev_cols = list(dev.columns) classes = np.unique(train['label']) dev['keep'] = dev['label'].apply(lambda x: x in classes) dev = dev[dev['keep'] == True] train = train.sample(frac=1).reset_index(drop=True) dev = dev.sample(frac=1).reset_index(drop=True) return train, dev
ap.add_argument('--backbone', required=True, type=str) ap.add_argument('--pooling', required=True, type=str) ap.add_argument('--output_dir', required=True, type=str) ap.add_argument('--min_samples', required=True, type=int) ap.add_argument('--cores', required=True, type=int) ap.add_argument('--save_features', action='store_true') return ap.parse_args() if __name__ == "__main__": args = get_args() # Load images and remove the classes with # too few examples. train = build_files_dataframe(os.path.join(args.base_dir, 'train')) train = prune_file_list(data=train, label_col='label', min_samples=args.min_samples) n_classes = train['label'].nunique() print("We have {} classes.".format(n_classes)) # Setup output create_directory(args.output_dir, recursive=True) # Build the model and import the correct pre-processing # function. Each model uses a different function. # Maybe they're the same under the hood because # they are all trained with imagenet (something to look # into). model, preprocess_input = model_factory(args.backbone, args.pooling)