def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator): class Dataset: pass dataset = Dataset() dataset.uri = uri dataset.normalize = not no_image_normalization if prepare_data_iterator: if cache_dir == '': cache_dir = None if cache_dir and create_cache_explicitly: if not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0 or overwrite_cache: if not os.path.exists(cache_dir): os.mkdir(cache_dir) logger.log(99, 'Creating cache data for "' + uri + '"') with data_iterator_csv_dataset(uri, batch_size, shuffle, normalize=False, cache_dir=cache_dir) as di: index = 0 while index < di.size: progress('', (1.0 * di.position) / di.size) di.next() index += batch_size dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0: if cache_dir and not os.path.exists(cache_dir): os.mkdir(cache_dir) dataset.data_iterator = (lambda: data_iterator_csv_dataset( uri, batch_size, shuffle, normalize=dataset.normalize, cache_dir=cache_dir)) else: dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) else: dataset.data_iterator = None return dataset
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator): class Dataset: pass dataset = Dataset() dataset.uri = uri dataset.normalize = not no_image_normalization if prepare_data_iterator: if cache_dir == '': cache_dir = None if cache_dir and create_cache_explicitly: if not os.path.exists(cache_dir) or overwrite_cache: if not os.path.exists(cache_dir): os.mkdir(cache_dir) logger.info('Creating cache data for "' + uri + '"') with data_iterator_csv_dataset(uri, batch_size, shuffle, normalize=False, cache_dir=cache_dir) as di: index = 0 while index < di.size: progress('', (1.0 * di.position) / di.size) di.next() index += batch_size dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir): if cache_dir and not os.path.exists(cache_dir): os.mkdir(cache_dir) dataset.data_iterator = (lambda: data_iterator_csv_dataset( uri, batch_size, shuffle, normalize=dataset.normalize, cache_dir=cache_dir)) else: dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) else: dataset.data_iterator = None return dataset
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator): class Dataset: pass dataset = Dataset() dataset.uri = uri dataset.normalize = not no_image_normalization comm = current_communicator() # use same random state for each process until slice is called rng = numpy.random.RandomState(0) use_memory_cache = comm.size == 1 if comm else True if prepare_data_iterator: if cache_dir == '': cache_dir = None # Disable implicit cache creation when MPI is available. if cache_dir and (create_cache_explicitly or comm): cache_index = os.path.join(cache_dir, "cache_index.csv") if not os.path.exists(cache_index) or overwrite_cache: if single_or_rankzero(): logger.log(99, 'Creating cache data for "' + uri + '"') try: os.makedirs(cache_dir) except OSError: pass # python2 does not support exists_ok arg with data_iterator_csv_dataset(uri, batch_size, shuffle, rng=rng, normalize=False, cache_dir=cache_dir, with_memory_cache=False) as di: pass rng = numpy.random.RandomState(0) dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache)) elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0: if comm: logger.critical( 'Implicit cache creation does not support with MPI') import sys sys.exit(-1) else: if cache_dir: try: os.makedirs(cache_dir) except OSError: pass # python2 does not support exists_ok arg dataset.data_iterator = (lambda: data_iterator_csv_dataset( uri, batch_size, shuffle, rng=rng, normalize=dataset.normalize, cache_dir=cache_dir)) else: dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache)) else: dataset.data_iterator = None return dataset
def loadData(batch_size): cache_dir = "./cache" if os.path.isdir(cache_dir) == False: os.mkdir(cache_dir) dataset = data_iterator_csv_dataset( "../AIStudy.BoardGame/experience.csv", batch_size, shuffle=True, normalize=True, cache_dir=cache_dir) else: dataset = data_iterator_cache(cache_dir, batch_size, shuffle=True, normalize=True) variables = dataset.variables print(variables) """ s0Index = variables.index('s0') print(("index(s0)={}").format(s0Index)) for n in range(1000): data = dataset.next() if n==0: print(("shape(s0)={}").format(data[s0Index].shape)) print(("epoch={},position={},size={}, data_size={}").format(dataset.epoch, dataset.position,dataset.size, len(data[0]))) """ return dataset
def test_sliced_data_iterator_race_condition(num_of_slices, size, batch_size, shuffle): from nnabla.utils.data_source_implements import CacheDataSource from nnabla.utils.data_iterator import data_iterator_cache with generate_cache_dir(size) as cache_dir: rng = np.random.RandomState(313) iterator = data_iterator_cache(cache_dir, batch_size, shuffle=True) sliced_it = iterator.slice(rng, num_of_slices, 1) for i in range(size + 5): d = sliced_it.next() iterator.close()
def imagenet_iterator(config, comm, train=True): if config['dataset']['dali']: if train: pipe = DataPipeline(config['dataset']['path'], config['train']['batch_size'], config['dataset']['dali_threads'], comm.rank, num_gpus=comm.n_procs, seed=1, train=train) else: pipe = DataPipeline(config['dataset']['val_path'], config['train']['batch_size'], config['dataset']['dali_threads'], comm.rank, num_gpus=comm.n_procs, seed=1, train=train) data_iterator_ = dali_iterator.DaliIterator(pipe) data_iterator_.size = np.ceil(pipe.epoch_size("Reader")/comm.n_procs) data_iterator_.batch_size = config['train']['batch_size'] return data_iterator_ else: return data_iterator_cache(config['dataset']['cache_dir'], config['train']['batch_size'], shuffle=True, normalize=True)
def forward_command(args): callback.update_status(args) configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) batch_size = args.batch_size if batch_size < 1: batch_size = None class ForwardConfig: pass config = ForwardConfig info = load.load(files, prepare_data_iterator=False, batch_size=batch_size) config.global_config = info.global_config config.executors = info.executors.values() config.networks = [] for e in config.executors: if e.network.name in info.networks.keys(): config.networks.append(info.networks[e.network.name]) else: logger.critical('Network {} is not found.'.format( config.executor.network.name)) return False normalize = True for d in info.datasets.values(): if d.uri == args.dataset or d.cache_dir == args.dataset: normalize = d.normalize for e in config.executors: normalize = normalize and not e.no_image_normalization orders = {} # With CSV if os.path.splitext(args.dataset)[1] == '.csv': data_iterator = (lambda: data_iterator_csv_dataset( uri=args.dataset, batch_size=config.networks[0].batch_size, shuffle=False, normalize=normalize, with_memory_cache=False, with_file_cache=False)) # load dataset as csv filereader = FileReader(args.dataset) with filereader.open(textmode=True, encoding='utf-8-sig') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) if args.replace_path: root_path = os.path.dirname(args.dataset) root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep)) else: root_path = '.' rows = [row for row in rows if len(row)] rows = list( map( lambda row: list( map( lambda i, x: x if row0[i][0] == '#' or is_float( x) else compute_full_path(root_path, x), range(len(row)), row)), rows)) for i in range(len(rows)): orders[i] = i # With Cache elif os.path.splitext(args.dataset)[1] == '.cache': data_iterator = (lambda: data_iterator_cache(uri=args.dataset, batch_size=config. networks[0].batch_size, shuffle=False, normalize=normalize)) # Get original CSV original_csv = os.path.join(args.dataset, 'original.csv') try: # load dataset as csv filereader = FileReader(original_csv) with filereader.open(textmode=True, encoding='utf-8-sig') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) root_path = '.' rows = list( map( lambda row: list( map( lambda x: x if is_float(x) else compute_full_path( root_path, x), row)), rows)) except: print('Cannot open', original_csv) pass # Get original Data order. order_csv = os.path.join(args.dataset, 'order.csv') try: filereader = FileReader(order_csv) with filereader.open(textmode=True) as f: for original, shuffled in [[int(x) for x in row] for row in csv.reader(f)]: orders[original] = shuffled except: print('Cannot open', order_csv) for i in range(len(rows)): orders[i] = i else: print('Unsupported extension "{}" in "{}".'.format( os.path.splitext(args.dataset)[1], args.dataset)) callback.update_status(('data.max', len(rows))) callback.update_status(('data.current', 0)) callback.update_status('processing', True) result_csv_filename = os.path.join(args.outdir, args.outfile) with open(result_csv_filename, 'w', encoding='utf-8') as f: writer = csv.writer(f, lineterminator='\n') with data_iterator() as di: index = 0 while index < di.size: data = di.next() result, outputs = _forward(args, index, config, data, di.variables) if index == 0: for name, dim in zip(result.names, result.dims): if dim == 1: if e.repeat_evaluation_type == "std": name = "Uncertainty(Std)" row0.append(name) else: for d in range(dim): row0.append(name + '__' + str(d)) writer.writerow(row0) for i, output in enumerate(outputs): if index + i < len(rows): import copy row = copy.deepcopy(rows[orders[index + i]]) row.extend(output) writer.writerow(row) index += len(outputs) callback.update_status(('data.current', min([index, len(rows)]))) callback.update_forward_time() callback.update_status() logger.log( 99, 'data {} / {}'.format(min([index, len(rows)]), len(rows))) callback.process_evaluation_result(args.outdir, result_csv_filename) logger.log(99, 'Forward Completed.') progress(None) callback.update_status(('output_result.csv_header', ','.join(row0))) callback.update_status(('output_result.column_num', len(row0))) callback.update_status(('output_result.data_num', len(rows))) callback.update_status('finished') return True
'vision', 'imagenet')) from tiny_imagenet_data import data_iterator_tiny_imagenet with data_iterator_tiny_imagenet(args.batch_size, 'train') as di: test_data_iterator(di, args) elif args.uri == 'TINY_IMAGENET_VAL': sys.path.append( os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'vision', 'imagenet')) from tiny_imagenet_data import data_iterator_tiny_imagenet with data_iterator_tiny_imagenet(args.batch_size, 'val') as di: test_data_iterator(di, args) else: if os.path.splitext(args.uri)[1].lower() == '.cache': from nnabla.utils.data_iterator import data_iterator_cache with data_iterator_cache(uri=args.uri, batch_size=args.batch_size, shuffle=args.shuffle, with_memory_cache=args.memory_cache, normalize=args.normalize) as di: test_data_iterator(di, args) else: from nnabla.utils.data_iterator import data_iterator_csv_dataset with data_iterator_csv_dataset(uri=args.uri, batch_size=args.batch_size, shuffle=args.shuffle, normalize=args.normalize, with_memory_cache=args.memory_cache, with_file_cache=args.file_cache, cache_dir=args.output) as di: test_data_iterator(di, args)
def data_iterator_imagenet(batch_size, cache_dir, rng=None): return data_iterator_cache(cache_dir, batch_size, shuffle=True, normalize=False, rng=rng)