def __init__(self, input_file, split_field): """Provide input information for splits.""" raw_df = next(load_data([input_file], shard_size=None)) self.splits = raw_df[split_field].values
def featurize(self, input_files, data_dir, shard_size=8192, num_shards_per_batch=24, worker_pool=None, logging=True, debug=False): """Featurize provided files and write to specified location.""" ############################################################## TIMING time1 = time.time() ############################################################## TIMING log("Loading raw samples now.", self.verbosity) log("shard_size: %d" % shard_size, self.verbosity) log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity) # Allow users to specify a single file for featurization if not isinstance(input_files, list): input_files = [input_files] if not os.path.exists(data_dir): os.makedirs(data_dir) # Construct partial function to write datasets. if not len(input_files): return None input_type = get_input_type(input_files[0]) if logging: mp.log_to_stderr() if worker_pool is None: if logging: worker_pool = LoggingPool(processes=1) else: worker_pool = mp.Pool(processes=1) log("Spawning workers now.", self.verbosity) metadata_rows = [] data_iterator = it.izip( it.repeat((self, shard_size, input_type, data_dir)), enumerate(load_data(input_files, shard_size, self.verbosity))) # Turns out python map is terrible and exhausts the generator as given. # Solution seems to be to to manually pull out N elements from iterator, # then to map on only those N elements. BLECH. Python should do a better # job here. num_batches = 0 ############################################################## TIMING time2 = time.time() log("TIMING: pre-map featurization took %0.3f s" % (time2-time1)) ############################################################## TIMING while True: log("About to start processing next batch of shards", self.verbosity) ############################################################## TIMING time1 = time.time() ############################################################## TIMING iterator = itertools.islice(data_iterator, num_shards_per_batch) if not debug: batch_metadata = worker_pool.map( featurize_map_function, iterator) else: batch_metadata = [] for elt in iterator: batch_metadata.append(featurize_map_function(elt)) ############################################################## TIMING time2 = time.time() log("TIMING: map call on batch took %0.3f s" % (time2-time1), self.verbosity) ############################################################## TIMING if batch_metadata: metadata_rows.extend([elt for elt in batch_metadata if elt is not None]) num_batches += 1 log("Featurized %d datapoints\n" % (shard_size * num_shards_per_batch * num_batches), self.verbosity) else: break ############################################################## TIMING time1 = time.time() ############################################################## TIMING # TODO(rbharath): This whole bit with metadata_rows is an awkward way of # creating a Dataset. Is there a more elegant solutions? dataset = Dataset(data_dir=data_dir, metadata_rows=metadata_rows, reload=reload, verbosity=self.verbosity) ############################################################## TIMING time2 = time.time() print("TIMING: dataset construction took %0.3f s" % (time2-time1), self.verbosity) ############################################################## TIMING return dataset
def __init__(self, input_file, split_field, verbose=False): """Provide input information for splits.""" raw_df = next(load_data([input_file], shard_size=None)) self.splits = raw_df[split_field].values self.verbose = verbose
def featurize(self, input_files, data_dir, shard_size=8192, num_shards_per_batch=24, worker_pool=None, logging=True, debug=False): """Featurize provided files and write to specified location.""" ############################################################## TIMING time1 = time.time() ############################################################## TIMING log("Loading raw samples now.", self.verbosity) log("shard_size: %d" % shard_size, self.verbosity) log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity) # Allow users to specify a single file for featurization if not isinstance(input_files, list): input_files = [input_files] if not os.path.exists(data_dir): os.makedirs(data_dir) # Construct partial function to write datasets. if not len(input_files): return None input_type = get_input_type(input_files[0]) if logging: mp.log_to_stderr() if worker_pool is None: if logging: worker_pool = LoggingPool(processes=1) else: worker_pool = mp.Pool(processes=1) log("Spawning workers now.", self.verbosity) metadata_rows = [] data_iterator = it.izip( it.repeat((self, shard_size, input_type, data_dir)), enumerate(load_data(input_files, shard_size, self.verbosity))) # Turns out python map is terrible and exhausts the generator as given. # Solution seems to be to to manually pull out N elements from iterator, # then to map on only those N elements. BLECH. Python should do a better # job here. num_batches = 0 ############################################################## TIMING time2 = time.time() log("TIMING: pre-map featurization took %0.3f s" % (time2 - time1)) ############################################################## TIMING while True: log("About to start processing next batch of shards", self.verbosity) ############################################################## TIMING time1 = time.time() ############################################################## TIMING iterator = itertools.islice(data_iterator, num_shards_per_batch) if not debug: batch_metadata = worker_pool.map(featurize_map_function, iterator) else: batch_metadata = [] for elt in iterator: batch_metadata.append(featurize_map_function(elt)) ############################################################## TIMING time2 = time.time() log("TIMING: map call on batch took %0.3f s" % (time2 - time1), self.verbosity) ############################################################## TIMING if batch_metadata: metadata_rows.extend( [elt for elt in batch_metadata if elt is not None]) num_batches += 1 log( "Featurized %d datapoints\n" % (shard_size * num_shards_per_batch * num_batches), self.verbosity) else: break ############################################################## TIMING time1 = time.time() ############################################################## TIMING # TODO(rbharath): This whole bit with metadata_rows is an awkward way of # creating a Dataset. Is there a more elegant solutions? dataset = Dataset(data_dir=data_dir, metadata_rows=metadata_rows, reload=reload, verbosity=self.verbosity) ############################################################## TIMING time2 = time.time() print("TIMING: dataset construction took %0.3f s" % (time2 - time1), self.verbosity) ############################################################## TIMING return dataset