def __call__(self, *args, **kwargs): import partd if self.tempdir: file = partd.File(dir=self.tempdir) else: file = partd.File() if self.buffer: return partd.PandasBlocks(partd.Buffer(partd.Dict(), file)) else: return partd.PandasBlocks(file)
def groupby(self, grouper, npartitions=None, blocksize=2**20): """ Group collection by key function Note that this requires full dataset read, serialization and shuffle. This is expensive. If possible you should use ``foldby``. >>> b = from_sequence(range(10)) >>> dict(b.groupby(lambda x: x % 2 == 0)) # doctest: +SKIP {True: [0, 2, 4, 6, 8], False: [1, 3, 5, 7, 9]} See Also -------- Bag.foldby """ if npartitions is None: npartitions = self.npartitions token = tokenize(self, grouper, npartitions, blocksize) import partd p = ('partd-' + token,) try: dsk1 = {p: (partd.Python, (partd.Snappy, partd.File()))} except AttributeError: dsk1 = {p: (partd.Python, partd.File())} # Partition data on disk name = 'groupby-part-{0}-{1}'.format(funcname(grouper), token) dsk2 = dict(((name, i), (partition, grouper, (self.name, i), npartitions, p, blocksize)) for i in range(self.npartitions)) # Barrier barrier_token = 'groupby-barrier-' + token def barrier(args): return 0 dsk3 = {barrier_token: (barrier, list(dsk2))} # Collect groups name = 'groupby-collect-' + token dsk4 = dict(((name, i), (collect, grouper, i, p, barrier_token)) for i in range(npartitions)) return type(self)(merge(self.dask, dsk1, dsk2, dsk3, dsk4), name, npartitions)
def __call__(self, *args, **kwargs): import partd path = tempfile.mkdtemp(suffix=".partd", dir=self.tempdir) try: partd_compression = ( getattr(partd.compressed, self.compression) if self.compression else None ) except AttributeError as e: raise ImportError( "Not able to import and load {0} as compression algorithm." "Please check if the library is installed and supported by Partd.".format( self.compression ) ) from e file = partd.File(path) partd.file.cleanup_files.append(path) # Envelope partd file with compression, if set and available if partd_compression: file = partd_compression(file) if self.buffer: return partd.PandasBlocks(partd.Buffer(partd.Dict(), file)) else: return partd.PandasBlocks(file)
def set_asset_data(gazu_project_id, gazu_asset_id, avalon_asset_id): # Store Zou Id and Avalon Id key value pair of the asset # Set the directory where partd stores it's data base_directory = os.environ["DATA_PATH"] data_directory = os.path.join(base_directory, "data") directory = os.path.join(data_directory, gazu_project_id) # Create the data directory for the project if it doesn't exist. if not os.path.exists(directory): if not os.path.exists(data_directory): os.mkdir(data_directory) os.mkdir(directory) # Init partd p = partd.File(directory) # Check if the asset is already stored and delete it if it is. # (We're making the assumption that IDs supplied to us are unique). if p.get(gazu_asset_id): p.delete(gazu_asset_id) logger.info("Deleting: {0}".format(gazu_asset_id)) # Encode and store the data as a utf-8 bytes value = bytes(str(avalon_asset_id), "utf-8") key_values = {gazu_asset_id: value} p.append(key_values)
def get_asset_data(gazu_project_id, gazu_asset_id): # Lookup the Zou Id and Avalon Id key value pair of the asset # Set the directory where partd stores it's data base_directory = os.environ["DATA_PATH"] directory = os.path.join(base_directory, "data", gazu_project_id) # Init partd p = partd.File(directory) if not p.get(gazu_asset_id): return False else: # Get the Avalon asset ID from partd project_data = bytes.decode(p.get(gazu_asset_id), "utf-8") return project_data
def get_project_data(project_id): # Lookup the Zou Id and Avalon Id key value pair of the asset # Set the directory where partd stores it's data directory = os.path.join(os.environ["DATA_PATH"], "data", project_id) # Init partd p = partd.Pickle(partd.File(directory)) if not p.get(project_id): return False else: # Get the Avalon asset ID from partd project_info = p.get(project_id) project_data = { "id": project_info[0], "collection": project_info[1] } return project_data
def __call__(self, *args, **kwargs): import partd if self.buffer: return partd.PandasBlocks(partd.Buffer(partd.Dict(), partd.File())) else: return partd.PandasBlocks(partd.File())