def post_build(self): def csv_map(): """Map input files to the output CSVs corresponding to them. Return {path sha1: [file names (minus '.csv' extension)]}. This saves a lot of globbing later, which can add up to hours over the course of tens of thousands of files, depending on IO speed. An alternative approach might be a radix tree of folders: less RAM, more IO. Try that and bench it sometime. """ ret = defaultdict(list) for csv_name in listdir(self._temp_folder): if csv_name.endswith('.csv'): path_hash, content_hash, ext = csv_name.split('.') # Removing ".csv" saves at least 2MB per worker on 700K files: ret[path_hash].append(csv_name[:-4]) return ret self._csv_map = csv_map() self._overrides, self._overriddens, self._parents, self._children = condense_global( self._temp_folder, chain.from_iterable(self._csv_map.itervalues()))
def post_build(self): def csv_map(): """Map input files to the output CSVs corresponding to them. Return {path sha1: [file names (minus '.csv' extension)]}. This saves a lot of globbing later, which can add up to hours over the course of tens of thousands of files, depending on IO speed. An alternative approach might be a radix tree of folders: less RAM, more IO. Try that and bench it sometime. """ ret = defaultdict(list) for csv_name in listdir(self._temp_folder): if csv_name.endswith('.csv'): path_hash, content_hash, ext = csv_name.split('.') # Removing ".csv" saves at least 2MB per worker on 700K files: ret[path_hash].append(csv_name[:-4]) return ret self._csv_map = csv_map() self._overrides, self._overriddens, self._parents, self._children = condense_global(self._temp_folder, chain.from_iterable(self._csv_map.itervalues()))
def post_build(self): self._overrides, self._overriddens, self._parents, self._children = condense_global( self._temp_folder)
def post_build(self): self._overrides, self._overriddens, self._parents, self._children = condense_global(self._temp_folder)