Exemplo n.º 1
0
    def post_build(self):
        def csv_map():
            """Map input files to the output CSVs corresponding to them.

            Return {path sha1: [file names (minus '.csv' extension)]}.

            This saves a lot of globbing later, which can add up to hours over
            the course of tens of thousands of files, depending on IO speed. An
            alternative approach might be a radix tree of folders: less RAM,
            more IO. Try that and bench it sometime.

            """
            ret = defaultdict(list)
            for csv_name in listdir(self._temp_folder):
                if csv_name.endswith('.csv'):
                    path_hash, content_hash, ext = csv_name.split('.')
                    # Removing ".csv" saves at least 2MB per worker on 700K files:
                    ret[path_hash].append(csv_name[:-4])
            return ret

        self._csv_map = csv_map()
        self._overrides, self._overriddens, self._parents, self._children = condense_global(
            self._temp_folder, chain.from_iterable(self._csv_map.itervalues()))
Exemplo n.º 2
0
    def post_build(self):
        def csv_map():
            """Map input files to the output CSVs corresponding to them.

            Return {path sha1: [file names (minus '.csv' extension)]}.

            This saves a lot of globbing later, which can add up to hours over
            the course of tens of thousands of files, depending on IO speed. An
            alternative approach might be a radix tree of folders: less RAM,
            more IO. Try that and bench it sometime.

            """
            ret = defaultdict(list)
            for csv_name in listdir(self._temp_folder):
                if csv_name.endswith('.csv'):
                    path_hash, content_hash, ext = csv_name.split('.')
                    # Removing ".csv" saves at least 2MB per worker on 700K files:
                    ret[path_hash].append(csv_name[:-4])
            return ret

        self._csv_map = csv_map()
        self._overrides, self._overriddens, self._parents, self._children = condense_global(self._temp_folder,
                            chain.from_iterable(self._csv_map.itervalues()))
Exemplo n.º 3
0
 def post_build(self):
     self._overrides, self._overriddens, self._parents, self._children = condense_global(
         self._temp_folder)
Exemplo n.º 4
0
 def post_build(self):
     self._overrides, self._overriddens, self._parents, self._children = condense_global(self._temp_folder)