示例#1
0
    def hadoopIterate(self, inputHdfsDirectory, outputHdfsDirectory, iteration=0, numberOfReducers=1, cmdenv=None, loggingLevel=logging.WARNING, overwrite=True, verbose=True):
        """Run a Hadoop iteration, wait for it to finish, and collect the results.

        This function builds and submits mapper and reducer scripts
        containing the MapReduceApplication as a serialized class,
        which are unserialized remotely.

        @type inputHdfsDirectory: string
        @param inputHdfsDirectory: The name of the HDFS directory to use as input.  It should contain SequenceFiles generated by C{hadoopPopulate}.
        @type outputHdfsDirectory: string
        @param outputHdfsDirectory: The name of the HDFS directory to use as output.  If it exists and C{overwrite} is True, it will be overwritten.
        @type iteration: int
        @param iteration: The iteration number.
        @type numberOfReducers: int
        @param numberOfReducers: Desired number of reducers.
        @type cmdenv: dict or None
        @param cmdenv: Environment variables to pass to the mapper and reducer processes.
        @type loggingLevel: logging level
        @param loggingLevel: The level of log output that will go to Hadoop's standard error.
        @type overwrite: bool
        @param overwrite: If C{outputHdfsDirectory} exists and this is True, the contents will be overwritten.
        @type verbose: bool
        @param verbose: If True, let Hadoop print its output to C{sys.stdout}.
        @rtype: 2-tuple of list, dict
        @return: List of output records and dictionary of output key-value pairs.
        @raise IOError: If any I/O related error occurs, this function raises an error.
        """

        self._hadoopCheck()

        if overwrite:
            returncode, stdout, stderr = self._hadoopCall(None, "fs", "-test", "-e", outputHdfsDirectory)
            if returncode == 0:
                returncode, stdout, stderr = self._hadoopCall(None, "fs", "-rmr", outputHdfsDirectory)
                if returncode != 0:
                    raise IOError("Could not remove path \"%s\": %s" % (outputHdfsDirectory, stderr))
                
        if iteration < len(self.mapRedApps):
            mapRedApp = self.mapRedApps[iteration]
            mapRedAppSerialized = self.mapRedAppsSerialized[iteration]
        else:
            mapRedApp = self.mapRedApps[-1]
            mapRedAppSerialized = self.mapRedAppsSerialized[-1]

        gatherOutput = mapRedApp.gatherOutput
        imports = mapRedApp.imports
        if imports is None:
            imports = {}
        namespace = self._buildNamespace(imports)

        files = mapRedApp.files
        if files is None:
            files = []
        if cmdenv is None:
            cmdenv = {}

        self.done = False
        startMetadata = copy.deepcopy(self.metadata)

        overrideAttributes = {"metadata": startMetadata, "iteration": iteration}
        overrideAttributesString = pickle.dumps(overrideAttributes, protocol=PICKLE_PROTOCOL)

        template = os.path.join(os.path.split(__file__)[0], "MapReduceTemplate.py")
        if not os.path.exists(template):
            raise IOError("Could not find %s in the Augustus distribution" % template)
        template = open(template).read()

        application = os.path.join(os.path.split(__file__)[0], "MapReduceApplication.py")
        if not os.path.exists(application):
            raise IOError("Could not find %s in the Augustus distribution" % application)
        application = open(application).read()

        mapperScript = tempfile.NamedTemporaryFile(delete=False)
        mapperScript.write(template)
        mapperScript.write(application)

        for name, value in imports.items():
            if value is None:
                mapperScript.write("import %s%s" % (name, os.linesep))

            elif isinstance(value, basestring):
                mapperScript.write("import %s as %s%s" % (name, value, os.linesep))

            else:
                if hasattr(value, "items"):
                    for n, v in value.items():
                        mapperScript.write("from %s import %s as %s%s" % (name, n, v, os.linesep))
                else:
                    for v in value:
                        mapperScript.write("from %s import %s%s" % (name, v, os.linesep))

        mapperScript.write("sys.stdout = sys.stderr%s" % os.linesep)
        mapperScript.write("logging.basicConfig(level=%d)%s" % (loggingLevel, os.linesep))
        mapperScript.write("logger = logging.getLogger(\"%s\")%s" % (self.loggerName, os.linesep))

        mapperScript.write("overrideAttributesString = %r%s" % (overrideAttributesString, os.linesep))
        mapperScript.write("overrideAttributes = pickle.loads(overrideAttributesString)%s" % os.linesep)
        mapperScript.write("overrideAttributes[\"emit\"] = emit%s" % os.linesep)
        mapperScript.write("overrideAttributes[\"logger\"] = logging.getLogger(\"%s\")%s" % (mapRedApp.loggerName, os.linesep))

        mapperScript.write("class FakePerformanceTable(object):%s" % os.linesep)
        mapperScript.write("    def __init__(self):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def __repr__(self):%s" % os.linesep)
        mapperScript.write("        return \"<FakePerformanceTable at 0x%%x>\" %% id(self)%s" % os.linesep)
        mapperScript.write("    def absorb(self, performanceTable):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def begin(self, key):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def end(self, key):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def pause(self, key):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def unpause(self, key):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def block(self):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def unblock(self):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("overrideAttributes[\"performanceTable\"] = FakePerformanceTable()%s" % os.linesep)

        mapperScript.write("mapRedAppSerialized = %r%s" % (mapRedAppSerialized, os.linesep))
        mapperScript.write("appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, globals())%s" % os.linesep)

        mapperScript.write("controller = Controller(appClass())%s" % os.linesep)
        mapperScript.write("controller.mapper()%s" % os.linesep)
        mapperScript.close()

        reducerScript = tempfile.NamedTemporaryFile(delete=False)
        reducerScript.write(template)
        reducerScript.write(application)

        for name, value in imports.items():
            if value is None:
                reducerScript.write("import %s%s" % (name, os.linesep))

            elif isinstance(value, basestring):
                reducerScript.write("import %s as %s%s" % (name, value, os.linesep))

            else:
                if hasattr(value, "items"):
                    for n, v in value.items():
                        reducerScript.write("from %s import %s as %s%s" % (name, n, v, os.linesep))
                else:
                    for v in value:
                        reducerScript.write("from %s import %s%s" % (name, v, os.linesep))

        reducerScript.write("sys.stdout = sys.stderr%s" % os.linesep)
        reducerScript.write("logging.basicConfig(level=%d)%s" % (loggingLevel, os.linesep))
        reducerScript.write("logger = logging.getLogger(\"%s\")%s" % (self.loggerName, os.linesep))

        reducerScript.write("overrideAttributesString = %r%s" % (overrideAttributesString, os.linesep))
        reducerScript.write("overrideAttributes = pickle.loads(overrideAttributesString)%s" % os.linesep)
        reducerScript.write("overrideAttributes[\"emit\"] = emit%s" % os.linesep)
        reducerScript.write("overrideAttributes[\"logger\"] = logging.getLogger(\"%s\")%s" % (mapRedApp.loggerName, os.linesep))

        reducerScript.write("class FakePerformanceTable(object):%s" % os.linesep)
        reducerScript.write("    def __init__(self):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def __repr__(self):%s" % os.linesep)
        reducerScript.write("        return \"<FakePerformanceTable at 0x%%x>\" %% id(self)%s" % os.linesep)
        reducerScript.write("    def absorb(self, performanceTable):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def begin(self, key):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def end(self, key):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def pause(self, key):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def unpause(self, key):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def block(self):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def unblock(self):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("overrideAttributes[\"performanceTable\"] = FakePerformanceTable()%s" % os.linesep)

        reducerScript.write("mapRedAppSerialized = %r%s" % (mapRedAppSerialized, os.linesep))
        reducerScript.write("appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, globals())%s" % os.linesep)

        reducerScript.write("controller = Controller(appClass())%s" % os.linesep)
        reducerScript.write("controller.reducer()%s" % os.linesep)
        reducerScript.close()

        if verbose:
            stdout = None
        else:
            stdout = subprocess.PIPE

        fileargs = ["-file"] * (2 * len(files))
        fileargs[1::2] = files

        envargs = ["-cmdenv"] * (2 * len(cmdenv))
        envargs[1::2] = ["%s=%s" % (n, v) for n, v in cmdenv.items()]

        process = subprocess.Popen([self.HADOOP_EXECUTABLE, "jar", self.HADOOP_STREAMING_JAR, "-D", "mapred.reduce.tasks=%d" % numberOfReducers, "-D", "stream.map.output=typedbytes", "-D", "stream.reduce.input=typedbytes", "-D", "stream.reduce.output=typedbytes", "-inputformat", "org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat", "-outputformat", "org.apache.hadoop.mapred.SequenceFileOutputFormat", "-input", "%s/*" % inputHdfsDirectory, "-output", outputHdfsDirectory, "-mapper", mapperScript.name, "-reducer", reducerScript.name, "-file", mapperScript.name, "-file", reducerScript.name] + fileargs + envargs, stdout=stdout)
        process.wait()

        try:
            os.remove(mapperScript.name)
        except OSError:
            pass
        try:
            os.remove(reducerScript.name)
        except OSError:
            pass

        if process.returncode != 0:
            raise RuntimeError("Hadoop streaming failed")

        if gatherOutput:
            outputRecords, outputKeyValues = self.hadoopGather(outputHdfsDirectory)

            overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": None, "logger": logging.getLogger(mapRedApp.loggerName)}

            appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace)
            appInstance = appClass()

            if appInstance.endIteration(outputRecords, outputKeyValues):
                self.done = True

            self.metadata = appInstance.metadata

            return outputRecords, outputKeyValues

        else:
            self.metadata = startMetadata
            return [], {}
示例#2
0
    def hadoopIterate(self,
                      inputHdfsDirectory,
                      outputHdfsDirectory,
                      iteration=0,
                      numberOfReducers=1,
                      cmdenv=None,
                      loggingLevel=logging.WARNING,
                      overwrite=True,
                      verbose=True):
        """Run a Hadoop iteration, wait for it to finish, and collect the results.

        This function builds and submits mapper and reducer scripts
        containing the MapReduceApplication as a serialized class,
        which are unserialized remotely.

        @type inputHdfsDirectory: string
        @param inputHdfsDirectory: The name of the HDFS directory to use as input.  It should contain SequenceFiles generated by C{hadoopPopulate}.
        @type outputHdfsDirectory: string
        @param outputHdfsDirectory: The name of the HDFS directory to use as output.  If it exists and C{overwrite} is True, it will be overwritten.
        @type iteration: int
        @param iteration: The iteration number.
        @type numberOfReducers: int
        @param numberOfReducers: Desired number of reducers.
        @type cmdenv: dict or None
        @param cmdenv: Environment variables to pass to the mapper and reducer processes.
        @type loggingLevel: logging level
        @param loggingLevel: The level of log output that will go to Hadoop's standard error.
        @type overwrite: bool
        @param overwrite: If C{outputHdfsDirectory} exists and this is True, the contents will be overwritten.
        @type verbose: bool
        @param verbose: If True, let Hadoop print its output to C{sys.stdout}.
        @rtype: 2-tuple of list, dict
        @return: List of output records and dictionary of output key-value pairs.
        @raise IOError: If any I/O related error occurs, this function raises an error.
        """

        self._hadoopCheck()

        if overwrite:
            returncode, stdout, stderr = self._hadoopCall(
                None, "fs", "-test", "-e", outputHdfsDirectory)
            if returncode == 0:
                returncode, stdout, stderr = self._hadoopCall(
                    None, "fs", "-rmr", outputHdfsDirectory)
                if returncode != 0:
                    raise IOError("Could not remove path \"%s\": %s" %
                                  (outputHdfsDirectory, stderr))

        if iteration < len(self.mapRedApps):
            mapRedApp = self.mapRedApps[iteration]
            mapRedAppSerialized = self.mapRedAppsSerialized[iteration]
        else:
            mapRedApp = self.mapRedApps[-1]
            mapRedAppSerialized = self.mapRedAppsSerialized[-1]

        gatherOutput = mapRedApp.gatherOutput
        imports = mapRedApp.imports
        if imports is None:
            imports = {}
        namespace = self._buildNamespace(imports)

        files = mapRedApp.files
        if files is None:
            files = []
        if cmdenv is None:
            cmdenv = {}

        self.done = False
        startMetadata = copy.deepcopy(self.metadata)

        overrideAttributes = {
            "metadata": startMetadata,
            "iteration": iteration
        }
        overrideAttributesString = pickle.dumps(overrideAttributes,
                                                protocol=PICKLE_PROTOCOL)

        template = os.path.join(
            os.path.split(__file__)[0], "MapReduceTemplate.py")
        if not os.path.exists(template):
            raise IOError("Could not find %s in the Augustus distribution" %
                          template)
        template = open(template).read()

        application = os.path.join(
            os.path.split(__file__)[0], "MapReduceApplication.py")
        if not os.path.exists(application):
            raise IOError("Could not find %s in the Augustus distribution" %
                          application)
        application = open(application).read()

        mapperScript = tempfile.NamedTemporaryFile(delete=False)
        mapperScript.write(template)
        mapperScript.write(application)

        for name, value in imports.items():
            if value is None:
                mapperScript.write("import %s%s" % (name, os.linesep))

            elif isinstance(value, basestring):
                mapperScript.write("import %s as %s%s" %
                                   (name, value, os.linesep))

            else:
                if hasattr(value, "items"):
                    for n, v in value.items():
                        mapperScript.write("from %s import %s as %s%s" %
                                           (name, n, v, os.linesep))
                else:
                    for v in value:
                        mapperScript.write("from %s import %s%s" %
                                           (name, v, os.linesep))

        mapperScript.write("sys.stdout = sys.stderr%s" % os.linesep)
        mapperScript.write("logging.basicConfig(level=%d)%s" %
                           (loggingLevel, os.linesep))
        mapperScript.write("logger = logging.getLogger(\"%s\")%s" %
                           (self.loggerName, os.linesep))

        mapperScript.write("overrideAttributesString = %r%s" %
                           (overrideAttributesString, os.linesep))
        mapperScript.write(
            "overrideAttributes = pickle.loads(overrideAttributesString)%s" %
            os.linesep)
        mapperScript.write("overrideAttributes[\"emit\"] = emit%s" %
                           os.linesep)
        mapperScript.write(
            "overrideAttributes[\"logger\"] = logging.getLogger(\"%s\")%s" %
            (mapRedApp.loggerName, os.linesep))

        mapperScript.write("class FakePerformanceTable(object):%s" %
                           os.linesep)
        mapperScript.write("    def __init__(self):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def __repr__(self):%s" % os.linesep)
        mapperScript.write(
            "        return \"<FakePerformanceTable at 0x%%x>\" %% id(self)%s"
            % os.linesep)
        mapperScript.write("    def absorb(self, performanceTable):%s" %
                           os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def begin(self, key):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def end(self, key):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def pause(self, key):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def unpause(self, key):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def block(self):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write("    def unblock(self):%s" % os.linesep)
        mapperScript.write("        pass%s" % os.linesep)
        mapperScript.write(
            "overrideAttributes[\"performanceTable\"] = FakePerformanceTable()%s"
            % os.linesep)

        mapperScript.write("mapRedAppSerialized = %r%s" %
                           (mapRedAppSerialized, os.linesep))
        mapperScript.write(
            "appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, globals())%s"
            % os.linesep)

        mapperScript.write("controller = Controller(appClass())%s" %
                           os.linesep)
        mapperScript.write("controller.mapper()%s" % os.linesep)
        mapperScript.close()

        reducerScript = tempfile.NamedTemporaryFile(delete=False)
        reducerScript.write(template)
        reducerScript.write(application)

        for name, value in imports.items():
            if value is None:
                reducerScript.write("import %s%s" % (name, os.linesep))

            elif isinstance(value, basestring):
                reducerScript.write("import %s as %s%s" %
                                    (name, value, os.linesep))

            else:
                if hasattr(value, "items"):
                    for n, v in value.items():
                        reducerScript.write("from %s import %s as %s%s" %
                                            (name, n, v, os.linesep))
                else:
                    for v in value:
                        reducerScript.write("from %s import %s%s" %
                                            (name, v, os.linesep))

        reducerScript.write("sys.stdout = sys.stderr%s" % os.linesep)
        reducerScript.write("logging.basicConfig(level=%d)%s" %
                            (loggingLevel, os.linesep))
        reducerScript.write("logger = logging.getLogger(\"%s\")%s" %
                            (self.loggerName, os.linesep))

        reducerScript.write("overrideAttributesString = %r%s" %
                            (overrideAttributesString, os.linesep))
        reducerScript.write(
            "overrideAttributes = pickle.loads(overrideAttributesString)%s" %
            os.linesep)
        reducerScript.write("overrideAttributes[\"emit\"] = emit%s" %
                            os.linesep)
        reducerScript.write(
            "overrideAttributes[\"logger\"] = logging.getLogger(\"%s\")%s" %
            (mapRedApp.loggerName, os.linesep))

        reducerScript.write("class FakePerformanceTable(object):%s" %
                            os.linesep)
        reducerScript.write("    def __init__(self):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def __repr__(self):%s" % os.linesep)
        reducerScript.write(
            "        return \"<FakePerformanceTable at 0x%%x>\" %% id(self)%s"
            % os.linesep)
        reducerScript.write("    def absorb(self, performanceTable):%s" %
                            os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def begin(self, key):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def end(self, key):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def pause(self, key):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def unpause(self, key):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def block(self):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write("    def unblock(self):%s" % os.linesep)
        reducerScript.write("        pass%s" % os.linesep)
        reducerScript.write(
            "overrideAttributes[\"performanceTable\"] = FakePerformanceTable()%s"
            % os.linesep)

        reducerScript.write("mapRedAppSerialized = %r%s" %
                            (mapRedAppSerialized, os.linesep))
        reducerScript.write(
            "appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, globals())%s"
            % os.linesep)

        reducerScript.write("controller = Controller(appClass())%s" %
                            os.linesep)
        reducerScript.write("controller.reducer()%s" % os.linesep)
        reducerScript.close()

        if verbose:
            stdout = None
        else:
            stdout = subprocess.PIPE

        fileargs = ["-file"] * (2 * len(files))
        fileargs[1::2] = files

        envargs = ["-cmdenv"] * (2 * len(cmdenv))
        envargs[1::2] = ["%s=%s" % (n, v) for n, v in cmdenv.items()]

        process = subprocess.Popen([
            self.HADOOP_EXECUTABLE, "jar", self.HADOOP_STREAMING_JAR, "-D",
            "mapred.reduce.tasks=%d" % numberOfReducers, "-D",
            "stream.map.output=typedbytes", "-D",
            "stream.reduce.input=typedbytes", "-D",
            "stream.reduce.output=typedbytes", "-inputformat",
            "org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat",
            "-outputformat",
            "org.apache.hadoop.mapred.SequenceFileOutputFormat", "-input",
            "%s/*" % inputHdfsDirectory, "-output", outputHdfsDirectory,
            "-mapper", mapperScript.name, "-reducer", reducerScript.name,
            "-file", mapperScript.name, "-file", reducerScript.name
        ] + fileargs + envargs,
                                   stdout=stdout)
        process.wait()

        try:
            os.remove(mapperScript.name)
        except OSError:
            pass
        try:
            os.remove(reducerScript.name)
        except OSError:
            pass

        if process.returncode != 0:
            raise RuntimeError("Hadoop streaming failed")

        if gatherOutput:
            outputRecords, outputKeyValues = self.hadoopGather(
                outputHdfsDirectory)

            overrideAttributes = {
                "metadata": startMetadata,
                "iteration": iteration,
                "emit": None,
                "logger": logging.getLogger(mapRedApp.loggerName)
            }

            appClass = unserializeClass(mapRedAppSerialized,
                                        MapReduceApplication,
                                        overrideAttributes, namespace)
            appInstance = appClass()

            if appInstance.endIteration(outputRecords, outputKeyValues):
                self.done = True

            self.metadata = appInstance.metadata

            return outputRecords, outputKeyValues

        else:
            self.metadata = startMetadata
            return [], {}
示例#3
0
    def iterate(self, inputData, iteration=0, sort=False, parallel=False, numberOfMappers=1, numberOfReducers=1, frozenClass=True):
        """Run a pure-Python map-reduce iteration.

        @type inputData: list of Python objects
        @param inputData: The objects to use as a data stream.
        @type iteration: int
        @param iteration: The iteration number.
        @type sort: bool
        @param sort: If True, perform a sorting step between the mapper and the reducer.
        @type parallel: bool
        @param parallel: If True, run the independent mappers and independent reducers as distinct threads.
        @type numberOfMappers: int
        @param numberOfMappers: Requested number of mappers.  Input data will be divided evenly among them.
        @type numberOfReducers: int
        @param numberOfReducers: Requested number of reducers.
        @type frozenClass: bool
        @param frozenClass: If True, practice serializing and unserializing the class to ensure the independence of the mappers and the reducers.  If False, skip this performance-limiting step.
        @rtype: 2-tuple of list, dict
        @return: List of output records and dictionary of output key-value pairs.
        """

        overheadPerformanceTable = PerformanceTable()
        self._performanceTables.append(overheadPerformanceTable)
        overheadPerformanceTable.begin("MapReduce.iterate")

        if iteration < len(self.mapRedApps):
            mapRedApp = self.mapRedApps[iteration]
            mapRedAppSerialized = self.mapRedAppsSerialized[iteration]
        else:
            mapRedApp = self.mapRedApps[-1]
            mapRedAppSerialized = self.mapRedAppsSerialized[-1]

        gatherOutput = mapRedApp.gatherOutput
        imports = mapRedApp.imports
        if imports is None:
            imports = {}
        namespace = self._buildNamespace(imports)

        self.logger.info("Start iteration %d with %d input records and %d metadata keys.", iteration, len(inputData), len(self.metadata))
        
        self.done = False
        overheadPerformanceTable.begin("copy metadata")
        startMetadata = copy.deepcopy(self.metadata)
        overheadPerformanceTable.end("copy metadata")

        intermediateData = {}
        dataLock = threading.Lock()
        def emit(appself, key, record):
            with dataLock:
                newTuple = copy.deepcopy((key, record))

                if key in intermediateData:
                    self.logger.debug("    key \"%s\": %r", key, record)
                    intermediateData[key].append(newTuple)

                else:
                    self.logger.debug("New key \"%s\": %r", key, record)
                    intermediateData[key] = [newTuple]

        if parallel:
            mapperThreads = []

        recordsPerMapper = int(math.ceil(len(inputData) / float(numberOfMappers)))
        for number in xrange(numberOfMappers):
            subData = inputData[(number * recordsPerMapper):((number + 1) * recordsPerMapper)]

            performanceTable = PerformanceTable()
            self._performanceTables.append(performanceTable)

            overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName)}

            if frozenClass:
                overheadPerformanceTable.begin("unfreeze mapper")
                appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace)
                overheadPerformanceTable.end("unfreeze mapper")
                controller = self.Controller(appClass())
            else:
                appInstance = mapRedApp()
                overrideAttributes["emit"] = types.MethodType(overrideAttributes["emit"], appInstance)
                appInstance.__dict__.update(overrideAttributes)
                controller = self.Controller(appInstance)

            if parallel:
                self.logger.info("Starting mapper %d in parallel with %d input records.", number, len(subData))
                mapperThreads.append(threading.Thread(target=controller.mapper, name=("Mapper_%03d" % number), args=(subData,)))
            else:
                self.logger.info("Starting mapper %d in series with %d input records.", number, len(subData))
                overheadPerformanceTable.pause("MapReduce.iterate")
                controller.mapper(subData)
                overheadPerformanceTable.unpause("MapReduce.iterate")

        if parallel:
            overheadPerformanceTable.pause("MapReduce.iterate")
            for thread in mapperThreads:
                thread.start()
            for thread in mapperThreads:
                thread.join()
            overheadPerformanceTable.unpause("MapReduce.iterate")

        self.logger.info("All mappers finished.")

        if sort:
            self.logger.info("Sorting %d intermediate values.", sum(len(x) for x in intermediateData.values()))
            overheadPerformanceTable.begin("sort intermediate data")
            for value in intermediateData.values():
                value.sort()
            overheadPerformanceTable.end("sort intermediate data")
        else:
            self.logger.info("Leaving %d intermediate values in the order in which they were generated.", sum(len(x) for x in intermediateData.values()))

        overheadPerformanceTable.begin("load balance")

        lengths = [(key, len(intermediateData[key])) for key in intermediateData]
        lengths.sort(lambda a, b: cmp(b[1], a[1]))

        assignments = [[] for x in xrange(numberOfReducers)]
        workload = [0] * numberOfReducers
        for key, length in lengths:
            index = min((w, i) for i, w in enumerate(workload))[1]   # this is argmin(workload)
            assignments[index].append(key)
            workload[index] += length
        
        if self.logger.isEnabledFor(logging.INFO):
            for i, (a, w) in enumerate(zip(assignments, workload)):
                if len(a) > 10:
                    self.logger.info("Assigning %d keys (%d total records) to reducer %d.", len(a), w, i)
                else:
                    self.logger.info("Assigning keys %s (%d total records) to reducer %d.", ", ".join("\"%s\"" % k for k in a), w, i)

        overheadPerformanceTable.end("load balance")

        outputRecords = []
        outputKeyValues = {}
        dataLock = threading.Lock()
        def emit(appself, key, record):
            if key is None:
                self.logger.debug("OutputRecord: %r", record)
                outputRecords.append(record)
            else:
                with dataLock:
                    if key in outputKeyValues:
                        raise RuntimeError("Two reducers are trying to write to the same metadata key: \"%s\"" % key)
                    else:
                        self.logger.debug("OutputKeyValue \"%s\": %r", key, record)
                        outputKeyValues[key] = record

        if parallel:
            reducerThreads = []

        for number in xrange(numberOfReducers):
            subData = []
            for key in assignments[number]:
                subData.extend(intermediateData[key])

            performanceTable = PerformanceTable()
            self._performanceTables.append(performanceTable)

            overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName)}
            if frozenClass:
                overheadPerformanceTable.begin("unfreeze reducer")
                appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace)
                overheadPerformanceTable.end("unfreeze reducer")
                controller = self.Controller(appClass())
            else:
                appInstance = mapRedApp()
                overrideAttributes["emit"] = types.MethodType(overrideAttributes["emit"], appInstance)
                appInstance.__dict__.update(overrideAttributes)
                controller = self.Controller(appInstance)            
            
            if parallel:
                self.logger.info("Starting reducer %d in parallel with %d input records.", number, len(subData))
                reducerThreads.append(threading.Thread(target=controller.reducer, name=("Reducer_%03d" % number), args=(subData,)))
            else:
                self.logger.info("Starting reducer %d in series with %d input records.", number, len(subData))
                overheadPerformanceTable.pause("MapReduce.iterate")
                controller.reducer(subData)
                overheadPerformanceTable.unpause("MapReduce.iterate")

        if parallel:
            overheadPerformanceTable.pause("MapReduce.iterate")
            for thread in reducerThreads:
                thread.start()
            for thread in reducerThreads:
                thread.join()
            overheadPerformanceTable.unpause("MapReduce.iterate")

        self.logger.info("All reducers finished.")
        self.logger.info("Finished iteration %s with %d output records and %d metadata keys.", iteration, len(outputRecords), len(outputKeyValues))

        if gatherOutput:
            performanceTable = PerformanceTable()
            self._performanceTables.append(performanceTable)

            overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": None, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName)}

            if frozenClass:
                overheadPerformanceTable.begin("unfreeze endIteration")
                appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace)
                overheadPerformanceTable.end("unfreeze endIteration")
                appInstance = appClass()
            else:
                appInstance = mapRedApp()
                appInstance.__dict__.update(overrideAttributes)

            overheadPerformanceTable.pause("MapReduce.iterate")
            if appInstance.endIteration(outputRecords, outputKeyValues):
                self.done = True
            overheadPerformanceTable.unpause("MapReduce.iterate")

            self.metadata = appInstance.metadata

            overheadPerformanceTable.end("MapReduce.iterate")
            return outputRecords, outputKeyValues

        else:
            self.metadata = startMetadata
            overheadPerformanceTable.end("MapReduce.iterate")
            return [], {}
示例#4
0
    def iterate(self,
                inputData,
                iteration=0,
                sort=False,
                parallel=False,
                numberOfMappers=1,
                numberOfReducers=1,
                frozenClass=True):
        """Run a pure-Python map-reduce iteration.

        @type inputData: list of Python objects
        @param inputData: The objects to use as a data stream.
        @type iteration: int
        @param iteration: The iteration number.
        @type sort: bool
        @param sort: If True, perform a sorting step between the mapper and the reducer.
        @type parallel: bool
        @param parallel: If True, run the independent mappers and independent reducers as distinct threads.
        @type numberOfMappers: int
        @param numberOfMappers: Requested number of mappers.  Input data will be divided evenly among them.
        @type numberOfReducers: int
        @param numberOfReducers: Requested number of reducers.
        @type frozenClass: bool
        @param frozenClass: If True, practice serializing and unserializing the class to ensure the independence of the mappers and the reducers.  If False, skip this performance-limiting step.
        @rtype: 2-tuple of list, dict
        @return: List of output records and dictionary of output key-value pairs.
        """

        overheadPerformanceTable = PerformanceTable()
        self._performanceTables.append(overheadPerformanceTable)
        overheadPerformanceTable.begin("MapReduce.iterate")

        if iteration < len(self.mapRedApps):
            mapRedApp = self.mapRedApps[iteration]
            mapRedAppSerialized = self.mapRedAppsSerialized[iteration]
        else:
            mapRedApp = self.mapRedApps[-1]
            mapRedAppSerialized = self.mapRedAppsSerialized[-1]

        gatherOutput = mapRedApp.gatherOutput
        imports = mapRedApp.imports
        if imports is None:
            imports = {}
        namespace = self._buildNamespace(imports)

        self.logger.info(
            "Start iteration %d with %d input records and %d metadata keys.",
            iteration, len(inputData), len(self.metadata))

        self.done = False
        overheadPerformanceTable.begin("copy metadata")
        startMetadata = copy.deepcopy(self.metadata)
        overheadPerformanceTable.end("copy metadata")

        intermediateData = {}
        dataLock = threading.Lock()

        def emit(appself, key, record):
            with dataLock:
                newTuple = copy.deepcopy((key, record))

                if key in intermediateData:
                    self.logger.debug("    key \"%s\": %r", key, record)
                    intermediateData[key].append(newTuple)

                else:
                    self.logger.debug("New key \"%s\": %r", key, record)
                    intermediateData[key] = [newTuple]

        if parallel:
            mapperThreads = []

        recordsPerMapper = int(
            math.ceil(len(inputData) / float(numberOfMappers)))
        for number in xrange(numberOfMappers):
            subData = inputData[(number * recordsPerMapper):((number + 1) *
                                                             recordsPerMapper)]

            performanceTable = PerformanceTable()
            self._performanceTables.append(performanceTable)

            overrideAttributes = {
                "metadata": startMetadata,
                "iteration": iteration,
                "emit": emit,
                "performanceTable": performanceTable,
                "logger": logging.getLogger(mapRedApp.loggerName)
            }

            if frozenClass:
                overheadPerformanceTable.begin("unfreeze mapper")
                appClass = unserializeClass(mapRedAppSerialized,
                                            MapReduceApplication,
                                            overrideAttributes, namespace)
                overheadPerformanceTable.end("unfreeze mapper")
                controller = self.Controller(appClass())
            else:
                appInstance = mapRedApp()
                overrideAttributes["emit"] = types.MethodType(
                    overrideAttributes["emit"], appInstance)
                appInstance.__dict__.update(overrideAttributes)
                controller = self.Controller(appInstance)

            if parallel:
                self.logger.info(
                    "Starting mapper %d in parallel with %d input records.",
                    number, len(subData))
                mapperThreads.append(
                    threading.Thread(target=controller.mapper,
                                     name=("Mapper_%03d" % number),
                                     args=(subData, )))
            else:
                self.logger.info(
                    "Starting mapper %d in series with %d input records.",
                    number, len(subData))
                overheadPerformanceTable.pause("MapReduce.iterate")
                controller.mapper(subData)
                overheadPerformanceTable.unpause("MapReduce.iterate")

        if parallel:
            overheadPerformanceTable.pause("MapReduce.iterate")
            for thread in mapperThreads:
                thread.start()
            for thread in mapperThreads:
                thread.join()
            overheadPerformanceTable.unpause("MapReduce.iterate")

        self.logger.info("All mappers finished.")

        if sort:
            self.logger.info("Sorting %d intermediate values.",
                             sum(len(x) for x in intermediateData.values()))
            overheadPerformanceTable.begin("sort intermediate data")
            for value in intermediateData.values():
                value.sort()
            overheadPerformanceTable.end("sort intermediate data")
        else:
            self.logger.info(
                "Leaving %d intermediate values in the order in which they were generated.",
                sum(len(x) for x in intermediateData.values()))

        overheadPerformanceTable.begin("load balance")

        lengths = [(key, len(intermediateData[key]))
                   for key in intermediateData]
        lengths.sort(lambda a, b: cmp(b[1], a[1]))

        assignments = [[] for x in xrange(numberOfReducers)]
        workload = [0] * numberOfReducers
        for key, length in lengths:
            index = min(
                (w, i)
                for i, w in enumerate(workload))[1]  # this is argmin(workload)
            assignments[index].append(key)
            workload[index] += length

        if self.logger.isEnabledFor(logging.INFO):
            for i, (a, w) in enumerate(zip(assignments, workload)):
                if len(a) > 10:
                    self.logger.info(
                        "Assigning %d keys (%d total records) to reducer %d.",
                        len(a), w, i)
                else:
                    self.logger.info(
                        "Assigning keys %s (%d total records) to reducer %d.",
                        ", ".join("\"%s\"" % k for k in a), w, i)

        overheadPerformanceTable.end("load balance")

        outputRecords = []
        outputKeyValues = {}
        dataLock = threading.Lock()

        def emit(appself, key, record):
            if key is None:
                self.logger.debug("OutputRecord: %r", record)
                outputRecords.append(record)
            else:
                with dataLock:
                    if key in outputKeyValues:
                        raise RuntimeError(
                            "Two reducers are trying to write to the same metadata key: \"%s\""
                            % key)
                    else:
                        self.logger.debug("OutputKeyValue \"%s\": %r", key,
                                          record)
                        outputKeyValues[key] = record

        if parallel:
            reducerThreads = []

        for number in xrange(numberOfReducers):
            subData = []
            for key in assignments[number]:
                subData.extend(intermediateData[key])

            performanceTable = PerformanceTable()
            self._performanceTables.append(performanceTable)

            overrideAttributes = {
                "metadata": startMetadata,
                "iteration": iteration,
                "emit": emit,
                "performanceTable": performanceTable,
                "logger": logging.getLogger(mapRedApp.loggerName)
            }
            if frozenClass:
                overheadPerformanceTable.begin("unfreeze reducer")
                appClass = unserializeClass(mapRedAppSerialized,
                                            MapReduceApplication,
                                            overrideAttributes, namespace)
                overheadPerformanceTable.end("unfreeze reducer")
                controller = self.Controller(appClass())
            else:
                appInstance = mapRedApp()
                overrideAttributes["emit"] = types.MethodType(
                    overrideAttributes["emit"], appInstance)
                appInstance.__dict__.update(overrideAttributes)
                controller = self.Controller(appInstance)

            if parallel:
                self.logger.info(
                    "Starting reducer %d in parallel with %d input records.",
                    number, len(subData))
                reducerThreads.append(
                    threading.Thread(target=controller.reducer,
                                     name=("Reducer_%03d" % number),
                                     args=(subData, )))
            else:
                self.logger.info(
                    "Starting reducer %d in series with %d input records.",
                    number, len(subData))
                overheadPerformanceTable.pause("MapReduce.iterate")
                controller.reducer(subData)
                overheadPerformanceTable.unpause("MapReduce.iterate")

        if parallel:
            overheadPerformanceTable.pause("MapReduce.iterate")
            for thread in reducerThreads:
                thread.start()
            for thread in reducerThreads:
                thread.join()
            overheadPerformanceTable.unpause("MapReduce.iterate")

        self.logger.info("All reducers finished.")
        self.logger.info(
            "Finished iteration %s with %d output records and %d metadata keys.",
            iteration, len(outputRecords), len(outputKeyValues))

        if gatherOutput:
            performanceTable = PerformanceTable()
            self._performanceTables.append(performanceTable)

            overrideAttributes = {
                "metadata": startMetadata,
                "iteration": iteration,
                "emit": None,
                "performanceTable": performanceTable,
                "logger": logging.getLogger(mapRedApp.loggerName)
            }

            if frozenClass:
                overheadPerformanceTable.begin("unfreeze endIteration")
                appClass = unserializeClass(mapRedAppSerialized,
                                            MapReduceApplication,
                                            overrideAttributes, namespace)
                overheadPerformanceTable.end("unfreeze endIteration")
                appInstance = appClass()
            else:
                appInstance = mapRedApp()
                appInstance.__dict__.update(overrideAttributes)

            overheadPerformanceTable.pause("MapReduce.iterate")
            if appInstance.endIteration(outputRecords, outputKeyValues):
                self.done = True
            overheadPerformanceTable.unpause("MapReduce.iterate")

            self.metadata = appInstance.metadata

            overheadPerformanceTable.end("MapReduce.iterate")
            return outputRecords, outputKeyValues

        else:
            self.metadata = startMetadata
            overheadPerformanceTable.end("MapReduce.iterate")
            return [], {}