Exemplo n.º 1
0
    def start(self):
        jvm = self.gateway.jvm

        self.job.addMrGeoProperties()
        dpf_properties = jvm.DataProviderFactory.getConfigurationFromProviders(
        )

        for prop in dpf_properties:
            self.job.setSetting(prop, dpf_properties[prop])

        if self.job.isDebug():
            master = "local"
        elif self.job.isSpark():
            # TODO:  get the master for spark
            master = ""
        elif self.job.isYarn():
            master = "yarn-client"
        else:
            cpus = (multiprocessing.cpu_count() / 4) * 3
            if cpus < 2:
                master = "local"
            else:
                master = "local[" + str(cpus) + "]"

        set_field(
            self.job, "jars",
            jvm.StringUtils.concatUnique(
                jvm.DependencyLoader.getAndCopyDependencies(
                    "org.mrgeo.mapalgebra.MapAlgebra", None),
                jvm.DependencyLoader.getAndCopyDependencies(
                    jvm.MapOpFactory.getMapOpClassNames(), None)))

        conf = jvm.MrGeoDriver.prepareJob(self.job)

        # need to override the yarn mode to "yarn-client" for python
        if self.job.isYarn():
            conf.set("spark.master", "yarn-client")

            mem = jvm.SparkUtils.humantokb(conf.get("spark.executor.memory"))
            workers = int(
                conf.get("spark.executor.instances")) + 1  # one for the driver

            conf.set("spark.executor.memory",
                     jvm.SparkUtils.kbtohuman(long(mem / workers), "m"))

        # for a in conf.getAll():
        #     print(a._1(), a._2())

        # jsc = jvm.JavaSparkContext(master, appName, sparkHome, jars)
        jsc = jvm.JavaSparkContext(conf)
        self.sparkContext = jsc.sc()
        self.sparkPyContext = SparkContext(master=master,
                                           appName=self.job.name(),
                                           jsc=jsc,
                                           gateway=self.gateway)
Exemplo n.º 2
0
    def _create_job(self):
        if not self._job:
            jvm = self._get_jvm()
            java_import(jvm, "org.mrgeo.job.*")

            appname = "PyMrGeo"

            self._job = jvm.JobArguments()
            java_gateway.set_field(self._job, "name", appname)

            # Yarn in the default
            self.useyarn()
Exemplo n.º 3
0
    def _create_job(self):
        if not self._job:
            jvm = self._get_jvm()
            java_import(jvm, "org.mrgeo.job.*")

            appname = "PyMrGeo"

            self._job = jvm.JobArguments()
            java_gateway.set_field(self._job, "name", appname)

            # Yarn in the default
            self.useyarn()
Exemplo n.º 4
0
    def start(self, context=None):

        if not context:
            jvm = self._get_jvm()
            job = self._get_job()

            job.addMrGeoProperties()
            dpf_properties = jvm.DataProviderFactory.getConfigurationFromProviders()

            for prop in dpf_properties:
                job.setSetting(prop, dpf_properties[prop])

            if job.isYarn():
                job.loadYarnSettings()


            java_gateway.set_field(job, "jars",
                      jvm.StringUtils.concatUnique(
                          jvm.DependencyLoader.getAndCopyDependencies("org.mrgeo.mapalgebra.MapAlgebra", None),
                          jvm.DependencyLoader.getAndCopyDependencies(jvm.MapOpFactory.getMapOpClassNames(), None)))

            conf = jvm.MrGeoDriver.prepareJob(job)

            if job.isYarn():
                # need to override the yarn mode to "yarn-client" for python
                conf.set("spark.master", "yarn-client")

                if not conf.getBoolean("spark.dynamicAllocation.enabled", False):
                    conf.set("spark.executor.instances", str(job.executors()))

                conf.set("spark.executor.cores", str(job.cores()))

                # in yarn-cluster, this is the total memory in the cluster, but here in yarn-client, it is
                # the memory per executor.  Go figure!
                mem = job.executorMemKb()

                overhead = conf.getInt("spark.yarn.executor.memoryOverhead", 384)
                if (mem * 0.1) > overhead:
                    overhead = mem * 0.1

                if overhead < 384:
                    overhead = 384

                mem -= (overhead * 2)  # overhead is 1x for driver and 1x for application master (am)
                conf.set("spark.executor.memory", jvm.SparkUtils.kbtohuman(long(mem), "m"))


            jsc = jvm.JavaSparkContext(conf)
            jsc.setCheckpointDir(jvm.HadoopFileUtils.createJobTmp(jsc.hadoopConfiguration()).toString())
            self.sparkContext = jsc.sc()
        else:
            self.sparkContext = context
Exemplo n.º 5
0
    def _create_job(self):
        jvm = self.gateway.jvm
        java_import(jvm, "org.mrgeo.data.DataProviderFactory")
        java_import(jvm, "org.mrgeo.job.*")
        java_import(jvm, "org.mrgeo.utils.DependencyLoader")
        java_import(jvm, "org.mrgeo.utils.StringUtils")

        appname = "PyMrGeo"

        self.job = jvm.JobArguments()
        set_field(self.job, "name", appname)

        # Yarn in the default
        self.useyarn()
Exemplo n.º 6
0
    def _create_job(self):
        jvm = self.gateway.jvm
        java_import(jvm, "org.mrgeo.data.DataProviderFactory")
        java_import(jvm, "org.mrgeo.job.*")
        java_import(jvm, "org.mrgeo.utils.DependencyLoader")
        java_import(jvm, "org.mrgeo.utils.StringUtils")

        appname = "PyMrGeo"

        self.job = jvm.JobArguments()
        set_field(self.job, "name", appname)

        # Yarn in the default
        self.useyarn()
Exemplo n.º 7
0
    def start(self):
        jvm = self.gateway.jvm

        self.job.addMrGeoProperties()
        dpf_properties = jvm.DataProviderFactory.getConfigurationFromProviders()

        for prop in dpf_properties:
            self.job.setSetting(prop, dpf_properties[prop])

        if self.job.isDebug():
            master = "local"
        elif self.job.isSpark():
            # TODO:  get the master for spark
            master = ""
        elif self.job.isYarn():
            master = "yarn-client"
        else:
            cpus = (multiprocessing.cpu_count() / 4) * 3
            if cpus < 2:
                master = "local"
            else:
                master = "local[" + str(cpus) + "]"

        set_field(self.job, "jars",
                  jvm.StringUtils.concatUnique(
                      jvm.DependencyLoader.getAndCopyDependencies("org.mrgeo.mapalgebra.MapAlgebra", None),
                      jvm.DependencyLoader.getAndCopyDependencies(jvm.MapOpFactory.getMapOpClassNames(), None)))

        conf = jvm.MrGeoDriver.prepareJob(self.job)

        # need to override the yarn mode to "yarn-client" for python
        if self.job.isYarn():
            conf.set("spark.master", "yarn-client")

            if not conf.getBoolean("spark.dynamicAllocation.enabled", False):
                mem = jvm.SparkUtils.humantokb(conf.get("spark.executor.memory"))
                workers = int(conf.get("spark.executor.instances")) + 1  # one for the driver

                conf.set("spark.executor.memory", jvm.SparkUtils.kbtohuman(long(mem / workers), "m"))

        # for a in conf.getAll():
        #     print(a._1(), a._2())

        # jsc = jvm.JavaSparkContext(master, appName, sparkHome, jars)
        jsc = jvm.JavaSparkContext(conf)
        self.sparkContext = jsc.sc()
        self.sparkPyContext = SparkContext(master=master, appName=self.job.name(), jsc=jsc, gateway=self.gateway)
Exemplo n.º 8
0
    def start(self):
        if self._started:
            # print("MrGeo is already started")
            return

        jvm = self._get_jvm()
        job = self._get_job()

        job.addMrGeoProperties()
        dpf_properties = jvm.DataProviderFactory.getConfigurationFromProviders(
        )

        for prop in dpf_properties:
            job.setSetting(prop, dpf_properties[prop])

        jvm.DependencyLoader.setPrintMissingDependencies(False)
        jvm.DependencyLoader.resetMissingDependencyList()

        java_gateway.set_field(
            job, "jars",
            jvm.StringUtils.concatUnique(
                jvm.DependencyLoader.getAndCopyDependencies(
                    "org.mrgeo.mapalgebra.MapAlgebra", None),
                jvm.DependencyLoader.getAndCopyDependencies(
                    jvm.MapOpFactory.getMapOpClassNames(), None)))

        conf = jvm.MrGeoDriver.prepareJob(job)

        jvm.DependencyLoader.printMissingDependencies()

        if self._localGateway:
            if job.isYarn():
                job.loadYarnSettings()

                # need to override the yarn mode to "yarn-client" for python
                conf.set("spark.master", "yarn-client")

                if not conf.getBoolean("spark.dynamicAllocation.enabled",
                                       False):
                    conf.set("spark.executor.instances", str(job.executors()))

                conf.set("spark.executor.cores", str(job.cores()))

                # in yarn-cluster, this is the total memory in the cluster, but here in yarn-client, it is
                # the memory per executor.  Go figure!
                mem = job.executorMemKb()

                overhead = conf.getInt("spark.yarn.executor.memoryOverhead",
                                       384)
                if (mem * 0.1) > overhead:
                    overhead = mem * 0.1

                if overhead < 384:
                    overhead = 384

                mem -= (
                    overhead * 2
                )  # overhead is 1x for driver and 1x for application master (am)
                conf.set("spark.executor.memory",
                         jvm.SparkUtils.kbtohuman(long(mem), "m"))

            jsc = jvm.JavaSparkContext(conf)
            jsc.setCheckpointDir(
                jvm.HadoopFileUtils.createJobTmp(
                    jsc.hadoopConfiguration()).toString())
            self.sparkContext = jsc.sc()

        self._started = True