Exemplo n.º 1
0
    def _run_task(self, task_id):
        task = self.__scheduled_tasks[task_id]

        logger.info('[pid %s] Running   %s', os.getpid(), task_id)
        try:
            # Verify that all the tasks are fulfilled!
            ok = True
            for task_2 in task.deps():
                if not task_2.complete():
                    ok = False
                    missing_dep = task_2

            if not ok:
                # TODO: possibly try to re-add task again ad pending
                raise RuntimeError('Unfulfilled dependency %r at run time!\nPrevious tasks: %r' % (missing_dep.task_id, self._previous_tasks))

            task.run()
            expl = json.dumps(task.on_success())
            logger.info('[pid %s] Done      %s', os.getpid(), task_id)
            status = DONE

        except KeyboardInterrupt:
            raise
        except Exception as ex:
            status = FAILED
            logger.exception("[pid %s] Error while running %s" % (os.getpid(), task))
            expl = task.on_failure(ex)
            receiver = interface.get_config().get('core', 'error-email', None)
            sender = interface.get_config().get('core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL)
            logger.info("[pid %s] Sending error email to %r", os.getpid(), receiver)
            notifications.send_email("Luigi: %s FAILED" % task, expl, sender, (receiver,))

        self.__scheduler.add_task(self.__id, task_id, status=status, expl=expl, runnable=None)
Exemplo n.º 2
0
    def add(self, task):
        try:
            task_id = task.task_id

            if task_id in self.__scheduled_tasks:
                return  # already scheduled
            logger.debug("Checking if %s is complete" % task_id)
            is_complete = False
            try:
                is_complete = task.complete()
                if is_complete not in (True, False):
                    raise Exception("Return value of Task.complete() must be boolean (was %r)" % is_complete)
            except KeyboardInterrupt:
                raise
            except:
                msg = "Will not schedule %s or any dependencies due to error in complete() method:" % (task,)
                logger.warning(msg, exc_info=1)  # like logger.exception but with WARNING level
                receiver = interface.get_config().get('core', 'error-email', None)
                sender = interface.get_config().get('core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL)
                logger.info("Sending warning email to %r" % receiver)
                notifications.send_email(
                    subject="Luigi: %s failed scheduling" % (task,),
                    message="%s:\n%s" % (msg, traceback.format_exc()),
                    sender=sender,
                    recipients=(receiver,))
                return
                # abort, i.e. don't schedule any subtasks of a task with
                # failing complete()-method since we don't know if the task
                # is complete and subtasks might not be desirable to run if
                # they have already ran before

            if is_complete:
                # Not submitting dependencies of finished tasks
                self.__scheduler.add_task(self.__id, task_id, status=DONE, runnable=False)

            elif task.run == NotImplemented:
                self.__scheduled_tasks[task_id] = task
                self.__scheduler.add_task(self.__id, task_id, status=PENDING, runnable=False)
                logger.warning('Task %s is is not complete and run() is not implemented. Probably a missing external dependency.', task_id)
            else:
                self.__scheduled_tasks[task_id] = task
                deps = [d.task_id for d in task.deps()]
                self.__scheduler.add_task(self.__id, task_id, status=PENDING, deps=deps, runnable=True)
                logger.info('Scheduled %s' % task_id)

                for task_2 in task.deps():
                    self.add(task_2)  # Schedule stuff recursively
        except KeyboardInterrupt:
            raise
        except:
            logger.exception("Luigi unexpected framework error while scheduling %s" % task)
            receiver = interface.get_config().get('core', 'error-email', None)
            sender = interface.get_config().get('core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL)
            notifications.send_email(
                subject="Luigi: Framework error while scheduling %s" % (task,),
                message="Luigi framework error:\n%s" % traceback.format_exc(),
                recipients=(receiver,),
                sender=sender)
Exemplo n.º 3
0
    def add(self, task):
        try:
            task_id = task.task_id

            if task_id in self.__scheduled_tasks:
                return  # already scheduled
            logger.debug("Checking %s" % task_id)
            if task.complete():
                # Not submitting dependencies of finished tasks
                self.__scheduler.add_task(self.__id,
                                          task_id,
                                          status=DONE,
                                          runnable=False)

            elif task.run == NotImplemented:
                self.__scheduled_tasks[task_id] = task
                self.__scheduler.add_task(self.__id,
                                          task_id,
                                          status=PENDING,
                                          runnable=False)
                logger.warning(
                    'Task %s is is not complete and run() is not implemented. Probably a missing external dependency.',
                    task_id)
            else:
                self.__scheduled_tasks[task_id] = task
                deps = [d.task_id for d in task.deps()]
                self.__scheduler.add_task(self.__id,
                                          task_id,
                                          status=PENDING,
                                          deps=deps,
                                          runnable=True)
                logger.info('Scheduled %s' % task_id)

                for task_2 in task.deps():
                    self.add(task_2)  # Schedule stuff recursively
        except KeyboardInterrupt:
            raise
        except:
            logger.exception("Error while trying to schedule %s" % task)

            if not sys.stdout.isatty():
                receiver = interface.get_config().get('core', 'error-email',
                                                      None)
                if receiver is not None:
                    email_body = "Scheduling error:\n%s" % traceback.format_exc(
                    )
                    sender = interface.get_config().get(
                        'core', 'email-sender',
                        'luigi-client@%s' % socket.getfqdn())
                    logger.info("Sending error email to %r" % receiver)
                    send_email("Luigi: %s FAILED SCHEDULING" % task,
                               email_body, sender, (receiver, ))
            exit(
                1)  # can't allow task to run without its dependencies resolved
Exemplo n.º 4
0
    def _run_task(self, task_id):
        task = self.__scheduled_tasks[task_id]

        logger.info('[pid %s] Running   %s', os.getpid(), task_id)
        try:
            # Verify that all the tasks are fulfilled!
            ok = True
            for task_2 in task.deps():
                if not task_2.complete():
                    ok = False
                    missing_dep = task_2

            if not ok:
                # TODO: possibly tru to re-add task again ad pending
                raise RuntimeError(
                    'Unfulfilled dependency %r at run time!\nPrevious tasks: %r'
                    % (missing_dep.task_id, self._previous_tasks))

            task.run()
            expl = json.dumps(task.on_success())
            logger.info('[pid %s] Done      %s', os.getpid(), task_id)
            status = DONE

        except KeyboardInterrupt:
            raise
        except Exception as ex:
            status = FAILED
            logger.exception("[pid %s] Error while running %s" %
                             (os.getpid(), task))
            expl = task.on_failure(ex)
            if not sys.stdout.isatty():
                receiver = interface.get_config().get('core', 'error-email',
                                                      None)
                if receiver is not None:
                    sender = interface.get_config().get(
                        'core', 'email-sender',
                        'luigi-client@%s' % socket.getfqdn())
                    logger.info("[pid %s] Sending error email to %r",
                                os.getpid(), receiver)
                    send_email("Luigi: %s FAILED" % task, expl, sender,
                               (receiver, ))

        self.__scheduler.add_task(self.__id,
                                  task_id,
                                  status=status,
                                  expl=expl,
                                  runnable=None)
Exemplo n.º 5
0
def _create_scheduler():
    config = interface.get_config()
    retry_delay = config.getfloat('scheduler', 'retry-delay', 900.0)
    remove_delay = config.getfloat('scheduler', 'remove-delay', 600.0)
    worker_disconnect_delay = config.getfloat('scheduler',
                                              'worker-disconnect-delay', 60.0)
    return scheduler.CentralPlannerScheduler(retry_delay, remove_delay,
                                             worker_disconnect_delay)
Exemplo n.º 6
0
Arquivo: hdfs.py Projeto: g152xx/luigi
def use_cdh4_syntax():
    """
    CDH4 (hadoop 2+) has a slightly different syntax for interacting with
    hdfs via the command line. The default version is CDH4, but one can
    override this setting with "cdh3" in the hadoop section of the config in
    order to use the old syntax
    """
    import interface
    return interface.get_config().get("hadoop", "version", "cdh4").lower() == "cdh4"
Exemplo n.º 7
0
def use_cdh4_syntax():
    """
    CDH4 (hadoop 2+) has a slightly different syntax for interacting with
    hdfs via the command line. The default version is CDH4, but one can
    override this setting with "cdh3" in the hadoop section of the config in
    order to use the old syntax
    """
    import interface
    return interface.get_config().get("hadoop", "version",
                                      "cdh4").lower() == "cdh4"
Exemplo n.º 8
0
    def add(self, task):
        try:
            task_id = task.task_id

            if task_id in self.__scheduled_tasks:
                return  # already scheduled
            logger.debug("Checking %s" % task_id)
            if task.complete():
                # Not submitting dependencies of finished tasks
                self.__scheduler.add_task(self.__id, task_id, status=DONE, runnable=False)

            elif task.run == NotImplemented:
                self.__scheduled_tasks[task_id] = task
                self.__scheduler.add_task(self.__id, task_id, status=PENDING, runnable=False)
                logger.warning(
                    "Task %s is is not complete and run() is not implemented. Probably a missing external dependency.",
                    task_id,
                )
            else:
                self.__scheduled_tasks[task_id] = task
                deps = [d.task_id for d in task.deps()]
                self.__scheduler.add_task(self.__id, task_id, status=PENDING, deps=deps, runnable=True)
                logger.info("Scheduled %s" % task_id)

                for task_2 in task.deps():
                    self.add(task_2)  # Schedule stuff recursively
        except KeyboardInterrupt:
            raise
        except:
            expl = traceback.format_exc(sys.exc_info()[2])

            logger.error(expl)
            logger.error("Error while trying to schedule %s" % task)

            if not sys.stdout.isatty():
                receiver = interface.get_config().get("core", "error-email", None)
                if receiver is not None:
                    sender = interface.get_config().get("core", "email-sender", "luigi-client@%s" % socket.getfqdn())
                    logger.info("Sending error email to %r" % receiver)
                    send_email("Luigi: %s FAILED SCHEDULING" % task, expl, sender, (receiver,))
            exit(1)  # can't allow task to run without its dependencies resolved
Exemplo n.º 9
0
    def _run_task(self, task_id):
        task = self.__scheduled_tasks[task_id]

        logger.info("[pid %s] Running   %s", os.getpid(), task_id)
        try:
            # Verify that all the tasks are fulfilled!
            ok = True
            for task_2 in task.deps():
                if not task_2.complete():
                    ok = False
                    missing_dep = task_2

            if not ok:
                # TODO: possibly tru to re-add task again ad pending
                raise RuntimeError(
                    "Unfulfilled dependency %r at run time!\nPrevious tasks: %r"
                    % (missing_dep.task_id, self._previous_tasks)
                )

            task.run()
            expl = json.dumps(task.on_success())
            logger.info("[pid %s] Done      %s", os.getpid(), task_id)
            status = DONE

        except KeyboardInterrupt:
            raise
        except Exception as ex:
            status = FAILED
            expl = json.dumps(task.on_failure(ex, traceback.format_exc(sys.exc_info()[2])))
            logger.error(expl)
            logger.exception("[pid %s] Error while running %s" % (os.getpid(), task))

            if not sys.stdout.isatty():
                receiver = interface.get_config().get("core", "error-email", None)
                if receiver is not None:
                    sender = interface.get_config().get("core", "email-sender", "luigi-client@%s" % socket.getfqdn())
                    logger.info("[pid %s] Sending error email to %r", os.getpid(), receiver)
                    send_email("Luigi: %s FAILED" % task, expl, sender, (receiver,))

        self.__scheduler.add_task(self.__id, task_id, status=status, expl=expl, runnable=None)
Exemplo n.º 10
0
def get_whoops_defaults(config=None):
    """Reads defaults from a client configuration file and fails if not."""
    config = config or interface.get_config()
    try:
        return {
            "host": config.get("hdfs", "namenode_host"),
            "port": config.get("hdfs", "namenode_port")
        }
    except:
        raise RuntimeError("You must specify namenode_host and namenode_port "
                           "in the [hdfs] section of your luigi config in "
                           "order to use luigi's whoops support without a "
                           "fully-qualified url")
Exemplo n.º 11
0
def runSimulator(input_file):

    env = simpy.Environment()

    debug = False

    hosts, links, flows, routers = interface.get_config(env, input_file, debug)

    monitor = Monitor(env, links, flows)

    # Run the simulation
    env.run(10 * 1000)

    # Graph the results
    # show_results(monitor)

    # Export the resutls to output.xlsx
    export_results(monitor)
Exemplo n.º 12
0
 def __init__(self):
     import interface
     config = interface.get_config()
     streaming_jar = config.get('hadoop', 'streaming-jar')
     super(DefaultHadoopJobRunner, self).__init__(streaming_jar=streaming_jar)
Exemplo n.º 13
0
def _create_scheduler():
    config = interface.get_config()
    retry_delay = config.getfloat('scheduler', 'retry-delay', 900.0)
    remove_delay = config.getfloat('scheduler', 'remove-delay', 600.0)
    worker_disconnect_delay = config.getfloat('scheduler', 'worker-disconnect-delay', 60.0)
    return scheduler.CentralPlannerScheduler(retry_delay, remove_delay, worker_disconnect_delay)
Exemplo n.º 14
0
 def __init__(self):
     import interface
     config = interface.get_config()
     streaming_jar = config.get('hadoop', 'streaming-jar')
     super(DefaultHadoopJobRunner,
           self).__init__(streaming_jar=streaming_jar)
Exemplo n.º 15
0
    def run_job(self, job):
        packages = [
            luigi
        ] + self.modules + job.extra_modules() + list(_attached_packages)

        # find the module containing the job
        packages.append(__import__(job.__module__, None, None, 'dummy'))

        # find the path to out runner.py
        runner_path = mrrunner.__file__
        # assume source is next to compiled
        if runner_path.endswith("pyc"):
            runner_path = runner_path[:-3] + "py"

        base_tmp_dir = interface.get_config().get('core', 'tmp-dir',
                                                  '/tmp/luigi')
        self.tmp_dir = os.path.join(
            base_tmp_dir, 'hadoop_job_%016x' % random.getrandbits(64))
        logger.debug("Tmp dir: %s", self.tmp_dir)
        os.makedirs(self.tmp_dir)

        # build arguments
        map_cmd = 'python mrrunner.py map'
        cmb_cmd = 'python mrrunner.py combiner'
        red_cmd = 'python mrrunner.py reduce'

        # replace output with a temporary work directory
        output_final = job.output().path
        output_tmp_fn = output_final + '-temp-' + datetime.datetime.now(
        ).isoformat().replace(':', '-')
        tmp_target = luigi.hdfs.HdfsTarget(output_tmp_fn, is_tmp=True)

        arglist = ['hadoop', 'jar', self.streaming_jar]

        # 'libjars' is a generic option, so place it first
        libjars = [libjar for libjar in self.libjars]

        for libjar in self.libjars_in_hdfs:
            subprocess.call(['hadoop', 'fs', '-get', libjar, self.tmp_dir])
            libjars.append(os.path.join(self.tmp_dir,
                                        os.path.basename(libjar)))

        if libjars:
            arglist += ['-libjars', ','.join(libjars)]

        # Add static files and directories
        extra_files = get_extra_files(job.extra_files())

        files = []
        for src, dst in extra_files:
            dst_tmp = '%s_%09d' % (dst.replace(
                '/', '_'), random.randint(0, 999999999))
            files += ['%s#%s' % (src, dst_tmp)]
            # -files doesn't support subdirectories, so we need to create the dst_tmp -> dst manually
            job._add_link(dst_tmp, dst)

        if files:
            arglist += ['-files', ','.join(files)]

        jobconfs = job.jobconfs()

        for k, v in self.jobconfs.iteritems():
            jobconfs.append('%s=%s' % (k, v))

        for conf in jobconfs:
            arglist += ['-D', conf]

        arglist += self.streaming_args

        arglist += ['-mapper', map_cmd, '-reducer', red_cmd]
        if job.combiner != NotImplemented:
            arglist += ['-combiner', cmb_cmd]
        files = [
            runner_path, self.tmp_dir + '/packages.tar',
            self.tmp_dir + '/job-instance.pickle'
        ]

        for f in files:
            arglist += ['-file', f]

        if self.output_format:
            arglist += ['-outputformat', self.output_format]
        if self.input_format:
            arglist += ['-inputformat', self.input_format]

        for target in luigi.task.flatten(job.input_hadoop()):
            assert isinstance(target, luigi.hdfs.HdfsTarget)
            arglist += ['-input', target.path]

        assert isinstance(job.output(), luigi.hdfs.HdfsTarget)
        arglist += ['-output', output_tmp_fn]

        # submit job
        create_packages_archive(packages, self.tmp_dir + '/packages.tar')

        job._dump(self.tmp_dir)

        run_and_track_hadoop_job(arglist)

        # rename temporary work directory to given output
        tmp_target.move(output_final, fail_if_exists=True)
        self.finish()
Exemplo n.º 16
0
    def add(self, task):
        if not isinstance(task, Task):
            raise TaskException('Can not schedule non-task %s' % task)

        if not task.initialized():
            # we can't get the repr of it since it's not initialized...
            raise TaskException(
                'Task of class %s not initialized. Did you override __init__ and forget to call super(...).__init__?'
                % task.__class__.__name__)

        try:
            task_id = task.task_id

            if task_id in self.__scheduled_tasks:
                return  # already scheduled
            logger.debug("Checking if %s is complete" % task_id)
            is_complete = False
            try:
                is_complete = task.complete()
                if is_complete not in (True, False):
                    raise Exception(
                        "Return value of Task.complete() must be boolean (was %r)"
                        % is_complete)
            except KeyboardInterrupt:
                raise
            except:
                msg = "Will not schedule %s or any dependencies due to error in complete() method:" % (
                    task, )
                logger.warning(
                    msg,
                    exc_info=1)  # like logger.exception but with WARNING level
                receiver = interface.get_config().get('core', 'error-email',
                                                      None)
                sender = interface.get_config().get(
                    'core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL)
                logger.info("Sending warning email to %r" % receiver)
                notifications.send_email(
                    subject="Luigi: %s failed scheduling" % (task, ),
                    message="%s:\n%s" % (msg, traceback.format_exc()),
                    sender=sender,
                    recipients=(receiver, ))
                return
                # abort, i.e. don't schedule any subtasks of a task with
                # failing complete()-method since we don't know if the task
                # is complete and subtasks might not be desirable to run if
                # they have already ran before

            if is_complete:
                # Not submitting dependencies of finished tasks
                self.__scheduler.add_task(self.__id,
                                          task_id,
                                          status=DONE,
                                          runnable=False)

            elif task.run == NotImplemented:
                self.__scheduled_tasks[task_id] = task
                self.__scheduler.add_task(self.__id,
                                          task_id,
                                          status=PENDING,
                                          runnable=False)
                logger.warning(
                    'Task %s is not complete and run() is not implemented. Probably a missing external dependency.',
                    task_id)
            else:
                self.__scheduled_tasks[task_id] = task
                deps = task.deps()
                for d in deps:
                    if isinstance(d, Target):
                        raise Exception(
                            'requires() can not return Target objects. Wrap it in an ExternalTask class'
                        )
                    elif not isinstance(d, Task):
                        raise Exception('requires() must return Task objects')
                deps = [d.task_id for d in task.deps()]
                self.__scheduler.add_task(self.__id,
                                          task_id,
                                          status=PENDING,
                                          deps=deps,
                                          runnable=True)
                logger.info('Scheduled %s' % task_id)

                for task_2 in task.deps():
                    self.add(task_2)  # Schedule stuff recursively
        except KeyboardInterrupt:
            raise
        except:
            logger.exception(
                "Luigi unexpected framework error while scheduling %s" % task)
            receiver = interface.get_config().get('core', 'error-email', None)
            sender = interface.get_config().get(
                'core', 'email-sender', notifications.DEFAULT_CLIENT_EMAIL)
            notifications.send_email(
                subject="Luigi: Framework error while scheduling %s" %
                (task, ),
                message="Luigi framework error:\n%s" % traceback.format_exc(),
                recipients=(receiver, ),
                sender=sender)
Exemplo n.º 17
0
    def run_job(self, job):
        packages = [luigi] + self.modules + job.extra_modules() + list(_attached_packages)

        # find the module containing the job
        packages.append(__import__(job.__module__, None, None, 'dummy'))

        # find the path to out runner.py
        runner_path = mrrunner.__file__
        # assume source is next to compiled
        if runner_path.endswith("pyc"):
            runner_path = runner_path[:-3] + "py"

        base_tmp_dir = interface.get_config().get('core', 'tmp-dir', '/tmp/luigi')
        self.tmp_dir = os.path.join(base_tmp_dir, 'hadoop_job_%016x' % random.getrandbits(64))
        logger.debug("Tmp dir: %s", self.tmp_dir)
        os.makedirs(self.tmp_dir)

        # build arguments
        map_cmd = 'python mrrunner.py map'
        cmb_cmd = 'python mrrunner.py combiner'
        red_cmd = 'python mrrunner.py reduce'

        # replace output with a temporary work directory
        output_final = job.output().path
        output_tmp_fn = output_final + '-temp-' + datetime.datetime.now().isoformat().replace(':', '-')
        tmp_target = luigi.hdfs.HdfsTarget(output_tmp_fn, is_tmp=True)

        arglist = ['hadoop', 'jar', self.streaming_jar]

        # 'libjars' is a generic option, so place it first
        libjars = [libjar for libjar in self.libjars]

        for libjar in self.libjars_in_hdfs:
            subprocess.call(['hadoop', 'fs', '-get', libjar, self.tmp_dir])
            libjars.append(os.path.join(self.tmp_dir, os.path.basename(libjar)))

        if libjars:
            arglist += ['-libjars', ','.join(libjars)]

        # Add static files and directories
        extra_files = get_extra_files(job.extra_files())

        files = []
        for src, dst in extra_files:
            dst_tmp = '%s_%09d' % (dst.replace('/', '_'), random.randint(0, 999999999))
            files += ['%s#%s' % (src, dst_tmp)]
            # -files doesn't support subdirectories, so we need to create the dst_tmp -> dst manually
            job._add_link(dst_tmp, dst)

        if files:
            arglist += ['-files', ','.join(files)]

        jobconfs = job.jobconfs()

        for k, v in self.jobconfs.iteritems():
            jobconfs.append('%s=%s' % (k, v))

        for conf in jobconfs:
            arglist += ['-D', conf]

        arglist += self.streaming_args

        arglist += ['-mapper', map_cmd, '-reducer', red_cmd]
        if job.combiner != NotImplemented:
            arglist += ['-combiner', cmb_cmd]
        files = [runner_path, self.tmp_dir + '/packages.tar', self.tmp_dir + '/job-instance.pickle']

        for f in files:
            arglist += ['-file', f]

        if self.output_format:
            arglist += ['-outputformat', self.output_format]
        if self.input_format:
            arglist += ['-inputformat', self.input_format]

        for target in luigi.task.flatten(job.input_hadoop()):
            assert isinstance(target, luigi.hdfs.HdfsTarget)
            arglist += ['-input', target.path]

        assert isinstance(job.output(), luigi.hdfs.HdfsTarget)
        arglist += ['-output', output_tmp_fn]

        # submit job
        create_packages_archive(packages, self.tmp_dir + '/packages.tar')

        job._dump(self.tmp_dir)

        self.run_and_track_hadoop_job(arglist)

        # rename temporary work directory to given output
        tmp_target.move(output_final, fail_if_exists=True)
        self.finish()