示例#1
0
文件: worker.py 项目: xbaro/codalab
    def run(task_id, task_args):
        """
        Performs a Run.

        task_id: The tracking ID for this task.
        task_args: The input arguments for this task:
        """
        run_id = task_args['bundle_id']
        execution_time_limit = task_args['execution_time_limit']
        container = task_args['container_name']
        reply_to_queue_name = task_args['reply_to']
        is_predict_step = task_args.get("predict", False)
        queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(),
                                     config.getAzureServiceBusKey(),
                                     config.getAzureServiceBusIssuer(),
                                     config.get_service_bus_shared_access_key_name(),
                                     config.get_service_bus_shared_access_key_value(),
                                     reply_to_queue_name)
        root_dir = None
        current_dir = os.getcwd()
        temp_dir = config.getLocalRoot()
        try:
           running_processes = subprocess.check_output(["fuser", temp_dir])
        except subprocess.CalledProcessError, e:
           running_processes = ''
示例#2
0
    def run(task_id, task_args):
        """
        Performs a Run.

        task_id: The tracking ID for this task.
        task_args: The input arguments for this task:
        """
        run_id = task_args['bundle_id']
        execution_time_limit = task_args['execution_time_limit']
        container = task_args['container_name']
        reply_to_queue_name = task_args['reply_to']
        is_predict_step = task_args.get("predict", False)
        queue = AzureServiceBusQueue(
            config.getAzureServiceBusNamespace(),
            config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(),
            config.get_service_bus_shared_access_key_name(),
            config.get_service_bus_shared_access_key_value(),
            reply_to_queue_name)
        root_dir = None
        current_dir = os.getcwd()
        temp_dir = config.getLocalRoot()
        try:
            running_processes = subprocess.check_output(["fuser", temp_dir])
        except subprocess.CalledProcessError, e:
            running_processes = ''
示例#3
0
def main():
    """
    Setup the worker and start it.
    """
    config = WorkerConfig()

    logging.config.dictConfig(config.getLoggerDictConfig())

    # queue to listen to for notifications of tasks to perform
    queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(),
                                 config.getAzureServiceBusKey(),
                                 config.getAzureServiceBusIssuer(),
                                 config.getAzureServiceBusQueue())
    # map task type to function to accomplish the task
    vtable = {'run': get_run_func(config)}
    # create and start the worker
    worker = BaseWorker(queue, vtable, logger)
    logger.info("Starting compute worker.")
    worker.start()
示例#4
0
def main():
    """
    Setup the worker and start it.
    """
    config = WorkerConfig()

    logging.config.dictConfig(config.getLoggerDictConfig())

    # queue to listen to for notifications of tasks to perform
    queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(),
                                 config.getAzureServiceBusKey(),
                                 config.getAzureServiceBusIssuer(),
                                 config.getAzureServiceBusQueue())
    # map task type to function to accomplish the task
    vtable = {
        'run' : get_run_func(config)
    }
    # create and start the worker
    worker = BaseWorker(queue, vtable, logger)
    logger.info("Starting compute worker.")
    worker.start()
示例#5
0
    def run(task_id, task_args):
        """
        Performs a Run.

        task_id: The tracking ID for this task.
        task_args: The input arguments for this task:
        """
        run_id = task_args['bundle_id']
        execution_time_limit = task_args['execution_time_limit']
        container = task_args['container_name']
        reply_to_queue_name = task_args['reply_to']
        queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(),
                                     config.getAzureServiceBusKey(),
                                     config.getAzureServiceBusIssuer(),
                                     reply_to_queue_name)
        root_dir = None
        current_dir = os.getcwd()

        try:
            _send_update(queue, task_id, 'running')
            # Create temporary directory for the run
            root_dir = tempfile.mkdtemp(dir=config.getLocalRoot())
            # Fetch and stage the bundles
            blob_service = BlobService(config.getAzureStorageAccountName(),
                                       config.getAzureStorageAccountKey())
            bundles = getBundle(root_dir, blob_service, container, run_id, 'run')
            # Verify we have an input folder: create one if it's not in the bundle.
            input_rel_path = join('run', 'input')
            if input_rel_path not in bundles:
                input_dir = join(root_dir, 'run', 'input')
                if os.path.exists(input_dir) == False:
                    os.mkdir(input_dir)
            # Verify we have a program
            prog_rel_path = join('run', 'program')
            if prog_rel_path not in bundles:
                raise Exception("Program bundle is not available.")
            prog_info = bundles[prog_rel_path]
            if prog_info is None:
                raise Exception("Program metadata is not available.")
            prog_cmd = ""
            if 'command' in prog_info:
                prog_cmd = prog_info['command'].strip()
            if len(prog_cmd) <= 0:
                raise Exception("Program command is not specified.")
            # Create output folder
            output_dir = join(root_dir, 'run', 'output')
            if os.path.exists(output_dir) == False:
                os.mkdir(output_dir)
            # Create temp folder
            temp_dir = join(root_dir, 'run', 'temp')
            if os.path.exists(temp_dir) == False:
                os.mkdir(temp_dir)
            # Report the list of folders and files staged
            #
            # Invoke custom evaluation program
            run_dir = join(root_dir, 'run')
            os.chdir(run_dir)
            logger.debug("Execution directory: %s", run_dir)
            # Update command-line with the real paths
            logger.debug("CMD: %s", prog_cmd)
            prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \
                                .replace("$input", join('.', 'input')) \
                                .replace("$output", join('.', 'output')) \
                                .replace("$tmp", join('.', 'temp')) \
                                .replace("/", os.path.sep) \
                                .replace("\\", os.path.sep)
            logger.debug("Invoking program: %s", prog_cmd)
            stdout_file = join(run_dir, 'stdout.txt')
            stderr_file = join(run_dir, 'stderr.txt')
            startTime = time.time()
            exit_code = None
            timed_out = False

            with open(stdout_file, "wb") as out, open(stderr_file, "wb") as err:
                evaluator_process = Popen(prog_cmd.split(' '), stdout=out, stderr=err)

                while exit_code is None:
                    exit_code = evaluator_process.poll()

                    # time in seconds
                    if exit_code is None and time.time() - startTime > execution_time_limit:
                        evaluator_process.kill()
                        exit_code = -1
                        logger.info("Killed process for running too long!")
                        err.write("Execution time limit exceeded!")
                        timed_out = True
                        break
                    else:
                        time.sleep(.1)

            logger.debug("Exit Code: %d", exit_code)
            endTime = time.time()
            elapsedTime = endTime - startTime
            prog_status = {
                'exitCode': exit_code,
                'elapsedTime': elapsedTime
            }
            with open(join(output_dir, 'metadata'), 'w') as f:
                f.write(yaml.dump(prog_status, default_flow_style=False))
            # Upload stdout and stderr files
            stdout_id = "%s/stdout.txt" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, stdout_id, stdout_file)
            stderr_id = "%s/stderr.txt" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, stderr_id, stderr_file)

            # check if timed out AFTER output files are written! If we exit sooner, no output is written
            if timed_out:
                raise Exception("Execution time limit exceeded!")

            private_dir = join(output_dir, 'private')
            if os.path.exists(private_dir):
                logger.debug("Packing private results...")
                private_output_file = join(root_dir, 'run', 'private_output.zip')
                shutil.make_archive(os.path.splitext(private_output_file)[0], 'zip', output_dir)
                private_output_id = "%s/private_output.zip" % (os.path.splitext(run_id)[0])
                _upload(blob_service, container, private_output_id, private_output_file)
                shutil.rmtree(private_dir)

            # Pack results and send them to Blob storage
            logger.debug("Packing results...")
            output_file = join(root_dir, 'run', 'output.zip')
            shutil.make_archive(os.path.splitext(output_file)[0], 'zip', output_dir)
            output_id = "%s/output.zip" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, output_id, output_file)

            _send_update(queue, task_id, 'finished')
        except Exception:
            logger.exception("Run task failed (task_id=%s).", task_id)
            _send_update(queue, task_id, 'failed')

        # comment out for dev and viewing of raw folder outputs.
        if root_dir is not None:
           # Try cleaning-up temporary directory
           try:
               os.chdir(current_dir)
               shutil.rmtree(root_dir)
           except:
               logger.exception("Unable to clean-up local folder %s (task_id=%s)", root_dir, task_id)
示例#6
0
    def run(task_id, task_args):
        """
        Performs a Run.

        task_id: The tracking ID for this task.
        task_args: The input arguments for this task:
        """
        run_id = task_args['bundle_id']
        execution_time_limit = task_args['execution_time_limit']
        container = task_args['container_name']
        reply_to_queue_name = task_args['reply_to']
        is_predict_step = task_args.get("predict", False)
        queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(),
                                     config.getAzureServiceBusKey(),
                                     config.getAzureServiceBusIssuer(),
                                     reply_to_queue_name)
        root_dir = None
        current_dir = os.getcwd()

        try:
            _send_update(queue, task_id, 'running')
            # Create temporary directory for the run
            root_dir = tempfile.mkdtemp(dir=config.getLocalRoot())
            # Fetch and stage the bundles
            blob_service = BlobService(config.getAzureStorageAccountName(),
                                       config.getAzureStorageAccountKey())
            bundles = getBundle(root_dir, blob_service, container, run_id, 'run')
            # Verify we have an input folder: create one if it's not in the bundle.
            input_rel_path = join('run', 'input')
            if input_rel_path not in bundles:
                input_dir = join(root_dir, 'run', 'input')
                if os.path.exists(input_dir) == False:
                    os.mkdir(input_dir)
            # Verify we have a program
            prog_rel_path = join('run', 'program')
            if prog_rel_path not in bundles:
                raise Exception("Program bundle is not available.")

            prog_info = bundles[prog_rel_path]
            if prog_info is None:
                raise Exception("Program metadata is not available.")

            prog_cmd_list = []
            if 'command' in prog_info:
                if isinstance(prog_info['command'], type([])):
                    prog_cmd_list = [_.strip() for _ in prog_info['command']]
                else:
                    prog_cmd_list = [prog_info['command'].strip()]
            if len(prog_cmd_list) <= 0:
                raise Exception("Program command is not specified.")

            # Create output folder
            output_dir = join(root_dir, 'run', 'output')
            if os.path.exists(output_dir) == False:
                os.mkdir(output_dir)
            # Create temp folder
            temp_dir = join(root_dir, 'run', 'temp')
            if os.path.exists(temp_dir) == False:
                os.mkdir(temp_dir)
            # Report the list of folders and files staged
            #
            # Invoke custom evaluation program
            run_dir = join(root_dir, 'run')
            os.chdir(run_dir)
            os.environ["PATH"] += os.pathsep + run_dir + "/program"
            logger.debug("Execution directory: %s", run_dir)

            if is_predict_step:
                stdout_file_name = 'prediction_stdout_file.txt'
                stderr_file_name = 'prediction_stderr_file.txt'
            else:
                stdout_file_name = 'stdout.txt'
                stderr_file_name = 'stderr.txt'

            stdout_file = join(run_dir, stdout_file_name)
            stderr_file = join(run_dir, stderr_file_name)
            stdout = open(stdout_file, "a+")
            stderr = open(stderr_file, "a+")
            prog_status = []

            for prog_cmd_counter, prog_cmd in enumerate(prog_cmd_list):
                # Update command-line with the real paths
                logger.debug("CMD: %s", prog_cmd)
                prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \
                                    .replace("$input", join('.', 'input')) \
                                    .replace("$output", join('.', 'output')) \
                                    .replace("$tmp", join('.', 'temp')) \
                                    .replace("/", os.path.sep) \
                                    .replace("\\", os.path.sep)
                logger.debug("Invoking program: %s", prog_cmd)

                startTime = time.time()
                exit_code = None
                timed_out = False

                evaluator_process = Popen(prog_cmd.split(' '), stdout=stdout, stderr=stderr, env=os.environ)

                logger.debug("Started process, pid=%s" % evaluator_process.pid)

                time_difference = time.time() - startTime
                signal.signal(signal.SIGALRM, alarm_handler)
                signal.alarm(int(math.fabs(math.ceil(execution_time_limit - time_difference))))

                exit_code = None

                logger.debug("Checking process, exit_code = %s" % exit_code)

                try:
                    while exit_code == None:
                        time.sleep(1)
                        exit_code = evaluator_process.poll()
                except (ValueError, OSError):
                    pass # tried to communicate with dead process
                except ExecutionTimeLimitExceeded:
                    exit_code = -1
                    logger.info("Killed process for running too long!")
                    stderr.write("Execution time limit exceeded!")
                    evaluator_process.kill()
                    timed_out = True

                signal.alarm(0)

                logger.debug("Exit Code: %d", exit_code)

                endTime = time.time()
                elapsedTime = endTime - startTime

                if len(prog_cmd_list) == 1:
                    # Overwrite prog_status array with dict
                    prog_status = {
                        'exitCode': exit_code,
                        'elapsedTime': elapsedTime
                    }
                else:
                    # otherwise we're doing multi-track and processing multiple commands so append to the array
                    prog_status.append({
                        'exitCode': exit_code,
                        'elapsedTime': elapsedTime
                    })
                with open(join(output_dir, 'metadata'), 'w') as f:
                    f.write(yaml.dump(prog_status, default_flow_style=False))

            stdout.close()
            stderr.close()

            logger.debug("Saving output files")
            stdout_id = "%s/%s" % (os.path.splitext(run_id)[0], stdout_file_name)
            _upload(blob_service, container, stdout_id, stdout_file)
            stderr_id = "%s/%s" % (os.path.splitext(run_id)[0], stderr_file_name)
            _upload(blob_service, container, stderr_id, stderr_file)

            private_dir = join(output_dir, 'private')
            if os.path.exists(private_dir):
                logger.debug("Packing private results...")
                private_output_file = join(root_dir, 'run', 'private_output.zip')
                shutil.make_archive(os.path.splitext(private_output_file)[0], 'zip', output_dir)
                private_output_id = "%s/private_output.zip" % (os.path.splitext(run_id)[0])
                _upload(blob_service, container, private_output_id, private_output_file)
                shutil.rmtree(private_dir)

            # Pack results and send them to Blob storage
            logger.debug("Packing results...")
            output_file = join(root_dir, 'run', 'output.zip')
            shutil.make_archive(os.path.splitext(output_file)[0], 'zip', output_dir)
            output_id = "%s/output.zip" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, output_id, output_file)

            # Check if the output folder contain an "html file" and copy the html file as detailed_results.html
            # traverse root directory, and list directories as dirs and files as files
            html_found = False
            for root, dirs, files in os.walk(output_dir):
                if not (html_found):
                    path = root.split('/')
                    for file in files:
                        file_to_upload = os.path.join(root,file)
                        file_ext = os.path.splitext(file_to_upload)[1]
                        if file_ext.lower() ==".html":
                            html_file_id = "%s/html/%s" % (os.path.splitext(run_id)[0],"detailed_results.html")
                            _upload(blob_service, container, html_file_id, file_to_upload, "html")
                            html_found = True

            # check if timed out AFTER output files are written! If we exit sooner, no output is written
            if timed_out:
                logger.exception("Run task timed out (task_id=%s).", task_id)
                _send_update(queue, task_id, 'failed')
            elif exit_code != 0:
                logger.exception("Run task exit code non-zero (task_id=%s).", task_id)
                _send_update(queue, task_id, 'failed', extra={'traceback': open(stderr_file).read()})
            else:
                _send_update(queue, task_id, 'finished')
        except Exception:
            logger.exception("Run task failed (task_id=%s).", task_id)
            _send_update(queue, task_id, 'failed', extra={'traceback': traceback.format_exc()})

        # comment out for dev and viewing of raw folder outputs.
        if root_dir is not None:
            # Try cleaning-up temporary directory
            try:
                os.chdir(current_dir)
                shutil.rmtree(root_dir)
            except:
                logger.exception("Unable to clean-up local folder %s (task_id=%s)", root_dir, task_id)

        # Cleanup any stuck processes or old files
        temp_dir = config.getLocalRoot()

        # Cleanup dir
        for the_file in os.listdir(temp_dir):
            file_path = os.path.join(temp_dir, the_file)
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)

        # Kill running processes in the temp dir
        call(["fuser", "-f", temp_dir])
示例#7
0
    def run(task_id, task_args):
        """
        Performs a Run.

        task_id: The tracking ID for this task.
        task_args: The input arguments for this task:
        """
        run_id = task_args['bundle_id']
        execution_time_limit = task_args['execution_time_limit']
        container = task_args['container_name']
        reply_to_queue_name = task_args['reply_to']
        queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(),
                                     config.getAzureServiceBusKey(),
                                     config.getAzureServiceBusIssuer(),
                                     reply_to_queue_name)
        root_dir = None
        current_dir = os.getcwd()

        try:
            _send_update(queue, task_id, 'running')
            # Create temporary directory for the run
            root_dir = tempfile.mkdtemp(dir=config.getLocalRoot())
            # Fetch and stage the bundles
            blob_service = BlobService(config.getAzureStorageAccountName(),
                                       config.getAzureStorageAccountKey())
            bundles = getBundle(root_dir, blob_service, container, run_id,
                                'run')
            # Verify we have an input folder: create one if it's not in the bundle.
            input_rel_path = join('run', 'input')
            if input_rel_path not in bundles:
                input_dir = join(root_dir, 'run', 'input')
                if os.path.exists(input_dir) == False:
                    os.mkdir(input_dir)
            # Verify we have a program
            prog_rel_path = join('run', 'program')
            if prog_rel_path not in bundles:
                raise Exception("Program bundle is not available.")
            prog_info = bundles[prog_rel_path]
            if prog_info is None:
                raise Exception("Program metadata is not available.")
            prog_cmd = ""
            if 'command' in prog_info:
                prog_cmd = prog_info['command'].strip()
            if len(prog_cmd) <= 0:
                raise Exception("Program command is not specified.")
            # Create output folder
            output_dir = join(root_dir, 'run', 'output')
            if os.path.exists(output_dir) == False:
                os.mkdir(output_dir)
            # Create temp folder
            temp_dir = join(root_dir, 'run', 'temp')
            if os.path.exists(temp_dir) == False:
                os.mkdir(temp_dir)
            # Report the list of folders and files staged
            #
            # Invoke custom evaluation program
            run_dir = join(root_dir, 'run')
            os.chdir(run_dir)
            os.environ["PATH"] += os.pathsep + run_dir + "/program"
            logger.debug("Execution directory: %s", run_dir)
            # Update command-line with the real paths
            logger.debug("CMD: %s", prog_cmd)
            prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \
                                .replace("$input", join('.', 'input')) \
                                .replace("$output", join('.', 'output')) \
                                .replace("$tmp", join('.', 'temp')) \
                                .replace("/", os.path.sep) \
                                .replace("\\", os.path.sep)
            logger.debug("Invoking program: %s", prog_cmd)
            stdout_file = join(run_dir, 'stdout.txt')
            stderr_file = join(run_dir, 'stderr.txt')
            startTime = time.time()
            exit_code = None
            timed_out = False

            with open(stdout_file, "wb") as out, open(stderr_file,
                                                      "wb") as err:
                evaluator_process = Popen(prog_cmd.split(' '),
                                          stdout=out,
                                          stderr=err)

                while exit_code is None:
                    exit_code = evaluator_process.poll()

                    # time in seconds
                    if exit_code is None and time.time(
                    ) - startTime > execution_time_limit:
                        exit_code = -1
                        logger.info("Killed process for running too long!")
                        err.write("Execution time limit exceeded!")
                        evaluator_process.kill()
                        timed_out = True
                        break
                    else:
                        time.sleep(.1)

            logger.debug("Exit Code: %d", exit_code)
            endTime = time.time()
            elapsedTime = endTime - startTime
            prog_status = {'exitCode': exit_code, 'elapsedTime': elapsedTime}
            with open(join(output_dir, 'metadata'), 'w') as f:
                f.write(yaml.dump(prog_status, default_flow_style=False))
            # Upload stdout and stderr files
            stdout_id = "%s/stdout.txt" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, stdout_id, stdout_file)
            stderr_id = "%s/stderr.txt" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, stderr_id, stderr_file)

            private_dir = join(output_dir, 'private')
            if os.path.exists(private_dir):
                logger.debug("Packing private results...")
                private_output_file = join(root_dir, 'run',
                                           'private_output.zip')
                shutil.make_archive(
                    os.path.splitext(private_output_file)[0], 'zip',
                    output_dir)
                private_output_id = "%s/private_output.zip" % (
                    os.path.splitext(run_id)[0])
                _upload(blob_service, container, private_output_id,
                        private_output_file)
                shutil.rmtree(private_dir)

            # Pack results and send them to Blob storage
            logger.debug("Packing results...")
            output_file = join(root_dir, 'run', 'output.zip')
            shutil.make_archive(
                os.path.splitext(output_file)[0], 'zip', output_dir)
            output_id = "%s/output.zip" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, output_id, output_file)

            # Check if the output folder contain an "html file" and copy the html file as detailed_results.html
            # traverse root directory, and list directories as dirs and files as files
            html_found = False
            for root, dirs, files in os.walk(output_dir):
                if not (html_found):
                    path = root.split('/')
                    for file in files:
                        file_to_upload = os.path.join(root, file)
                        file_ext = os.path.splitext(file_to_upload)[1]
                        if file_ext.lower() == ".html":
                            html_file_id = "%s/html/%s" % (os.path.splitext(
                                run_id)[0], "detailed_results.html")
                            print "file_to_upload:%s" % file_to_upload
                            _upload(blob_service, container, html_file_id,
                                    file_to_upload, "html")
                            html_found = True

            # check if timed out AFTER output files are written! If we exit sooner, no output is written
            if timed_out:
                logger.exception("Run task failed (task_id=%s).", task_id)
                _send_update(queue, task_id, 'failed')
            else:
                _send_update(queue, task_id, 'finished')
        except Exception:
            logger.exception("Run task failed (task_id=%s).", task_id)
            _send_update(queue, task_id, 'failed')

        # comment out for dev and viewing of raw folder outputs.
        if root_dir is not None:
            # Try cleaning-up temporary directory
            try:
                os.chdir(current_dir)
                shutil.rmtree(root_dir)
            except:
                logger.exception(
                    "Unable to clean-up local folder %s (task_id=%s)",
                    root_dir, task_id)
示例#8
0
    def run(task_id, task_args):
        """
        Performs a Run.

        task_id: The tracking ID for this task.
        task_args: The input arguments for this task:
        """
        run_id = task_args['bundle_id']
        container = task_args['container_name']
        reply_to_queue_name = task_args['reply_to']
        queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(),
                                     config.getAzureServiceBusKey(),
                                     config.getAzureServiceBusIssuer(),
                                     reply_to_queue_name)
        root_dir = None
        current_dir = os.getcwd()
        try:
            _send_update(queue, task_id, 'running')

            # Create temporary directory for the run
            root_dir = tempfile.mkdtemp(dir=config.getLocalRoot())
            # Fetch and stage the bundles
            blob_service = BlobService(config.getAzureStorageAccountName(),
                                       config.getAzureStorageAccountKey())
            bundles = getBundle(root_dir, blob_service, container, run_id,
                                'run')
            # Verify we have an input folder: create one if it's not in the bundle.
            input_rel_path = join('run', 'input')
            if input_rel_path not in bundles:
                input_dir = join(root_dir, 'run', 'input')
                if os.path.exists(input_dir) == False:
                    os.mkdir(input_dir)
            # Verify we have a program
            prog_rel_path = join('run', 'program')
            if prog_rel_path not in bundles:
                raise Exception("Program bundle is not available.")
            prog_info = bundles[prog_rel_path]
            if prog_info is None:
                raise Exception("Program metadata is not available.")
            prog_cmd = ""
            if 'command' in prog_info:
                prog_cmd = prog_info['command'].strip()
            if len(prog_cmd) <= 0:
                raise Exception("Program command is not specified.")
            # Create output folder
            output_dir = join(root_dir, 'run', 'output')
            if os.path.exists(output_dir) == False:
                os.mkdir(output_dir)
            # Create temp folder
            temp_dir = join(root_dir, 'run', 'temp')
            if os.path.exists(temp_dir) == False:
                os.mkdir(temp_dir)
            # Report the list of folders and files staged
            #
            # Invoke custom evaluation program
            run_dir = join(root_dir, 'run')
            os.chdir(run_dir)
            logger.debug("Execution directory: %s", run_dir)
            # Update command-line with the real paths
            logger.debug("CMD: %s", prog_cmd)
            prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \
                                .replace("$input", join('.', 'input')) \
                                .replace("$output", join('.', 'output')) \
                                .replace("$tmp", join('.', 'temp')) \
                                .replace("/", os.path.sep) \
                                .replace("\\", os.path.sep)
            logger.debug("Invoking program: %s", prog_cmd)
            stdout_file = join(run_dir, 'stdout.txt')
            stderr_file = join(run_dir, 'stderr.txt')
            startTime = time.time()
            exitCode = os.system(prog_cmd + ' >' + stdout_file + ' 2>' +
                                 stderr_file)  # Run it!
            logger.debug("Exit Code: %d", exitCode)
            endTime = time.time()
            elapsedTime = endTime - startTime
            prog_status = {'exitCode': exitCode, 'elapsedTime': elapsedTime}
            with open(join(output_dir, 'metadata'), 'w') as f:
                f.write(yaml.dump(prog_status, default_flow_style=False))
            # Upload stdout and stderr files
            stdout_id = "%s/stdout.txt" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, stdout_id, stdout_file)
            stderr_id = "%s/stderr.txt" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, stderr_id, stderr_file)
            # Pack results and send them to Blob storage
            logger.debug("Packing results...")
            output_file = join(root_dir, 'run', 'output.zip')
            shutil.make_archive(
                os.path.splitext(output_file)[0], 'zip', output_dir)
            output_id = "%s/output.zip" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, output_id, output_file)

            _send_update(queue, task_id, 'finished')
        except Exception:
            logger.exception("Run task failed (task_id=%s).", task_id)
            _send_update(queue, task_id, 'failed')

        if root_dir is not None:
            # Try cleaning-up temporary directory
            try:
                os.chdir(current_dir)
                shutil.rmtree(root_dir)
            except:
                logger.exception(
                    "Unable to clean-up local folder %s (task_id=%s)",
                    root_dir, task_id)
示例#9
0
    def run(task_id, task_args):
        """
        Performs a Run.

        task_id: The tracking ID for this task.
        task_args: The input arguments for this task:
        """
        run_id = task_args['bundle_id']
        execution_time_limit = task_args['execution_time_limit']
        container = task_args['container_name']
        reply_to_queue_name = task_args['reply_to']
        is_predict_step = task_args.get("predict", False)
        queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(),
                                     config.getAzureServiceBusKey(),
                                     config.getAzureServiceBusIssuer(),
                                     reply_to_queue_name)
        root_dir = None
        current_dir = os.getcwd()

        try:
            _send_update(queue, task_id, 'running')
            # Create temporary directory for the run
            root_dir = tempfile.mkdtemp(dir=config.getLocalRoot())
            # Fetch and stage the bundles
            blob_service = BlobService(config.getAzureStorageAccountName(),
                                       config.getAzureStorageAccountKey())
            bundles = getBundle(root_dir, blob_service, container, run_id,
                                'run')
            # Verify we have an input folder: create one if it's not in the bundle.
            input_rel_path = join('run', 'input')
            if input_rel_path not in bundles:
                input_dir = join(root_dir, 'run', 'input')
                if os.path.exists(input_dir) == False:
                    os.mkdir(input_dir)
            # Verify we have a program
            prog_rel_path = join('run', 'program')
            if prog_rel_path not in bundles:
                raise Exception("Program bundle is not available.")

            prog_info = bundles[prog_rel_path]
            if prog_info is None:
                raise Exception("Program metadata is not available.")

            prog_cmd_list = []
            if 'command' in prog_info:
                if isinstance(prog_info['command'], type([])):
                    prog_cmd_list = [_.strip() for _ in prog_info['command']]
                else:
                    prog_cmd_list = [prog_info['command'].strip()]
            if len(prog_cmd_list) <= 0:
                raise Exception("Program command is not specified.")

            # Create output folder
            output_dir = join(root_dir, 'run', 'output')
            if os.path.exists(output_dir) == False:
                os.mkdir(output_dir)
            # Create temp folder
            temp_dir = join(root_dir, 'run', 'temp')
            if os.path.exists(temp_dir) == False:
                os.mkdir(temp_dir)
            # Report the list of folders and files staged
            #
            # Invoke custom evaluation program
            run_dir = join(root_dir, 'run')
            os.chdir(run_dir)
            os.environ["PATH"] += os.pathsep + run_dir + "/program"
            logger.debug("Execution directory: %s", run_dir)

            if is_predict_step:
                stdout_file_name = 'prediction_stdout_file.txt'
                stderr_file_name = 'prediction_stderr_file.txt'
            else:
                stdout_file_name = 'stdout.txt'
                stderr_file_name = 'stderr.txt'

            stdout_file = join(run_dir, stdout_file_name)
            stderr_file = join(run_dir, stderr_file_name)
            stdout = open(stdout_file, "a+")
            stderr = open(stderr_file, "a+")
            prog_status = []

            for prog_cmd_counter, prog_cmd in enumerate(prog_cmd_list):
                # Update command-line with the real paths
                logger.debug("CMD: %s", prog_cmd)
                prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \
                                    .replace("$input", join('.', 'input')) \
                                    .replace("$output", join('.', 'output')) \
                                    .replace("$tmp", join('.', 'temp')) \
                                    .replace("/", os.path.sep) \
                                    .replace("\\", os.path.sep)
                logger.debug("Invoking program: %s", prog_cmd)

                startTime = time.time()
                exit_code = None
                timed_out = False

                evaluator_process = Popen(prog_cmd.split(' '),
                                          stdout=stdout,
                                          stderr=stderr,
                                          env=os.environ)

                logger.debug("Started process, pid=%s" % evaluator_process.pid)

                time_difference = time.time() - startTime
                signal.signal(signal.SIGALRM, alarm_handler)
                signal.alarm(
                    int(
                        math.fabs(
                            math.ceil(execution_time_limit -
                                      time_difference))))

                exit_code = None

                logger.debug("Checking process, exit_code = %s" % exit_code)

                try:
                    while exit_code == None:
                        time.sleep(1)
                        exit_code = evaluator_process.poll()
                except (ValueError, OSError):
                    pass  # tried to communicate with dead process
                except ExecutionTimeLimitExceeded:
                    exit_code = -1
                    logger.info("Killed process for running too long!")
                    stderr.write("Execution time limit exceeded!")
                    evaluator_process.kill()
                    timed_out = True

                signal.alarm(0)

                logger.debug("Exit Code: %d", exit_code)

                endTime = time.time()
                elapsedTime = endTime - startTime

                if len(prog_cmd_list) == 1:
                    # Overwrite prog_status array with dict
                    prog_status = {
                        'exitCode': exit_code,
                        'elapsedTime': elapsedTime
                    }
                else:
                    # otherwise we're doing multi-track and processing multiple commands so append to the array
                    prog_status.append({
                        'exitCode': exit_code,
                        'elapsedTime': elapsedTime
                    })
                with open(join(output_dir, 'metadata'), 'w') as f:
                    f.write(yaml.dump(prog_status, default_flow_style=False))

            stdout.close()
            stderr.close()

            logger.debug("Saving output files")
            stdout_id = "%s/%s" % (os.path.splitext(run_id)[0],
                                   stdout_file_name)
            _upload(blob_service, container, stdout_id, stdout_file)
            stderr_id = "%s/%s" % (os.path.splitext(run_id)[0],
                                   stderr_file_name)
            _upload(blob_service, container, stderr_id, stderr_file)

            private_dir = join(output_dir, 'private')
            if os.path.exists(private_dir):
                logger.debug("Packing private results...")
                private_output_file = join(root_dir, 'run',
                                           'private_output.zip')
                shutil.make_archive(
                    os.path.splitext(private_output_file)[0], 'zip',
                    output_dir)
                private_output_id = "%s/private_output.zip" % (
                    os.path.splitext(run_id)[0])
                _upload(blob_service, container, private_output_id,
                        private_output_file)
                shutil.rmtree(private_dir)

            # Pack results and send them to Blob storage
            logger.debug("Packing results...")
            output_file = join(root_dir, 'run', 'output.zip')
            shutil.make_archive(
                os.path.splitext(output_file)[0], 'zip', output_dir)
            output_id = "%s/output.zip" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, output_id, output_file)

            # Check if the output folder contain an "html file" and copy the html file as detailed_results.html
            # traverse root directory, and list directories as dirs and files as files
            html_found = False
            for root, dirs, files in os.walk(output_dir):
                if not (html_found):
                    path = root.split('/')
                    for file in files:
                        file_to_upload = os.path.join(root, file)
                        file_ext = os.path.splitext(file_to_upload)[1]
                        if file_ext.lower() == ".html":
                            html_file_id = "%s/html/%s" % (os.path.splitext(
                                run_id)[0], "detailed_results.html")
                            _upload(blob_service, container, html_file_id,
                                    file_to_upload, "html")
                            html_found = True

            # check if timed out AFTER output files are written! If we exit sooner, no output is written
            if timed_out:
                logger.exception("Run task timed out (task_id=%s).", task_id)
                _send_update(queue, task_id, 'failed')
            elif exit_code != 0:
                logger.exception("Run task exit code non-zero (task_id=%s).",
                                 task_id)
                _send_update(queue,
                             task_id,
                             'failed',
                             extra={'traceback': open(stderr_file).read()})
            else:
                _send_update(queue, task_id, 'finished')
        except Exception:
            logger.exception("Run task failed (task_id=%s).", task_id)
            _send_update(queue,
                         task_id,
                         'failed',
                         extra={'traceback': traceback.format_exc()})

        # comment out for dev and viewing of raw folder outputs.
        if root_dir is not None:
            # Try cleaning-up temporary directory
            try:
                os.chdir(current_dir)
                shutil.rmtree(root_dir)
            except:
                logger.exception(
                    "Unable to clean-up local folder %s (task_id=%s)",
                    root_dir, task_id)

        # Cleanup any stuck processes or old files
        temp_dir = config.getLocalRoot()

        # Cleanup dir
        for the_file in os.listdir(temp_dir):
            file_path = os.path.join(temp_dir, the_file)
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)

        # Kill running processes in the temp dir
        call(["fuser", "-f", temp_dir])
示例#10
0
    def run(task_id, task_args):
        """
        Performs a Run.

        task_id: The tracking ID for this task.
        task_args: The input arguments for this task:
        """
        run_id = task_args['bundle_id']
        container = task_args['container_name']
        reply_to_queue_name = task_args['reply_to']
        queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(),
                                     config.getAzureServiceBusKey(),
                                     config.getAzureServiceBusIssuer(),
                                     reply_to_queue_name)
        root_dir = None
        current_dir = os.getcwd()
        try:
            _send_update(queue, task_id, 'running')

            # Create temporary directory for the run
            root_dir = tempfile.mkdtemp(dir=config.getLocalRoot())
            # Fetch and stage the bundles
            blob_service = BlobService(config.getAzureStorageAccountName(),
                                       config.getAzureStorageAccountKey())
            bundles = getBundle(root_dir, blob_service, container, run_id, 'run')
            # Verify we have an input folder: create one if it's not in the bundle.
            input_rel_path = join('run', 'input')
            if input_rel_path not in bundles:
                input_dir = join(root_dir, 'run', 'input')
                if os.path.exists(input_dir) == False:
                    os.mkdir(input_dir)
            # Verify we have a program
            prog_rel_path = join('run', 'program')
            if prog_rel_path not in bundles:
                raise Exception("Program bundle is not available.")
            prog_info = bundles[prog_rel_path]
            if prog_info is None:
                raise Exception("Program metadata is not available.")
            prog_cmd = ""
            if 'command' in prog_info:
                prog_cmd = prog_info['command'].strip()
            if len(prog_cmd) <= 0:
                raise Exception("Program command is not specified.")
            # Create output folder
            output_dir = join(root_dir, 'run', 'output')
            if os.path.exists(output_dir) == False:
                os.mkdir(output_dir)
            # Create temp folder
            temp_dir = join(root_dir, 'run', 'temp')
            if os.path.exists(temp_dir) == False:
                os.mkdir(temp_dir)
            # Report the list of folders and files staged
            #
            # Invoke custom evaluation program
            run_dir = join(root_dir, 'run')
            os.chdir(run_dir)
            logger.debug("Execution directory: %s" % run_dir)
            # Update command-line with the real paths
            logger.debug("CMD: %s" % prog_cmd)
            prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \
                                .replace("$input", join('.', 'input')) \
                                .replace("$output", join('.', 'output')) \
                                .replace("$tmp", join('.', 'temp')) \
                                .replace("/", os.path.sep) \
                                .replace("\\", os.path.sep)
            logger.debug("Invoking program: %s", prog_cmd)
            stdout_file = join(run_dir, 'stdout.txt')
            stderr_file = join(run_dir, 'stderr.txt')
            startTime = time.time()
            exitCode = os.system(prog_cmd + ' >' + stdout_file + ' 2>' + stderr_file) # Run it!
            logger.debug("Exit Code: %d" % exitCode)
            endTime = time.time()
            elapsedTime = endTime - startTime
            prog_status = {
                'exitCode': exitCode,
                'elapsedTime': elapsedTime
            }
            with open(join(output_dir, 'metadata'), 'w') as f:
                f.write(yaml.dump(prog_status, default_flow_style=False))
            # Upload stdout and stderr files
            stdout_id = "%s/stdout.txt" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, stdout_id, stdout_file)
            stderr_id = "%s/stderr.txt" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, stderr_id, stderr_file)
            # Pack results and send them to Blob storage
            logger.debug("Packing results...")
            output_file = join(root_dir, 'run', 'output.zip')
            shutil.make_archive(os.path.splitext(output_file)[0], 'zip', output_dir)
            output_id = "%s/output.zip" % (os.path.splitext(run_id)[0])
            _upload(blob_service, container, output_id, output_file)

            _send_update(queue, task_id, 'finished')
        except Exception:
            logger.exception("Run task failed (task_id=%s).", task_id)
            _send_update(queue, task_id, 'failed')

        if root_dir is not None:
            # Try cleaning-up temporary directory
            try:
                os.chdir(current_dir)
                shutil.rmtree(root_dir)
            except:
                logger.exception("Unable to clean-up local folder %s (task_id=%s)", root_dir, task_id)