Exemplo n.º 1
0
    def add_job(self, job_dict):
        job = models.StitchJob()
        # TODO: job.id is the same as file_id
        job.id = str(time.time())
        job.src_filename = str(job_dict.get("src_filename", ""))
        job.src_file_id = "" 
        job.dst_dir = str(job_dict.get("dst_dir", ""))
        job.dst_format = str(job_dict.get("dst_format", "flv"))
        job.segments = str(job_dict.get("segments", ""))
        job.map_filename = str(job_dict.get("map_filename", ""))
        job.map_file_id = ""

        print "src_filename = %s, dst_dir = %s" % (job.src_filename, job.dst_dir)
        if job.src_filename == "" or job.dst_dir == "":
            return None

        print "add_job, job_id: %s" % (job.id) 
        try:
            job_db_operator.add(job)
            job_manager = JobManager(job)
            self._job_managers.append(job_manager)
            return job.id
        except Exception as e:
            # TODO: maka a response
            print e.message
        return None
Exemplo n.º 2
0
    def handle_failed_result(self, task_id):
        """save the failed status result

        Args:
            task_id (str): the result under this task_id
        """
        from job_manager import JobManager
        logger = Logger().get()
        logger.debug(
            f"save_result: task_id: {task_id} status: {Status.Failed}")
        ResourceManager().save_result(task_id, [])
        ResourceManager().update_task_status(task_id, Status.Failed)
        JobManager().finish_task(task_id)
        try:
            container = self.id_to_task_container[task_id][1]
        except Exception as e:
            logger.info(
                f"exception for finding container correspoding to {task_id}, status:{Status.Failed}, maybe the container is forced killed before, {e}"
            )
        else:
            self.id_to_task_container.pop(task_id, None)
            container.stop()
            container.remove()
            logger.debug(
                f"for task:{task_id}, can't run cmd normally, exit normally")
Exemplo n.º 3
0
def dist_train(trainer,
               paddle_job):
    if os.getenv("RUNNING_ON_CLOUD", "NO") == "NO":
        job_manager = JobManager(paddle_job)
        if not job_manager.submit():
            print "submit Paddle Job failed."
        else:
            print "submit Paddle Job successed."
    else:
        trainer()
def simulate(cluster_manager):
    # NUM_NODES = 50
    NUM_NODES = 20
    # NUM_TASKS = 18000
    NUM_JOBS = 40
    JOB_ARRIVAL_DURATION = 1000
    MACHINE_SPEC = ResourceVec(16, 64, 3000, 5)

    # a, m = 3., 2.  # shape and mode of distribution
    # durations = np.round((np.random.pareto(a, NUM_TASKS) + 1) * m)

    # Initialize cluster
    cluster = cluster_manager()

    # Add nodes
    for _ in range(NUM_NODES):
        cluster.addNode(MACHINE_SPEC)

    # Create jobs
    arrival_times = np.sort(
        np.round(
            np.random.uniform(low=0, high=JOB_ARRIVAL_DURATION,
                              size=NUM_JOBS)))
    jobs = []
    for i in range(NUM_JOBS):
        duration = 260
        numTasks = 30
        taskResources = randomResource(MACHINE_SPEC)
        jobs.append(JobManager(duration, numTasks, taskResources))

    print("{0} {1} {0}".format("=" * 15, cluster.name))
    print("Starting simulation.")

    time = 0
    jobIdx = 0
    # Run simulation
    while cluster.hasUncompletedJobs() or jobIdx < len(jobs):
        while jobIdx < len(jobs) and arrival_times[jobIdx] == time:
            job = jobs[jobIdx]
            cluster.assignJob(job)
            jobIdx += 1

        if time % 100 == 0:
            print(f'{time}s\n{cluster.status()}')
        cluster.tick()
        time += 1

    print(f'All jobs completed in {time}s.')

    job_durations = [job.endTime - job.startTime for job in jobs]
    print(job_durations)
    print(sum(job_durations) / len(job_durations))

    return time
Exemplo n.º 5
0
def trigger_job(instances_db,
                job_id,
                jobs_db,
                botleague_liaison_host,
                docker_tag=None,
                job_type=JOB_TYPE_EVAL):
    docker_tag = docker_tag or 'deepdriveio/problem-worker-test'
    eval_mgr = JobManager(jobs_db=jobs_db, instances_db=instances_db)
    eval_mgr.check_for_finished_jobs()
    test_job = Box(botleague_liaison_host=botleague_liaison_host,
                   status=JOB_STATUS_CREATED,
                   id=job_id,
                   job_type=job_type,
                   eval_spec=Box(docker_tag=docker_tag,
                                 eval_id=utils.generate_rand_alphanumeric(32),
                                 eval_key='fake_eval_key',
                                 seed=1,
                                 problem='domain_randomization',
                                 pull_request=None,
                                 max_seconds=20))

    # Make a copy of prod instances
    prod_instances_db = get_worker_instances_db(force_firestore_db=True)
    for inst in prod_instances_db.where('id', '>', ''):
        instances_db.set(inst.id, inst)

    try:
        eval_mgr.jobs_db.set(job_id, test_job)
        new_jobs, exceptions = eval_mgr.assign_jobs()
        assert not exceptions
        if new_jobs:
            # We don't actually start instances but we act like we did.
            assert new_jobs[0].status == JOB_STATUS_CREATED or \
                   new_jobs[0].instance_id

            if 'instance_id' in new_jobs[0]:
                instance_meta = eval_mgr.instances_db.get(
                    new_jobs[0].instance_id)

                # So we have real instance meta, but inserted the job into a
                # test collection that the instance is not watching.
                # So the job will not actually run.
                assert instance_meta.status == INSTANCE_STATUS_USED
        else:
            log.warning('Test did not find an instance to run. TODO: use'
                        ' test instance data.')
    finally:
        if jobs_db is not None:
            jobs_db.delete_all_test_data()
        if instances_db is not None:
            instances_db.delete_all_test_data()
Exemplo n.º 6
0
def main():
    job_manager = JobManager()

    if '--check-for-finished-jobs' in sys.argv:
        job_manager.check_for_finished_jobs()

    def loop_fn():
        ping_cronitor('run')
        # ci_mgr.run()
        job_manager.run()
        ping_cronitor('complete')

    SingletonLoop(loop_name=constants.JOB_LOOP_ID,
                  fn=loop_fn).run()
Exemplo n.º 7
0
    def setUp(self):

        print("Test with command\n%s\n" % self.command_line_input)
        print("Submitting job...")
        now = datetime.now()
        dummy_job = Job(self.job_name, self.comments, now)
        test_task = Task(self.location, self.tool_type,
                         self.command_line_input)
        dummy_job.tasks = [test_task]
        job_id, tasks_id = JobManager().submit_job(dummy_job)

        self.job_id = job_id
        self.tasks_id = tasks_id

        ms_without_ns = int(now.microsecond / 1000) * 1000
        self.now = now.replace(microsecond=ms_without_ns)
Exemplo n.º 8
0
 def test_GT_1507_class_job_manager(self):
     importDataDict = {
         TEST_showImageUrl: TEST_showImageUrl,
         TEST_showObjectUrl: TEST_showObjectUrl
     }
     threadJobObj = ThreadJob(backgroundFunction, TEST_channelName,
                              TEST_openDataUrl, importDataDict,
                              TEST_serviceName)
     threadJobObj.start()
     manager = JobManager()
     jobId = manager.startJob(threadJobObj)
     self.assertEquals(len(jobId), 12)
     self.assertEquals(type(manager.getJob(jobId)), dict)
     manager.stopJob(jobId)
     self.assertEquals(threadJobObj.done, True)
     self.assertEquals(type(manager.getJobs()), list)
     threadJobObj.stop()
def init_server(args):
    global environ
    global job_manager
    global measurement_manager

    base_path = args.config_path
    modules = args.modules
    master = args.master

    modules = ['%s/%s' % (base_path, module) for module in modules]
    environ = load_environ('%s/config.json' % (base_path), modules)

    if master != '':
        environ['master'] = master

    job_manager = JobManager(environ)
    job_manager.register_completion_cb(job_completed)
    measurement_manager = MeasurementManager(environ)
Exemplo n.º 10
0
    def kill_task(self, task_id):
        """try to kill and remove the container correspoding to the given task_id, if succeed, update the status at RM
    
        Args:
            task_id (Task): The id of the task

        """
        logger = Logger().get()
        try:
            container = self.id_to_task_container[task_id][1]
            container.stop()
            container.remove()
            self.id_to_task_container.pop(task_id, None)
            ResourceManager().update_task_status(task_id, Status.Killed)
            from job_manager import JobManager
            JobManager().finish_task(task_id)
        except Exception as e:
            logger.error(
                f"try to kill {task_id}'s container fail, maybe the container is not existed or already killed, exception: {e}"
            )
Exemplo n.º 11
0
 def __init__(self):
     self.manager = JobManager()
     self.base = automap_base()
     self.engine = create_engine(settings.CONNECTION_STR)
     self.base.prepare(self.engine, reflect=True)
     self.Session = sessionmaker(bind=self.engine)
Exemplo n.º 12
0
            for key, value in self.__dict__.items()
            if not key.startswith('__') and not callable(key)
        }
        state_file = open(SERVER_STATE_FILE, 'w')
        dump(state_variables, state_file, skipkeys=True)
        state_file.close()


# Called on first run of the server to initialise first pull job
def init():
    # Commented out for front end deployment to AWS
    job_manager.add_job(datetime.datetime.now(), PULL_LAND_REGISTRY_JOB, '')


Base.metadata.create_all(database_engine)
job_manager = JobManager(1, 'JobsManagerThread', 1, database_engine)

if '-i' in sys.argv:
    init()

server_state = ServerState()

external_stylesheets = ['']

# Reflect the dataset table from the database
Table('core_dataset',
      Base.metadata,
      autoload=True,
      autoload_with=database_engine,
      keep_existing=False,
      extend_existing=True)
Exemplo n.º 13
0
 def __init__(self, *args, **kwargs):
     super(StrategyAgent.TrainBehaviour, self).__init__()
     self.job_manager = JobManager(
         workers=[f'strategy_agent_worker1@{domain}', f'strategy_agent_worker2@{domain}'])
Exemplo n.º 14
0
def get_job_list():
    """Test command: curl --request GET http://localhost:5000/job-list/json
    """
    job_list = ResourceManager().get_job_list()
    print(JobManager().job_metadata)
    return {"Job_List": job_list}
Exemplo n.º 15
0
def submit_job_through_job_manager(job):
    JobManager().submit_job(job)
Exemplo n.º 16
0
    def handle_successful_result(self, task_id):
        """save the successful status result

        Args:
            task_id (str): the result under this task_id
        """
        from job_manager import JobManager
        logger = Logger().get()
        logger.debug(
            f"save_result: task_id: {task_id} status: {Status.Successful}")
        try:
            output_path = self.id_to_task_container[task_id][0].output
            container = self.id_to_task_container[task_id][1]
        except Exception as e:
            logger.info(
                f"exception for finding container correspoding to {task_id}, status:{Status.Successful} maybe the container is forced killed before, {e}"
            )
            ResourceManager().save_result(task_id, [])
            ResourceManager().update_task_status(task_id, Status.Killed)
        else:
            self.id_to_task_container.pop(task_id, None)
            try:
                # get result from container
                container.exec_run("tar -cvf {}.docker {}".format(
                    task_id, " ".join(output_path)))
                bits, stat = container.get_archive(f"{task_id}.docker")
                path = f"/tmp/Felucca/result/{task_id}"
                if not os.path.exists(path):
                    os.makedirs(path)
                file = open(f"{path}/{task_id}.tar", "wb+")
                for chunk in bits:
                    file.write(chunk)
                file.close()
                # extract result tar file
                result_tar = tarfile.open(f"{path}/{task_id}.tar", "r")
                result_tar.extractall(path)
                result_tar.close()
                result_tar = tarfile.open(f"{path}/{task_id}.docker", "r")
                result_tar.extractall(path)
                result_tar.close()
                #delete temp tar file after extraction
                os.remove(f"{path}/{task_id}.tar")
                os.remove(f"{path}/{task_id}.docker")
                logger.debug(f"for task:{task_id} execute and exit normally")
            except Exception as e:
                logger.error(
                    f"exception at copying result out of container and extrating the file for {task_id}, {e}"
                )
                ResourceManager().save_result(task_id, [])
                ResourceManager().update_task_status(task_id, Status.Failed)
                container.stop()
                container.remove()
            else:
                for index in range(len(output_path)):
                    output_path[index] = os.path.join(path, output_path[index])
                    print(output_path[index])
                ResourceManager().save_result(task_id, output_path)
                ResourceManager().mark_task_as_finished(task_id)
                JobManager().finish_task(task_id)
                container.stop()
                container.remove()
Exemplo n.º 17
0
import os
from job_manager import JobManager
from datetime import datetime

# Configure logging
log_format = "%(asctime)s  %(name)8s  %(levelname)5s  %(message)s"
logging.basicConfig(
    level=logging.INFO,
    handlers=[logging.FileHandler("test.log"),
              logging.StreamHandler()],
    format=log_format,
)
logger = logging.getLogger("main")

# Instantiate JobManager instance
jm = JobManager()


class BaseHandler(tornado.web.RequestHandler):
    def set_default_headers(self):
        self.set_header("Access-Control-Allow-Origin", "*")
        self.set_header(
            "Access-Control-Allow-Headers",
            "Origin, X-Requested-With, Content-Type, Accept, Authorization")
        self.set_header("Access-Control-Allow-Methods",
                        " POST, PUT, DELETE, OPTIONS, GET")

    def options(self):
        self.set_status(204)
        self.finish()
Exemplo n.º 18
0
def kill_job(job_id):
    JobManager().kill_job(job_id)
    return {"Status": "ok"}
Exemplo n.º 19
0
 def setUp(self):
     # Use "test" database for unit tests instead of "felucca"
     self.resource_manager = ResourceManager("test")
     self.job_manager = JobManager()
     self.job_manager.db_name = "test"
Exemplo n.º 20
0
def get_result():
    status = request.form['status']
    ExecutionManager().save_result(request.form['task_id'], status)
    JobManager().finish_task(request.form['task_id'])
    return {'is_received': True}
Exemplo n.º 21
0
TITLE_KEYWORDS = [
    'engineer', 'software-engineer', 'dataengineer', 'data-engineer', 'data'
]

# jobs must contain one group keyword in job description for every group
MUST_HAVE_KEYWORD_GROUPS = [['python', 'python3']]

# job will update all nice to have keywords founds
NICE_TO_HAVE_KEYWORDS = [
    'pandas', 'webscraping', 'dash', 'scrapy', 'etl', 'pipeline'
]

# Starting urls from indeed site, can add job titles, experience level, etc
INDEED_STARTING_URLS = [
    "https://www.indeed.com/jobs?q=data+engineer&jt=fulltime&explvl=entry_level",
    "https://www.indeed.com/jobs?q=software+engineer&jt=fulltime&explvl=entry_level",
]

# amount of jobs opening at a time
JOB_TAB_AMOUNT = 5

# resume you are sending out
RESUME = 'V1.00'

if __name__ == '__main__':
    user = JobManager(TITLE_KEYWORDS, MUST_HAVE_KEYWORD_GROUPS,
                      NICE_TO_HAVE_KEYWORDS, INDEED_STARTING_URLS,
                      JOB_TAB_AMOUNT, RESUME)
    print(user)
    user.start()
Exemplo n.º 22
0
def thread_update_kernel(BASE_IMAGE="seipharos/pharos:latest"):
    JobManager().kill_all_jobs()
    ExecutionManager().update_kernel(BASE_IMAGE)