예제 #1
0
def setup_position_scan_files(job, pos_scan_str=None):
  """
  Takes a model.Job entity and creates appropriate files on each
  CRUNCHING_HOSTS server, ready to run a FoldX PositionScan job.
  
  Optionally, pos_scan_str can be provided (eg a <PositionScan>
  string generated using chain2pos_scan_str).
  """
  
  if not pos_scan_str:
    pos_scan_str = chain2pos_scan_str(job.params.chain_pssm, \
                                      job.params.pdbfile, \
                                      mutation_set=job.params.mutation_set)
  
  job_sub_path = pjoin(job.path, pos_scan_str.replace(',', '-'))
  
  if job.params.predict_water:
    water = "-PREDICT"
  else:
    water = "-CRYSTAL" 
  run_txt = POSITION_SCAN_TEMPLATE % {'position_scan':pos_scan_str, 'water':water}

  # fill out the qsub script with variables
  callback_url = get_callback_url(job, part=pos_scan_str)
  qsub_script = FOLDX_SCRIPT_TEMPLATE % {'foldx':FOLDX_BIN, \
                                         'callback_url':callback_url,\
					 'job_uuid':job.uuid,\
					 'part':pos_scan_str}

  for host in CRUNCHING_HOSTS:
    with fabsettings(hide('stderr', 'stdout'), \
                  host_string=host, \
                  key_filename=SSH_KEYS):
      
      with fabsettings(warn_only=True):
        if not exists(job.path):
          run('mkdir %s' % (job.path))
        run('mkdir %s' % (job_sub_path))
      
      remote_pdb_path = pjoin(job.path, "TEMPLATE.pdb")
      if not exists(remote_pdb_path):
        put(job.params.pdbfile.fullpath(), \
            remote_pdb_path)
      
      run("ln -s %s %s" % (remote_pdb_path, \
                           pjoin(job_sub_path, "TEMPLATE.pdb")))
      run("ln -s %s %s" % (ROTABASE_PATH, \
                           pjoin(job_sub_path, "rotabase.txt")))
      put(StringIO(run_txt), pjoin(job_sub_path,'run.txt'))
      put(StringIO(LIST_TXT), pjoin(job_sub_path,'list.txt'))
      put(StringIO(qsub_script), pjoin(job_sub_path,'foldx.qsub'))

      fabric.network.disconnect_all()
  
  return job_sub_path
예제 #2
0
def wfdeploy(branch='master'):
    local('git add -p')
    with fabsettings(warn_only=True):
        result = local('git commit')
    local("git push origin %s" % branch)
    code_dir = '~/webapps/ckwilcox_com'
    with cd(code_dir):
        run("git pull origin %s" % branch)
예제 #3
0
def wfdeploy(branch='master'):
    local('git add -p')
    with fabsettings(warn_only=True):
        result = local('git commit')
    local("git push origin %s" % branch)
    code_dir = '~/webapps/piles_app/piles_io/'
    with cd(code_dir):
        run("git pull origin %s" % branch)
        run("../apache2/bin/restart")
예제 #4
0
def kill_remote_job(self, task_data=None, **kwargs):
    from ..models import Job

    _init_fabric_env()

    environment = task_data.get('environment', {})
    job_id = task_data.get('job_id')
    job = Job.objects.get(id=job_id)
    master_ip = job.compute_resource.host
    gateway = job.compute_resource.gateway_server
    queue_type = job.compute_resource.queue_type
    private_key = job.compute_resource.private_key
    remote_username = job.compute_resource.extra.get('username', None)

    working_dir = job.abs_path_on_compute
    kill_script_path = join(working_dir, 'kill_job.sh')

    message = "No message."
    try:
        with fabsettings(gateway=gateway,
                         host_string=master_ip,
                         user=remote_username,
                         key=private_key):
            with cd(working_dir):
                with shell_env(**environment):
                    # if queue_type == 'slurm':
                    #     result = run(f"scancel {job.remote_id}")
                    # else:
                    #     result = run(f"kill {job.remote_id}")

                    result = run(f"{kill_script_path} kill")
                    job_killed = result.succeeded

    except BaseException as e:
        if hasattr(e, 'message'):
            message = e.message

        self.update_state(state=states.FAILURE, meta=message)
        raise e

    task_data.update(result=result)

    return task_data
예제 #5
0
def poll_job_ps(self, task_data=None, **kwargs):
    from ..models import Job

    job_id = task_data.get('job_id')
    job = Job.objects.get(id=job_id)
    master_ip = job.compute_resource.host
    gateway = job.compute_resource.gateway_server
    _init_fabric_env()
    private_key = job.compute_resource.private_key
    remote_username = job.compute_resource.extra.get('username', None)

    message = "No message."
    try:
        with fabsettings(gateway=gateway,
                         host_string=master_ip,
                         user=remote_username,
                         key=private_key):
            with shell_env():
                result = run(f"ps - u {remote_username} -o pid | "
                             f"tr -d ' ' | "
                             f"grep '^{job.remote_id}$'")
                job_is_not_running = not result.succeeded

    except BaseException as e:
        if hasattr(e, 'message'):
            message = e.message

        self.update_state(state=states.FAILURE, meta=message)
        raise e

    # grab the Job from the database again to minimise race condition
    # where the status updated while we are running ssh'ing and running 'ps'
    job = Job.objects.get(id=job_id)
    if not job.done and job_is_not_running:
        job.status = Job.STATUS_FAILED
        job.save()

        index_remote_files.apply_async(args=(dict(job_id=job_id), ))

    task_data.update(result=result)

    return task_data
예제 #6
0
def get_results(job):
  """
  Pulls raw results files back from the compute node(s).
  """
  for host in CRUNCHING_HOSTS:
    with fabsettings(hide('stderr', 'stdout'), host_string=host):
      result = get(job.path, settings.RESULTS_ROOT)
      """
      with cd(job.path):
        # equivalent to "../"+{uuid}, but safer not to assume
        #reljobdir = os.path.join("..", \
        #          os.path.basename( \
        #           os.path.dirname(job.path)))
        reljobdir = "../"+job.uuid
        tarball = uuid+".tar.bz2"
        result = run("%s %s %s" % (TAR_COMMAND, tarball, reljobdir))
      """
  if result:
    return os.path.join(settings.RESULTS_ROOT, job.uuid)
  else:
    return None
예제 #7
0
def check_job_part_done(job, part, filename="FINISHED"):
  """
  Looks for the existence of the file "FINISHED" (or whatever filename
  is specified) in the job/part directory. Returns True if it exists, else False.
  
  Used as a secondary check in case the job_complete webhook fails to notify 
  the server that the job has finished.
  """
  # TODO: rather than just check for finish, also periodically call
  #       another function that looks at the number of lines in each
  #       interfacr/<job_uuid>/*/energies_*_TEMPLATE.txt
  #       file and presents a progress bar to the user
  for host in CRUNCHING_HOSTS:
    with fabsettings(hide('stderr', 'stdout'), host_string=host):
      finpath = os.path.join(*[job.path, part, "FINISHED"])
      log.debug("Checking for file: " + finpath)
      if exists(finpath):
        log.debug(finpath + " found. Job done !")
        return True
      else:
        log.debug(finpath + " not found. Job isn't finished.")
        return False
예제 #8
0
def start_job(self, task_data=None, **kwargs):
    from ..models import Job

    if task_data is None:
        raise InvalidTaskError("task_data is None")

    job_id = task_data.get('job_id')
    job = Job.objects.get(id=job_id)
    result = task_data.get('result')
    master_ip = job.compute_resource.host
    gateway = job.compute_resource.gateway_server

    webhook_notify_url = ''
    # secret = None

    environment = task_data.get('environment', {})
    job_auth_header = task_data.get('job_auth_header', '')
    # environment.update(JOB_ID=job_id)
    _init_fabric_env()
    private_key = job.compute_resource.private_key
    remote_username = job.compute_resource.extra.get('username', None)

    job_script_template_vars = dict(environment)
    job_script_template_vars['JOB_AUTH_HEADER'] = job_auth_header
    job_script = BytesIO(
        render_to_string('job_scripts/run_job.sh',
                         context=job_script_template_vars).encode('utf-8'))
    kill_script = BytesIO(
        render_to_string('job_scripts/kill_job.sh',
                         context=job_script_template_vars).encode('utf-8'))
    curl_headers = BytesIO(b"%s\n" % job_auth_header.encode('utf-8'))
    config_json = BytesIO(json.dumps(job.params).encode('utf-8'))

    remote_id = None
    message = "Failure, without exception."
    try:
        with fabsettings(
                gateway=gateway,
                host_string=master_ip,
                user=remote_username,
                key=private_key,
                # key_filename=expanduser("~/.ssh/id_rsa"),
        ):
            working_dir = job.abs_path_on_compute
            input_dir = join(working_dir, 'input')
            output_dir = join(working_dir, 'output')
            job_script_path = join(input_dir, 'run_job.sh')
            kill_script_path = join(working_dir, 'kill_job.sh')
            for d in [working_dir, input_dir, output_dir]:
                result = run(f'mkdir -p {d} && chmod 700 {d}')
            result = put(job_script, job_script_path, mode=0o700)
            result = put(kill_script, kill_script_path, mode=0o700)
            result = put(curl_headers,
                         join(working_dir, '.private_request_headers'),
                         mode=0o600)
            result = put(config_json,
                         join(input_dir, 'pipeline_config.json'),
                         mode=0o600)
            with cd(working_dir):
                with shell_env(**environment):
                    # NOTE: We can't sbatch the run_job.sh script due to
                    #       the local aria2c RPC daemon launched by laxydl
                    #       In the future, we may have a DataTransferHost where
                    #       the data staging steps run, then we could launch
                    #       run_job.sh via sbatch.
                    # if job.compute_resource.queue_type == 'slurm':
                    #     result = run(f"sbatch --parsable "
                    #                  f'--job-name="laxy:{job_id}" '
                    #                  f"--output output/run_job.out "
                    #                  f"{job_script_path} "
                    #                  f" >>slurm.jids")
                    #     remote_id = run(str("head -1 slurm.jids"))

                    # The job script is always run locally on the compute
                    # node (not sbatched), but will itself send jobs
                    # to the queue.
                    result = run(f"nohup bash -l -c '"
                                 f"{job_script_path} & "
                                 f"echo $! >>job.pids"
                                 f"' >output/run_job.out")
                    remote_id = run(str("head -1 job.pids"))

        succeeded = result.succeeded
    except BaseException as e:
        succeeded = False
        if hasattr(e, 'message'):
            message = e.message
        if hasattr(e, '__traceback__'):
            tb = e.__traceback__
            message = '%s - Traceback: %s' % (message, ''.join(
                traceback.format_list(traceback.extract_tb(tb))))
        else:
            message = repr(e)

    if not succeeded and job.compute_resource.disposable:
        job.compute_resource.dispose()

    job_status = Job.STATUS_RUNNING if succeeded else Job.STATUS_FAILED
    job = Job.objects.get(id=job_id)
    job.status = job_status
    job.remote_id = remote_id
    job.save()

    # if webhook_notify_url:
    #     job_status = Job.STATUS_STARTING if succeeded else Job.STATUS_FAILED
    #     resp = request_with_retries(
    #         'PATCH', callback_url,
    #         json={'status': job_status},
    #         headers={'Authorization': secret},
    #     )

    if not succeeded:
        self.update_state(state=states.FAILURE, meta=message)
        raise Exception(message)
        # raise Ignore()

    task_data.update(result=result)

    return task_data
예제 #9
0
def estimate_job_tarball_size(self, task_data=None, **kwargs):

    if task_data is None:
        raise InvalidTaskError("task_data is None")

    from ..models import Job

    _init_fabric_env()

    environment = task_data.get('environment', {})
    job_id = task_data.get('job_id')
    job = Job.objects.get(id=job_id)
    master_ip = job.compute_resource.host
    gateway = job.compute_resource.gateway_server
    queue_type = job.compute_resource.queue_type
    private_key = job.compute_resource.private_key
    remote_username = job.compute_resource.extra.get('username', None)

    job_path = job.abs_path_on_compute

    message = "No message."
    task_result = dict()
    try:
        with fabsettings(gateway=gateway,
                         host_string=master_ip,
                         user=remote_username,
                         key=private_key):
            with cd(job_path):
                with shell_env(**environment):
                    # if queue_type == 'slurm':
                    #     result = run(f"scancel {job.remote_id}")
                    # else:
                    #     result = run(f"kill {job.remote_id}")

                    # NOTE: If running tar -czf is too slow / too much extra I/O load,
                    #       we could use the placeholder heuristic of
                    #       f`du -bc --max-depth=0 "{job_path}"` * 0.66 for RNAsik runs,
                    #       stored in job metadata. Or add proper sizes to every File.metdata
                    #       and derive it from a query.

                    result = run(
                        f'tar -czf - --directory "{job_path}" . | wc --bytes')
                    if result.succeeded:
                        tarball_size = int(result.stdout.strip())
                        with transaction.atomic():
                            job = Job.objects.get(id=job_id)
                            job.params['tarball_size'] = tarball_size
                            job.save()

                        task_result['tarball_size'] = tarball_size
                    else:
                        task_result['stdout'] = result.stdout.strip()
                        task_result['stderr'] = result.stderr.strip()

    except BaseException as e:
        if hasattr(e, 'message'):
            message = e.message

        self.update_state(state=states.FAILURE, meta=message)
        raise e

    task_data.update(result=task_result)

    return task_data
예제 #10
0
def index_remote_files(self, task_data=None, **kwargs):
    if task_data is None:
        raise InvalidTaskError("task_data is None")

    job_id = task_data.get('job_id')
    job = Job.objects.get(id=job_id)
    clobber = task_data.get('clobber', False)

    compute_resource = job.compute_resource
    if compute_resource is not None:
        master_ip = compute_resource.host
        gateway = compute_resource.gateway_server
    else:
        logger.info(f"Not indexing files for {job_id}, no compute_resource.")
        return task_data

    job.log_event('JOB_INFO', 'Indexing all files (backend task)')

    environment = task_data.get('environment', {})
    # environment.update(JOB_ID=job_id)
    _init_fabric_env()
    private_key = job.compute_resource.private_key
    remote_username = job.compute_resource.extra.get('username', None)

    compute_id = job.compute_resource.id
    message = "No message."

    def create_update_file_objects(remote_path,
                                   fileset=None,
                                   prefix_path='',
                                   location_base=''):
        """
        Returns a list of (unsaved) File objects from a recursive 'find'
        of a remote directory. If a file of the same path exists in the FileSet,
        update the file object location (if unset) rather than create a new one.

        :param fileset:
        :type fileset:
        :param prefix_path:
        :type prefix_path:
        :param remote_path: Path on the remote server.
        :type remote_path: str
        :param location_base: Prefix of location URL (eg sftp://127.0.0.1/XxX/)
        :type location_base: str
        :return: A list of File objects
        :rtype: List[File]
        """

        with cd(remote_path):
            filepaths = remote_list_files('.')
            urls = [(f'{location_base}/{fpath}', fpath) for fpath in filepaths]

            file_objs = []
            for location, filepath in urls:
                fname = Path(filepath).name
                fpath = Path(prefix_path) / Path(filepath).parent

                if fileset:
                    f = fileset.get_file_by_path(Path(fpath) / Path(fname))

                if not f:
                    f = File(location=location,
                             owner=job.owner,
                             name=fname,
                             path=fpath)
                elif not f.location:
                    f.location = location
                    f.owner = job.owner

                file_objs.append(f)

        return file_objs

    try:
        with fabsettings(
                gateway=gateway,
                host_string=master_ip,
                user=remote_username,
                key=private_key,
                # key_filename=expanduser("~/.ssh/id_rsa"),
        ):
            working_dir = job.abs_path_on_compute
            input_dir = os.path.join(working_dir, 'input')
            output_dir = os.path.join(working_dir, 'output')

            output_files = create_update_file_objects(
                output_dir,
                fileset=job.output_files,
                prefix_path='output',
                location_base=laxy_sftp_url(job, 'output'),
            )
            job.output_files.path = 'output'
            job.output_files.owner = job.owner

            if clobber:
                job.output_files.remove(job.output_files, delete=True)

            job.output_files.add(output_files)

            # TODO: This should really be done at job start, or once input data
            #       has been staged on the compute node.
            input_files = create_update_file_objects(
                input_dir,
                fileset=job.input_files,
                prefix_path='input',
                location_base=laxy_sftp_url(job, 'input'))
            job.input_files.path = 'input'
            job.input_files.owner = job.owner

            if clobber:
                job.input_files.remove(job.input_files, delete=True)

            job.input_files.add(input_files)

        succeeded = True
    except BaseException as e:
        succeeded = False
        if hasattr(e, 'message'):
            message = e.message

        self.update_state(state=states.FAILURE, meta=message)
        raise e

    # job_status = Job.STATUS_RUNNING if succeeded else Job.STATUS_FAILED
    # job = Job.objects.get(id=job_id)
    # job.status = job_status
    # job.save()

    # if not succeeded:
    #     self.update_state(state=states.FAILURE, meta=message)
    #     raise Exception(message)
    #     # raise Ignore()

    return task_data
예제 #11
0
def run_position_scan_parallel(job, threads=None):
  pos_scan_str = chain2pos_scan_str(job.params.chain_pssm, \
                                     job.params.pdbfile, \
                                     mutation_set=job.params.mutation_set)
   
  positions = pos_scan_str.split(',')
  
  # this re-globs positions to match the number of 'threads'
  globbed_ps = []
  if threads and not (threads > len(positions)):
    c = 0
    ps = ""
    globsize = len(positions)/threads
    while 1:
      if positions:
        ps += positions.pop()+","
      else:
        break
      if (c % globsize == 0):
        ps=ps[:-1]
        globbed_ps.append(ps)
        ps = ""
      c +=1
    if ps: globbed_ps.append(ps)
    positions = globbed_ps
  ####

  for pos in positions:
    # setup remote files on cluster
    job_sub_path = setup_position_scan_files(job, pos_scan_str=pos)
    # update the job record so it knows about this 'sub-job'
    job.running_parts.append(pos)
    
    for host in CRUNCHING_HOSTS:
      with fabsettings(hide('stderr', 'stdout'), host_string=host):

        with cd(job_sub_path):
          # stdin, stdout and stderr must all be redirected somewhere 
          # (/dev/null or a file) so that the shell can be properly
          # backgrounded and detached .. otherwise run() becomes blocking
          """
          cmd = 'nohup %(foldx)s -runfile run.txt >foldx.out 2>foldx.err </dev/null; \
                 curl "%(url)s" &>callback.out </dev/null &' % \
                 {'foldx':FOLDX_BIN, 
                  'url':callback_url}
          """
          # submit a job to the remote queing system
          # note: the sleep seems to be required otherwise
          #       some jobs fail to queue - I guess fabric is killing the
          #       connection too quickly
          cmd = "qsub foldx.qsub & sleep %i" % (QSUB_SLEEP)

          result = run(cmd)
          log.debug(result)
          
          # test: this detaches and is non-blocking 
          #result = run("nohup yes >& /dev/null < /dev/null &")
          
  fabric.network.disconnect_all()
  job.start_date = datetime.datetime.now()
  job.save()
  
  return job.uuid