예제 #1
0
    def submit_combine(self):
        job_ids = self.job_list
        if job_ids is not None and len(job_ids) > 20:
            print("WARNING: boto3 cannot support waiting for more than 20 jobs.")
            print("Please wait for the reading to finish, then run again with the")
            print("`combine` option.")
            return

        # Get environment variables
        environment_vars = get_environment()

        job_name = '%s_combine_reading_results' % self.basename
        command_list = get_batch_command(
            ['python', '-m', 'indra.tools.reading.assemble_reading_stmts_aws',
             self.basename, '-r'] + self.readers,
            purpose='pmid_reading',
            project=self.project_name
            )
        logger.info('Command list: %s' % str(command_list))
        kwargs = {'jobName': job_name, 'jobQueue': self._job_queue,
                  'jobDefinition': self._job_def,
                  'containerOverrides': {'environment': environment_vars,
                                         'command': command_list,
                                         'memory': 60000, 'vcpus': 1}}
        if job_ids:
            kwargs['dependsOn'] = job_ids
        batch_client = boto3.client('batch')
        batch_client.submit_job(**kwargs)
        logger.info("submitted...")
        return
예제 #2
0
def submit_combine(basename, readers, job_ids=None, project_name=None):
    if job_ids is not None and len(job_ids) > 20:
        print("WARNING: boto3 cannot support waiting for more than 20 jobs.")
        print("Please wait for the reading to finish, then run again with the")
        print("`combine` option.")
        return

    # Get environment variables
    environment_vars = get_environment()

    job_name = '%s_combine_reading_results' % basename
    command_list = get_batch_command([
        'python', '-m', 'indra.tools.reading.assemble_reading_stmts_aws',
        basename, '-r'
    ] + readers,
                                     purpose='pmid_reading',
                                     project=project_name)
    logger.info('Command list: %s' % str(command_list))
    kwargs = {
        'jobName': job_name,
        'jobQueue': 'run_reach_queue',
        'jobDefinition': 'run_reach_jobdef',
        'containerOverrides': {
            'environment': environment_vars,
            'command': command_list,
            'memory': 60000,
            'vcpus': 1
        }
    }
    if job_ids:
        kwargs['dependsOn'] = job_ids
    batch_client = boto3.client('batch')
    batch_client.submit_job(**kwargs)
    logger.info("submitted...")
예제 #3
0
def submit_reading(basename,
                   pmid_list_filename,
                   readers,
                   start_ix=None,
                   end_ix=None,
                   pmids_per_job=3000,
                   num_tries=2,
                   force_read=False,
                   force_fulltext=False,
                   project_name=None):
    # Upload the pmid_list to Amazon S3
    pmid_list_key = 'reading_results/%s/pmids' % basename
    s3_client = boto3.client('s3')
    s3_client.upload_file(pmid_list_filename, bucket_name, pmid_list_key)

    # If no end index is specified, read all the PMIDs
    if end_ix is None:
        with open(pmid_list_filename, 'rt') as f:
            lines = f.readlines()
            end_ix = len(lines)

    if start_ix is None:
        start_ix = 0

    # Get environment variables
    environment_vars = get_environment()

    # Iterate over the list of PMIDs and submit the job in chunks
    batch_client = boto3.client('batch')
    job_list = []
    for job_start_ix in range(start_ix, end_ix, pmids_per_job):
        job_end_ix = job_start_ix + pmids_per_job
        if job_end_ix > end_ix:
            job_end_ix = end_ix
        job_name = '%s_%d_%d' % (basename, job_start_ix, job_end_ix)
        command_list = get_batch_command([
            'python', '-m', 'indra.tools.reading.pmid_reading.read_pmids_aws',
            basename, '/tmp', '16',
            str(job_start_ix),
            str(job_end_ix), '-r'
        ] + readers,
                                         purpose='pmid_reading',
                                         project=project_name)
        if force_read:
            command_list.append('--force_read')
        if force_fulltext:
            command_list.append('--force_fulltext')
        logger.info('Commands list: %s' % str(command_list))
        job_info = batch_client.submit_job(
            jobName=job_name,
            jobQueue='run_reach_queue',
            jobDefinition='run_reach_jobdef',
            containerOverrides={
                'environment': environment_vars,
                'command': command_list
            },
            retryStrategy={'attempts': num_tries})
        logger.info("submitted...")
        job_list.append({'jobId': job_info['jobId']})
    return job_list
    def submit_reading(self,
                       input_fname,
                       start_ix,
                       end_ix,
                       ids_per_job,
                       num_tries=2):
        # stash this for later.
        self.ids_per_job = ids_per_job

        # Upload the pmid_list to Amazon S3
        id_list_key = 'reading_results/%s/%s' % (self.basename,
                                                 self._s3_input_name)
        s3_client = boto3.client('s3')
        s3_client.upload_file(input_fname, bucket_name, id_list_key)

        # If no end index is specified, read all the PMIDs
        if end_ix is None:
            with open(input_fname, 'rt') as f:
                lines = f.readlines()
                end_ix = len(lines)

        if start_ix is None:
            start_ix = 0

        # Get environment variables
        environment_vars = get_environment()

        # Iterate over the list of PMIDs and submit the job in chunks
        batch_client = boto3.client('batch', region_name='us-east-1')
        job_list = []
        for job_start_ix in range(start_ix, end_ix, ids_per_job):
            job_end_ix = job_start_ix + ids_per_job
            if job_end_ix > end_ix:
                job_end_ix = end_ix
            job_name, cmd = self._make_command(job_start_ix, job_end_ix)
            command_list = get_batch_command(cmd,
                                             purpose=self._purpose,
                                             project=self.project_name)
            logger.info('Command list: %s' % str(command_list))
            job_info = batch_client.submit_job(
                jobName=job_name,
                jobQueue=self._job_queue,
                jobDefinition=self._job_def,
                containerOverrides={
                    'environment': environment_vars,
                    'command': command_list
                },
                retryStrategy={'attempts': num_tries})
            logger.info("submitted...")
            job_list.append({'jobId': job_info['jobId']})
        self.job_list = job_list
        return job_list
예제 #5
0
def submit_db_reading(basename,
                      id_list_filename,
                      readers,
                      start_ix=None,
                      end_ix=None,
                      pmids_per_job=3000,
                      num_tries=2,
                      force_read=False,
                      force_fulltext=False,
                      read_all_fulltext=False,
                      project_name=None,
                      max_reach_input_len=None,
                      max_reach_space_ratio=None):
    # Upload the pmid_list to Amazon S3
    pmid_list_key = 'reading_inputs/%s/id_list' % basename
    s3_client = boto3.client('s3')
    s3_client.upload_file(id_list_filename, bucket_name, pmid_list_key)

    # If no end index is specified, read all the PMIDs
    if end_ix is None:
        with open(id_list_filename, 'rt') as f:
            lines = f.readlines()
            end_ix = len(lines)

    if start_ix is None:
        start_ix = 0

    # Get environment variables
    environment_vars = get_environment()

    # Fix reader options
    if 'all' in readers:
        readers = ['reach', 'sparser']

    if force_read:
        mode = 'all'
    else:
        mode = 'unread-all'

    # Iterate over the list of PMIDs and submit the job in chunks
    batch_client = boto3.client('batch', region_name='us-east-1')
    job_list = []
    for job_start_ix in range(start_ix, end_ix, pmids_per_job):
        job_end_ix = job_start_ix + pmids_per_job
        if job_end_ix > end_ix:
            job_end_ix = end_ix
        job_name = '%s_%d_%d' % (basename, job_start_ix, job_end_ix)
        command_list = get_batch_command([
            'python', '-m', 'indra.tools.reading.db_reading.read_db_aws',
            basename, '/tmp', mode, '32',
            str(job_start_ix),
            str(job_end_ix), '-r'
        ] + readers,
                                         purpose='db_reading',
                                         project=project_name)
        if force_fulltext:
            command_list.append('--force_fulltext')
        if read_all_fulltext:
            command_list.append('--read_all_fulltext')
        if max_reach_input_len is not None:
            command_list += ['--max_reach_input_len', max_reach_input_len]
        if max_reach_space_ratio is not None:
            command_list += ['--max_reach_space_ratio', max_reach_space_ratio]
        logger.info('Command list: %s' % str(command_list))
        job_info = batch_client.submit_job(
            jobName=job_name,
            jobQueue='run_db_reading_queue',
            jobDefinition='run_db_reading_jobdef',
            containerOverrides={
                'environment': environment_vars,
                'command': command_list
            },
            retryStrategy={'attempts': num_tries})
        logger.info("submitted...")
        job_list.append({'jobId': job_info['jobId']})
    return job_list
예제 #6
0
    def submit_jobs(self, *args, num_tries=1, stagger=0):
        """Submit all the jobs to batch.

        Parameters
        ----------
        num_tries : int
            The number of times a job may be attempted.
        stagger : float
            The number of seconds to wait between job submissions.

        Returns
        -------
        job_lists : dict{queue_name: list[str]}
            A dict of lists of job id strings, keyed by the name of each queue
            used.
        """
        # Get environment variables
        environment_vars = get_environment()

        # Iterate over the list of PMIDs and submit the job in chunks
        batch_client = boto3.client('batch', region_name='us-east-1')

        # Check to see if we've already been given a signal to quit.
        if self.running is None:
            self.running = True
        elif not self.running:
            return None

        self.set_monitors_submitting(True)
        try:
            for job_args in self._iter_job_args(*args):
                # Check for a stop signal
                if not self.running:
                    logger.info("Running was switched off, discontinuing...")
                    break

                cmd_iter = self._iter_job_queue_def_commands(*job_args)
                for job_name, cmd, job_def, job_queue in cmd_iter:
                    command_list = get_batch_command(cmd,
                                                     purpose=self._purpose,
                                                     project=self.project_name)
                    logger.info('Command list: %s' % str(command_list))

                    # Submit the job.
                    kwargs = {}
                    if self.job_timeout_override is not None:
                        kwargs['timeout'] = \
                            {'attemptDurationSeconds': self.job_timeout_override}
                    job_info = batch_client.submit_job(
                        jobName=job_name,
                        jobQueue=job_queue,
                        jobDefinition=job_def,
                        containerOverrides={
                            'environment': environment_vars,
                            'command': command_list
                        },
                        retryStrategy={'attempts': num_tries},
                        **kwargs)

                    # Record the job id.
                    logger.info("submitted...")
                    self.job_lists[job_queue].append(
                        {k: job_info[k]
                         for k in ['jobId', 'jobName']})
                    logger.info("Sleeping for %d seconds..." % stagger)
                    sleep(stagger)
        finally:
            self.set_monitors_submitting(False)
        return self.job_lists
예제 #7
0
    def submit_reading(self,
                       input_fname,
                       start_ix,
                       end_ix,
                       ids_per_job,
                       num_tries=1,
                       stagger=0):
        """Submit a batch of reading jobs

        Parameters
        ----------
        input_fname : str
            The name of the file containing the ids to be read.
        start_ix : int
            The line index of the first item in the list to read.
        end_ix : int
            The line index of the last item in the list to be read.
        ids_per_job : int
            The number of ids to be given to each job.
        num_tries : int
            The number of times a job may be attempted.
        stagger : float
            The number of seconds to wait between job submissions.

        Returns
        -------
        job_list : list[str]
            A list of job id strings.
        """
        self.job_list = []

        # stash this for later.
        self.ids_per_job = ids_per_job

        # Upload the pmid_list to Amazon S3
        id_list_key = 'reading_results/%s/%s' % (self.basename,
                                                 self._s3_input_name)
        s3_client = boto3.client('s3')
        s3_client.upload_file(input_fname, bucket_name, id_list_key)

        # If no end index is specified, read all the PMIDs
        if end_ix is None:
            with open(input_fname, 'rt') as f:
                lines = f.readlines()
                end_ix = len(lines)

        if start_ix is None:
            start_ix = 0

        # Get environment variables
        environment_vars = get_environment()

        # Iterate over the list of PMIDs and submit the job in chunks
        batch_client = boto3.client('batch', region_name='us-east-1')

        # Check to see if we've already been given a signal to quit.
        if self.running is None:
            self.running = True
        elif not self.running:
            return None

        for job_start_ix in range(start_ix, end_ix, ids_per_job):

            # Check for a stop signal
            if not self.running:
                logger.info("Running was switched off, discontinuing...")
                break

            # Generate the command for this batch.
            job_end_ix = job_start_ix + ids_per_job
            if job_end_ix > end_ix:
                job_end_ix = end_ix
            job_name, cmd = self._make_command(job_start_ix, job_end_ix)
            command_list = get_batch_command(cmd,
                                             purpose=self._purpose,
                                             project=self.project_name)
            logger.info('Command list: %s' % str(command_list))

            # Submit the job.
            job_info = batch_client.submit_job(
                jobName=job_name,
                jobQueue=self._job_queue,
                jobDefinition=self._job_def,
                containerOverrides={
                    'environment': environment_vars,
                    'command': command_list
                },
                retryStrategy={'attempts': num_tries})

            # Record the job id.
            logger.info("submitted...")
            self.job_list.append({'jobId': job_info['jobId']})
            logger.info("Sleeping for %d seconds..." % stagger)
            sleep(stagger)

        return self.job_list
예제 #8
0
    def submit_reading(self, input_fname, start_ix, end_ix, ids_per_job,
                       num_tries=1, stagger=0):
        """Submit a batch of reading jobs

        Parameters
        ----------
        input_fname : str
            The name of the file containing the ids to be read.
        start_ix : int
            The line index of the first item in the list to read.
        end_ix : int
            The line index of the last item in the list to be read.
        ids_per_job : int
            The number of ids to be given to each job.
        num_tries : int
            The number of times a job may be attempted.
        stagger : float
            The number of seconds to wait between job submissions.

        Returns
        -------
        job_list : list[str]
            A list of job id strings.
        """
        self.job_list = []

        # stash this for later.
        self.ids_per_job = ids_per_job

        # Upload the pmid_list to Amazon S3
        id_list_key = 'reading_results/%s/%s' % (self.basename,
                                                 self._s3_input_name)
        s3_client = boto3.client('s3')
        s3_client.upload_file(input_fname, bucket_name, id_list_key)

        # If no end index is specified, read all the PMIDs
        if end_ix is None:
            with open(input_fname, 'rt') as f:
                lines = f.readlines()
                end_ix = len(lines)

        if start_ix is None:
            start_ix = 0

        # Get environment variables
        environment_vars = get_environment()

        # Iterate over the list of PMIDs and submit the job in chunks
        batch_client = boto3.client('batch', region_name='us-east-1')

        # Check to see if we've already been given a signal to quit.
        if self.running is None:
            self.running = True
        elif not self.running:
            return None

        for job_start_ix in range(start_ix, end_ix, ids_per_job):

            # Check for a stop signal
            if not self.running:
                logger.info("Running was switched off, discontinuing...")
                break

            # Generate the command for this batch.
            job_end_ix = job_start_ix + ids_per_job
            if job_end_ix > end_ix:
                job_end_ix = end_ix
            job_name, cmd = self._make_command(job_start_ix, job_end_ix)
            command_list = get_batch_command(cmd, purpose=self._purpose,
                                             project=self.project_name)
            logger.info('Command list: %s' % str(command_list))

            # Submit the job.
            job_info = batch_client.submit_job(
                jobName=job_name,
                jobQueue=self._job_queue,
                jobDefinition=self._job_def,
                containerOverrides={
                    'environment': environment_vars,
                    'command': command_list},
                retryStrategy={'attempts': num_tries}
            )

            # Record the job id.
            logger.info("submitted...")
            self.job_list.append({'jobId': job_info['jobId']})
            logger.info("Sleeping for %d seconds..." % stagger)
            sleep(stagger)

        return self.job_list
예제 #9
0
    def submit_jobs(self, *args, num_tries=1, stagger=0):
        """Submit all the jobs to batch.

        Parameters
        ----------
        num_tries : int
            The number of times a job may be attempted.
        stagger : float
            The number of seconds to wait between job submissions.

        Returns
        -------
        job_lists : dict{queue_name: list[str]}
            A dict of lists of job id strings, keyed by the name of each queue
            used.
        """
        # Get environment variables
        environment_vars = get_environment()

        # Iterate over the list of PMIDs and submit the job in chunks
        batch_client = boto3.client('batch', region_name='us-east-1')

        # Check to see if we've already been given a signal to quit.
        if self.running is None:
            self.running = True
        elif not self.running:
            return None

        self.set_monitors_submitting(True)
        try:
            for ix1, job_args in enumerate(self._iter_job_args(*args)):
                # Check for a stop signal
                if not self.running:
                    logger.info("Running was switched off, discontinuing...")
                    break

                cmd_iter = self._iter_job_queue_def_commands(*job_args)
                for ix2, (job_name, cmd, job_def, job_queue) in enumerate(cmd_iter):
                    # Wait for there to be enough jobs to submit. Wait
                    # increases exponentially.
                    if self.max_jobs is not None:
                        counts = self.get_job_counts_by_status()
                        not_done = counts['pre'] + counts['running']
                        wait = 10
                        n_sleeps = 0
                        while not_done > self.max_jobs:
                            sleep_time = wait * (2**n_sleeps)
                            logger.info(f"Waiting {sleep_time} seconds: max "
                                        f"jobs is {self.max_jobs} and "
                                        f"{not_done} are running.for ")
                            sleep(sleep_time)
                            counts = self.get_job_counts_by_status()
                            not_done = counts['pre'] + counts['running']

                    # Build the command.
                    command_list = get_batch_command(cmd, purpose=self._purpose,
                                                     project=self.project_name)
                    logger.info('Command list: %s' % str(command_list))

                    # Update the environment variables.
                    updated_environment_vars = environment_vars
                    updated_environment_vars.append({
                        'name': 'JOB_GROUP_ID',
                        'value': str(ix1)
                    })
                    updated_environment_vars.append({
                        'name': 'JOB_ID',
                        'value': str(ix2)
                    })
                    # Submit the job.
                    kwargs = {}
                    if self.job_timeout_override is not None:
                        kwargs['timeout'] = \
                            {'attemptDurationSeconds': self.job_timeout_override}
                    job_info = batch_client.submit_job(
                        jobName=job_name,
                        jobQueue=job_queue,
                        jobDefinition=job_def,
                        containerOverrides={
                            'environment': updated_environment_vars,
                            'command': command_list},
                        retryStrategy={'attempts': num_tries},
                        **kwargs
                    )

                    # Record the job id.
                    logger.info("submitted...")
                    self.job_lists[job_queue].append(
                        {k: job_info[k] for k in ['jobId', 'jobName']}
                    )
                    logger.info("Sleeping for %d seconds..." % stagger)
                    sleep(stagger)
        finally:
            self.set_monitors_submitting(False)
        return self.job_lists