Пример #1
0
 def create_new_core(self, dataset_id):
     '''
     Executes the Solr create_core command.
     '''
     logger.info('\n\nAttempt to create a Solr core.')
     cmd = 'sudo -u solr /opt/solr/bin/solr create_core -c {d}'.format(
         d=dataset_id)
     run_shell_command(cmd)
Пример #2
0
 def _download_file(self, url, output):
     '''
     Download a file located at `url` into a filepath given by the 
     `output` arg.
     '''
     download_cmd_template = 'wget {url} -O {output_file}'
     try:
         run_shell_command(
             download_cmd_template.format(url=url, output_file=output))
     except Exception as ex:
         logger.info('Failed at downloading from {u}'.format(u=url))
         raise ex
Пример #3
0
def check_if_container_running(container_id):
    '''
    Queries the status of a docker container to see if it is still running.
    Returns True if running, False if exited.
    '''
    field = '.State.Status'
    cmd = DOCKER_INSPECT_CMD.format(container_id=container_id, field=field)
    logger.info('Inspect Docker container with: {cmd}'.format(cmd=cmd))
    try:
        stdout, stderr = run_shell_command(cmd)
    except Exception as ex:
        logger.error('Caught an exception when checking for running container.'
            ' This can be caused by a race condition if the timestamp on the'
            ' ExecutedOperation is not committed to the database before the second'
            ' request is issued. '
        )
        return False
    stdout = stdout.decode('utf-8').strip()
    if stdout == DOCKER_EXITED_FLAG:
        return False
    elif stdout == DOCKER_RUNNING_FLAG:
        return True
    else:
        logger.info('Received a container status of: {status}'.format(
            status=stdout
        ))
        #TODO inform admins so we can track this case.
        # returning True here (in this potential edge case) makes the container
        # essentially permanant until we resolve its status.
        return True
Пример #4
0
def check_image_exists(img_str):
    logger.info('Check if {img} exists.'.format(img = img_str))
    manifest_cmd = 'docker manifest inspect {img}'.format(img = img_str)
    try:
        stdout, stderr = run_shell_command(manifest_cmd)
        logger.info('Successfully found Docker image')
        return True
    except Exception as ex:
        logger.info('Docker image lookup failed.')
        return False
Пример #5
0
def pull_image(remote_container_url):
    '''
    Provided with a fully qualifed docker image
    url, pull the image to this machine. 
    '''
    pull_cmd = 'docker pull {x}'.format(x=remote_container_url)
    try:
        stdout, stderr = run_shell_command(pull_cmd)
        logger.info('Successfully pulled Docker image')
    except Exception as ex:
        logger.error('Docker pull failed.')
        raise ex
Пример #6
0
def get_logs(container_id):
    '''
    Queries the logs from a given container
    '''
    log_cmd = 'docker logs {id}'.format(id=container_id)
    logger.info('Query Docker logs with: {cmd}'.format(cmd=log_cmd))
    try:
        stdout, stderr = run_shell_command(log_cmd)
        logger.info('Successfully queried container logs: {id}.'.format(id=container_id))
        return stdout.decode('utf-8')
    except Exception as ex:
        logger.error('Query of container logs did not succeed.')
        return ''
Пример #7
0
def get_timestamp_as_datetime(container_id, field):
    cmd = DOCKER_INSPECT_CMD.format(container_id=container_id, field=field)
    logger.info('Inspect Docker container with: {cmd}'.format(cmd=cmd))
    stdout, stderr = run_shell_command(cmd)

    # the timestamp by Docker is given like: b'"2020-09-28T17:51:52.393865325Z"\n'
    # so we need to convert to a string (from bytes), and strip off the end-line
    # and other stuff, like excessive microsends...
    try:

        time_str = stdout.decode('utf-8').strip()[:-2].split('.')[0]
        t = datetime.datetime.strptime(time_str, '%Y-%m-%dT%H:%M:%S')
        return t
    except Exception as ex:
        logger.error('Could not parse a timestamp from the Docker inspect command.'
            ' The timestamp string was: {s}. The raw string was: {r}'.format(
                s=time_str,
                r = stdout.decode('utf-8')
            )
        )
Пример #8
0
def check_container_exit_code(container_id):
    '''
    Queries the status of a docker container to see the exit code.
    Note that running containers will give an exit code of zero, so this
    should NOT be used to see if a container is still running.
    '''
    field = '.State.ExitCode'
    cmd = DOCKER_INSPECT_CMD.format(container_id=container_id, field=field)
    logger.info('Inspect Docker container with: {cmd}'.format(cmd=cmd))
    stdout, stderr = run_shell_command(cmd)
    logger.info('Results of inspect:\n\nSTDOUT: {stdout}\n\nSTDERR: {stderr}'.format(
        stdout = stdout,
        stderr = stderr
    ))
    try:
        exit_code = int(stdout)
        return exit_code
    except ValueError as ex:
        logger.error('Received non-integer exit code from container: {id}'.format(
            id=container_id
        ))
Пример #9
0
 def index_example_file(self, dataset_id, filepath):
     '''
     Indexes a file into the core
     '''
     cmd = '/opt/solr/bin/post -c {d} {f}'.format(d=dataset_id, f=filepath)
     run_shell_command(cmd)
Пример #10
0
def remove_container(container_id):
    rm_cmd = 'docker rm {id}'.format(id=container_id)
    logger.info('Remove Docker container with: {cmd}'.format(cmd=rm_cmd))
    stdout, stderr = run_shell_command(rm_cmd)
    logger.info('Successfully removed container: {id}.'.format(id=container_id))
Пример #11
0
    def prepare(self):
        '''
        Handles prep of the dataset. Does NOT index!
        '''
        tmp_dir = self._create_tmp_dir()

        ann_df = self._get_sample_annotations(tmp_dir)
        pheno_df = self._get_phenotype_data(tmp_dir)

        # Merge the sample-level table with the patient-level data
        ann_df['subject_id'] = ann_df['SAMPID'].apply(
            lambda x: '-'.join(x.split('-')[:2]))

        # In the phenotypes file, sex is 2=F, 1=M
        pheno_df['_SEX'] = pheno_df['SEX'].apply(lambda x: 'M'
                                                 if x == 1 else 'F')
        merged_ann = pd.merge(ann_df,
                              pheno_df,
                              left_on='subject_id',
                              right_on='SUBJID')

        # remap the column names and drop the others
        merged_ann.rename(columns=self.COLUMN_MAPPING, inplace=True)
        merged_ann = merged_ann[self.COLUMN_MAPPING.values()]
        merged_ann = merged_ann.set_index('sample_id')

        final_ann = pd.DataFrame()
        counts_output_path = os.path.join(
            self.ROOT_DIR,
            self.COUNT_OUTPUT_FILE_TEMPLATE.format(tag=self.TAG,
                                                   date=self.date_str))
        with pd.HDFStore(counts_output_path) as hdf_out:
            for i, (tissue,
                    tissue_subdf) in enumerate(merged_ann.groupby('tissue')):
                logger.info('Handling tissue {t}'.format(t=tissue))
                try:
                    url = self.TISSUE_TO_FILE_MAP[tissue]
                except KeyError as ex:
                    logger.info(
                        'No file exists in the map for {t}. Skipping.'.format(
                            t=tissue))
                    continue
                output_file = '{d}/f{i}.gct.gz'.format(d=tmp_dir, i=i)
                self._download_file(url, output_file)
                run_shell_command('gunzip {f}'.format(f=output_file))
                output_file = output_file[:-3]

                # the GCT-format file has two header lines. The third line has the usual
                # column headers
                counts = pd.read_table(output_file,
                                       sep='\t',
                                       skiprows=2,
                                       header=0,
                                       index_col=1)
                counts.drop(['Description'], axis=1, inplace=True)
                counts.drop(['id'], axis=1, inplace=True)

                # As of this writing, there are alternate ENSG Ids that are suffixed with _PAR_Y
                # to denote features that are on the regions of chrY which are identical to those
                # on chrX.
                # https://www.gencodegenes.org/pages/faq.html (search "PAR_Y")
                # We drop those here.
                # It appears the mapping does not count to these regions anyway, since the rows are all
                # zeros (while the canonical transcript is generally non-zero)
                idx_par = pd.Series(
                    [x.endswith('_PAR_Y') for x in counts.index])
                counts = counts.loc[~idx_par.values]

                # Remove the version from the ENSG gene ID
                counts.index = [x.split('.')[0] for x in counts.index]

                samples_in_matrix = counts.columns
                tissue_subdf = tissue_subdf.loc[samples_in_matrix]
                final_ann = pd.concat([final_ann, tissue_subdf], axis=0)

                group_id = RnaSeqMixin.create_python_compatible_id(
                    tissue) + '/ds'
                hdf_out.put(group_id, counts)

        final_ann.to_csv(os.path.join(
            self.ROOT_DIR,
            self.ANNOTATION_OUTPUT_FILE_TEMPLATE.format(tag=self.TAG,
                                                        date=self.date_str)),
                         sep=',',
                         index_label='sample_id')
Пример #12
0
    def run(self, executed_op, op_data, validated_inputs):
        logger.info('Running in local Docker mode.')
        logger.info('Executed op type: %s' % type(executed_op))
        logger.info('Executed op ID: %s' % str(executed_op.id))
        logger.info('Op data: %s' % op_data)
        logger.info(validated_inputs)

        # the UUID identifying the execution of this operation:
        execution_uuid = str(executed_op.id)

        # get the operation dir so we can look at which converters and command to use:
        op_dir = os.path.join(settings.OPERATION_LIBRARY_DIR,
                              str(op_data['id']))

        # To avoid conflicts or corruption of user data, we run each operation in its
        # own sandbox. We must first copy over their files to that sandbox dir.
        execution_dir = os.path.join(settings.OPERATION_EXECUTION_DIR,
                                     execution_uuid)
        make_local_directory(execution_dir)

        # convert the user inputs into args compatible with commandline usage:
        # For instance, a differential gene expression requires one to specify
        # the samples that are in each group-- to do this, the Operation requires
        # two ObservationSet instances are submitted as arguments. The "translator"
        # will take the ObservationSet data structures and turn them into something
        # that the call with use- e.g. making a CSV list to submit as one of the args
        # like:
        # docker run <image> run_something.R -a sampleA,sampleB -b sampleC,sampleD
        arg_dict = self._map_inputs(op_dir, validated_inputs, execution_dir)

        logger.info('After mapping the user inputs, we have the'
                    ' following structure: {d}'.format(d=arg_dict))

        # Construct the command that will be run in the container:
        entrypoint_file_path = os.path.join(op_dir, self.ENTRYPOINT_FILE)
        if not os.path.exists(entrypoint_file_path):
            logger.error(
                'Could not find the required entrypoint file at {p}.'
                ' Something must have corrupted the operation directory.'.
                format(p=entrypoint_file_path))
            raise Exception('The repository must have been corrupted.'
                            ' Failed to find the entrypoint file.'
                            ' Check dir at: {d}'.format(d=op_dir))
        entrypoint_cmd = self._get_entrypoint_command(entrypoint_file_path,
                                                      arg_dict)

        image_str = get_image_name_and_tag(op_data['repo_name'],
                                           op_data['git_hash'])

        cmd = self.DOCKER_RUN_CMD.format(
            container_name=execution_uuid,
            execution_mount=settings.OPERATION_EXECUTION_DIR,
            work_dir=settings.OPERATION_EXECUTION_DIR,
            job_dir=execution_dir,
            docker_image=image_str,
            cmd=entrypoint_cmd)
        try:
            run_shell_command(cmd)
            executed_op.job_id = execution_uuid
            executed_op.save()
        except Exception as ex:
            logger.info('Failed when running shell command: {c}'.format(c=cmd))
            logger.info('Exception was: {ex}'.format(ex=ex))
            # if an exception is raised when issuing the Docker run
            # command, then the job has failed. This error is likely
            # not due to user error, but something with the issuing
            # command or allocating appropriate Docker resources.
            executed_op.job_failed = True
            executed_op.execution_stop_datetime = datetime.datetime.now()
            executed_op.status = ExecutedOperation.ADMIN_NOTIFIED
            executed_op.save()
            alert_admins(str(ex))