Exemplo n.º 1
0
    def run_bfc(self, ctx, params):
        """
        BFC (Bloom Filter) error correcting app for sequencing errors in llluminia short reads.
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportBFCResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: results
        #BEGIN run_bfc

        log('Running run_bfc with params=')
        pprint(params)
        bfc_cmd = [self.BFC]
        shared_dir = "/kb/module/work/tmp"

        # validate parameters
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'input_reads_upa' not in params:
            raise ValueError('input_reads_upa parameter is required')
        if 'output_reads_name' not in params:
            raise ValueError('output_reads_name parameter is required')

        if 'drop_unique_kmer_reads' in params:
            if params['drop_unique_kmer_reads']:
                bfc_cmd.append(str('-1'))

        if 'est_genome_size' in params:
            if params['est_genome_size']:
                if 'est_genome_size_units' in params:
                    if params['est_genome_size_units'] in [
                            "G", "M", "K", "g", "m", "k"
                    ]:
                        bfc_cmd.append('-s')
                        bfc_cmd.append(
                            str(params['est_genome_size']) +
                            str(params['est_genome_size_units']))
                    else:
                        raise ValueError(
                            'est_genome_size_units must be G, M or K')
                else:
                    raise ValueError('est_genome_size_units must be set')

        if 'kmer_size' in params:
            if params['kmer_size']:
                if params['kmer_size'] < 64:
                    bfc_cmd.append('-k')
                    bfc_cmd.append(str(params['kmer_size']))
                else:
                    raise ValueError('kmer_size must be <= 63')

        input_reads_upa = params['input_reads_upa']
        output_reads_name = params['output_reads_name']
        os.chdir(shared_dir)

        output_reads_file = output_reads_name + ".fq"
        bfc_output_file = "bfc_" + output_reads_name + ".fq"
        seqtk_output_file = "seqtk_bfc_" + output_reads_name + ".fq"
        workspace_name = params['workspace_name']

        # get the reads library as gzipped interleaved file
        reads_params = {
            'read_libraries': [input_reads_upa],
            'interleaved': 'true',
            'gzipped': 'true'
        }

        ru = _ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']
        log(reads)
        input_reads_file = os.path.basename(
            reads[input_reads_upa]['files']['fwd'])
        log('Input reads files:')
        log('     ' + input_reads_file)

        # hardcoding a couple parameters
        bfc_cmd.append('-t')
        bfc_cmd.append(str(self.THREADS))

        bfc_cmd.append(input_reads_file)

        bfc_cmd.append('>')
        bfc_cmd.append(bfc_output_file)

        log('Running BFC:')
        log('     ' + ' '.join(bfc_cmd))

        bfc_cmd_output = self.run_command(' '.join(bfc_cmd))

        # drop non-paired reads using seqtk

        seqtk_cmd = [
            self.SEQTK, "dropse", bfc_output_file, ">", seqtk_output_file
        ]
        self.run_command(' '.join(seqtk_cmd))

        # upload reads output
        shutil.copy(seqtk_output_file, output_reads_file)

        out_reads_upa = ru.upload_reads({
            'fwd_file':
            os.path.join(shared_dir, output_reads_file),
            'interleaved':
            1,
            'wsname':
            workspace_name,
            'name':
            output_reads_name,
            'source_reads_ref':
            input_reads_upa
        })
        # create report
        ws = _Workspace(self.ws_url)

        input_meta = ws.get_objects2({
            'objects': [{
                'ref': input_reads_upa
            }],
            'no_data': 1
        })['data'][0]
        input_reads_name = input_meta['info'][1]
        input_reads_count = input_meta['info'][10]['read_count']

        output_meta = ws.get_objects2({
            'objects': [{
                'ref': out_reads_upa['obj_ref']
            }],
            'no_data':
            1
        })['data'][0]
        output_reads_count = output_meta['info'][10]['read_count']

        # get total filtered reads
        filtered_reads = int(input_reads_count) - int(output_reads_count)

        # add commas for readability
        input_reads_count = "{:,}".format(int(input_reads_count))
        output_reads_count = "{:,}".format(int(output_reads_count))
        filtered_reads = "{:,}".format(filtered_reads)
        filtered_reads = str(filtered_reads)

        k_mer_size = str(params['kmer_size'])

        bfc_main = '\n'.join([
            l for l in bfc_cmd_output.split('\n') if l.startswith('[M::main')
        ])

        report = 'Successfully ran bfc, on input reads: {}\n'.format(
            input_reads_name)
        report += 'with command: {}\n\n{}\n'.format(' '.join(bfc_cmd),
                                                    bfc_main)
        report += 'created object: {}({})\n\n'.format(output_reads_name,
                                                      out_reads_upa['obj_ref'])
        report += ' input reads: {}\n k-mer size: {}\n filtered reads: {}\n output reads: {}'.format(
            input_reads_count, k_mer_size, filtered_reads, output_reads_count)

        log('Saving report')
        kbr = _KBaseReport(self.callbackURL)
        report_info = kbr.create_extended_report({
            'message':
            report,
            'objects_created': [{
                'ref': out_reads_upa['obj_ref'],
                'description': 'Corrected reads'
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            'bfc_report_' + str(uuid.uuid4())
        })

        results = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END run_bfc

        # At some point might do deeper type checking...
        if not isinstance(results, dict):
            raise ValueError('Method run_bfc return value ' +
                             'results is not type dict as required.')
        # return the results
        return [results]
Exemplo n.º 2
0
def build_samples(
        config: Dict[str, str]) -> Tuple[Samples, KBaseUserLookup, List[str]]:
    '''
    Build the sample service instance from the SDK server provided parameters.

    :param cfg: The SDK generated configuration.
    :returns: A samples instance.
    '''
    if not config:
        raise ValueError('config is empty, cannot start service')
    arango_url = _check_string_req(config.get('arango-url'),
                                   'config param arango-url')
    arango_db = _check_string_req(config.get('arango-db'),
                                  'config param arango-db')
    arango_user = _check_string_req(config.get('arango-user'),
                                    'config param arango-user')
    arango_pwd = _check_string_req(config.get('arango-pwd'),
                                   'config param arango-pwd')

    col_sample = _check_string_req(config.get('sample-collection'),
                                   'config param sample-collection')
    col_version = _check_string_req(config.get('version-collection'),
                                    'config param version-collection')
    col_ver_edge = _check_string_req(config.get('version-edge-collection'),
                                     'config param version-edge-collection')
    col_node = _check_string_req(config.get('node-collection'),
                                 'config param node-collection')
    col_node_edge = _check_string_req(config.get('node-edge-collection'),
                                      'config param node-edge-collection')
    col_data_link = _check_string_req(config.get('data-link-collection'),
                                      'config param data-link-collection')
    col_ws_obj_ver = _check_string_req(
        config.get('workspace-object-version-shadow-collection'),
        'config param workspace-object-version-shadow-collection')
    col_schema = _check_string_req(config.get('schema-collection'),
                                   'config param schema-collection')

    auth_root_url = _check_string_req(config.get('auth-root-url'),
                                      'config param auth-root-url')
    auth_token = _check_string_req(config.get('auth-token'),
                                   'config param auth-token')
    full_roles = split_value(config, 'auth-full-admin-roles')
    read_roles = split_value(config, 'auth-read-admin-roles')
    read_exempt_roles = split_value(config, 'auth-read-exempt-roles')

    ws_url = _check_string_req(config.get('workspace-url'),
                               'config param workspace-url')
    ws_token = _check_string_req(config.get('workspace-read-admin-token'),
                                 'config param workspace-read-admin-token')

    kafka_servers = _check_string(config.get('kafka-bootstrap-servers'),
                                  'config param kafka-bootstrap-servers',
                                  optional=True)
    kafka_topic = None
    if kafka_servers:  # have to start the server twice to test no kafka scenario
        kafka_topic = _check_string(config.get('kafka-topic'),
                                    'config param kafka-topic')

    metaval_url = _check_string(config.get('metadata-validator-config-url'),
                                'config param metadata-validator-config-url',
                                optional=True)

    # meta params may have info that shouldn't be logged so don't log any for now.
    # Add code to deal with this later if needed
    print(f'''
        Starting server with config:
            arango-url: {arango_url}
            arango-db: {arango_db}
            arango-user: {arango_user}
            arango-pwd: [REDACTED FOR YOUR SAFETY AND COMFORT]
            sample-collection: {col_sample}
            version-collection: {col_version}
            version-edge-collection: {col_ver_edge}
            node-collection: {col_node}
            node-edge-collection: {col_node_edge}
            data-link-collection: {col_data_link}
            workspace-object-version-shadow-collection: {col_ws_obj_ver}
            schema-collection: {col_schema}
            auth-root-url: {auth_root_url}
            auth-token: [REDACTED FOR YOUR CONVENIENCE AND ENJOYMENT]
            auth-full-admin-roles: {', '.join(full_roles)}
            auth-read-admin-roles: {', '.join(read_roles)}
            auth-read-exempt-roles: {', '.join(read_exempt_roles)}
            workspace-url: {ws_url}
            workspace-read-admin-token: [REDACTED FOR YOUR ULTIMATE PLEASURE]
            kafka-bootstrap-servers: {kafka_servers}
            kafka-topic: {kafka_topic}
            metadata-validators-config-url: {metaval_url}
    ''')

    # build the validators before trying to connect to arango
    metaval = get_validators(
        metaval_url) if metaval_url else MetadataValidatorSet()

    arangoclient = _arango.ArangoClient(hosts=arango_url)
    arango_db = arangoclient.db(arango_db,
                                username=arango_user,
                                password=arango_pwd,
                                verify=True)
    storage = _ArangoSampleStorage(
        arango_db,
        col_sample,
        col_version,
        col_ver_edge,
        col_node,
        col_node_edge,
        col_ws_obj_ver,
        col_data_link,
        col_schema,
    )
    storage.start_consistency_checker()
    kafka = _KafkaNotifer(kafka_servers, _cast(
        str, kafka_topic)) if kafka_servers else None
    user_lookup = KBaseUserLookup(auth_root_url, auth_token, full_roles,
                                  read_roles)
    ws = _WS(_Workspace(ws_url, token=ws_token))
    return Samples(storage, user_lookup, metaval, ws,
                   kafka), user_lookup, read_exempt_roles
    def __init__(self, jars_dir: _Path, mongo_controller: _MongoController,
                 mongo_db: str, mongo_type_db: str, auth_url: str,
                 root_temp_dir: _Path):
        '''
        Create and start a new Workspace service. An unused port will be selected for the server.

        :param jars_dir: The path to the lib/jars dir of the KBase Jars repo
            (https://github.com/kbase/jars), e.g /path_to_repo/lib/jars.
        :param mongo_controller: A MongoDB controller.
        :param mongo_db: The database in which to store Workspace data.
        :param mongo_type_db: The database in which to store Workspace type specifications.
        :param auth_url: The root url of an instance of the KBase auth service.
        :param root_temp_dir: A temporary directory in which to store Auth data and log files.
            The files will be stored inside a child directory that is unique per invocation.
        '''
        if not jars_dir or not _os.access(jars_dir, _os.X_OK):
            raise _TestException(
                'jars_dir {} does not exist or is not executable.'.format(
                    jars_dir))
        if not mongo_controller:
            raise _TestException('mongo_controller must be provided')
        if not mongo_db:
            raise _TestException('mongo_db must be provided')
        if not mongo_type_db:
            raise _TestException('mongo_type_db must be provided')
        if not auth_url:
            raise _TestException('auth_url must be provided')
        if not root_temp_dir:
            raise _TestException('root_temp_dir is None')

        self._mongo = mongo_controller
        self._db = mongo_db
        jars_dir = jars_dir.resolve()
        class_path = self._get_class_path(jars_dir)

        # make temp dirs
        root_temp_dir = root_temp_dir.absolute()
        _os.makedirs(root_temp_dir, exist_ok=True)
        self.temp_dir = _Path(
            _tempfile.mkdtemp(prefix='WorkspaceController-',
                              dir=str(root_temp_dir)))
        ws_temp_dir = self.temp_dir.joinpath('temp_files')
        _os.makedirs(ws_temp_dir)

        configfile = self._create_deploy_cfg(self.temp_dir, ws_temp_dir,
                                             f'localhost:{self._mongo.port}',
                                             mongo_db, mongo_type_db, auth_url)
        newenv = _os.environ.copy()
        newenv['KB_DEPLOYMENT_CONFIG'] = configfile

        self.port = _test_utils.find_free_port()

        command = ['java', '-classpath', class_path, _WS_CLASS, str(self.port)]

        self._wslog = self.temp_dir / 'ws.log'
        self._outfile = open(self._wslog, 'w')

        self._proc = _subprocess.Popen(command,
                                       stdout=self._outfile,
                                       stderr=_subprocess.STDOUT,
                                       env=newenv)

        ws = _Workspace(f'http://localhost:{self.port}')
        for count in range(40):
            err = None
            _time.sleep(1)  # wait for server to start
            try:
                self.version = ws.ver()
                break
            except (_ServerError, _requests.exceptions.ConnectionError) as se:
                err = _TestException(se.args[0])
                err.__cause__ = se
        if err:
            print(
                'Error starting workspace service. Dumping logs and throwing error'
            )
            self._print_ws_logs()
            raise err
        self.startup_count = count + 1