示例#1
0
    def _stage_input_file(self, cutadapt_runner, ref, reads_type):

        ru = ReadsUtils(self.callbackURL)
        if reads_type == 'KBaseFile.PairedEndLibrary' or 'KBaseAssembly.PairedEndLibrary':
            input_file_info = ru.download_reads({
                'read_libraries': [ref],
                'interleaved': 'true'
            })['files'][ref]
        elif reads_type == 'KBaseFile.SingleEndLibrary' or 'KBaseAssembly.SingleEndLibrary':
            input_file_info = ru.download_reads({'read_libraries':
                                                 [ref]})['files'][ref]
        else:
            raise ValueError("Can't download_reads() for object type: '" +
                             str(reads_type) + "'")
        input_file_info['input_ref'] = ref
        file_location = input_file_info['files']['fwd']

        # DEBUG
        #with open (file_location, 'r', 0)  as fasta_file:
        #    for line in fasta_file.readlines():
        #        print ("LINE: '"+line+"'\n")

        interleaved = False
        if input_file_info['files']['type'] == 'interleaved':
            interleaved = True
        cutadapt_runner.set_interleaved(interleaved)
        cutadapt_runner.set_input_file(file_location)
        return input_file_info
示例#2
0
    def prepare_single_run(self, input_info, assembly_or_genome_ref,
                           bowtie2_index_info, ws_for_cache):
        ''' Given a reads ref and an assembly, setup the bowtie2 index '''
        # first setup the bowtie2 index of the assembly
        input_configuration = {'bowtie2_index_info': bowtie2_index_info}
        if not bowtie2_index_info:
            bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir, self.workspace_url,
                                                      self.callback_url, self.srv_wiz_url,
                                                      self.provenance)

            index_result = bowtie2IndexBuilder.get_index({'ref': assembly_or_genome_ref,
                                                          'ws_for_cache': ws_for_cache})
            input_configuration['bowtie2_index_info'] = index_result

        # next download the reads
        read_lib_ref = input_info['ref']
        read_lib_info = input_info['info']
        reads_params = {'read_libraries': [read_lib_ref],
                        'interleaved': 'false',
                        'gzipped': None}
        ru = ReadsUtils(self.callback_url)
        reads = ru.download_reads(reads_params)['files']

        input_configuration['reads_lib_type'] = self.get_type_from_obj_info(read_lib_info).split('.')[1]
        input_configuration['reads_files'] = reads[read_lib_ref]
        input_configuration['reads_lib_ref'] = read_lib_ref

        return input_configuration
示例#3
0
    def get_ea_utils_result (self,refid, input_params):
      ref = [refid] 
      DownloadReadsParams={'read_libraries':ref}
      dfUtil = ReadsUtils(self.callbackURL)
      x=dfUtil.download_reads(DownloadReadsParams)
      report = ''
      fwd_file = None 
      rev_file = None 

      fwd_file    =  x['files'][ref[0]]['files']['fwd']
      otype =  x['files'][ref[0]]['files']['otype']

      #case of interleaved
      if (otype == 'interleaved'):
          report += self.get_report_string (fwd_file)
          
      #case of separate pair 
      if (otype == 'paired'):
         report += self.get_report_string (fwd_file)

         rev_file    =  x['files'][ref[0]]['files']['rev']
         report += self.get_report_string (rev_file)

      #case of single end 
      if (otype == 'single'):
         report += self.get_report_string (fwd_file)
      #print report
      return report
示例#4
0
    def get_reads_RU(self, ctx, refs, console):
        readcli = ReadsUtils(self.callbackURL,
                             token=ctx['token'],
                             service_ver='dev')

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = readcli.download_reads({
                'read_libraries': refs,
                'interleaved': 'true',
                'gzipped': None
            })['files']
        except ServerError as se:
            self.log(console, 'logging stacktrace from dynamic client error')
            self.log(console, se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log(console, 'Got reads data from converter:\n' + pformat(reads))
        return reads
示例#5
0
def fetch_reads_from_reference(ref, callback_url):
    """
    Fetch a FASTQ file (or 2 for paired-end) from a reads reference.
    Returns the following structure:
    {
        "style": "paired", "single", or "interleaved",
        "file_fwd": path_to_file,
        "file_rev": path_to_file, only if paired end,
        "object_ref": reads reference for downstream convenience.
    }
    """
    try:
        print("Fetching reads from object {}".format(ref))
        reads_client = ReadsUtils(callback_url)
        reads_dl = reads_client.download_reads({
            "read_libraries": [ref],
            "interleaved": "false"
        })
        pprint(reads_dl)
        reads_files = reads_dl['files'][ref]['files']
        ret_reads = {
            "object_ref": ref,
            "style": reads_files["type"],
            "file_fwd": reads_files["fwd"]
        }
        if reads_files.get("rev", None) is not None:
            ret_reads["file_rev"] = reads_files["rev"]
        return ret_reads
    except:
        print("Unable to fetch a file from expected reads object {}".format(ref))
        raise
示例#6
0
    def get_input_reads(self, params, token):
        print('in get input reads')

        wsname = params[self.PARAM_IN_WS]
        libfile_args = params[self.PARAM_IN_LIBFILE_ARGS]

        obj_ids = []
        for libarg in libfile_args:
            read_name = libarg[self.PARAM_IN_LIBRARY]
            r = read_name if '/' in read_name else (wsname + '/' + read_name)
            obj_ids.append({'ref': r})
            libarg['ref_library'] = r

            if self.PARAM_IN_UNPAIRED in libarg and libarg[self.PARAM_IN_UNPAIRED] is not None:
                read_name = libarg[self.PARAM_IN_UNPAIRED]
                r = read_name if '/' in read_name else (wsname + '/' + read_name)
                obj_ids.append({'ref': r})
                libarg['ref_unpaired'] = r

        ws = workspaceService(self.workspaceURL, token=token)
        ws_info = ws.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        readcli = ReadsUtils(self.callbackURL, token=token,
                             service_ver='dev')

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')

        try:
            reads = readcli.download_reads({'read_libraries': reads_params,
                                            'interleaved': 'true',
                                            'gzipped': None
                                            })['files']
        except ServerError as se:
            self.log('logging stacktrace from dynamic client error')
            self.log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log('Got reads data from converter:\n' + pformat(reads))
        print("READS:")
        pprint(reads)
        return reads
示例#7
0
def download_interleaved_reads(callback_url, reads_upa):
    ru = ReadsUtils(callback_url)
    reads_info = ru.download_reads({
        'read_libraries': [reads_upa],
        'interleaved': 'true',
        'gzipped': None
    })['files'][reads_upa]
    return reads_info
示例#8
0
    def run_mash_sketch(self, ctx, params):
        """
        Generate a sketch file from a fasta/fastq file
        :param params: instance of type "MashSketchParams" (* * Pass in **one
           of** input_path, assembly_ref, or reads_ref *   input_path -
           string - local file path to an input fasta/fastq *   assembly_ref
           - string - workspace reference to an Assembly type *   reads_ref -
           string - workspace reference to a Reads type * Optionally, pass in
           a boolean indicating whether you are using paired-end reads. *
           paired_ends - boolean - whether you are passing in paired ends) ->
           structure: parameter "input_path" of String, parameter
           "assembly_ref" of String, parameter "reads_ref" of String,
           parameter "paired_ends" of type "boolean" (params:
           input_upa: workspace reference to an assembly object
           workspace_name: name of current workspace search_db: database to
           search n_max_results: number of results to return, integer between
           1 and 100)
        :returns: instance of type "MashSketchResults" (* * Returns the local
           scratch file path of the generated sketch file. * Will have the
           extension '.msh') -> structure: parameter "sketch_path" of String
        """
        # ctx is the context object
        # return variables are: results
        #BEGIN run_mash_sketch
        if 'reads_ref' in params:
            reads_utils = ReadsUtils(self.callbackURL)
            result = reads_utils.download_reads({
                'read_libraries': [params['reads_ref']],
                'interleaved':
                'true'
            })
            input_path = result['files'][params['reads_ref']]['files']['fwd']
        elif 'assembly_ref' in params:
            assembly_util = AssemblyUtil(self.callbackURL)
            result = assembly_util.get_assembly_as_fasta(
                {'ref': params['assembly_ref']})
            input_path = result['path']
        elif 'input_path' in params:
            input_path = params['input_path']
        else:
            raise ValueError(
                'Invalid params; must provide one of `reads_ref`, `assembly_ref`, or `input_path`.'
            )
        mash_utils = MashUtils(self.config, self.auth_token)
        output_file_path = mash_utils.mash_sketch(
            input_path, paired_ends=params.get('paired_ends'))
        results = {'sketch_path': output_file_path}
        #END run_mash_sketch

        # At some point might do deeper type checking...
        if not isinstance(results, dict):
            raise ValueError('Method run_mash_sketch return value ' +
                             'results is not type dict as required.')
        # return the results
        return [results]
    def _upload_reads(self, refid, callbackURL, input_params):
        ref = [refid]
        DownloadReadsParams = {'read_libraries': ref}
        dfUtil = ReadsUtils(callbackURL)
        x = dfUtil.download_reads(DownloadReadsParams)

        uploadReadParams = {}
        fwd_file = x['files'][ref[0]]['files']['fwd']
        otype = x['files'][ref[0]]['files']['otype']
        #case of interleaved
        if (otype == 'interleaved'):
            uploadReadParams = {
                'fwd_file': fwd_file,
                'wsname': input_params['workspace_name'],
                'name': input_params['output'],
                'rev_file': '',
                'sequencing_tech': input_params['sequencing_tech'],
                'single_genome': input_params['single_genome'],
                'interleaved': 1
            }

        #case of separate pair
        if (otype == 'paired'):
            rev_file = x['files'][ref[0]]['files']['rev']
            uploadReadParams = {
                'fwd_file': fwd_file,
                'wsname': input_params['workspace_name'],
                'name': input_params['output'],
                'rev_file': rev_file,
                'sequencing_tech': input_params['sequencing_tech'],
                'single_genome': input_params['single_genome']
            }

        #case of single end
        if (otype == 'single'):
            uploadReadParams = {
                'fwd_file': fwd_file,
                'wsname': input_params['workspace_name'],
                'name': input_params['output'],
                'rev_file': '',
                'sequencing_tech': input_params['sequencing_tech'],
                'single_genome': input_params['single_genome']
            }
        y = dfUtil.upload_reads(uploadReadParams)
        return y['obj_ref']
示例#10
0
def ru_reads_download(logger, ref, tdir, token):
    check_disk_space(logger)
    logger.info("{0} will be downloaded and transferred to {1}".format(ref,tdir))
    ru = ReadsUtils(url=os.environ['SDK_CALLBACK_URL'], token=token)
    ds = ru.download_reads({"read_libraries" : [ref], "interleaved" : "false"})
    logger.info("{0} will be downloaded and transferred to {1}".format(ref,tdir))
    
    #ds['fwd'] = os.path.join(tdir, trim_gz(ds['files'][ref]['files']['fwd_name']))
    ds['fwd'] = os.path.join(tdir, os.path.basename(ds['files'][ref]['files']['fwd']))
    os.rename(ds['files'][ref]['files']['fwd'],ds['fwd'])
    if ds['files'][ref]['files']['type'] == 'paired':
        if ds['files'][ref]['files']['rev_name'] is None:
            ds['rev'] = os.path.join(tdir, 'rev.fastq')
        else:
            ds['rev'] = os.path.join(tdir, os.path.basename(ds['files'][ref]['files']['rev']))
        os.rename(ds['files'][ref]['files']['rev'],ds['rev'])
    logger.info("{0} will be downloaded and transferred to {1}".format(ref,tdir))
    return ds
示例#11
0
 def fetch_reads_files(self, reads_upas):
     """
     From a list of reads UPAs, uses ReadsUtils to fetch the reads as files.
     Returns them as a dictionary from reads_upa -> filename
     """
     if reads_upas is None:
         raise ValueError("reads_upas must be a list of UPAs")
     if len(reads_upas) == 0:
         raise ValueError("reads_upas must contain at least one UPA")
     ru = ReadsUtils(self.callback_url)
     reads_info = ru.download_reads(({
         'read_libraries': reads_upas,
         'interleaved': 'true',
         'gzipped': None
     }))['files']
     file_set = dict()
     for reads in reads_info:
         file_set[reads] = reads_info[reads]['files']['fwd']
     return file_set
示例#12
0
    def run_idba_ud(self, ctx, params):
        """
        Run IDBA on paired end libraries
        :param params: instance of type "idba_ud_Params" (Input parameters
           for running idba_ud. string workspace_name - the name of the
           workspace from which to take input and store output.
           list<paired_end_lib> read_libraries - Illumina PairedEndLibrary
           files to assemble. string output_contigset_name - the name of the
           output contigset min_contig_length - minimum length of contigs to
           output, default is 2000 @optional kval_args) -> structure:
           parameter "workspace_name" of String, parameter "read_libraries"
           of list of type "paired_end_lib" (The workspace object name of a
           PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile
           type.), parameter "output_contigset_name" of String, parameter
           "min_contig_length" of Long, parameter "kval_args" of type
           "kval_args_type" (Additional parameters: k values for idba_ud.
           (Note: The UI elements for these values have been removed, based
           on feedback)) -> structure: parameter "mink_arg" of Long,
           parameter "maxk_arg" of Long, parameter "step_arg" of Long
        :returns: instance of type "idba_ud_Output" (Output parameters for
           IDBA run. string report_name - the name of the KBaseReport.Report
           workspace object. string report_ref  - the workspace reference of
           the report.) -> structure: parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_idba_ud

        print("===================  IN run_idba_ud")

        print("PARAMS: ")
        pprint(params)
        print("============================   END OF PARAMS: ")

        # A whole lot of this is adapted or outright copied from
        # https://github.com/msneddon/MEGAHIT
        self.log('Running run_idba_ud with params:\n' + pformat(params))

        token = ctx['token']

        # the reads should really be specified as a list of absolute ws refs
        # but the narrative doesn't do that yet
        self.process_params(params)

        # get absolute refs from ws
        wsname = params[self.PARAM_IN_WS]
        obj_ids = []
        for r in params[self.PARAM_IN_LIB]:
            obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})
        ws = workspaceService(self.workspaceURL, token=token)
        ws_info = ws.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        readcli = ReadsUtils(self.callbackURL, token=ctx['token'])

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = readcli.download_reads({
                'read_libraries': reads_params,
                'interleaved': 'false',
                'gzipped': None
            })['files']
        except ServerError as se:
            self.log('logging stacktrace from dynamic client error')
            self.log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log('Got reads data from converter:\n' + pformat(reads))

        self.check_reads(reads, reftoname)

        reads_data = []
        for ref in reads:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            print("REF:" + str(ref))
            print("READS REF:" + str(reads[ref]))
            seq_tech = reads[ref]["sequencing_tech"]
            if f['type'] == 'interleaved':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'paired',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'paired':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'rev_file': f['rev'],
                    'type': 'paired',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'single':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'single',
                    'seq_tech': seq_tech
                })
            else:
                raise ValueError('Something is very wrong with read lib' +
                                 reads_name)

        # set the output location
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        outdir = os.path.join(self.scratch, 'IDBA_dir' + str(timestamp))

        idba_out = self.exec_idba_ud(reads_data, params, outdir)
        self.log('IDBA output dir: ' + idba_out)

        # parse the output and save back to KBase
        output_contigs = os.path.join(idba_out, 'contig.fa')

        self.log('Uploading FASTA file to Assembly')
        assemblyUtil = AssemblyUtil(self.callbackURL,
                                    token=ctx['token'],
                                    service_ver='dev')
        if params.get('min_contig_length', 0) > 0:
            assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': output_contigs
                },
                'workspace_name':
                wsname,
                'assembly_name':
                params[self.PARAM_IN_CS_NAME],
                'min_contig_length':
                params['min_contig_length']
            })
            # load report from scaffolds.fasta
            report_name, report_ref = self.load_report(
                output_contigs + '.filtered.fa', params, wsname)
        else:
            assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': output_contigs
                },
                'workspace_name':
                wsname,
                'assembly_name':
                params[self.PARAM_IN_CS_NAME]
            })
            # load report from scaffolds.fasta
            report_name, report_ref = self.load_report(output_contigs, params,
                                                       wsname)

        output = {'report_name': report_name, 'report_ref': report_ref}

        #END run_idba_ud

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_idba_ud return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#13
0
    def exec_megahit(self, ctx, params):
        """
        :param params: instance of type "ExecMegaHitParams" (exec_megahit()
           Actual execution of MEGAHIT Accepts ReadsSet or a ReadsLibrary as
           Input Creates Assembly object(s) as output. Will eventually also
           create AssemblySet object if input is a ReadsSet and not running a
           combined assembly Other vars same as run_megahit()) -> structure:
           parameter "workspace_name" of String, parameter "input_reads_ref"
           of String, parameter "output_contigset_name" of String, parameter
           "combined_assembly_flag" of Long, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_len" of Long
        :returns: instance of type "ExecMegaHitOutput" -> structure:
           parameter "report_text" of String, parameter
           "output_contigset_ref" of list of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN exec_megahit
        console = []
        self.log(console, 'Running exec_megahit() with params=')
        self.log(console, "\n" + pformat(params))

        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'

        ### STEP 0: init
        token = ctx['token']
        wsClient = workspaceService(self.workspaceURL, token=token)
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        ### STEP 1: basic parameter checks + parsing
        required_params = [
            'workspace_name', 'input_reads_ref', 'output_contigset_name'
        ]
        for required_param in required_params:
            if required_param not in params or params[required_param] == None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        ### STEP 2: determine if input is a ReadsLibrary or ReadsSet
        input_reads_ref = params['input_reads_ref']
        input_reads_name = None
        try:
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': input_reads_ref
                }]})[0]
            input_reads_obj_type = re.sub(
                '-[0-9]+\.[0-9]+$', "",
                input_reads_obj_info[TYPE_I])  # remove trailing version
            input_reads_name = input_reads_obj_info[NAME_I]

        except Exception as e:
            raise ValueError('Unable to get reads object from workspace: (' +
                             input_reads_ref + ')' + str(e))

        accepted_input_types = [
            "KBaseSets.ReadsSet", "KBaseFile.PairedEndLibrary"
        ]
        if input_reads_obj_type not in accepted_input_types:
            raise ValueError("Input reads of type '" + input_reads_obj_type +
                             "' not accepted.  Must be one of " +
                             ", ".join(accepted_input_types))

        if input_reads_obj_type == "KBaseSets.ReadsSet":
            required_param = 'combined_assembly_flag'
            if required_param not in params or params[required_param] == None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        ### STEP 3: get the list of library references
        if input_reads_obj_type == "KBaseFile.PairedEndLibrary":
            readsSet_ref_list = [input_reads_ref]
            readsSet_names_list = [input_reads_name]

        elif input_reads_obj_type == "KBaseSets.ReadsSet":
            readsSet_ref_list = []
            readsSet_names_list = []

            try:
                setAPI_Client = SetAPI(
                    url=self.serviceWizardURL,
                    token=ctx['token'])  # for dynamic service
                #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token'])  # SDK local method
            except Exception as e:
                raise ValueError(
                    "SetAPI FAILURE: Unable to get SetAPI Client from serviceWizard: '"
                    + self.serviceWizardURL + "' token: '" + ctx['token'] +
                    "'" + str(e))
                #raise ValueError("SetAPI FAILURE: Unable to get SetAPI Client as local method callbackURL: '"+self.callbackURL+"' token: '"+ctx['token']+"'" + str(e))

            try:
                input_readsSet_obj = setAPI_Client.get_reads_set_v1({
                    'ref':
                    input_reads_ref,
                    'include_item_info':
                    1
                })
            except Exception as e:
                raise ValueError(
                    'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                    + str(input_reads_ref) + ")\n" + str(e))

            for readsLibrary_obj in input_readsSet_obj['data']['items']:
                readsSet_ref_list.append(readsLibrary_obj['ref'])
                NAME_I = 1
                readsSet_names_list.append(readsLibrary_obj['info'][NAME_I])

        else:
            raise ValueError("Input reads of type '" + input_reads_obj_type +
                             "' not accepted.  Must be one of " +
                             ", ".join(accepted_input_types))

        ### STEP 4: If doing a combined assembly on a ReadsSet, download reads one at a time and combine
        if input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] != 0:

            self.log(
                console,
                "MegaHit_Sets:run_megahit(): CREATING COMBINED INPUT FASTQ FILES"
            )

            # make dir
            timestamp = int(
                (datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000)
            input_dir = os.path.join(self.scratch, 'input.' + str(timestamp))
            if self.mac_mode:  # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
                input_dir = os.path.join(self.host_scratch,
                                         'input.' + str(timestamp))
            if not os.path.exists(input_dir):
                os.makedirs(input_dir)

            # connect to ReadsUtils Client
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
            except:
                raise ValueError("Unable to get readsUtils_Client\n" + str(e))

            # start combined file
            read_buf_size = 65536
            write_buf_size = 65536
            combined_input_fwd_path = os.path.join(input_dir,
                                                   'input_reads_fwd.fastq')
            combined_input_rev_path = os.path.join(input_dir,
                                                   'input_reads_rev.fastq')
            combined_input_fwd_handle = open(combined_input_fwd_path, 'w',
                                             write_buf_size)
            combined_input_rev_handle = open(combined_input_rev_path, 'w',
                                             write_buf_size)

            # add libraries, one at a time
            for this_input_reads_ref in readsSet_ref_list:
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                try:
                    readsLibrary = readsUtils_Client.download_reads({
                        'read_libraries': [this_input_reads_ref],
                        'interleaved':
                        'false'
                    })
                except Exception as e:
                    raise ValueError(
                        'Unable to get reads object from workspace: (' +
                        this_input_reads_ref + ")\n" + str(e))

                this_input_fwd_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['fwd']
                this_input_rev_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['rev']

                # append fwd
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): APPENDING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                this_input_path = this_input_fwd_path
                cat_file_handle = combined_input_fwd_handle
                with open(this_input_path, 'r',
                          read_buf_size) as this_input_handle:
                    while True:
                        read_data = this_input_handle.read(read_buf_size)
                        if read_data:
                            cat_file_handle.write(read_data)
                        else:
                            break
                os.remove(
                    this_input_path
                )  # create space since we no longer need the piece file

                # append rev
                this_input_path = this_input_rev_path
                cat_file_handle = combined_input_rev_handle
                with open(this_input_path, 'r',
                          read_buf_size) as this_input_handle:
                    while True:
                        read_data = this_input_handle.read(read_buf_size)
                        if read_data:
                            cat_file_handle.write(read_data)
                        else:
                            break
                os.remove(
                    this_input_path
                )  # create space since we no longer need the piece file

            combined_input_fwd_handle.close()
            combined_input_rev_handle.close()

        ### STEP 5: finally run MegaHit_Sets
        exec_megahit_single_library_params = params
        output_assemblyset_contigset_paths = []
        output_contigset_path = None

        # PairedEndLibrary
        if input_reads_obj_type == "KBaseFile.PairedEndLibrary":
            self.log(
                console,
                "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsLibrary: "
                + str(input_reads_ref))
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
                readsLibrary = readsUtils_Client.download_reads({
                    'read_libraries': [input_reads_ref],
                    'interleaved':
                    'false'
                })
            except Exception as e:
                raise ValueError(
                    'Unable to get reads object from workspace: (' +
                    input_reads_ref + ")\n" + str(e))

            input_fwd_path = readsLibrary['files'][input_reads_ref]['files'][
                'fwd']
            input_rev_path = readsLibrary['files'][input_reads_ref]['files'][
                'rev']
            exec_megahit_single_library_params[
                'input_fwd_path'] = input_fwd_path
            exec_megahit_single_library_params[
                'input_rev_path'] = input_rev_path

            # the key line
            output_contigset_path = self.exec_megahit_single_library(
                exec_megahit_single_library_params)
            output_assemblyset_contigset_paths.append(output_contigset_path)

            os.remove(input_fwd_path)  # files can be really big
            os.remove(input_rev_path)

        # ReadsSet combined (already downloaded and combined fastqs)
        elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] != 0:

            input_fwd_path = combined_input_fwd_path
            input_rev_path = combined_input_rev_path
            exec_megahit_single_library_params[
                'input_fwd_path'] = input_fwd_path
            exec_megahit_single_library_params[
                'input_rev_path'] = input_rev_path

            # the key line
            output_contigset_path = self.exec_megahit_single_library(
                exec_megahit_single_library_params)
            output_assemblyset_contigset_paths.append(output_contigset_path)

            os.remove(input_fwd_path)  # files can be really big
            os.remove(input_rev_path)

        # ReadsSet uncombined (still have to download)
        elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] == 0:
            # connect to ReadsUtils Client
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
            except:
                raise ValueError("Unable to get readsUtils_Client\n" + str(e))

            # get libraries, one at a time, and run MegaHit_Sets
            output_assemblyset_contigset_paths = []
            for this_input_reads_ref in readsSet_ref_list:
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                try:
                    readsLibrary = readsUtils_Client.download_reads({
                        'read_libraries': [this_input_reads_ref],
                        'interleaved':
                        'false'
                    })
                except Exception as e:
                    raise ValueError(
                        'Unable to get reads object from workspace: (' +
                        this_input_reads_ref + ")\n" + str(e))

                this_input_fwd_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['fwd']
                this_input_rev_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['rev']
                exec_megahit_single_library_params[
                    'input_fwd_path'] = this_input_fwd_path
                exec_megahit_single_library_params[
                    'input_rev_path'] = this_input_rev_path

                # the key line
                this_output_contigset_path = self.exec_megahit_single_library(
                    exec_megahit_single_library_params)
                output_assemblyset_contigset_paths.append(
                    this_output_contigset_path)

                os.remove(this_input_fwd_path)  # files can be really big
                os.remove(this_input_rev_path)

        # just in case we've confused ourselves
        else:
            raise ValueError("error in logic")

        ### STEP 6: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL,
                                    token=ctx['token'],
                                    service_ver=SERVICE_VER)
        output_contigset_refs = []
        output_contigset_names = []
        for i, this_output_contigset_path in enumerate(
                output_assemblyset_contigset_paths):
            if len(output_assemblyset_contigset_paths) == 1:
                assembly_name = params['output_contigset_name']
            else:
                assembly_name = readsSet_names_list[i] + '-' + params[
                    'output_contigset_name']

            this_output_data_ref = assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': this_output_contigset_path
                },
                'workspace_name':
                params['workspace_name'],
                'assembly_name':
                assembly_name
            })

            output_contigset_refs.append(this_output_data_ref)
            output_contigset_names.append(assembly_name)

        ### STEP 7: generate the report text

        # compute a simple contig length distribution for the report
        report = ''
        for i, this_output_contigset_path in enumerate(
                output_assemblyset_contigset_paths):

            report += "MegaHit_Sets run for Read Library: " + readsSet_names_list[
                i] + "\n"
            report += "-------------------------------------------------------------\n"
            report += "\n"
            lengths = []
            for seq_record in SeqIO.parse(this_output_contigset_path, 'fasta'):
                lengths.append(len(seq_record.seq))

                report += 'ContigSet saved to: ' + params[
                    'workspace_name'] + '/' + output_contigset_names[i] + '\n'
                report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
                report += 'Avg Length: ' + str(
                    sum(lengths) / float(len(lengths))) + ' bp.\n'

                bins = 10
                counts, edges = np.histogram(lengths, bins)
                report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
                for c in range(bins):
                    report += '   ' + str(counts[c]) + '\t--\t' + str(
                        edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        ### STEP 8: contruct the output to send back
        output = {
            'report_text': report,
            'output_contigset_refs': output_contigset_refs
        }

        #END exec_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method exec_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#14
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT.  Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           min_k - minimum kmer size (<= 127), must be odd number, default 21
           max_k - maximum kmer size (<= 127), must be odd number, default 99
           k_step - increment of kmer size of each iteration (<= 28), must be
           even number, default 10 k_list - list of kmer size (all must be
           odd, in the range 15-127, increment <= 28); override `--k-min',
           `--k-max' and `--k-step' min_contig_length - minimum length of
           contigs to output, default is 2000 @optional
           megahit_parameter_preset @optional min_count @optional k_min
           @optional k_max @optional k_step @optional k_list @optional
           min_contig_length) -> structure: parameter "workspace_name" of
           String, parameter "read_library_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_length" of Long
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))

        min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH
        if 'min_contig_length' in params:
            if params['min_contig_length']:
                if str(params['min_contig_length']).isdigit():
                    min_contig_length = params['min_contig_length']
                else:
                    raise ValueError('min_contig_length parameter must be a non-negative integer')

        megahit_cmd.append('--min-contig-len')
        megahit_cmd.append(str(min_contig_length))

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            raise ValueError('Error running MEGAHIT, return code: ' +
                             str(retcode) + '\n')

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
                                                                'file': {'path': output_contigs},
                                                                'workspace_name': params['workspace_name'],
                                                                'assembly_name': params['output_contigset_name']
                                                                })


        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except QUASTError as qe:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok 
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except _RepError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok 
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#15
0
    def runFastQC(self, ctx, input_params):
        """
        :param input_params: instance of type "FastQCParams" -> structure:
           parameter "input_ws" of String, parameter "input_file" of String,
           parameter "input_file_ref" of String
        :returns: instance of type "FastQCOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: reported_output
        #BEGIN runFastQC

        token = ctx['token']
        wsClient = workspaceService(self.workspaceURL, token=token)
        headers = {'Authorization': 'OAuth ' + token}
        uuid_string = str(uuid.uuid4())
        read_file_path = self.scratch + "/" + uuid_string
        os.mkdir(read_file_path)

        input_file_ref = self._get_input_file_ref_from_params(input_params)

        library = None
        try:
            library = wsClient.get_objects2(
                {'objects': [{
                    'ref': input_file_ref
                }]})['data'][0]
        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                input_file_ref + ')' + str(e))

        download_read_params = {'read_libraries': [], 'interleaved': "false"}
        if ("SingleEnd" in library['info'][2]
                or "PairedEnd" in library['info'][2]):
            download_read_params['read_libraries'].append(library['info'][7] +
                                                          "/" +
                                                          library['info'][1])
        elif ("SampleSet" in library['info'][2]):
            for sample_id in library['data']['sample_ids']:
                download_read_params['read_libraries'].append(
                    library['info'][7] + "/" + sample_id)

#        pprint(download_read_params)
        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        ret = ru.download_reads(download_read_params)
        #        pprint(ret)

        read_file_list = list()
        for file in ret['files']:
            files = ret['files'][file]['files']

            fwd_name = files['fwd'].split('/')[-1]
            fwd_name = fwd_name.replace('.gz', '')
            shutil.move(files['fwd'], os.path.join(read_file_path, fwd_name))
            read_file_list.append(os.path.join(read_file_path, fwd_name))

            if (files['rev'] is not None):
                rev_name = files['rev'].split('/')[-1]
                rev_name = rev_name.replace('.gz', '')
                shutil.move(files['rev'], os.path.join(read_file_path,
                                                       rev_name))
                read_file_list.append(os.path.join(read_file_path, rev_name))

        subprocess.check_output(["fastqc"] + read_file_list)
        report = "Command run: " + " ".join(["fastqc"] + read_file_list)

        output_html_files = list()
        output_zip_files = list()
        first_file = ""
        html_string = ""
        html_count = 0
        with open('/kb/data/index_start.txt', 'r') as start_file:
            html_string = start_file.read()

        for file in os.listdir(read_file_path):
            label = ".".join(file.split(".")[1:])
            if (file.endswith(".zip")):
                output_zip_files.append({
                    'path':
                    read_file_path + "/" + file,
                    'name':
                    file,
                    'label':
                    label,
                    'description':
                    'Zip file generated by fastqc that contains original images seen in the report'
                })
            if (file.endswith(".html")):
                if (first_file == ""):
                    first_file = file
                output_html_files.append({
                    'path':
                    read_file_path + "/" + file,
                    'name':
                    file,
                    'label':
                    label,
                    'description':
                    'HTML file generated by fastqc that contains report on quality of reads'
                })
                html_string += "            <button data-button=\"page " + str(
                    html_count) + "\" data-page=\"" + file + "\">Page " + str(
                        html_count + 1) + "</button>\n"
                html_count += 1

        html_string += "        </div>    </div>    <div id=\"body\">\n        <iframe id=\"content\" style=\"width: 100%; border: none; \" src=\"" + first_file + "\"></iframe>\n    </div>"

        with open('/kb/data/index_end.txt', 'r') as end_file:
            html_string += end_file.read()

        with open(read_file_path + "/index.html", 'w') as index_file:
            index_file.write(html_string)

        output_html_files.append({
            'path':
            read_file_path + "/index.html",
            'name':
            "index.html",
            'label':
            "index.html",
            'description':
            'HTML file generated by fastqc that contains report on quality of reads'
        })

        report_params = {
            'objects_created': [],
            #                          'message' : report,
            #                          'direct_html' : html_string,
            'direct_html_link_index': html_count,
            'file_links': output_zip_files,
            'html_links': output_html_files,
            'workspace_name': input_params['input_ws'],
            'report_object_name': 'kb_fastqc_report_' + uuid_string
        }
        kbase_report_client = KBaseReport(self.callback_url, token=token)
        output = kbase_report_client.create_extended_report(report_params)
        reported_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        #Remove temp reads directory
        shutil.rmtree(read_file_path, ignore_errors=True)

        #END runFastQC

        # At some point might do deeper type checking...
        if not isinstance(reported_output, dict):
            raise ValueError('Method runFastQC return value ' +
                             'reported_output is not type dict as required.')
        # return the results
        return [reported_output]
示例#16
0
class DataStagingUtils(object):
    def __init__(self, config, ctx):
        self.ctx = ctx
        self.scratch = os.path.abspath(config['scratch'])
        self.ws_url = config['workspace-url']
        self.serviceWizardURL = config['srv-wiz-url']
        self.callbackURL = config['SDK_CALLBACK_URL']
        if not os.path.exists(self.scratch):
            os.makedirs(self.scratch)

        self.SE_flag = 'SE'
        self.PE_flag = 'PE'

        SERVICE_VER = 'release'

        # readsUtils_Client
        try:
            self.readsUtils_Client = ReadsUtils(self.callbackURL,
                                                token=self.ctx['token'],
                                                service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate readsUtils_Client with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            self.setAPI_Client = SetAPI(
                url=self.serviceWizardURL,
                token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError(
                'Unable to instantiate setAPI_Client with serviceWizardURL: ' +
                self.serviceWizardURL + ' ERROR: ' + str(e))

    def expand_input(self, input_refs):
        '''
        Expand input based on an input data reference for Kaiju

        input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # expand any sets and build a non-redundant list of reads input objs
        ws = Workspace(self.ws_url)
        expanded_input = []
        input_ref_seen = dict()
        SE_types = [
            'KBaseFile.SingleEndLibrary', 'KBaseAssembly.SingleEndLibrary'
        ]
        PE_types = [
            'KBaseFile.PairedEndLibrary', 'KBaseAssembly.PairedEndLibrary'
        ]

        [
            OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
            WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
        ] = range(11)  # object_info tuple
        for input_ref in input_refs:
            input_info = ws.get_object_info3({'objects': [{
                'ref': input_ref
            }]})['infos'][0]
            obj_name = input_info[NAME_I]
            type_name = input_info[TYPE_I].split('-')[0]

            # ReadsSet
            if type_name in ['KBaseSets.ReadsSet']:
                try:
                    input_readsSet_obj = self.setAPI_Client.get_reads_set_v1({
                        'ref':
                        input_ref,
                        'include_item_info':
                        1
                    })

                except Exception as e:
                    raise ValueError(
                        'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                        + str(input_ref) + ")\n" + str(e))

                for readsLibrary_obj in input_readsSet_obj['data']['items']:
                    this_reads_ref = readsLibrary_obj['ref']
                    if this_reads_ref in input_ref_seen:
                        continue
                    input_ref_seen[this_reads_ref] = True

                    this_reads_name = readsLibrary_obj['info'][NAME_I]
                    reads_item_type = readsLibrary_obj['info'][TYPE_I]
                    reads_item_type = re.sub(
                        '-[0-9]+\.[0-9]+$', "",
                        reads_item_type)  # remove trailing version
                    if reads_item_type in PE_types:
                        this_reads_type = self.PE_flag
                    elif reads_item_type in SE_types:
                        this_reads_type = self.SE_flag
                    else:
                        raise ValueError("Can't handle read item type '" +
                                         reads_item_type + "' obj_name: '" +
                                         this_reads_name + " in Set: '" +
                                         str(input_ref) + "'")
                    expanded_input.append({
                        'ref': this_reads_ref,
                        'name': this_reads_name,
                        'type': this_reads_type
                    })
            # SingleEnd Library
            elif type_name in SE_types:
                this_reads_ref = input_ref
                if this_reads_ref in input_ref_seen:
                    continue
                input_ref_seen[this_reads_ref] = True
                this_reads_name = obj_name
                this_reads_type = self.SE_flag
                expanded_input.append({
                    'ref': this_reads_ref,
                    'name': this_reads_name,
                    'type': this_reads_type
                })
            # PairedEnd Library
            elif type_name in PE_types:
                this_reads_ref = input_ref
                if this_reads_ref in input_ref_seen:
                    continue
                input_ref_seen[this_reads_ref] = True
                this_reads_name = obj_name
                this_reads_type = self.PE_flag
                expanded_input.append({
                    'ref': this_reads_ref,
                    'name': this_reads_name,
                    'type': this_reads_type
                })
            else:
                raise ValueError("Illegal type in input_refs: " +
                                 str(obj_name) + " (" + str(input_ref) +
                                 ") is of type: '" + str(type_name) + "'")

        return expanded_input

    def stage_input(self,
                    input_item=None,
                    subsample_percent=10,
                    subsample_replicates=1,
                    subsample_seed=1,
                    fasta_file_extension='fastq'):
        '''
        Stage input based on an input data reference for Kaiju

        input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet

        This method creates a directory in the scratch area with the set of Fasta/Fastq files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input({'ref':<ref>,'name':<name>,'type':<type>}, subsample_percent, subsample_replicates, subsample_seed, 'fastq')

            staged_input
            {"input_dir": '...'}
        '''
        # init
        staged_input = dict()
        replicate_input = []

        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'input_reads_' + suffix)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)

        #
        # Download reads
        #

        # Paired End Lib
        if input_item['type'] == self.PE_flag:
            try:
                readsLibrary = self.readsUtils_Client.download_reads({
                    'read_libraries': [input_item['ref']],
                    'interleaved':
                    'false'
                })
            except Exception as e:
                raise ValueError(
                    'Unable to get read library object from workspace: (' +
                    str(input_item['ref']) + ")\n" + str(e))

            input_fwd_file_path = readsLibrary['files'][
                input_item['ref']]['files']['fwd']
            input_rev_file_path = readsLibrary['files'][
                input_item['ref']]['files']['rev']
            fwd_filename = os.path.join(
                input_dir, input_item['name'] + '.fwd.' + fasta_file_extension)
            rev_filename = os.path.join(
                input_dir, input_item['name'] + '.rev.' + fasta_file_extension)
            if input_fwd_file_path != fwd_filename:
                shutil.move(input_fwd_file_path, fwd_filename)
            if input_rev_file_path != rev_filename:
                shutil.move(input_rev_file_path, rev_filename)
            input_item['fwd_file'] = fwd_filename
            input_item['rev_file'] = rev_filename

            if not os.path.isfile(fwd_filename):
                raise ValueError('Error generating reads file ' + fwd_filename)
            if not os.path.isfile(rev_filename):
                raise ValueError('Error generating reads file ' + rev_filename)
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len):
                raise ValueError('Reads Library is empty in filename: ' +
                                 str(fwd_filename))
            if not self._fasta_seq_len_at_least(rev_filename, min_fasta_len):
                raise ValueError('Reads Library is empty in filename: ' +
                                 str(rev_filename))

        # Single End Lib
        elif input_item['type'] == self.SE_flag:
            try:
                readsLibrary = self.readsUtils_Client.download_reads(
                    {'read_libraries': [input_item['ref']]})
            except Exception as e:
                raise ValueError(
                    'Unable to get read library object from workspace: (' +
                    str(input_item['ref']) + ")\n" + str(e))

            input_fwd_file_path = readsLibrary['files'][
                input_item['ref']]['files']['fwd']
            fwd_filename = os.path.join(
                input_dir, input_item['name'] + '.fwd.' + fasta_file_extension)
            if input_fwd_file_path != fwd_filename:
                shutil.move(input_fwd_file_path, fwd_filename)
            input_item['fwd_file'] = fwd_filename

            if not os.path.isfile(fwd_filename):
                raise ValueError('Error generating reads file ' + fwd_filename)
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len):
                raise ValueError('Reads Library is empty in filename: ' +
                                 str(fwd_filename))

        else:
            raise ValueError("No type set for input library " +
                             str(input_item['name']) + " (" +
                             str(input_item['ref']) + ")")

        #
        # Subsample
        #

        if subsample_percent == 100:
            replicate_input = [input_item]
        else:
            replicate_input = self._randomly_subsample_reads(
                input_item,
                subsample_percent=subsample_percent,
                subsample_replicates=subsample_replicates,
                subsample_seed=subsample_seed)
            # free up disk
            os.remove(input_item['fwd_file'])
            if input_item['type'] == self.PE_flag:
                os.remove(input_item['rev_file'])

        # return input file info
        #staged_input['input_dir'] = input_dir
        #staged_input['folder_suffix'] = suffix
        staged_input['replicate_input'] = replicate_input
        return staged_input

    def _randomly_subsample_reads(self,
                                  input_item=None,
                                  subsample_percent=100,
                                  subsample_replicates=1,
                                  subsample_seed=1):

        replicate_files = []
        split_num = subsample_replicates

        # for now can only do percentage instead of raw cnt of reads per subsample
        use_reads_num = False
        use_reads_perc = True
        reads_num = 0  # not used.  subsample_percent used instead

        # init randomizer
        random.seed(subsample_seed)

        # Paired End
        #
        if input_item['type'] == self.PE_flag:
            print("SUBSAMPLING PE library " + input_item['name'])  # DEBUG

            # file paths
            input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file'])
            input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path)
            input_rev_path = re.sub("\.fastq$", "", input_item['rev_file'])
            input_rev_path = re.sub("\.FASTQ$", "", input_rev_path)
            output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired"
            output_rev_paired_file_path_base = input_rev_path + "_rev_paired"

            # set up for file io
            total_paired_reads = 0
            total_unpaired_fwd_reads = 0
            total_unpaired_rev_reads = 0
            total_paired_reads_by_set = []
            fwd_ids = dict()
            paired_ids = dict()
            paired_ids_list = []
            paired_lib_i = dict()
            paired_buf_size = 100000
            recs_beep_n = 1000000

            # read fwd file to get fwd ids
            #            rec_cnt = 0  # DEBUG
            print("GETTING IDS")  # DEBUG
            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        fwd_ids[read_id] = True

                        # DEBUG
#                        if rec_cnt % 100 == 0:
#                            print ("read_id: '"+str(read_id)+"'")
#                        rec_cnt += 1

# read reverse to determine paired
            print("DETERMINING PAIRED IDS")  # DEBUG
            with open(input_item['rev_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        if fwd_ids[read_id]:
                            paired_ids[read_id] = True
                            paired_ids_list.append(read_id)

                        # DEBUG
#                        if rec_cnt % 100 == 0:
#                            print ("read_id: '"+str(read_id)+"'")
#                        rec_cnt += 1
            total_paired_reads = len(paired_ids_list)
            print("TOTAL PAIRED READS CNT: " +
                  str(total_paired_reads))  # DEBUG

            # Determine sublibrary sizes
            if use_reads_num:
                reads_per_lib = reads_num
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_num <= total_paired_reads_cnt / split_num.  You have reads_num:"
                        + str(reads_num) + " > total_paired_reads_cnt:" +
                        str(total_paired_reads) + " / split_num:" +
                        str(split_num) + ".  Instead try reads_num <= " +
                        str(total_paired_reads // split_num))
            elif use_reads_perc:
                reads_per_lib = int(
                    (subsample_percent / 100.0) * total_paired_reads)
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_perc <= 1 / split_num.  You have reads_perc:"
                        + str(subsample_percent) + " > 1 / split_num:" +
                        str(split_num) + ".  Instead try reads_perc <= " +
                        str(int(100 * 1 / split_num)))
            else:
                raise ValueError(
                    "error in logic reads_num vs. reads_perc logic")

            # Determine random membership in each sublibrary
            print("GETTING RANDOM SUBSAMPLES")  # DEBUG
            for i, read_id in enumerate(
                    random.sample(paired_ids_list, reads_per_lib * split_num)):
                lib_i = i % split_num
                paired_lib_i[read_id] = lib_i

            # split fwd paired
            print("WRITING FWD SPLIT PAIRED")  # DEBUG
            paired_output_reads_file_handles = []
            for lib_i in range(split_num):
                paired_output_reads_file_handles.append(
                    open(
                        output_fwd_paired_file_path_base + "-" + str(lib_i) +
                        ".fastq", 'w', paired_buf_size))
                total_paired_reads_by_set.append(0)

            rec_buf = []
            last_read_id = None
            paired_cnt = 0
            capture_type_paired = False

            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        if last_read_id != None:
                            if capture_type_paired:
                                lib_i = paired_lib_i[last_read_id]
                                paired_output_reads_file_handles[
                                    lib_i].writelines(rec_buf)
                                paired_cnt += 1
                                total_paired_reads_by_set[lib_i] += 1
                                if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                                    print("\t" + str(paired_cnt) +
                                          " recs processed")
                            else:
                                #unpaired_fwd_buf.extend(rec_buf)
                                pass
                            rec_buf = []
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        last_read_id = read_id
                        try:
                            found = paired_lib_i[read_id]
                            capture_type_paired = True
                        except:
                            total_unpaired_fwd_reads += 1
                            capture_type_paired = False
                    rec_buf.append(line)
                # last rec
                if len(rec_buf) > 0:
                    if capture_type_paired:
                        lib_i = paired_lib_i[last_read_id]
                        paired_output_reads_file_handles[lib_i].writelines(
                            rec_buf)
                        paired_cnt += 1
                        if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                            print("\t" + str(paired_cnt) + " recs processed")
                    else:
                        #unpaired_fwd_buf.extend(rec_buf)
                        pass
                    rec_buf = []

            for output_handle in paired_output_reads_file_handles:
                output_handle.close()

            print("\t" + str(paired_cnt) + " FWD recs processed")

            # split rev paired
            print("WRITING REV SPLIT PAIRED")  # DEBUG
            paired_output_reads_file_handles = []
            for lib_i in range(split_num):
                paired_output_reads_file_handles.append(
                    open(
                        output_rev_paired_file_path_base + "-" + str(lib_i) +
                        ".fastq", 'w', paired_buf_size))

            rec_buf = []
            last_read_id = None
            paired_cnt = 0
            capture_type_paired = False

            with open(input_item['rev_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        if last_read_id != None:
                            if capture_type_paired:
                                lib_i = paired_lib_i[last_read_id]
                                paired_output_reads_file_handles[
                                    lib_i].writelines(rec_buf)
                                paired_cnt += 1
                                if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                                    print("\t" + str(paired_cnt) +
                                          " recs processed")
                            else:
                                #unpaired_fwd_buf.extend(rec_buf)
                                pass
                            rec_buf = []
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        last_read_id = read_id
                        try:
                            found = paired_lib_i[read_id]
                            capture_type_paired = True
                        except:
                            total_unpaired_rev_reads += 1
                            capture_type_paired = False
                    rec_buf.append(line)
                # last rec
                if len(rec_buf) > 0:
                    if capture_type_paired:
                        lib_i = paired_lib_i[last_read_id]
                        paired_output_reads_file_handles[lib_i].writelines(
                            rec_buf)
                        paired_cnt += 1
                        if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                            print("\t" + str(paired_cnt) + " recs processed")
                    else:
                        #unpaired_fwd_buf.extend(rec_buf)
                        pass
                    rec_buf = []

            for output_handle in paired_output_reads_file_handles:
                output_handle.close()

            print("\t" + str(paired_cnt) + " REV recs processed")

            # summary
            report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[
                'name'] + "\n"
            report += "TOTAL PAIRED READS: " + str(total_paired_reads) + "\n"
            report += "TOTAL UNPAIRED FWD READS (discarded): " + str(
                total_unpaired_fwd_reads) + "\n"
            report += "TOTAL UNPAIRED REV READS (discarded): " + str(
                total_unpaired_rev_reads) + "\n"
            report += "\n"
            for lib_i in range(split_num):
                report += "PAIRED READS IN SET " + str(lib_i) + ": " + str(
                    total_paired_reads_by_set[lib_i]) + "\n"
            print(report)

            # make replicate objects to return
            #        for replicate_i,replicate_item in enumerate(replicate_files):
            #            replicate_input.append({'fwd_file': replicate_item['fwd_file'],
            #                                    'type': input_item['type'],
            #                                    'name': input_item['name']+"-"+str(replicate_i)
            #                                })
            #            if input_item['type'] == self.PE_flag:
            #                replicate_input[replicate_i]['rev_file'] = replicate_item['rev_file']

            print("MAKING REPLICATE OBJECT")  # DEBUG
            paired_obj_refs = []
            for lib_i in range(split_num):
                output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str(
                    lib_i) + ".fastq"
                output_rev_paired_file_path = output_rev_paired_file_path_base + "-" + str(
                    lib_i) + ".fastq"
                if not os.path.isfile (output_fwd_paired_file_path) \
                     or os.path.getsize (output_fwd_paired_file_path) == 0 \
                   or not os.path.isfile (output_rev_paired_file_path) \
                     or os.path.getsize (output_rev_paired_file_path) == 0:

                    raise ValueError("failed to create paired output")
                else:
                    zero_pad = '0' * (len(str(split_num)) -
                                      len(str(lib_i + 1)))
                    replicate_files.append({
                        'fwd_file':
                        output_fwd_paired_file_path,
                        'rev_file':
                        output_rev_paired_file_path,
                        'ref':
                        input_item[
                            'ref'],  # note: this is for the src, not the subsample which is not saved
                        'type':
                        input_item['type'],
                        'name':
                        input_item['name'] + '-' + zero_pad + str(lib_i + 1)
                    })

        # SingleEndLibrary
        #
        elif input_item['type'] == self.SE_flag:
            print("SUBSAMPLING SE library " + input_item['name'])

            # file paths
            input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file'])
            input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path)
            output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired"

            # get "paired" ids
            print("DETERMINING IDS")  # DEBUG
            paired_ids = dict()
            paired_ids_list = []
            paired_lib_i = dict()
            paired_buf_size = 100000
            recs_beep_n = 100000

            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        if read_id in paired_ids:
                            raise ValueError("repeat read_id: " + read_id)
                        paired_ids[read_id] = True
                        paired_ids_list.append(read_id)
                        # DEBUG
#                        if rec_cnt % 100 == 0:
#                            print ("read_id: '"+str(read_id)+"'")
#                        rec_cnt += 1
            total_paired_reads = len(paired_ids_list)
            print("TOTAL READS CNT: " + str(total_paired_reads))  # DEBUG

            # Determine sublibrary sizes
            if use_reads_num:
                reads_per_lib = reads_num
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_num <= total_paired_reads_cnt / split_num.  You have reads_num:"
                        + str(reads_num) + " > total_paired_reads_cnt:" +
                        str(total_paired_reads) + " / split_num:" +
                        str(split_num) + ".  Instead try reads_num <= " +
                        str(total_paired_reads // split_num))
            elif use_reads_perc:
                reads_per_lib = int(
                    (subsample_percent / 100.0) * total_paired_reads)
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_perc <= 1 / split_num.  You have reads_perc:"
                        + str(subsample_percent) + " > 1 / split_num:" +
                        str(split_num) + ".  Instead try reads_perc <= " +
                        str(int(100 * 1 / split_num)))
            else:
                raise ValueError(
                    "error in logic reads_num vs. reads_perc logic")

            # Determine random membership in each sublibrary
            print("GETTING RANDOM SUBSAMPLES")  # DEBUG
            for i, read_id in enumerate(
                    random.sample(paired_ids_list, reads_per_lib * split_num)):
                lib_i = i % split_num
                paired_lib_i[read_id] = lib_i

            # set up for file io
            total_paired_reads = 0
            total_paired_reads_by_set = []
            paired_buf_size = 1000000

            # split reads
            print("WRITING SPLIT SINGLE END READS")  # DEBUG
            paired_output_reads_file_handles = []
            for lib_i in range(split_num):
                paired_output_reads_file_handles.append(
                    open(
                        output_fwd_paired_file_path_base + "-" + str(lib_i) +
                        ".fastq", 'w', paired_buf_size))
                total_paired_reads_by_set.append(0)

            rec_buf = []
            last_read_id = None
            paired_cnt = 0
            recs_beep_n = 1000000
            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        total_paired_reads += 1
                        if last_read_id != None:
                            try:
                                lib_i = paired_lib_i[last_read_id]
                                total_paired_reads_by_set[lib_i] += 1
                                paired_output_reads_file_handles[
                                    lib_i].writelines(rec_buf)
                                paired_cnt += 1
                            except:
                                pass
                            if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                                print("\t" + str(paired_cnt) +
                                      " recs processed")
                            rec_buf = []
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        last_read_id = read_id
                    rec_buf.append(line)
                # last rec
                if len(rec_buf) > 0:
                    if last_read_id != None:
                        try:
                            lib_i = paired_lib_i[last_read_id]
                            total_paired_reads_by_set[lib_i] += 1
                            paired_output_reads_file_handles[lib_i].writelines(
                                rec_buf)
                            paired_cnt += 1
                        except:
                            pass
                    if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                        print("\t" + str(paired_cnt) + " recs processed")
                    rec_buf = []

            for output_handle in paired_output_reads_file_handles:
                output_handle.close()

            # summary
            report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[
                'name'] + "\n"
            report += "TOTAL READS: " + str(total_paired_reads) + "\n"
            for lib_i in range(split_num):
                report += "SINGLE END READS IN SET " + str(lib_i) + ": " + str(
                    total_paired_reads_by_set[lib_i]) + "\n"
            print(report)

            # make replicate objects to return
            print("MAKING REPLICATE OBJECTS")  # DEBUG
            paired_obj_refs = []
            for lib_i in range(split_num):
                output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str(
                    lib_i) + ".fastq"
                if not os.path.isfile (output_fwd_paired_file_path) \
                     or os.path.getsize (output_fwd_paired_file_path) == 0:

                    raise ValueError("failed to create paired output")
                else:
                    zero_pad = '0' * (len(str(split_num)) -
                                      len(str(lib_i + 1)))
                    replicate_files.append({
                        'fwd_file':
                        output_fwd_paired_file_path,
                        'ref':
                        input_item[
                            'ref'],  # note: this is for the src, not the subsample which is not saved
                        'type':
                        input_item['type'],
                        'name':
                        input_item['name'] + '-' + zero_pad + str(lib_i + 1)
                    })

        else:
            raise ValueError("unknown ReadLibrary type:" +
                             str(input_item['type']) + " for readslibrary: " +
                             input_item['name'])

        return replicate_files

    def _fasta_seq_len_at_least(self, fasta_path, min_fasta_len=1):
        '''
        counts the number of non-header, non-whitespace characters in a FASTA file
        '''
        seq_len = 0
        with open(fasta_path, 'r', 0) as fasta_handle:
            for line in fasta_handle:
                line = line.strip()
                if line.startswith('>'):
                    continue
                line = line.replace(' ', '')
                seq_len += len(line)
                if seq_len >= min_fasta_len:
                    return True
        return False
示例#17
0
    def fastqutils_stats(self, ctx, params):
        """
        :param params: instance of type "FastqUtilsStatsParams" -> structure:
           parameter "workspace_name" of type "workspace_name" (A string
           representing a workspace name.), parameter "read_library_ref" of
           type "read_library_ref" (A string representing a ContigSet id.)
        :returns: instance of type "FastqUtilsStatsResult" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN fastqutils_stats

        print('Running fastqutils_stats with params=')
        print(pformat(params))

        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')

        # Get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL, token=ctx['token'])
        reads = ru.download_reads(reads_params)['files']
        files = [reads[input_ref]['files']['fwd']]
        if reads[input_ref]['files']['rev']:
            files.append(reads[input_ref]['files']['rev'])
        print('running on files:')
        for f in files:
            print(f)

        # construct the command
        stats_cmd = [self.FASTQUTILS, 'stats']

        report = ''
        for f in files:
            cmd = stats_cmd
            cmd.append(f)

            report += '============== ' + f + ' ==============\n'
            print('running: ' + ' '.join(cmd))
            p = subprocess.Popen(cmd,
                                 cwd=self.scratch,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=False)

            while True:
                line = p.stdout.readline()
                if not line:
                    break
                report += line
                print(line.replace('\n', ''))

            p.stdout.close()
            p.wait()
            report += "\n\n"
            print('return code: ' + str(p.returncode))
            if p.returncode != 0:
                raise ValueError('Error running ' + self.FASTQUTILS + ', return code: ' + str(p.returncode))


        reportObj = {
            'objects_created': [],
            'text_message': report
        }
        report = KBaseReport(self.callbackURL)
        report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})
        returnVal = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END fastqutils_stats

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method fastqutils_stats return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
示例#18
0
    def run_SPAdes(self, ctx, params):
        """
        Run SPAdes on paired end libraries
        :param params: instance of type "SPAdesParams" (Input parameters for
           running SPAdes. workspace_name - the name of the workspace from
           which to take input and store output. output_contigset_name - the
           name of the output contigset list<paired_end_lib> read_libraries -
           Illumina PairedEndLibrary files to assemble. dna_source -
           (optional) the source of the DNA used for sequencing
           'single_cell': DNA amplified from a single cell via MDA anything
           else: Standard DNA sample from multiple cells. Default value is
           None. min_contig_length - (optional) integer to filter out contigs
           with length < min_contig_length from the SPAdes output. Default
           value is 0 implying no filter.) -> structure: parameter
           "workspace_name" of String, parameter "output_contigset_name" of
           String, parameter "read_libraries" of list of type
           "paired_end_lib" (The workspace object name of a PairedEndLibrary
           file, whether of the KBaseAssembly or KBaseFile type.), parameter
           "dna_source" of String, parameter "min_contig_length" of Long
        :returns: instance of type "SPAdesOutput" (Output parameters for
           SPAdes run. report_name - the name of the KBaseReport.Report
           workspace object. report_ref - the workspace reference of the
           report.) -> structure: parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_SPAdes

        # A whole lot of this is adapted or outright copied from
        # https://github.com/msneddon/MEGAHIT
        self.log('Running run_SPAdes with params:\n' + pformat(params))

        token = ctx['token']

        # the reads should really be specified as a list of absolute ws refs
        # but the narrative doesn't do that yet
        self.process_params(params)

        # get absolute refs from ws
        wsname = params[self.PARAM_IN_WS]
        obj_ids = []
        for r in params[self.PARAM_IN_LIB]:
            obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})
        ws = workspaceService(self.workspaceURL, token=token)
        ws_info = ws.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        readcli = ReadsUtils(self.callbackURL, token=ctx['token'])

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = readcli.download_reads({
                'read_libraries': reads_params,
                'interleaved': 'false',
                'gzipped': None
            })['files']
        except ServerError as se:
            self.log('logging stacktrace from dynamic client error')
            self.log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log('Got reads data from converter:\n' + pformat(reads))

        phred_type = self.check_reads(params, reads, reftoname)

        reads_data = []
        for ref in reads:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            #            print ("REF:" + str(ref))
            #            print ("READS REF:" + str(reads[ref]))
            seq_tech = reads[ref]["sequencing_tech"]
            if f['type'] == 'interleaved':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'paired',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'paired':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'rev_file': f['rev'],
                    'type': 'paired',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'single':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'single',
                    'seq_tech': seq_tech
                })
            else:
                raise ValueError('Something is very wrong with read lib' +
                                 reads_name)
        spades_out = self.exec_spades(params[self.PARAM_IN_DNA_SOURCE],
                                      reads_data, phred_type)
        self.log('SPAdes output dir: ' + spades_out)

        # parse the output and save back to KBase
        output_contigs = os.path.join(spades_out, 'scaffolds.fasta')

        self.log('Uploading FASTA file to Assembly')

        assemblyUtil = AssemblyUtil(self.callbackURL,
                                    token=ctx['token'],
                                    service_ver='release')

        if params.get('min_contig_length', 0) > 0:
            assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': output_contigs
                },
                'workspace_name':
                wsname,
                'assembly_name':
                params[self.PARAM_IN_CS_NAME],
                'min_contig_length':
                params['min_contig_length']
            })
            # load report from scaffolds.fasta.filtered.fa
            report_name, report_ref = self.load_report(
                output_contigs + '.filtered.fa', params, wsname)
        else:
            assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': output_contigs
                },
                'workspace_name':
                wsname,
                'assembly_name':
                params[self.PARAM_IN_CS_NAME]
            })
            # load report from scaffolds.fasta
            report_name, report_ref = self.load_report(output_contigs, params,
                                                       wsname)

        output = {'report_name': report_name, 'report_ref': report_ref}
        #END run_SPAdes

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_SPAdes return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#19
0
    def execReadLibraryPRINSEQ(self, ctx, input_params):
        """
        :param input_params: instance of type "inputPRINSEQ" (execPRINSEQ and
           execReadLibraryPRINSEQ input input_reads_ref : may be
           KBaseFile.PairedEndLibrary or KBaseFile.SingleEndLibrary output_ws
           : workspace to write to output_reads_name : obj_name to create
           lc_method : Low complexity method - value must be "dust" or
           "entropy" lc_entropy_threshold : Low complexity threshold - Value
           must be an integer between 0 and 100. Note a higher
           lc_entropy_threshold in entropy is more stringent.
           lc_dust_threshold : Low complexity threshold - Value must be an
           integer between 0 and 100. Note a lower lc_entropy_threshold is
           less stringent with dust) -> structure: parameter
           "input_reads_ref" of type "data_obj_ref", parameter "output_ws" of
           type "workspace_name" (Common Types), parameter
           "output_reads_name" of type "data_obj_name", parameter "lc_method"
           of String, parameter "lc_entropy_threshold" of Long, parameter
           "lc_dust_threshold" of Long
        :returns: instance of type "outputReadLibraryExecPRINSEQ" ->
           structure: parameter "output_filtered_ref" of type "data_obj_ref",
           parameter "output_unpaired_fwd_ref" of type "data_obj_ref",
           parameter "output_unpaired_rev_ref" of type "data_obj_ref",
           parameter "report" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN execReadLibraryPRINSEQ
        console = []
        #        self.log(console, 'Running execTrimmomatic with parameters: ')
        #        self.log(console, "\n"+pformat(input_params))
        report = ''
        returnVal = dict()
        #        retVal['output_filtered_ref'] = None
        #        retVal['output_unpaired_fwd_ref'] = None
        #        retVal['output_unpaired_rev_ref'] = None

        token = ctx['token']
        wsClient = workspaceService(self.ws_url, token=token)
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # param checks
        required_params = ['input_reads_ref', 'output_ws', 'lc_method']
        # output reads_name is optional. If not set will use old_objects name
        for required_param in required_params:
            if required_param not in input_params or input_params[
                    required_param] is None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        if (input_params['lc_method'] != 'dust') and (input_params['lc_method']
                                                      != 'entropy'):
            raise ValueError(
                "lc_method (low complexity method) must be 'dust' or 'entropy', "
                + "it is currently set to : " + input_params['lc_method'])

        if not ('lc_entropy_threshold' in input_params
                or 'lc_dust_threshold' in input_params):
            raise ValueError(
                ("A low complexity threshold needs to be " +
                 "entered for {}".format(input_params['lc_method'])))
        elif input_params['lc_method'] == 'dust':
            if 'lc_dust_threshold' not in input_params:
                raise ValueError(
                    ("A low complexity threshold needs to be " +
                     "entered for {}".format(input_params['lc_method'])))
            else:
                lc_threshold = input_params['lc_dust_threshold']
        else:
            if 'lc_entropy_threshold' not in input_params:
                raise ValueError(
                    ("A low complexity threshold needs to be " +
                     "entered for {}".format(input_params['lc_method'])))
            else:
                lc_threshold = input_params['lc_entropy_threshold']

        if (lc_threshold < 0.0) or (lc_threshold > 100.0):
            raise ValueError((
                "The threshold for {} must be between 0 and 100, it is currently "
                + "set to : {}").format(input_params['lc_method'],
                                        lc_threshold))
        reportObj = {'objects_created': [], 'text_message': ''}

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = [
            str(input_params['input_reads_ref'])
        ]

        # GET THE READS OBJECT
        # Determine whether read library or read set is input object
        #
        try:
            # object_info tuple
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': input_params['input_reads_ref']
                }]})[0]
            input_reads_obj_type = input_reads_obj_info[TYPE_I]
            # input_reads_obj_version = input_reads_obj_info[VERSION_I]
            # this is object version, not type version

        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                str(input_params['input_reads_ref']) + ')' + str(e))

        # self.log (console, "B4 TYPE: '" +
        #           str(input_reads_obj_type) +
        #           "' VERSION: '" + str(input_reads_obj_version)+"'")
        # remove trailing version
        input_reads_obj_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                      input_reads_obj_type)
        # self.log (console, "AF TYPE: '"+str(input_reads_obj_type)+"' VERSION: '" +
        # str(input_reads_obj_version)+"'")

        # maybe add below later "KBaseSets.ReadsSet",
        acceptable_types = [
            "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary",
            "KBaseAssembly.SingleEndLibrary", "KBaseFile.SingleEndLibrary"
        ]
        if input_reads_obj_type not in acceptable_types:
            raise ValueError("Input reads of type: '" + input_reads_obj_type +
                             "'.  Must be one of " +
                             ", ".join(acceptable_types))

        if input_reads_obj_type in [
                "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary"
        ]:
            read_type = 'PE'
        elif input_reads_obj_type in [
                "KBaseFile.SingleEndLibrary", "KBaseAssembly.SingleEndLibrary"
        ]:
            read_type = 'SE'

        # Instatiate ReadsUtils
        try:
            readsUtils_Client = ReadsUtils(url=self.callback_url,
                                           token=ctx['token'])  # SDK local
            self._log(None, 'Starting Read File(s) Download')
            readsLibrary = readsUtils_Client.download_reads({
                'read_libraries': [input_params['input_reads_ref']],
                'interleaved':
                'false'
            })
            self._log(None, 'Completed Read File(s) Downloading')
        except Exception as e:
            raise ValueError(
                ('Unable to get read library object from workspace: ({})\n'
                 ).format(str(input_params['input_reads_ref']), str(e)))

        # get WS metadata to get obj_name
        ws = workspaceService(self.ws_url)
        try:
            info = ws.get_object_info_new(
                {'objects': [{
                    'ref': input_params['input_reads_ref']
                }]})[0]
        except workspaceService as wse:
            self._log(console, 'Logging workspace exception')
            self._log(str(wse))
            raise

        #determine new object base name
        new_object_name = info[1]
        if ('output_reads_name' in input_params
                and input_params['output_reads_name'] != ''
                and input_params['output_reads_name'] is not None):
            new_object_name = input_params['output_reads_name']

        # MAKE A DIRECTORY TO PUT THE READ FILE(S)
        # create the output directory and move the file there
        # PUT FILES INTO THE DIRECTORY
        # Sanitize the file names
        tempdir = tempfile.mkdtemp(dir=self.scratch)
        export_dir = os.path.join(tempdir, info[1])
        os.makedirs(export_dir)

        if read_type == 'PE':
            # IF PAIRED END, potentially 6 files created
            # one of each for the two directions(good(paired), good_singletons, bad)
            # Take the good paired and (re)upload new reads object.
            # We throwout the bad reads

            input_files_info = self._setup_pe_files(readsLibrary, export_dir,
                                                    input_params)

            # RUN PRINSEQ with user options (lc_method and lc_threshold)
            cmd = (
                "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} "
                "-fastq2 {} -out_format 3 -lc_method {} "
                "-lc_threshold {}").format(
                    input_files_info["fastq_file_path"],
                    input_files_info["fastq2_file_path"],
                    input_params['lc_method'], lc_threshold)
            print "Command to be run : " + cmd
            args = shlex.split(cmd)
            perl_script = subprocess.Popen(args,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            output = perl_script.communicate()
            found_results = False
            file_names_dict = dict()
            for element in output:
                if "Input and filter stats:" in element:
                    found_results = True
                    element_parts = element.split("Input and filter stats:")
                    # PRINSEQ OUTPUT
                    report = "Input and filter stats:{}".format(
                        element_parts[1])
                    reportObj['text_message'] = report
                    read_files_list = os.listdir(export_dir)

                    # proc = subprocess.Popen(['ls', '-l', export_dir], stdout=subprocess.PIPE)
                    # proc_output = proc.stdout.read()
                    # print "PROC OUTPUT : " + proc_output

                    for read_filename in read_files_list:
                        file_direction = None
                        print "Read File : {}".format(read_filename)
                        # determine if forward(fastq) or reverse(fastq2) file
                        if input_files_info["fastq_filename"] in read_filename:
                            file_direction = "fwd"
                        elif input_files_info[
                                "fastq2_filename"] in read_filename:
                            file_direction = "rev"
                        if file_direction is not None:
                            # determine good singleton or good part of a pair.
                            print "TEST: {}_prinseq_good_".format(
                                input_files_info["fastq_filename"])
                            if ("{}_prinseq_good_singletons".format(
                                    input_files_info["fastq_filename"])
                                    in read_filename
                                    or "{}_prinseq_good_singletons".format(
                                        input_files_info["fastq2_filename"])
                                    in read_filename):
                                # Unpaired singletons that need to be
                                # saved as a new single end reads object
                                file_names_dict["{}_good_singletons".format(file_direction)] = \
                                    os.path.join(export_dir, read_filename)
                            elif ("{}_prinseq_good_".format(
                                    input_files_info["fastq_filename"])
                                  in read_filename
                                  or "{}_prinseq_good_".format(
                                      input_files_info["fastq2_filename"])
                                  in read_filename):
                                file_names_dict["{}_good_pair".format(file_direction)] = \
                                    os.path.join(export_dir, read_filename)
                    if (('fwd_good_pair' in file_names_dict)
                            and ('rev_good_pair' in file_names_dict)):
                        self._log(None, 'Saving new Paired End Reads')
                        returnVal['filtered_paired_end_ref'] = \
                            readsUtils_Client.upload_reads({'wsname':
                                                            str(input_params['output_ws']),
                                                            'name': new_object_name,
                                                            'source_reads_ref':
                                                            input_params['input_reads_ref'],
                                                            'fwd_file':
                                                                file_names_dict['fwd_good_pair'],
                                                            'rev_file':
                                                                file_names_dict['rev_good_pair']
                                                            }
                                                           )['obj_ref']
                        reportObj['objects_created'].append({
                            'ref':
                            returnVal['filtered_paired_end_ref'],
                            'description':
                            'Filtered Paired End Reads',
                            'object_name':
                            new_object_name
                        })
                        print "REFERENCE : " + str(
                            returnVal['filtered_paired_end_ref'])
                    else:
                        reportObj['text_message'] += \
                            "\n\nNo good matching pairs passed low complexity filtering.\n" + \
                            "Consider loosening the threshold value.\n"
                    if 'fwd_good_singletons' in file_names_dict:
                        self._log(None, 'Saving new Forward Unpaired Reads')
                        fwd_object_name = "{}_fwd_singletons".format(
                            new_object_name)
                        returnVal['output_filtered_fwd_unpaired_end_ref'] = \
                            readsUtils_Client.upload_reads({'wsname':
                                                            str(input_params['output_ws']),
                                                            'name': fwd_object_name,
                                                            'source_reads_ref':
                                                            input_params['input_reads_ref'],
                                                            'fwd_file':
                                                            file_names_dict['fwd_good_singletons']}
                                                           )['obj_ref']
                        reportObj['objects_created'].append({
                            'ref':
                            returnVal['output_filtered_fwd_unpaired_end_ref'],
                            'description':
                            'Filtered Forward Unpaired End Reads',
                            'object_name':
                            fwd_object_name
                        })
                        print "REFERENCE : " + \
                            str(returnVal['output_filtered_fwd_unpaired_end_ref'])
                    if 'rev_good_singletons' in file_names_dict:
                        self._log(None, 'Saving new Reverse Unpaired Reads')
                        rev_object_name = "{}_rev_singletons".format(
                            new_object_name)
                        returnVal['output_filtered_rev_unpaired_end_ref'] = \
                            readsUtils_Client.upload_reads({'wsname':
                                                            str(input_params['output_ws']),
                                                            'name': rev_object_name,
                                                            'source_reads_ref':
                                                            input_params['input_reads_ref'],
                                                            'fwd_file':
                                                            file_names_dict['rev_good_singletons']}
                                                           )['obj_ref']
                        reportObj['objects_created'].append({
                            'ref':
                            returnVal['output_filtered_rev_unpaired_end_ref'],
                            'description':
                            'Filtered Reverse Unpaired End Reads',
                            'object_name':
                            rev_object_name
                        })
                        print "REFERENCE : " + \
                            str(returnVal['output_filtered_rev_unpaired_end_ref'])
                    if len(reportObj['objects_created']) > 0:
                        reportObj['text_message'] += "\nOBJECTS CREATED :\n"
                        for obj in reportObj['objects_created']:
                            reportObj['text_message'] += "{} : {}\n".format(
                                obj['object_name'], obj['description'])
                    else:
                        reportObj['text_message'] += \
                            "\nFiltering filtered out all reads. No objects made.\n"
            if not found_results:
                raise Exception('Unable to execute PRINSEQ, Error: {}'.format(
                    str(output)))
            print "FILES DICT : {}".format(str(file_names_dict))
            print "REPORT OBJECT :"
            print str(reportObj)

        elif read_type == 'SE':
            # Download reads Libs to FASTQ files
            # IF SINGLE END INPUT 2 files created (good and bad)
            # Take good and (re)upload new reads object
            input_fwd_file_path = \
                readsLibrary['files'][input_params['input_reads_ref']]['files']['fwd']
            fastq_filename = self._sanitize_file_name(
                os.path.basename(input_fwd_file_path))
            fastq_file_path = os.path.join(export_dir, fastq_filename)
            shutil.move(input_fwd_file_path, fastq_file_path)

            # RUN PRINSEQ with user options (lc_method and lc_threshold)
            cmd = (
                "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} "
                "-out_format 3 -lc_method {} "
                "-lc_threshold {}").format(fastq_file_path,
                                           input_params['lc_method'],
                                           lc_threshold)
            print "Command to be run : " + cmd
            args = shlex.split(cmd)
            print "ARGS:  " + str(args)
            perl_script = subprocess.Popen(args,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            output = perl_script.communicate()
            print "OUTPUT: " + str(output)
            found_results = False
            found_se_filtered_file = False
            file_names_dict = dict()
            for element in output:
                if "Input and filter stats:" in element:
                    found_results = True
                    element_parts = element.split("Input and filter stats:")
                    # PRINSEQ OUTPUT
                    report = "Input and filter stats:{}".format(
                        element_parts[1])
                    reportObj['text_message'] = report
                    read_files_list = os.listdir(export_dir)

                    for read_filename in read_files_list:
                        print "Early Read File : {}".format(read_filename)

                    for read_filename in read_files_list:
                        print "Read File : {}".format(read_filename)
                        if ("{}_prinseq_good_".format(fastq_filename)
                                in read_filename):
                            #Found Good file. Save the Reads objects
                            self._log(None, 'Saving Filtered Single End Reads')
                            returnVal['output_filtered_single_end_ref'] = \
                                readsUtils_Client.upload_reads({'wsname':
                                                                str(input_params['output_ws']),
                                                                'name': new_object_name,
                                                                'source_reads_ref':
                                                                input_params['input_reads_ref'],
                                                                'fwd_file':
                                                                    os.path.join(export_dir,
                                                                                 read_filename)}
                                                               )['obj_ref']
                            reportObj['objects_created'].append({
                                'ref':
                                returnVal['output_filtered_single_end_ref'],
                                'description':
                                'Filtered Single End Reads'
                            })
                            print "REFERENCE : " + str(
                                returnVal['output_filtered_single_end_ref'])
                            found_se_filtered_file = True
                            break
            if not found_se_filtered_file:
                reportObj['text_message'] += \
                    "\n\nNone of the reads passed low complexity filtering.\n" + \
                    "Consider loosening the threshold value.\n"
            if not found_results:
                raise Exception('Unable to execute PRINSEQ, Error: {}'.format(
                    str(output)))
            print "FILES DICT : {}".format(str(file_names_dict))
            print "REPORT OBJECT :"
            print str(reportObj)

        # save report object
        #
        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report = KBaseReport(self.callback_url, token=ctx['token'], service_ver=SERVICE_VER)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': input_params['output_ws']
        })

        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END execReadLibraryPRINSEQ

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method execReadLibraryPRINSEQ return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#20
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT.  Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           min_k - minimum kmer size (<= 127), must be odd number, default 21
           max_k - maximum kmer size (<= 127), must be odd number, default 99
           k_step - increment of kmer size of each iteration (<= 28), must be
           even number, default 10 k_list - list of kmer size (all must be
           odd, in the range 15-127, increment <= 28); override `--k-min',
           `--k-max' and `--k-step' min_contig_length - minimum length of
           contigs to output, default is 2000 @optional
           megahit_parameter_preset @optional min_count @optional k_min
           @optional k_max @optional k_step @optional k_list @optional
           min_contig_length) -> structure: parameter "workspace_name" of
           String, parameter "read_library_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_length" of Long
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {
            'read_libraries': [input_ref],
            'interleaved': 'false',
            'gzipped': None
        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))

        min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH
        if 'min_contig_length' in params:
            if params['min_contig_length']:
                if str(params['min_contig_length']).isdigit():
                    min_contig_length = params['min_contig_length']
                else:
                    raise ValueError(
                        'min_contig_length parameter must be a non-negative integer'
                    )

        megahit_cmd.append('--min-contig-len')
        megahit_cmd.append(str(min_contig_length))

        # set the number of cpus
        megahit_cmd.append('--num-cpu-threads')
        megahit_cmd.append(str(multiprocessing.cpu_count() - 1))

        # set the output location
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            raise ValueError('Error running MEGAHIT, return code: ' +
                             str(retcode) + '\n')

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs,
                        os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch,
                                          'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': output_contigs
            },
            'workspace_name':
            params['workspace_name'],
            'assembly_name':
            params['output_contigset_name']
        })

        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params[
            'workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(
                edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({
                'files': [{
                    'path': output_contigs,
                    'label': params['output_contigset_name']
                }]
            })
        except QUASTError as qe:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report({
                'message':
                report,
                'objects_created': [{
                    'ref': output_data_ref,
                    'description': 'Assembled contigs'
                }],
                'direct_html_link_index':
                0,
                'html_links': [{
                    'shock_id': quastret['shock_id'],
                    'name': 'report.html',
                    'label': 'QUAST report'
                }],
                'report_object_name':
                'kb_megahit_report_' + str(uuid.uuid4()),
                'workspace_name':
                params['workspace_name']
            })
        except _RepError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#21
0
    def runFastQC(self, ctx, input_params):
        """
        :param input_params: instance of type "FastQCParams" -> structure:
           parameter "input_ws" of String, parameter "input_file" of String,
           parameter "input_file_ref" of String
        :returns: instance of type "FastQCOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: reported_output
        #BEGIN runFastQC

        token = ctx['token']
        wsClient = workspaceService(self.workspaceURL, token=token)
        uuid_string = str(uuid.uuid4())
        read_file_path = self.scratch+"/"+uuid_string
        os.mkdir(read_file_path)

        input_file_ref = self._get_input_file_ref_from_params(input_params)

        library=None
        try:
            library = wsClient.get_objects2({'objects': [{'ref': input_file_ref}]})['data'][0]
        except Exception as e:
            raise ValueError('Unable to get read library object from workspace: (' + input_file_ref + ')' + str(e))

        download_read_params = {'read_libraries': [], 'interleaved':"false"}
        if("SingleEnd" in library['info'][2] or "PairedEnd" in library['info'][2]):
            download_read_params['read_libraries'].append(library['info'][7]+"/"+library['info'][1])
        elif("SampleSet" in library['info'][2]):
            for sample_id in library['data']['sample_ids']:
                if("/" in sample_id):
                    download_read_params['read_libraries'].append(sample_id)
                else:
                    if(sample_id.isdigit()):
                        download_read_params['read_libraries'].append(library['info'][6]+"/"+sample_id)
                    else:
                        download_read_params['read_libraries'].append(library['info'][7]+"/"+sample_id)

        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        ret = ru.download_reads(download_read_params)

        read_file_list=list()
        for file in ret['files']:
            files = ret['files'][file]['files']

            fwd_name=files['fwd'].split('/')[-1]
            fwd_name=fwd_name.replace('.gz','')
            shutil.move(files['fwd'],os.path.join(read_file_path, fwd_name))
            read_file_list.append(os.path.join(read_file_path, fwd_name))

            if(files['rev'] is not None):
                rev_name=files['rev'].split('/')[-1]
                rev_name=rev_name.replace('.gz','')
                shutil.move(files['rev'],os.path.join(read_file_path, rev_name))
                read_file_list.append(os.path.join(read_file_path, rev_name))

        subprocess.check_output(["fastqc"]+read_file_list)
        # report = "Command run: "+" ".join(["fastqc"]+read_file_list)

        output = self.create_report(token, input_params['input_ws'],
                                    uuid_string, read_file_path)
        reported_output = {'report_name': output['name'],
                           'report_ref': output['ref']}

        # Remove temp reads directory
        shutil.rmtree(read_file_path, ignore_errors=True)

        #END runFastQC

        # At some point might do deeper type checking...
        if not isinstance(reported_output, dict):
            raise ValueError('Method runFastQC return value ' +
                             'reported_output is not type dict as required.')
        # return the results
        return [reported_output]
示例#22
0
    def run_MiniASM(self, ctx, params):
        """
        Run MiniASM on paired end libraries
        :param params: instance of type "MiniASM_Params" -> structure:
           parameter "workspace_name" of String, parameter "read_libraries"
           of list of type "paired_end_lib" (The workspace object name of a
           PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile
           type.), parameter "output_contigset_name" of String, parameter
           "min_contig" of Long, parameter "opt_args" of type "opt_args_type"
           (Input parameters for running MiniASM. string workspace_name - the
           name of the workspace from which to take input and store output.
           list<paired_end_lib> read_libraries - Illumina PairedEndLibrary
           files to assemble. string output_contigset_name - the name of the
           output contigset) -> structure: parameter "min_span" of Long,
           parameter "min_coverage" of Long, parameter "min_overlap" of Long,
           parameter "extra_params" of list of String
        :returns: instance of type "MiniASM_Output" (Output parameters for
           MiniASM run. string report_name - the name of the
           KBaseReport.Report workspace object. string report_ref - the
           workspace reference of the report.) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_MiniASM
        
        print("===================  IN run_MiniASM")

        # A whole lot of this is adapted or outright copied from
        # https://github.com/msneddon/MEGAHIT
        self.log('Running run_MiniASM with params:\n' + pformat(params))

        token = ctx['token']

        # the reads should really be specified as a list of absolute ws refs
        # but the narrative doesn't do that yet
        self.process_params(params)

        # get absolute refs from ws
        wsname = params[self.PARAM_IN_WS]
        print("Workspace name: " + wsname)
        obj_ids = []
        for r in params[self.PARAM_IN_LIB]:
            obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})
        ws = workspaceService(self.workspaceURL, token=token)

        ws_info = ws.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        readcli = ReadsUtils(self.callbackURL)

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')

        try:
            reads = readcli.download_reads({'read_libraries': reads_params,
                                            'interleaved': 'false',
                                            'gzipped': None
                                            })['files']
        except ServerError as se:
            self.log('logging stacktrace from dynamic client error')
            self.log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log('Got reads data from converter:\n' + pformat(reads))

        reads_data = []
        for ref in reads:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            print ("REF:" + str(ref))
            print ("READS REF:" + str(reads[ref]))
            seq_tech = reads[ref]["sequencing_tech"]
            if f['type'] == 'interleaved':
                reads_data.append({'fwd_file': f['fwd'], 'type':'paired',
                                   'seq_tech': seq_tech})
            elif f['type'] == 'paired':
                reads_data.append({'fwd_file': f['fwd'], 'rev_file': f['rev'],
                                   'type':'paired', 'seq_tech': seq_tech})
            elif f['type'] == 'single':
                reads_data.append({'fwd_file': f['fwd'], 'type':'single',
                                   'seq_tech': seq_tech})
            else:
                raise ValueError('Something is very wrong with read lib' + reads_name)

        print("READS_DATA: ")
        pprint(reads_data)
        print("============================   END OF READS_DATA: ")

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        outdir = os.path.join(self.scratch, 'MiniASM_dir-' + str(timestamp))

        miniasm_outfile = self.exec_MiniASM(reads_data, params, outdir)
        self.log('MiniASM output dir: ' + miniasm_outfile)

        # parse the output and save back to KBase

        output_contigs = miniasm_outfile

        min_contig_len = 0

        if self.PARAM_IN_MIN_CONTIG in params and params[self.PARAM_IN_MIN_CONTIG] is not None:
            if (int(params[self.PARAM_IN_MIN_CONTIG])) > 0:
                min_contig_len = int(params[self.PARAM_IN_MIN_CONTIG])

        self.log('Uploading FASTA file to Assembly')
        assemblyUtil = AssemblyUtil(self.callbackURL)

        assemblyUtil.save_assembly_from_fasta({'file': {'path': output_contigs},
                                               'workspace_name': wsname,
                                               'assembly_name': params[self.PARAM_IN_CS_NAME],
                                               'min_contig_length': min_contig_len
                                               })

        report_name, report_ref = self.load_report(output_contigs, params, wsname)

        output = {'report_name': report_name,
                  'report_ref': report_ref
                  }

        #END run_MiniASM

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_MiniASM return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]