class DownloadFastqUtils:
    def __init__(self):
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        self.au = AssemblyUtil(self.callbackURL)
        self.ru = ReadsUtils(self.callbackURL)
        pass

    def _stage_input_file(self, ref, reads_type):

        if reads_type == 'KBaseFile.PairedEndLibrary' or 'KBaseAssembly.PairedEndLibrary':
            input_file_info = self.ru.download_reads({
                'read_libraries': [ref],
                'interleaved': 'true'
            })['files'][ref]
        elif reads_type == 'KBaseFile.SingleEndLibrary' or 'KBaseAssembly.SingleEndLibrary':
            input_file_info = self.ru.download_reads({'read_libraries':
                                                      [ref]})['files'][ref]
        else:
            raise ValueError("Can't download_reads() for object type: '" +
                             str(reads_type) + "'")
        input_file_info['input_ref'] = ref
        file_location = input_file_info['files']['fwd']

        interleaved = False
        if input_file_info['files']['type'] == 'interleaved':
            interleaved = True

        return input_file_info

    def download_genome(self, genomeref):
        file = self.au.get_assembly_as_fasta({'ref': genomeref})
        return file
示例#2
0
    def prepare_single_run(self, input_info, assembly_or_genome_ref,
                           bwa_index_info, ws_for_cache):
        ''' Given a reads ref and an assembly, setup the bwa index '''
        # first setup the bwa index of the assembly
        input_configuration = {'bwa_index_info': bwa_index_info}
        if not bwa_index_info:
            bwaIndexBuilder = BwaIndexBuilder(self.scratch_dir,
                                              self.workspace_url,
                                              self.callback_url,
                                              self.srv_wiz_url,
                                              self.provenance)
            index_result = bwaIndexBuilder.get_index({
                'ref':
                assembly_or_genome_ref,
                'ws_for_cache':
                ws_for_cache
            })
            input_configuration['bwa_index_info'] = index_result
        # next download the reads
        read_lib_ref = input_info['ref']
        read_lib_info = input_info['info']
        reads_params = {
            'read_libraries': [read_lib_ref],
            'interleaved': 'false',
            'gzipped': None
        }
        ru = ReadsUtils(self.callback_url)
        reads = ru.download_reads(reads_params)['files']

        input_configuration['reads_lib_type'] = self.get_type_from_obj_info(
            read_lib_info).split('.')[1]
        input_configuration['reads_files'] = reads[read_lib_ref]
        input_configuration['reads_lib_ref'] = read_lib_ref

        return input_configuration
示例#3
0
    def get_reads_RU(self, refs, console):
        readcli = ReadsUtils(self.callbackURL, token=self.token)

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = readcli.download_reads({'read_libraries': refs,
                                            'interleaved': 'true',
                                            'gzipped': None
                                            })['files']
        except ServerError as se:
            self.log(console, 'logging stacktrace from dynamic client error')
            self.log(console, se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log(console, 'Got reads data from converter:\n' + pformat(reads))
        return reads
示例#4
0
def fetch_reads_from_reference(ref, callback_url):
    """
    Fetch a FASTQ file (or 2 for paired-end) from a reads reference.
    Returns the following structure:
    {
        "style": "paired", "single", or "interleaved",
        "file_fwd": path_to_file,
        "file_rev": path_to_file, only if paired end,
        "object_ref": reads reference for downstream convenience.
    }
    """
    try:
        print("Fetching reads from object {}".format(ref))
        reads_client = ReadsUtils(callback_url)
        reads_dl = reads_client.download_reads({
            "read_libraries": [ref],
            "interleaved": "false"
        })
        pprint(reads_dl)
        reads_files = reads_dl['files'][ref]['files']
        ret_reads = {
            "object_ref": ref,
            "style": reads_files["type"],
            "file_fwd": reads_files["fwd"]
        }
        if reads_files.get("rev", None) is not None:
            ret_reads["file_rev"] = reads_files["rev"]
        return ret_reads
    except:
        print(
            "Unable to fetch a file from expected reads object {}".format(ref))
        raise
示例#5
0
def download_interleaved_reads(callback_url, reads_upa):
    ru = ReadsUtils(callback_url)
    reads_info = ru.download_reads({
        'read_libraries': [reads_upa],
        'interleaved': 'true',
        'gzipped': None
    })['files'][reads_upa]
    return reads_info
示例#6
0
    def download_reads(self, token, reads_ref):
        try:
            readsUtils_Client = ReadsUtils (url=self.callback_url, token=token)  # SDK local                   

            readsLibrary = readsUtils_Client.download_reads ({'read_libraries': [reads_ref],
                                                                 'interleaved': 'true'                                                              
            })
            reads_file_path = readsLibrary['files'][reads_ref]['files']['fwd']
        except Exception as e:
            raise ValueError('Unable to get reads library object from workspace: (' + reads_ref +")\n" + str(e))

        return reads_file_path
示例#7
0
    def run_mash_sketch(self, ctx, params):
        """
        Generate a sketch file from a fasta/fastq file
        :param params: instance of type "MashSketchParams" (* * Pass in **one
           of** input_path, assembly_ref, or reads_ref *   input_path -
           string - local file path to an input fasta/fastq *   assembly_ref
           - string - workspace reference to an Assembly type *   reads_ref -
           string - workspace reference to a Reads type * Optionally, pass in
           a boolean indicating whether you are using paired-end reads. *
           paired_ends - boolean - whether you are passing in paired ends) ->
           structure: parameter "input_path" of String, parameter
           "assembly_ref" of String, parameter "reads_ref" of String,
           parameter "paired_ends" of type "boolean" (params:
           input_upa: workspace reference to an assembly object
           workspace_name: name of current workspace search_db: database to
           search n_max_results: number of results to return, integer between
           1 and 100)
        :returns: instance of type "MashSketchResults" (* * Returns the local
           scratch file path of the generated sketch file. * Will have the
           extension '.msh') -> structure: parameter "sketch_path" of String
        """
        # ctx is the context object
        # return variables are: results
        #BEGIN run_mash_sketch
        if 'reads_ref' in params:
            reads_utils = ReadsUtils(self.callbackURL)
            result = reads_utils.download_reads({
                'read_libraries': [params['reads_ref']],
                'interleaved': 'true'
            })
            input_path = result['files'][params['reads_ref']]['files']['fwd']
        elif 'assembly_ref' in params:
            assembly_util = AssemblyUtil(self.callbackURL)
            result = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']})
            input_path = result['path']
        elif 'input_path' in params:
            input_path = params['input_path']
        else:
            raise ValueError(
                'Invalid params; must provide one of `reads_ref`, `assembly_ref`, or `input_path`.'
            )
        mash_utils = MashUtils(self.config, self.auth_token)
        output_file_path = mash_utils.mash_sketch(input_path, paired_ends=params.get('paired_ends'))
        results = {'sketch_path': output_file_path}
        #END run_mash_sketch

        # At some point might do deeper type checking...
        if not isinstance(results, dict):
            raise ValueError('Method run_mash_sketch return value ' +
                             'results is not type dict as required.')
        # return the results
        return [results]
示例#8
0
 def fetch_reads_files(self, reads_upas):
     """
     From a list of reads UPAs, uses ReadsUtils to fetch the reads as files.
     Returns them as a dictionary from reads_upa -> filename
     """
     if reads_upas is None:
         raise ValueError("reads_upas must be a list of UPAs")
     if len(reads_upas) == 0:
         raise ValueError("reads_upas must contain at least one UPA")
     ru = ReadsUtils(self.callback_url)
     reads_info = ru.download_reads(({
         'read_libraries': reads_upas,
         'interleaved': 'true',
         'gzipped': None
     }))['files']
     file_set = dict()
     for reads in reads_info:
         file_set[reads] = reads_info[reads]['files']['fwd']
     return file_set
示例#9
0
class masurca_utils:
    """
    masurca_utils: defining a system of utils for running masurca
    """
    MaSuRCA_VERSION = 'MaSuRCA-3.2.9'
    MaSuRCA_BIN = '/kb/module/' + MaSuRCA_VERSION + '/bin/masurca'
    PARAM_IN_WS = 'workspace_name'
    PARAM_IN_THREADN = 'num_threads'
    PARAM_IN_READS_LIBS = 'reads_libraries'
    PARAM_IN_JUMP_LIBS = 'jump_libraries'
    PARAM_IN_JF_SIZE = 'jf_size'
    PARAM_IN_CS_NAME = 'output_contigset_name'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')

    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token)
        self.au = AssemblyUtil(self.callback_url, token=self.token)
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir
        self.prog_runner = Program_Runner(self.MaSuRCA_BIN, self.proj_dir)

    def _has_long_reads(self, params):
        """
        _has_long_reads: check if a long reads input exists in the parameters
        """
        return (params.get('pacbio_reads', None)
                or params.get('nanopore_reads', None)
                or params.get('other_frg_file', None))

    def _get_data_portion(self,
                          pe_reads_data,
                          jp_reads_data=None,
                          pacbio_reads_file='',
                          nanopore_reads_file='',
                          other_frg_file=''):
        """
        _get_data_portion: build the 'DATA...END' portion for the config.txt file
        """
        data_str = ''
        if pe_reads_data:
            # log('PE reads data details:\n{}'.format(json.dumps(pe_reads_data, indent=1)))
            for pe in pe_reads_data:
                if data_str != '':
                    data_str += '\n'
                data_str += 'PE= ' + pe['pe_prefix'] + ' ' + str(pe['pe_mean']) + ' ' + \
                            str(pe['pe_stdev']) + ' ' + pe['fwd_file']
                if pe.get('rev_file', None):
                    data_str += ' ' + pe['rev_file']

        if jp_reads_data:
            # log('JUMP reads data details:\n{}'.format(json.dumps(jp_reads_data, indent=1)))
            for jp in jp_reads_data:
                if data_str != '':
                    data_str += '\n'
                data_str += 'JUMP= ' + jp['jp_prefix'] + ' ' + str(jp['jp_mean']) + ' ' + \
                            str(jp['jp_stdev']) + ' ' + jp['fwd_file']
                if jp.get('rev_file', None):
                    data_str += ' ' + jp['rev_file']

        # Adding the pacbio_reads
        # Note that pcbio reads must be in a single fasta file!
        # For example:
        # data_str +='\nPACBIO= /pool/genomics/frandsenp/masurca/PacBio/pacbio_reads.fasta'
        # ***if you have both types of reads supply them both as NANOPORE type***
        if pacbio_reads_file != '':
            if data_str != '':
                data_str += '\n'
            if nanopore_reads_file != '':
                data_str += 'NANOPORE=' + pacbio_reads_file
            else:
                data_str += 'PACBIO=' + pacbio_reads_file

        # Adding the nanopore_reads and note that nanopore reads must be in a single fasta file!
        # For example:
        # data_str +='\nNANOPORE= /pool/genomics/frandsenp/masurca/NanoPore/nanopore_reads.fasta'
        if nanopore_reads_file != '':
            if data_str != '':
                data_str += '\n'
            data_str += 'NANOPORE= ' + nanopore_reads_file

        # Adding the other_frg_file inputs if any
        # any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first converted into
        # Celera Assembler compatible .frg file
        # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg
        if other_frg_file != '':
            if data_str != '':
                data_str += '\n'
            data_str += 'OTHER=' + other_frg_file

        return data_str

    def _get_parameters_portion(self, params):
        """
        build the 'PARAMETERS...END' portion for the config.txt file
        """
        # set the default parameters as suggested in the example configuration file
        param_str = (
            "EXTEND_JUMP_READS=0\nUSE_GRID=0\nGRID_QUEUE=all.q\nGRID_BATCH_SIZE"
            + "=300000000\nLHE_COVERAGE=25\nMEGA_READS_ONE_PASS=0")
        if (params.get('graph_kmer_size', None)
                and type(params['graph_kmer_size']) == int):
            if param_str != '':
                param_str += '\n'
            param_str += 'GRAPH_KMER_SIZE=' + str(params['graph_kmer_size'])
        else:
            if param_str != '':
                param_str += '\n'
            param_str += 'GRAPH_KMER_SIZE=auto'
        if params.get('use_linking_mates', None):
            if param_str != '':
                param_str += '\n'
            if params['use_linking_mates'] == 1 and not self._has_long_reads(
                    params):
                param_str += 'USE_LINKING_MATES=1'
            else:
                param_str += 'USE_LINKING_MATES=0'
        if params.get('limit_jump_coverage', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'LIMIT_JUMP_COVERAGE = ' + str(
                params['limit_jump_coverage'])
        if params.get('cgwErrorRate', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'CA_PARAMETERS = cgwErrorRate=' + str(
                params['cgwErrorRate'])
        if params.get(self.PARAM_IN_THREADN, None):
            if param_str != '':
                param_str += '\n'
            param_str += 'NUM_THREADS = ' + str(params[self.PARAM_IN_THREADN])
        if params.get('jf_size', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'JF_SIZE=' + str(params['jf_size'])
        if params.get('kmer_count_threshold', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'KMER_COUNT_THRESHOLD=' + str(
                params['kmer_count_threshold'])
        if params.get('do_homopolymer_trim', None):
            if param_str != '':
                param_str += '\n'
            if params['do_homopolymer_trim'] == 1:
                param_str += 'DO_HOMOPOLYMER_TRIM=1'
            else:
                param_str += 'DO_HOMOPOLYMER_TRIM=0'
        if params.get('close_gaps', None):
            if param_str != '':
                param_str += '\n'
            if params['close_gaps'] == 1:
                param_str += 'CLOSE_GAPS=1'
            else:
                param_str += 'CLOSE_GAPS=0'
        if params.get('soap_assembly', None):
            if param_str != '':
                param_str += '\n'
            if params['soap_assembly'] == 1:
                param_str += 'SOAP_ASSEMBLY=1'
            else:
                param_str += 'SOAP_ASSEMBLY=0'
        return param_str

    def _replaceSectionText(self, orig_txt, begin_patn, end_patn, repl_txt):
        """
        replace a section of text of orig_txt between lines begin-patn and end-patn with repl_text
        examples of parameters:
            begin_patn1 = "DATA\n"
            begin_patn2 = "PARAMETERS\n"
            end_patn1 = "END\nPARAMETERS\n"
            end_patn2 = "END\n"
            repl_txt1 = ('PE= pe 500 50 /kb/module/work/testReads/small.forward.fq' +
                          ' /kb/module/work/testReads/small.reverse.fq\n')
            repl_txt2 = ('GRAPH_KMER_SIZE=auto\nUSE_LINKING_MATES=1\nLIMIT_JUMP_COVERAGE = 60\n' +
                          'CA_PARAMETERS = cgwErrorRate=0.15\nNUM_THREADS= 64\nJF_SIZE=100000000\n
                          DO_HOMOPOLYMER_TRIM=0\n')
        """
        if repl_txt != '':
            # create regular expression pattern
            repl = re.compile(begin_patn + '.*?' + end_patn, re.DOTALL)
            repl_txt = begin_patn + repl_txt + '\n' + end_patn
            # replace the text between begin_patn and end_patn with repl_txt
            txt_replaced = repl.sub(repl_txt, orig_txt)
            # pprint(txt_replaced)
            return txt_replaced
        else:
            return orig_txt

    def _unique_prefix_check(self, pfix, refs):
        prefix_lookup = {}
        for ref in refs:
            pre = ref[pfix][0:2]
            if pre not in prefix_lookup:
                prefix_lookup[pre] = 1
            else:
                raise ValueError('The first two characters in \'' + ref[pfix] +
                                 '\' has been used.')

    def _get_pereads_info(self, input_params):
        """
        _get_pereads_info--from a list of paired_readsParams structures fetches the
        corresponding reads info with the paired_readsParams[pe_id]
        returns a list of reads data in the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'pe_prefix': the two-letter prefix for the reads library,
                'pe_mean': the average reads length for the reads library,
                'pe_stdev': the standard deviation for the reads library,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        rds_params = copy.deepcopy(input_params)
        wsname = rds_params[self.PARAM_IN_WS]
        rds_refs = []
        rds_data = []

        # reads_libraries grouped params
        if rds_params.get(self.PARAM_IN_READS_LIBS, None):
            pe_reads_libs = rds_params[self.PARAM_IN_READS_LIBS]

            for pe_lib in pe_reads_libs:
                if pe_lib.get('pe_id', None):
                    rds_refs.append(pe_lib['pe_id'])
            rds_data = self._get_kbreads_info(wsname, rds_refs)

            for pe_lib in pe_reads_libs:
                i = 0
                for rds in rds_data:
                    i += 1
                    if 'pe_id' in pe_lib and pe_lib['pe_id'] == rds[
                            'reads_ref']:
                        if pe_lib.get('pe_prefix', None):
                            rds['pe_prefix'] = pe_lib['pe_prefix'][0]
                        else:
                            rds['pe_prefix'] = 'p'
                        rds['pe_prefix'] += str(i)
                        pe_lib['pe_prefix'] = rds['pe_prefix']

                        if pe_lib.get('pe_mean', None) is None:
                            pe_lib['pe_mean'] = 500
                        rds['pe_mean'] = pe_lib['pe_mean']

                        if pe_lib.get('pe_stdev', None) is None:
                            pe_lib['pe_stdev'] = 50
                        rds['pe_stdev'] = pe_lib['pe_stdev']

            self._unique_prefix_check('pe_prefix', pe_reads_libs)
        else:
            raise ValueError("Parameter {} is required.".format(
                self.PARAM_IN_READS_LIBS))
        return rds_data

    def _get_jpreads_info(self, input_params):
        """
        _get_jpreads_info--from a list of jump_readsParams structures fetches the corresponding
        reads info with the paired_readsParams[pe_id]
        returns a list of reads data in the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'jp_prefix': the two-letter prefix for the reads library,
                'jp_mean': the average reads length for the reads library,
                'jp_stdev': the standard deviation for the reads library,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        rds_params = copy.deepcopy(input_params)
        wsname = rds_params[self.PARAM_IN_WS]
        rds_refs = []
        rds_data = []

        # jump_libraries grouped params
        if rds_params.get(self.PARAM_IN_JUMP_LIBS, None):
            jp_reads_libs = rds_params[self.PARAM_IN_JUMP_LIBS]
            for jp_lib in jp_reads_libs:
                if jp_lib.get('jp_id', None):
                    rds_refs.append(jp_lib['jp_id'])
            rds_data = self._get_kbreads_info(wsname, rds_refs)

            for jp_lib in jp_reads_libs:
                i = 0
                for rds in rds_data:
                    i += 1
                    if 'jp_id' in jp_lib and jp_lib['jp_id'] == rds[
                            'reads_ref']:
                        if jp_lib.get('jp_prefix', None):
                            rds['jp_prefix'] = jp_lib['jp_prefix'][0]
                        else:
                            rds['jp_prefix'] = 's'
                        rds['jp_prefix'] += str(i)
                        jp_lib['jp_prefix'] = rds['jp_prefix']

                        if jp_lib.get('jp_mean', None) is None:
                            jp_lib['jp_mean'] = 3600
                        rds['jp_mean'] = jp_lib['jp_mean']

                        if jp_lib.get('jp_stdev', None) is None:
                            jp_lib['jp_stdev'] = 200
                        rds['jp_stdev'] = jp_lib['jp_stdev']

            self._unique_prefix_check('jp_prefix', jp_reads_libs)
        return rds_data

    def _get_kbreads_info(self, wsname, reads_refs):
        """
        _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding
         reads info with as deinterleaved fastq files and returns a list of reads data in
         the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        obj_ids = []
        for r in reads_refs:
            if r:
                obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})

        if not obj_ids:
            return []

        ws_info = self.ws_client.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = self.ru.download_reads({
                'read_libraries': reads_params,
                'interleaved': 'false'
            })['files']
        except ServerError as se:
            log('logging stacktrace from dynamic client error')
            log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.SingleEndLibrary ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'KBaseFile.SingleEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        # log('Downloaded reads data from KBase:\n' + pformat(reads))
        reads_data = []
        for ref in reads_refs:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            seq_tech = reads[ref]['sequencing_tech']
            rds_info = {
                'fwd_file': f['fwd'],
                'reads_ref': ref,
                'type': f['type'],
                'seq_tech': seq_tech,
                'reads_name': reads_name
            }
            if f.get('rev', None) is not None:
                rds_info['rev_file'] = f['rev']
            reads_data.append(rds_info)

        return reads_data

    def _generate_output_file_list(self, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.proj_dir, str(uuid.uuid4()))
        mkdir_p(output_directory)
        masurca_output = os.path.join(output_directory, 'masurca_output.zip')
        self._zip_folder(out_dir, masurca_output)

        output_files.append({
            'path':
            masurca_output,
            'name':
            os.path.basename(masurca_output),
            'label':
            os.path.basename(masurca_output),
            'description':
            'Output file(s) generated by MaSuRCA'
        })

        return output_files

    def _zip_folder(self, folder_path, output_path):
        """
        _zip_folder: Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders could be included in the archive as well
        if the commented portion is used.
        """
        with zipfile.ZipFile(output_path,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    # print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print("{} created successfully.".format(output_path))
        # with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size

    def _load_stats(self, input_file_name):
        log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly')
        log('Building Object.')
        if not os.path.isfile(input_file_name):
            raise Exception('The input file name {0} is not a file!'.format(
                input_file_name))
        with open(input_file_name, 'r') as input_file_handle:
            contig_id = None
            sequence_len = 0
            fasta_dict = dict()
            first_header_found = False
            # Pattern for replacing white space
            pattern = re.compile(r'\s+')
            for current_line in input_file_handle:
                if current_line[0] == '>':
                    # found a header line
                    # Wrap up previous fasta sequence
                    if not first_header_found:
                        first_header_found = True
                    else:
                        fasta_dict[contig_id] = sequence_len
                        sequence_len = 0
                    fasta_header = current_line.replace('>', '').strip()
                    try:
                        contig_id = fasta_header.strip().split(' ', 1)[0]
                    except (IndexError, KeyError, ValueError):
                        contig_id = fasta_header.strip()
                else:
                    sequence_len += len(re.sub(pattern, '', current_line))
        # wrap up last fasta sequence
        if not first_header_found:
            raise Exception("There are no contigs in this file")
        else:
            fasta_dict[contig_id] = sequence_len
        return fasta_dict

    def _check_reference(self, ref):
        """
        Tests the given ref string to make sure it conforms to the expected
        object reference format. Returns True if it passes, False otherwise.
        """
        obj_ref_regex = re.compile(
            "^(?P<wsid>\d+)\/(?P<objid>\d+)(\/(?P<ver>\d+))?$")
        ref_path = ref.strip().split(";")
        for step in ref_path:
            if not obj_ref_regex.match(step):
                return False
        return True

    def _check_ref_type(self, ref, allowed_types):
        """
        Validates the object type of ref against the list of allowed types. If it passes, this
        returns True, otherwise False.
        Really, all this does is verify that at least one of the strings in allowed_types is
        a substring of the ref object type name.
        Ex1:
        ref = "KBaseGenomes.Genome-4.0"
        allowed_types = ["assembly", "KBaseFile.Assembly"]
        returns False
        Ex2:
        ref = "KBaseGenomes.Genome-4.0"
        allowed_types = ["assembly", "genome"]
        returns True
        """
        obj_type = self._get_object_type(ref).lower()
        for t in allowed_types:
            if t.lower() in obj_type:
                return True
        return False

    def _get_object_type(self, ref):
        """
        Fetches and returns the typed object name of ref from the given workspace url.
        If that object doesn't exist, or there's another Workspace error, this raises a
        RuntimeError exception.
        """
        info = self.ws_client.get_object_info3({'objects': [{'ref': ref}]})
        obj_info = info.get('infos', [[]])[0]
        if len(obj_info) == 0:
            raise RuntimeError(
                "An error occurred while fetching type info from the Workspace. "
                "No information returned for reference {}".format(ref))
        return obj_info[2]

    def _get_fasta_from_assembly(self, assembly_ref):
        """
        From an assembly or contigset, this uses a data file to build a FASTA file
        and return the path to it.
        """
        allowed_types = [
            'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly',
            'KBaseGenomes.ContigSet'
        ]
        if not self._check_ref_type(assembly_ref, allowed_types):
            raise ValueError(
                "The reference {} cannot be used to fetch a FASTA file".format(
                    assembly_ref))
        au = AssemblyUtil(self.callback_url)
        return au.get_assembly_as_fasta({'ref': assembly_ref})

    def generate_report(self, contig_file_name, params, out_dir, wsname):
        """
        generate_report: reporting results
        """
        log('Generating and saving report')

        contig_file_with_path = os.path.join(out_dir, contig_file_name)
        fasta_stats = self._load_stats(contig_file_with_path)
        lengths = [fasta_stats[contig_id] for contig_id in fasta_stats]

        assembly_ref = params[self.PARAM_IN_WS] + '/' + params[
            self.PARAM_IN_CS_NAME]

        report_text = ''
        report_text += 'MaSuRCA results saved to: ' + wsname + '/' + out_dir + '\n'
        report_text += 'Assembly saved to: ' + assembly_ref + '\n'
        report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report_text += 'Avg Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n'
        for c in range(bins):
            report_text += ('   ' + str(counts[c]) + '\t--\t' + str(edges[c]) +
                            ' to ' + str(edges[c + 1]) + ' bp\n')
        print('Running QUAST')
        quastret = self.kbq.run_QUAST({
            'files': [{
                'path': contig_file_with_path,
                'label': params[self.PARAM_IN_CS_NAME]
            }]
        })

        output_files = self._generate_output_file_list(out_dir)

        print('Saving report')
        report_output = self.kbr.create_extended_report({
            'message':
            report_text,
            'objects_created': [{
                'ref': assembly_ref,
                'description': 'Assembled contigs'
            }],
            'direct_html_link_index':
            0,
            'file_links':
            output_files,
            'html_links': [{
                'shock_id': quastret['shock_id'],
                'name': 'report.html',
                'label': 'QUAST report'
            }],
            'report_object_name':
            'kb_masurca_report_' + str(uuid.uuid4()),
            'workspace_name':
            params[self.PARAM_IN_WS]
        })
        report_name = report_output['name']
        report_ref = report_output['ref']
        return report_name, report_ref

    def validate_params(self, params):
        """
        validate_params: checks params passed to run_masurca_app method and set default values
        """
        # log('Start validating run_masurca_app parameters:\n{}'.format(
        # json.dumps(params, indent=1)))

        # check for mandatory parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError(self.PARAM_IN_WS + ' parameter is mandatory')
        if self.PARAM_IN_THREADN not in params:
            raise ValueError(self.PARAM_IN_THREADN + ' parameter is mandatory')

        if params.get(self.PARAM_IN_JF_SIZE, None) is None:
            raise ValueError(self.PARAM_IN_JF_SIZE + ' parameter is mandatory')
        if params.get(self.PARAM_IN_READS_LIBS, None) is None:
            raise ValueError(self.PARAM_IN_READS_LIBS +
                             ' parameter is mandatory')
        if type(params[self.PARAM_IN_READS_LIBS]) != list:
            raise ValueError(self.PARAM_IN_READS_LIBS + ' must be a list')

        if params.get(self.PARAM_IN_CS_NAME, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(
                self.PARAM_IN_CS_NAME))
        if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]):
            raise ValueError('Invalid workspace object name: {}.'.format(
                params[self.PARAM_IN_CS_NAME]))

        if 'dna_source' in params:
            dna_src = params.get('dna_source')
            if dna_src == 'bacteria':
                params['limit_jump_coverage'] = 60
                params['cgwErrorRate'] = 0.25
            else:
                params['limit_jump_coverage'] = 300
                params['cgwErrorRate'] = 0.15

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return params

    def construct_masurca_assembler_cfg(self, params):
        # STEP 1: get the working folder housing the config.txt file and the masurca results
        wsname = params[self.PARAM_IN_WS]
        config_file_path = os.path.join(self.proj_dir, 'config.txt')

        # STEP 2.1: retrieve the reads data from input parameter
        pe_reads_data = self._get_pereads_info(params)
        jp_reads_data = []
        if params.get(self.PARAM_IN_JUMP_LIBS, None):
            jp_reads_data = self._get_jpreads_info(params)
            if 'jp_mean' not in params or type(params['jp_mean']) != int:
                params['jp_mean'] = 3600
            if 'jp_stdev' not in params or type(params['jp_stdev']) != int:
                params['jp_stdev'] = 200

        # STEP 2.2: PACBIO reads must be in a single FASTA file and supplied as PACBIO=reads.fa;
        assbl_types = [
            'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly',
            'KBaseGenomes.ContigSet'
        ]
        reads_types = [
            'KBaseAssembly.SingleEndLibrary', 'KBaseFile.SingleEndLibrary',
            'KBaseAssembly.PairedEndLibrary', 'KBaseFile.PairedEndLibrary'
        ]
        pb_reads_file = ''
        if params.get('pacbio_reads', None):
            pb_ref = params['pacbio_reads']
            if self._check_ref_type(pb_ref, assbl_types):
                pb_reads_file = (self._get_fasta_from_assembly(pb_ref)).get(
                    'path', '')
            else:
                if self._check_ref_type(pb_ref, reads_types):
                    pb_rd = self._get_kbreads_info(wsname, [pb_ref])
                    pb_reads_file = pb_rd[0]['fwd_file']
                    if pb_rd[0].get('rev_file', None):
                        pb_reads_file += ' ' + pb_rd[0]['rev_file']

        # STEP 2.3: NANOPORE reads must be in a single FASTA/FASTQ file and supplied
        # as NANOPORE=reads.fa
        np_reads_file = ''
        if params.get('nanopore_reads', None):
            np_ref = params['nanopore_reads']
            if self._check_ref_type(np_ref, assbl_types):
                np_reads_file = (self._get_fasta_from_assembly(np_ref)).get(
                    'path', '')
            else:
                if self._check_ref_type(np_ref, reads_types):
                    np_rd = self._get_kbreads_info(wsname, [np_ref])
                    np_reads_file = np_rd[0]['fwd_file']
                    if np_rd[0].get('rev_file', None):
                        np_reads_file += ' ' + np_rd[0]['rev_file']

        # STEP 2.4: any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
        # converted into Celera Assembler compatible .frg files
        # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg
        other_frg = ''
        if params.get('other_frg_file', None):
            other_frg = params['other_frg_file']

        # STEP 3: construct and save the config.txt file for running masurca
        try:
            # STEP 3.1: replace the 'DATA...END' portion of the config_template.txt file
            data_str = self._get_data_portion(pe_reads_data, jp_reads_data,
                                              pb_reads_file, np_reads_file,
                                              other_frg)
            if data_str == '':  # no reads libraries are specified, no further actions
                return ''

            config_template = ''
            with codecs.open(os.path.join(os.path.dirname(__file__),
                                          'config_template.txt'),
                             mode='r',
                             encoding='utf-8') as config_template_file:
                config_template = config_template_file.read()

            begin_patn1 = "DATA\n"
            end_patn1 = "END\nPARAMETERS\n"
            config_with_data = self._replaceSectionText(
                config_template, begin_patn1, end_patn1, data_str)
            # log("\n***After DATA section replacement:\n{}\nSaved at {}".format(
            #             config_with_data.encode('utf-8').decode('utf-8'), config_file_path))

            with codecs.open(config_file_path, mode='w',
                             encoding='utf-8') as config_file:
                config_file.write(config_with_data)

            # STEP 3.2: replace the 'PARAMETERS...END' portion of the config_file file saved above
            param_str = self._get_parameters_portion(params)
            if param_str == '':  # no parameters are specified, no further actions
                return ''

            previous_config = ''
            with codecs.open(config_file_path, mode='r',
                             encoding='utf-8') as previous_config_file:
                previous_config = previous_config_file.read()

            begin_patn2 = "PARAMETERS\n"
            end_patn2 = "END\n"
            final_config = self._replaceSectionText(previous_config,
                                                    begin_patn2, end_patn2,
                                                    param_str)
            log("\n***Configuration file content:\n{}\nSaved at {}".format(
                final_config.encode('utf-8').decode('utf-8'),
                config_file_path))

            with codecs.open(config_file_path, mode='w',
                             encoding='utf-8') as config_file:
                config_file.write(final_config)
        except IOError as ioerr:
            log('Creation of the config.txt file raised error:\n')
            pprint(ioerr)
            return ''
        else:
            return config_file_path

    def generate_assemble_script(self, config_file):
        if os.path.isfile(config_file):
            f_dir, f_nm = os.path.split(config_file)
            m_cmd = [self.MaSuRCA_BIN]
            m_cmd.append(config_file)
            try:
                self.prog_runner.run(m_cmd, f_dir)
                assemble_file = os.path.join(f_dir, 'assemble.sh')
                log('Created the assemble.sh file at {}.\n'.format(
                    assemble_file))
                return assemble_file
            except ValueError as ve:
                log('Error generating assemble.sh file: \n{}'.format(ve))
                raise ValueError('Failed to generate assemble.sh file!')
        else:
            log("The config file {} is not found.\n".format(config_file))
            log('NO assemble.sh file created.\n')
        return ''

    def run_assemble(self, asmbl_file):
        exit_code = 1
        if os.path.isfile(asmbl_file):
            log("The assemble.sh file exists at {}\n".format(asmbl_file))
            f_dir, f_nm = os.path.split(asmbl_file)
            a_cmd = ['/bin/bash']
            a_cmd.append(asmbl_file)
            log("The working directory is {}\n".format(f_dir))
            log("The assembling command is {}\n".format(' '.join(a_cmd)))
            try:
                exit_code = self.prog_runner.run(a_cmd, f_dir)
            except ValueError as ve:
                log('Error running assemble: \n{}'.format(ve))
        else:
            log("The assemble.sh file {} is not found.".format(asmbl_file))
        return exit_code

    def save_assembly(self, contig_fa, wsname, a_name):
        if os.path.isfile(contig_fa):
            log('Uploading FASTA file to Assembly...')
            self.au.save_assembly_from_fasta({
                'file': {
                    'path': contig_fa
                },
                'workspace_name': wsname,
                'assembly_name': a_name
            })
        else:
            log("The contig file {} is not found.".format(contig_fa))
示例#10
0
    def download_long(self, console, warnings, token, wsname, lib,
                      min_long_read_length):
        try:
            # object info
            try:
                wsClient = Workspace(self.workspaceURL, token=token)
            except Exception as e:
                raise ValueError("unable to instantiate wsClient. " + str(e))

            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple

            obj_id = {'ref': lib if '/' in lib else (wsname + '/' + lib)}
            lib_obj_info = wsClient.get_object_info_new({'objects':
                                                         [obj_id]})[0]
            lib_obj_type = lib_obj_info[TYPE_I]
            lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                  lib_obj_type)  # remove trailing version
            lib_ref = str(lib_obj_info[WSID_I])+'/' + \
                str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I])
            if lib_obj_type == 'KBaseGenomes.ContigSet' or lib_obj_type == 'KBaseGenomeAnnotations.Assembly':
                # download using assembly util / data file util
                self.log(console,
                         "Getting long reads (from contigs object).\n")
                auClient = AssemblyUtil(url=self.callbackURL, token=token)
                dfuClient = DataFileUtil(url=self.callbackURL, token=token)
                contigFile = auClient.get_assembly_as_fasta({
                    'ref': lib_ref
                }).get('path')
                long_reads_path = dfuClient.unpack_file(
                    {'file_path': contig_file})['file_path']
                self.log(
                    warnings,
                    "Warning:  Long reads are in FASTA format, so short read check was not performed."
                )

            else:
                ruClient = ReadsUtils(url=self.callbackURL, token=token)
                self.log(console,
                         "Getting long reads (from reads library object).\n")
                result = ruClient.download_reads({
                    'read_libraries': [lib_ref],
                    'interleaved': 'false'
                })
                long_reads_path = result['files'][lib_ref]['files']['fwd']
                [n_reads, n_reads_short
                 ] = self.filter_short_fastq(console, long_reads_path,
                                             min_long_read_length)
                if (n_reads_short > 0):
                    self.log(
                        warnings, "Warning:  Of " + str(n_reads) +
                        " long reads, " + str(n_reads_short) +
                        " are shorter than " + str(min_long_read_length) +
                        "; consider using the filtlong app to filter out shorter reads."
                    )

        except Exception as e:
            raise ValueError('Unable to download long reads\n' + str(e))
        return long_reads_path
示例#11
0
    def download_short_unpaired(self, console, token, wsname,
                                short_unpaired_libraries):
        try:
            self.log(console, "Getting short unpaired reads.\n")
            ruClient = ReadsUtils(url=self.callbackURL, token=token)

            # first, unpack any ReadsSets into the actual SingleEndLibrary referencs
            reads_refs = []
            # object info
            try:
                wsClient = Workspace(self.workspaceURL, token=token)
            except Exception as e:
                raise ValueError("unable to instantiate wsClient. " + str(e))

            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple
            for lib in short_unpaired_libraries:
                try:
                    obj_id = {
                        'ref': lib if '/' in lib else (wsname + '/' + lib)
                    }
                    lib_obj_info = wsClient.get_object_info_new(
                        {'objects': [obj_id]})[0]
                    lib_obj_type = lib_obj_info[TYPE_I]
                    # remove trailing version
                    lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type)
                    lib_ref = str(lib_obj_info[WSID_I])+'/' + \
                        str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I])
                    if lib_obj_type == 'KBaseSets.ReadsSet':
                        # unpack it
                        try:
                            setAPIClient = SetAPI(url=self.serviceWizardURL,
                                                  token=token)
                            self.log(console, 'getting reads set ' + lib_ref)
                            readsSet = setAPIClient.get_reads_set_v1({
                                'ref':
                                lib_ref,
                                'include_item_info':
                                1
                            })
                        except Exception as e:
                            raise ValueError(
                                'SetAPI FAILURE: Unable to get read library set object: ('
                                + lib_ref + ')\n' + str(e))
                        for readsLibrary in readsSet['data']['items']:
                            reads_refs.append(readsLibrary['ref'])
                    else:
                        # use other reads objects "as is"
                        reads_refs.append(lib_ref)
                except Exception as e:
                    raise ValueError('Unable to get read library object: (' +
                                     str(lib) + ')' + str(e))

            result = ruClient.download_reads({
                'read_libraries': reads_refs,
                'interleaved': 'false'
            })
            # combine outputs
            short_unpaired_path = os.path.join(
                self.scratch, "short_unpaired_" + str(uuid.uuid4()) + ".fastq")

            self.log(console, "Combining short unpaired reads.\n")

            for reads_ref in reads_refs:
                files = result['files'][reads_ref]['files']

                if 'fwd' in files:
                    path = files['fwd']
                    if path.endswith('.gz'):
                        cmd = 'gzip -dc ' + path + ' >> ' + short_unpaired_path
                    else:
                        cmd = 'cat ' + path + ' >> ' + short_unpaired_path
                    self.log(console, "command: " + cmd)
                    cmdProcess = subprocess.Popen(cmd,
                                                  stdout=subprocess.PIPE,
                                                  stderr=subprocess.STDOUT,
                                                  shell=True)
                    cmdProcess.wait()
                    if cmdProcess.returncode != 0:
                        raise ValueError('Error running ' + cmd)
                    os.remove(path)
                else:
                    raise ValueError('File ' + reads_ref +
                                     ' missing forward reads file')

        except Exception as e:
            raise ValueError('Unable to download short unpaired reads\n' +
                             str(e))
        return short_unpaired_path
示例#12
0
class nmdc_mg_assembly:
    def __init__(self, callbaack_url, scratch, wdl='../../metaAssembly/'):
        self.callback_url = callbaack_url
        self.scratch = scratch
        self.special = special(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.report = KBaseReport(self.callback_url)
        self.wdl_base = wdl

    def validate_params(self, params):
        pass

    def fetch_reads_files(self, reads_upas):
        """
        From a list of reads UPAs, uses ReadsUtils to fetch the reads as files.
        Returns them as a dictionary from reads_upa -> filename
        """
        if reads_upas is None:
            raise ValueError("reads_upas must be a list of UPAs")
        if len(reads_upas) == 0:
            raise ValueError("reads_upas must contain at least one UPA")
        reads_info = self.ru.download_reads(({
            'read_libraries': reads_upas,
            'interleaved': 'true',
            'gzipped': None
        }))['files']
        file_set = dict()
        for reads in reads_info:
            file_set[reads] = reads_info[reads]['files']['fwd']
        return file_set

    def run_wdl(self, rf):
        print(os.getcwd())
        wdl_files = ['jgi_assembly.wdl']

        for f in wdl_files:
            src = self.wdl_base + f
            dst = './' + f
            shutil.copy(src, dst)
        ins = {
            "jgi_metaASM.input_file": [rf.replace(self.scratch, './')],
            "jgi_metaASM.rename_contig_prefix": "contig",
            "jgi_metaASM.outdir": "/out/"
        }
        input_file = os.path.join(self.scratch, 'inputs.json')
        with open(input_file, 'w') as f:
            f.write(json.dumps(ins))

        p = {'workflow': wdl_files[0], 'inputs': 'inputs.json'}

        res = self.special.wdl(p)
        print('wdl: ' + str(res))

    def _fix_path(self, orig):
        ind = orig.find('cromwell-executions')
        return os.path.join(self.scratch, orig[ind:])

    def upload_assembly(self, file_path_orig, workspace_name, assembly_name):
        """
        From a list of file paths, uploads them to KBase, generates Assembly objects,
        then returns the generated UPAs.
        """
        file_path = self._fix_path(file_path_orig)
        if not file_path:
            raise ValueError("file_path must be defined")
        if not os.path.exists(file_path):
            raise ValueError(
                "The given assembly file '{}' does not exist".format(
                    file_path))
        if not workspace_name:
            raise ValueError("workspace_name must be defined")
        if not assembly_name:
            raise ValueError("assembly_name must be defined")

        assembly_upa = self.au.save_assembly_from_fasta({
            "file": {
                "path": file_path
            },
            "workspace_name":
            workspace_name,
            "assembly_name":
            assembly_name
        })
        return assembly_upa

    def _upload_pipeline_result(self,
                                pipeline_result,
                                workspace_name,
                                assembly_name,
                                filtered_reads_name=None,
                                cleaned_reads_name=None,
                                skip_rqcfilter=False,
                                input_reads=None):
        """
        This is very tricky and uploads (optionally!) a few things under different cases.
        1. Uploads assembly
            - this always happens after a successful run.
        2. Cleaned reads - passed RQCFilter / BFC / SeqTK
            - optional, if cleaned_reads_name isn't None
        3. Filtered reads - passed RQCFilter
            - optional, if filtered_reads_name isn't None AND skip_rqcfilter is False
        returns a dict of UPAs with the following keys:
        - assembly_upa - the assembly (always)
        - filtered_reads_upa - the RQCFiltered reads (optionally)
        - cleaned_reads_upa - the RQCFiltered -> BFC -> SeqTK cleaned reads (optional)
        """

        # upload the assembly
        uploaded_assy_upa = self.file_util.upload_assembly(
            pipeline_result["spades"]["contigs_file"], workspace_name,
            assembly_name)
        upload_result = {"assembly_upa": uploaded_assy_upa}
        # upload filtered reads if we didn't skip RQCFilter (otherwise it's just a copy)
        if filtered_reads_name and not skip_rqcfilter:
            # unzip the cleaned reads because ReadsUtils won't do it for us.
            decompressed_reads = os.path.join(self.output_dir,
                                              "filtered_reads.fastq")
            pigz_command = "{} -d -c {} > {}".format(
                PIGZ, pipeline_result["rqcfilter"]["filtered_fastq_file"],
                decompressed_reads)
            p = subprocess.Popen(pigz_command,
                                 cwd=self.scratch_dir,
                                 shell=True)
            exit_code = p.wait()
            if exit_code != 0:
                raise RuntimeError(
                    "Unable to decompress filtered reads for validation! Can't upload them, either!"
                )
            filtered_reads_upa = self.file_util.upload_reads(
                decompressed_reads, workspace_name, filtered_reads_name,
                input_reads)
            upload_result["filtered_reads_upa"] = filtered_reads_upa
        # upload the cleaned reads
        if cleaned_reads_name:
            # unzip the cleaned reads because ReadsUtils won't do it for us.
            decompressed_reads = os.path.join(self.output_dir,
                                              "cleaned_reads.fastq")
            pigz_command = "{} -d -c {} > {}".format(
                PIGZ, pipeline_result["seqtk"]["cleaned_reads"],
                decompressed_reads)
            p = subprocess.Popen(pigz_command,
                                 cwd=self.scratch_dir,
                                 shell=True)
            exit_code = p.wait()
            if exit_code != 0:
                raise RuntimeError(
                    "Unable to decompress cleaned reads for validation! Can't upload them, either!"
                )
            cleaned_reads_upa = self.file_util.upload_reads(
                decompressed_reads, workspace_name, cleaned_reads_name,
                input_reads)
            upload_result["cleaned_reads_upa"] = cleaned_reads_upa
        return upload_result

    def assemble(self, params):
        self.validate_params(params)
        workspace_name = params['workspace_name']
        assembly_name = params['output_assembly_name']

        # Stage Data
        files = self.fetch_reads_files([params["reads_upa"]])
        reads_files = list(files.values())

        # Run WDL
        self.run_wdl(reads_files[0])

        # Check if things ran
        mfile = os.path.join(self.scratch, 'meta.json')
        print(mfile)
        if not os.path.exists(mfile):
            raise OSError("Failed to run workflow")

        with open(mfile) as f:
            pipeline_output = json.loads(f.read())
        out = pipeline_output["calls"]["jgi_metaASM.create_agp"][0]["outputs"]
        print(out)

        # Generate Output Objects
        contigs_fn = out['outcontigs']
        upa = self.upload_assembly(contigs_fn, workspace_name, assembly_name)

        upload_kwargs = {}

        print("upload complete")

        # Do report
        report_info = self.report.create({
            'report': {
                'objects_created': [],
                'text_message': "Assemble metagenomic reads"
            },
            'workspace_name': workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
示例#13
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           k_min - minimum kmer size (<= 255), must be odd number, defaults
           to 21 k_max - maximum kmer size (<= 255), must be odd number,
           defaults to 141 k_step - increment of kmer size of each iteration
           (<= 28), must be even number, defaults to 10 k_list - list of kmer
           sizes (all must be odd, in the range 15-255, increment <= 28);
           override using `--k-min', `--k-max' and `--k-step'
           min_contig_length - minimum length of contigs to output, default
           is 2000 max_mem_percent - maximum memory to make available to
           MEGAHIT, as a percentage of available system memory (optional,
           default = 0.9 or 90%) @optional megahit_parameter_preset @optional
           min_count @optional k_min @optional k_max @optional k_step
           @optional k_list @optional min_contig_length @optional
           max_mem_percent) -> structure: parameter "workspace_name" of
           String, parameter "read_library_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_length" of Long, parameter "max_mem_percent"
           of Double
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))

        min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH
        if 'min_contig_length' in params:
            if params['min_contig_length']:
                if str(params['min_contig_length']).isdigit():
                    min_contig_length = params['min_contig_length']
                else:
                    raise ValueError('min_contig_length parameter must be a non-negative integer')

        megahit_cmd.append('--min-contig-len')
        megahit_cmd.append(str(min_contig_length))

        # Set the number of CPUs to the number of cores minus 1
        megahit_cmd.append('--num-cpu-threads')
        megahit_cmd.append(str(max([(multiprocessing.cpu_count() - 1), 1])))

        # set mem usage
        # Note: this just sets the default value - 90% of available system memory allocated
        # to the container. Exposing it here as a place to later expose as a parameter.
        max_mem_percent = params.get('max_mem_percent', 0.9)
        megahit_cmd.append('-m')
        megahit_cmd.append(str(max_mem_percent))

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            error_str = report_megahit_error(output_dir, retcode)
            raise RuntimeError(error_str)

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
                                                                'file': {'path': output_contigs},
                                                                'workspace_name': params['workspace_name'],
                                                                'assembly_name': params['output_contigset_name']
                                                                })


        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except ServerError as qe:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except ServerError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def run_FamaReadProfiling(self, ctx, params):
        """
        Run metagenome functional profiling module of Fama.
        :param params: instance of type "FamaReadProfilingParams" (Parameters
           for metagenome functional profiling. workspace_name - the name of
           the workspace for input/output read_library_refs - references to
           the name of the PE read library or SE read library ref_dataset -
           the name of Fama reference dataset is_paired_end - 1 for
           paired-end library, 0 for single-end library
           output_functional_profile_name - the name of the output functional
           profile output_read_library_ref - the name of the output filtered
           PE or SE read library) -> structure: parameter "workspace_name" of
           String, parameter "read_library_refs" of list of String, parameter
           "ref_dataset" of String, parameter "is_paired_end" of type "bool"
           (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "output_functional_profile_name" of String, parameter
           "output_read_library_name" of String
        :returns: instance of type "ReportResults" (Output report parameters
           report_name - the name of the report object report_ref - the
           reference to the report object) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_FamaReadProfiling
        # Import Read Library and save as two paired-end FASTQ files
        input_refs = params['read_library_refs']
        fama_reference = params['ref_dataset']
        ws_client = Workspace(self.ws_url)
        ru = ReadsUtils(self.callback_url)
        ret = ws_client.get_object_info3(
            {'objects': [{
                'ref': ref
            } for ref in input_refs]})
        name2ref = {}
        input_reads = {}
        for input_ref in input_refs:
            ret = ws_client.get_object_info3({'objects': [{'ref': input_ref}]})
            obj_name = ret['infos'][0][1]
            name2ref[obj_name] = input_ref

            reads_params = {
                'read_libraries': [input_ref],
                'interleaved': 'false',
                'gzipped': None
            }

            reads = ru.download_reads(reads_params)['files']

            print('Input reads files downloaded:')
            print(reads)
            fwd_reads_file = reads[input_ref]['files']['fwd']
            rev_reads_file = reads[input_ref]['files']['rev']
            print('forward: ' + str(fwd_reads_file))
            print('reverse: ' + str(rev_reads_file))
            input_reads[obj_name] = {}
            input_reads[obj_name]['fwd'] = fwd_reads_file
            input_reads[obj_name]['rev'] = rev_reads_file

        fama_params = {
            'input_reads':
            input_reads,
            'work_dir':
            self.shared_folder,
            'reference':
            fama_reference,
            'is_paired_end':
            params['is_paired_end'],
            'name2ref':
            name2ref,
            'ws_name':
            params['workspace_name'],
            'ws_client':
            ws_client,
            'output_read_library_name':
            params['output_read_library_name'],
            'output_functional_profile_name':
            params['output_functional_profile_name'],
            'input_read_refs':
            params['read_library_refs']
        }

        # Run Fama
        fama_output = functional_profiling_pipeline(fama_params)

        # Write filtered reads to workspace
        reads_params = {
            'fwd_file': fama_output['fwd_reads'],
            'sequencing_tech': reads[input_ref]['sequencing_tech'],
            'single_genome': '0',
            'wsname': params['workspace_name'],
            'name': params['output_read_library_name']
        }
        if 'rev_reads' in fama_output:
            reads_params['rev_file'] = fama_output['rev_reads']
            reads_params['interleaved'] = '0'

        ru_ret = ru.upload_reads(reads_params)
        print('reads_params', reads_params)
        print('ru_ret', ru_ret)
        output_reads_ref = ru_ret['obj_ref']

        # Write HTML output to workspace
        message = 'Fama functional profiling finished successfully'
        dfu = DataFileUtil(self.callback_url)
        try:
            dfu_output = dfu.file_to_shock(
                {'file_path': fama_output['html_report']})
        except ServerError as dfue:
            # not really any way to test this block
            self.log('Logging exception loading results to shock')
            self.log(str(dfue))
            raise

        html_links = [{
            'shock_id': dfu_output['shock_id'],
            'description': 'HTML report for Fama App',
            'name': 'fama_report.html',
            'label': 'Fama_report'
        }]
        for krona_file in fama_output['krona_charts']:
            try:
                dfu_output = dfu.file_to_shock({'file_path': krona_file})
                html_links.append({
                    'shock_id':
                    dfu_output['shock_id'],
                    'description':
                    'Krona chart for function taxonomy profile',
                    'name':
                    fama_output['krona_charts'][krona_file][0],
                    'label':
                    fama_output['krona_charts'][krona_file][1]
                })
            except ServerError as dfue:
                # not really any way to test this block
                self.log('Logging exception loading results to shock')
                self.log(str(dfue))
                raise
        self.log('Krona chart saved: ' + str(dfu_output))

        # Save report
        report_params = {
            'message':
            message,
            'objects_created': [{
                'ref': output_reads_ref,
                'description': 'Filtered Read Library'
            }, {
                'ref': fama_output['trait_matrix_ref'],
                'description': 'Raw counts matrix'
            }, {
                'ref': fama_output['functional_profile_ref'],
                'description': 'Functional profile'
            }],
            'direct_html_link_index':
            0,
            'html_links':
            html_links,
            'file_links':
            fama_output['report_files'],
            'report_object_name':
            'fama_profiling_report_' + str(uuid.uuid4()),
            'workspace_name':
            params['workspace_name'],
            'html_window_height':
            460
        }
        try:
            report = KBaseReport(self.callback_url)
            report_info = report.create_extended_report(report_params)
        except ServerError as kre:
            # not really any way to test this block
            self.log('Logging exception saving report')
            self.log(str(kre))
            raise

        report_info['report_params'] = report_params
        self.log(str(report_info))
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END run_FamaReadProfiling

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_FamaReadProfiling return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#15
0
class CocacolaUtil:
    CONCOCT_BASE_PATH = '/kb/deployment/bin/CONCOCT'
    COCACOLA_BASE_PATH = '/kb/module/lib/kb_cocacola/bin/COCACOLA-python'
    BINNER_RESULT_DIRECTORY = 'cocacola_output_dir'
    BINNER_BIN_RESULT_DIR = 'final_bins'
    MAPPING_THREADS = 16
    BBMAP_MEM = '30g'

    def __init__(self, config):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shock_url = config['shock-url']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.mgu = MetagenomeUtils(self.callback_url)

    def _validate_run_cocacola_params(self, task_params):
        """
        _validate_run_cocacola_params:
                validates params passed to run_cocacola method
        """
        log('Start validating run_cocacola params')

        # check for required parameters
        for p in ['assembly_ref', 'binned_contig_name', 'workspace_name', 'reads_list', 'read_mapping_tool']:
            if p not in task_params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """
        os.chdir(self.scratch)
        log('Start executing command:\n{}'.format(command))
        log('Command is running from:\n{}'.format(self.scratch))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, stderr = pipe.communicate()
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\n'.format(exitCode))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format(exitCode, output, stderr)
            raise ValueError(error_msg)
            sys.exit(1)
        return (output, stderr)

    # this function has been customized to return read_type variable (interleaved vs single-end library)
    def stage_reads_list_file(self, reads_list):
        """
        stage_reads_list_file: download fastq file associated to reads to scratch area
                          and return result_file_path
        """

        log('Processing reads object list: {}'.format(reads_list))

        result_file_path = []
        read_type = []

        # getting from workspace and writing to scratch. The 'reads' dictionary now has file paths to scratch.
        reads = self.ru.download_reads({'read_libraries': reads_list, 'interleaved': None})['files']

        # reads_list is the list of file paths on workspace? (i.e. 12804/1/1).
        # "reads" is the hash of hashes where key is "12804/1/1" or in this case, read_obj and
        # "files" is the secondary key. The tertiary keys are "fwd" and "rev", as well as others.
        for read_obj in reads_list:
            files = reads[read_obj]['files']    # 'files' is dictionary where 'fwd' is key of file path on scratch.
            result_file_path.append(files['fwd'])
            read_type.append(files['type'])
            if 'rev' in files and files['rev'] is not None:
                result_file_path.append(files['rev'])

        return result_file_path, read_type

    def _get_contig_file(self, assembly_ref):
        """
        _get_contig_file: get contig file from GenomeAssembly object
        """
        contig_file = self.au.get_assembly_as_fasta({'ref': assembly_ref}).get('path')

        sys.stdout.flush()
        contig_file = self.dfu.unpack_file({'file_path': contig_file})['file_path']

        return contig_file

    def retrieve_and_clean_assembly(self, task_params):
        if os.path.exists(task_params['contig_file_path']):
            assembly = task_params['contig_file_path']
            print("FOUND ASSEMBLY ON LOCAL SCRATCH")
        else:
            # we are on njsw so lets copy it over to scratch
            assembly = self._get_contig_file(task_params['assembly_ref'])

        # remove spaces from fasta headers because that breaks bedtools
        assembly_clean = os.path.abspath(assembly).split('.fa')[0] + "_clean.fa"

        command = '/bin/bash reformat.sh in={} out={} addunderscore overwrite=true'.format(assembly, assembly_clean)

        log('running reformat command: {}'.format(command))
        out, err = self._run_command(command)

        return assembly_clean

    def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length):
        """ generates SeqRecords iterator for writing from a legacy contigset object """
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record

    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        """ removes all contigs less than the min_contig_length provided """
        filtered_fasta_file_path = os.path.abspath(fasta_file_path).split('.fa')[0] + "_filtered.fa"

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path

    def generate_stats_for_genome_bins(self, task_params, genome_bin_fna_file, bbstats_output_file):
        """
        generate_command: bbtools stats.sh command
        """
        log("running generate_stats_for_genome_bins on {}".format(genome_bin_fna_file))
        genome_bin_fna_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, genome_bin_fna_file)
        command = '/bin/bash stats.sh in={} format=3 > {}'.format(genome_bin_fna_file, bbstats_output_file)
        self._run_command(command)
        bbstats_output = open(bbstats_output_file, 'r').readlines()[1]
        n_scaffolds = bbstats_output.split('\t')[0]
        n_contigs = bbstats_output.split('\t')[1]
        scaf_bp = bbstats_output.split('\t')[2]
        contig_bp = bbstats_output.split('\t')[3]
        gap_pct = bbstats_output.split('\t')[4]
        scaf_N50 = bbstats_output.split('\t')[5]
        scaf_L50 = bbstats_output.split('\t')[6]
        ctg_N50 = bbstats_output.split('\t')[7]
        ctg_L50 = bbstats_output.split('\t')[8]
        scaf_N90 = bbstats_output.split('\t')[9]
        scaf_L90 = bbstats_output.split('\t')[10]
        ctg_N90 = bbstats_output.split('\t')[11]
        ctg_L90 = bbstats_output.split('\t')[12]
        scaf_max = bbstats_output.split('\t')[13]
        ctg_max = bbstats_output.split('\t')[14]
        scaf_n_gt50K = bbstats_output.split('\t')[15]
        scaf_pct_gt50K = bbstats_output.split('\t')[16]
        gc_avg = float(bbstats_output.split('\t')[17]) * 100  # need to figure out if correct
        gc_std = float(bbstats_output.split('\t')[18]) * 100  # need to figure out if correct

        log('Generated generate_stats_for_genome_bins command: {}'.format(command))

        return {'n_scaffolds': n_scaffolds,
                'n_contigs': n_contigs,
                'scaf_bp': scaf_bp,
                'contig_bp': contig_bp,
                'gap_pct': gap_pct,
                'scaf_N50': scaf_N50,
                'scaf_L50': scaf_L50,
                'ctg_N50': ctg_N50,
                'ctg_L50': ctg_L50,
                'scaf_N90': scaf_N90,
                'scaf_L90': scaf_L90,
                'ctg_N90': ctg_N90,
                'ctg_L90': ctg_L90,
                'scaf_max': scaf_max,
                'ctg_max': ctg_max,
                'scaf_n_gt50K': scaf_n_gt50K,
                'scaf_pct_gt50K': scaf_pct_gt50K,
                'gc_avg': gc_avg,
                'gc_std': gc_std
                }

    def deinterlace_raw_reads(self, fastq):
        fastq_forward = fastq.split('.fastq')[0] + "_forward.fastq"
        fastq_reverse = fastq.split('.fastq')[0] + "_reverse.fastq"
        command = 'reformat.sh in={} out1={} out2={} overwrite=true'.format(fastq, fastq_forward, fastq_reverse)
        self._run_command(command)
        return (fastq_forward, fastq_reverse)

    def run_read_mapping_interleaved_pairs_mode(self, task_params, assembly_clean, fastq, sam):
        read_mapping_tool = task_params['read_mapping_tool']
        log("running {} mapping in interleaved mode.".format(read_mapping_tool))
        if task_params['read_mapping_tool'] == 'bbmap':
            command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM)
            command += 'threads={} '.format(self.MAPPING_THREADS)
            command += 'ref={} '.format(assembly_clean)
            command += 'in={} '.format(fastq)
            command += 'out={} '.format(sam)
            command += 'fast interleaved=true mappedonly nodisk overwrite'
        elif task_params['read_mapping_tool'] == 'bwa':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            command = 'bwa index {} && '.format(assembly_clean)
            command += 'bwa mem -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} '.format(fastq_forward)
            command += '{} > '.format(fastq_reverse)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_default':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 -x {} '.format(bt2index)
            command += '-1 {} '.format(fastq_forward)
            command += '-2 {} '.format(fastq_reverse)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 --very-sensitive -x {} '.format(bt2index)
            command += '-1 {} '.format(fastq_forward)
            command += '-2 {} '.format(fastq_reverse)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'minimap2':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} '.format(fastq_forward)
            command += '{} > '.format(fastq_reverse)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'hisat2':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            ht2index = os.path.basename(assembly_clean) + '.ht2'
            command = 'hisat2-build {} '.format(assembly_clean)
            command += '{} && '.format(ht2index)
            command += 'hisat2 -x {} '.format(ht2index)
            command += '-1 {} '.format(fastq_forward)
            command += '-2 {} '.format(fastq_reverse)
            command += '-S {} '.format(sam)
            command += '--threads {}'.format(self.MAPPING_THREADS)
        log('running alignment command: {}'.format(command))
        out, err = self._run_command(command)

    def run_read_mapping_unpaired_mode(self, task_params, assembly_clean, fastq, sam):
        read_mapping_tool = task_params['read_mapping_tool']
        log("running {} mapping in single-end (unpaired) mode.".format(read_mapping_tool))
        if task_params['read_mapping_tool'] == 'bbmap':
            command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM)
            command += 'threads={} '.format(self.MAPPING_THREADS)
            command += 'ref={} '.format(assembly_clean)
            command += 'in={} '.format(fastq)
            command += 'out={} '.format(sam)
            command += 'fast interleaved=false mappedonly nodisk overwrite'
            # BBMap is deterministic without the deterministic flag if using single-ended reads
        elif task_params['read_mapping_tool'] == 'bwa':
            command = 'bwa index {} && '.format(assembly_clean)
            command += 'bwa mem -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} > '.format(fastq)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_default':
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 -x {} '.format(bt2index)
            command += '-U {} '.format(fastq)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive':
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 --very-sensitive -x {} '.format(bt2index)
            command += '-U {} '.format(fastq)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'minimap2':
            command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} > '.format(fastq)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'hisat2':
            ht2index = os.path.basename(assembly_clean) + '.ht2'
            command = 'hisat2-build {} '.format(assembly_clean)
            command += '{} && '.format(ht2index)
            command += 'hisat2 -x {} '.format(ht2index)
            command += '-U {} '.format(fastq)
            command += '-S {} '.format(sam)
            command += '--threads {}'.format(self.MAPPING_THREADS)
        log('running alignment command: {}'.format(command))
        out, err = self._run_command(command)

    def convert_sam_to_sorted_and_indexed_bam(self, sam):
        # create bam files from sam files
        sorted_bam = os.path.abspath(sam).split('.sam')[0] + "_sorted.bam"

        command = 'samtools view -F 0x04 -uS {} | '.format(sam)
        command += 'samtools sort - -o {}'.format(sorted_bam)

        log('running samtools command to generate sorted bam: {}'.format(command))
        self._run_command(command)

        # verify we got bams
        if not os.path.exists(sorted_bam):
            log('Failed to find bam file\n{}'.format(sorted_bam))
            sys.exit(1)
        elif(os.stat(sorted_bam).st_size == 0):
            log('Bam file is empty\n{}'.format(sorted_bam))
            sys.exit(1)

        # index the bam file
        command = 'samtools index {}'.format(sorted_bam)

        log('running samtools command to index sorted bam: {}'.format(command))
        self._run_command(command)

        return sorted_bam

    def generate_alignment_bams(self, task_params, assembly_clean):
        """
            This function runs the selected read mapper and creates the
            sorted and indexed bam files from sam files using samtools.
        """

        reads_list = task_params['reads_list']

        (read_scratch_path, read_type) = self.stage_reads_list_file(reads_list)

        sorted_bam_file_list = []

        # list of reads files, can be 1 or more. assuming reads are either type unpaired or interleaved
        # will not handle unpaired forward and reverse reads input as seperate (non-interleaved) files

        for i in range(len(read_scratch_path)):
            fastq = read_scratch_path[i]
            fastq_type = read_type[i]

            sam = os.path.basename(fastq).split('.fastq')[0] + ".sam"
            sam = os.path.join(self.BINNER_RESULT_DIRECTORY, sam)

            if fastq_type == 'interleaved':  # make sure working - needs tests
                log("Running interleaved read mapping mode")
                self.run_read_mapping_interleaved_pairs_mode(task_params, assembly_clean, fastq, sam)
            else:  # running read mapping in single-end mode
                log("Running unpaired read mapping mode")
                self.run_read_mapping_unpaired_mode(task_params, assembly_clean, fastq, sam)

            sorted_bam = self.convert_sam_to_sorted_and_indexed_bam(sam)

            sorted_bam_file_list.append(sorted_bam)

        return sorted_bam_file_list

    def generate_make_coverage_table_command(self, task_params, sorted_bam_file_list):
        # create the depth file for this bam
        #
        min_contig_length = task_params['min_contig_length']
        sorted_bam = task_params['sorted_bam']

        depth_file_path = os.path.join(self.scratch, str('cocacola_depth.txt'))
        command = '/kb/module/lib/kb_cocacola/bin/jgi_summarize_bam_contig_depths '
        command += '--outputDepth {} '.format(depth_file_path)
        command += '--minContigLength {} '.format(min_contig_length)
        command += '--minContigDepth 1 {}'.format(sorted_bam)

        log('running summarize_bam_contig_depths command: {}'.format(command))
        self._run_command(command)

        return depth_file_path

    def generate_cocacola_cut_up_fasta_command(self, task_params):
        """
        generate_command: cocacola cut_up_fasta
        """
        contig_file_path = task_params['contig_file_path']
        contig_split_size = task_params['contig_split_size']
        contig_split_overlap = task_params['contig_split_overlap']

        log("\n\nRunning generate_cocacola_cut_up_fasta_command")

        command = 'python {}/scripts/cut_up_fasta.py '.format(self.CONCOCT_BASE_PATH)
        command += '{} '.format(contig_file_path)
        command += '-c {} '.format(contig_split_size)
        command += '-o {} '.format(contig_split_overlap)
        command += '--merge_last -b temp.bed > {}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY)
        log('Generated cocacola_cut_up_fasta command: {}'.format(command))

        self._run_command(command)

    def generate_cocacola_input_table_from_bam(self, task_params):
        """
        generate_command: cocacola generate input table
        """
        log("\n\nRunning generate_cocacola_input_table_from_bam")
        command = 'python {}/scripts/gen_input_table.py '.format(self.CONCOCT_BASE_PATH)

        command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY)
        command += '{}/*_sorted.bam > '.format(self.BINNER_RESULT_DIRECTORY)
        command += '{}/coverage_table.tsv'.format(self.BINNER_RESULT_DIRECTORY)
        log('Generated cocacola generate input table from bam command: {}'.format(command))
        calc_contigs = 0
        for line in open('{}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY)):
            if line.startswith(">"):
                calc_contigs += 1
        task_params['calc_contigs'] = calc_contigs
        self._run_command(command)

    def generate_cocacola_kmer_composition_table(self, task_params):
        """
        generate_command: cocacola generate kmer composition table
        """
        log("\n\nRunning generate_cocacola_kmer_composition_table")
        calc_contigs = task_params['calc_contigs']
        kmer_size = task_params['kmer_size']
        command = 'python {}/scripts/fasta_to_features.py '.format(self.CONCOCT_BASE_PATH)
        command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY)
        command += '{} '.format(calc_contigs)
        command += '{} '.format(kmer_size)
        command += '{}/split_contigs_kmer_{}.csv'.format(self.BINNER_RESULT_DIRECTORY, kmer_size)
        log('Generated cocacola generate input table from bam command: {}'.format(command))

        self._run_command(command)

    def generate_cocacola_command(self, task_params):
        """
        generate_command: cocacola
        """

        min_contig_length = task_params['min_contig_length']
        kmer_size = task_params['kmer_size']

        log("\n\nRunning generate_cocacola_command")
        command = 'python {}/cocacola.py '.format(self.COCACOLA_BASE_PATH)
        command += '--contig_file {}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY)
        command += '--abundance_profiles {}/coverage_table.tsv '.format(self.BINNER_RESULT_DIRECTORY)
        command += '--composition_profiles {}/split_contigs_kmer_{}.csv '.format(self.BINNER_RESULT_DIRECTORY,
                                                                                 kmer_size)
        command += '--output {}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY,
                                                                           min_contig_length)

        log('Generated cocacola command: {}'.format(command))

        self._run_command(command)

    def add_header_to_post_clustering_file(self, task_params):
        min_contig_length = task_params['min_contig_length']
        header = "contig_id,cluster_id"
        with open('{}/cocacola_output_clusters_min{}_headers.csv'.format(self.BINNER_RESULT_DIRECTORY,
                                                                         min_contig_length), 'w') as outfile:
            outfile.write(header)
            with open('{}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY,
                                                                     min_contig_length), 'r') as datafile:
                for line in datafile:
                    outfile.write(line)

    def generate_cocacola_post_clustering_merging_command(self, task_params):
        """
        generate_command: cocacola post cluster merging
        """
        min_contig_length = task_params['min_contig_length']
        log("\n\nRunning generate_cocacola_post_clustering_merging_command")

        command = 'python {}/scripts/merge_cutup_clustering.py '.format(self.CONCOCT_BASE_PATH)
        command += '{}/cocacola_output_clusters_min{}_headers.csv > '.format(self.BINNER_RESULT_DIRECTORY,
                                                                             min_contig_length)
        command += '{}/clustering_merged_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length)
        log('Generated generate_cocacola_post_clustering_merging command: {}'.format(command))

        self._run_command(command)

    def generate_cocacola_extract_fasta_bins_command(self, task_params):
        """
        generate_command: cocacola extract_fasta_bins
        """
        log("\n\nRunning generate_cocacola_extract_fasta_bins_command")

        contig_file_path = task_params['contig_file_path']
        min_contig_length = task_params['min_contig_length']

        bin_result_directory = self.BINNER_RESULT_DIRECTORY + '/' + self.BINNER_BIN_RESULT_DIR
        self._mkdir_p(bin_result_directory)
        command = 'python {}/scripts/extract_fasta_bins.py '.format(self.CONCOCT_BASE_PATH)
        command += '{} '.format(contig_file_path)
        command += '{}/clustering_merged_min{}.csv '.format(self.BINNER_RESULT_DIRECTORY, min_contig_length)
        command += '--output_path {}/{}'.format(self.BINNER_RESULT_DIRECTORY, self.BINNER_BIN_RESULT_DIR)
        log('Generated generate_cocacola_extract_fasta_bins_command command: {}'.format(command))

        self._run_command(command)

    def rename_and_standardize_bin_names(self, task_params):
        """
        generate_command: generate renamed bins
        """
        log("\n\nRunning rename_and_standardize_bin_names")
        path_to_cocacola_result_bins = os.path.abspath(self.BINNER_RESULT_DIRECTORY) + \
            '/' + self.BINNER_BIN_RESULT_DIR + '/'
        for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins):
            for file in files:
                if file.endswith('.fa'):
                    os.rename(os.path.abspath(path_to_cocacola_result_bins) + '/' +
                              file, os.path.abspath(path_to_cocacola_result_bins) + '/bin.' +
                              file.split('.fa')[0].zfill(3) + '.fasta')  # need to change to 4 digits

    def make_binned_contig_summary_file_for_binning_apps(self, task_params):
        """
        generate_command: generate binned contig summary command
        """
        log("\n\nRunning make_binned_contig_summary_file_for_binning_apps")
        path_to_cocacola_result = os.path.abspath(self.BINNER_RESULT_DIRECTORY)
        path_to_cocacola_result_bins = '{}/{}/'.format(path_to_cocacola_result, self.BINNER_BIN_RESULT_DIR)
        path_to_summary_file = path_to_cocacola_result_bins + 'binned_contig.summary'
        with open(path_to_summary_file, 'w+') as f:
            f.write("Bin name\tCompleteness\tGenome size\tGC content\n")
            for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins):
                for file in files:
                    if file.endswith('.fasta'):
                        genome_bin_fna_file = os.path.join(self.BINNER_BIN_RESULT_DIR, file)
                        bbstats_output_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY,
                                                           genome_bin_fna_file).split('.fasta')[0] + ".bbstatsout"
                        bbstats_output = self.generate_stats_for_genome_bins(task_params,
                                                                             genome_bin_fna_file,
                                                                             bbstats_output_file)
                        f.write('{}\t0\t{}\t{}\n'.format(genome_bin_fna_file.split("/")[-1],
                                                         bbstats_output['contig_bp'],
                                                         bbstats_output['gc_avg']))
        f.close()
        log('Finished make_binned_contig_summary_file_for_binning_apps function')

    def generate_output_file_list(self, result_directory):
        """
        generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cocacola_result.zip')

        with zipfile.ZipFile(result_file, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:

            for dirname, subdirs, files in os.walk(result_directory):
                for file in files:
                    if (file.endswith('.sam') or
                        file.endswith('.bam') or
                        file.endswith('.bai') or
                       file.endswith('.summary')):
                            continue
                    if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                            continue
                    zip_file.write(os.path.join(dirname, file), file)
                if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                    baseDir = os.path.basename(dirname)
                    for file in files:
                        full = os.path.join(dirname, file)
                        zip_file.write(full, os.path.join(baseDir, file))

        output_files.append({'path': result_file,
                             'name': os.path.basename(result_file),
                             'label': os.path.basename(result_file),
                             'description': 'Files generated by CONCOCT App'})

        return output_files

    def generate_html_report(self, result_directory, assembly_ref, binned_contig_obj_ref):
        """
        generate_html_report: generate html summary report
        """

        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        # get summary data from existing assembly object and bins_objects
        Summary_Table_Content = ''
        Overview_Content = ''
        (binned_contig_count, input_contig_count, total_bins_count) = \
            self.generate_overview_info(assembly_ref, binned_contig_obj_ref, result_directory)

        Overview_Content += '<p>Binned contigs: {}</p>'.format(binned_contig_count)
        Overview_Content += '<p>Input contigs: {}</p>'.format(input_contig_count)
        Overview_Content += '<p>Number of bins: {}</p>'.format(total_bins_count)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          Overview_Content)
                report_template = report_template.replace('Summary_Table_Content',
                                                          Summary_Table_Content)
                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for kb_cocacola App'})
        return html_report

    def generate_overview_info(self, assembly_ref, binned_contig_obj_ref, result_directory):
        """
        _generate_overview_info: generate overview information from assembly and binnedcontig
        """

        # get assembly and binned_contig objects that already have some data populated in them
        assembly = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0]
        binned_contig = self.dfu.get_objects({'object_refs': [binned_contig_obj_ref]})['data'][0]

        input_contig_count = assembly.get('data').get('num_contigs')
        binned_contig_count = 0
        total_bins_count = 0
        total_bins = binned_contig.get('data').get('bins')
        total_bins_count = len(total_bins)
        for bin in total_bins:
            binned_contig_count += len(bin.get('contigs'))

        return (binned_contig_count, input_contig_count, total_bins_count)

    def generate_report(self, binned_contig_obj_ref, task_params):
        """
        generate_report: generate summary report
        """
        log('Generating report')

        result_directory = os.path.join(self.scratch, "cocacola_output_dir")

        task_params['result_directory'] = result_directory

        output_files = self.generate_output_file_list(task_params['result_directory'])

        output_html_files = self.generate_html_report(task_params['result_directory'],
                                                      task_params['assembly_ref'],
                                                      binned_contig_obj_ref)

        report_params = {
            'message': '',
            'workspace_name': task_params['workspace_name'],
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 266,
            'report_object_name': 'kb_cocacola_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def create_dict_from_depth_file(self, depth_file_path):
        # keep contig order (required by metabat2)
        depth_file_dict = {}
        with open(depth_file_path, 'r') as f:
            header = f.readline().rstrip().split("\t")
            # print('HEADER1 {}'.format(header))
            # map(str.strip, header)
            for line in f:
                # deal with cases were fastq name has spaces.Assume first
                # non white space word is unique and use this as ID.
                # line = line.rstrip()
                vals = line.rstrip().split("\t")
                if ' ' in vals[0]:
                    ID = vals[0].split()[0]
                else:
                    ID = vals[0]
                depth_file_dict[ID] = vals[1:]
            depth_file_dict['header'] = header
        return depth_file_dict

    def run_cocacola(self, task_params):
        """
        run_cocacola: cocacola app

        required params:
            assembly_ref: Metagenome assembly object reference
            binned_contig_name: BinnedContig object name and output file header
            workspace_name: the name of the workspace it gets saved to.
            reads_list: list of reads object (PairedEndLibrary/SingleEndLibrary)
            upon which CONCOCT will be run

        optional params:
            min_contig_length: minimum contig length; default 1000

            ref: https://github.com/BinPro/CONCOCT/blob/develop/README.md
        """
        log('--->\nrunning CocacolaUtil.run_cocacola\n' +
            'task_params:\n{}'.format(json.dumps(task_params, indent=1)))

        self._validate_run_cocacola_params(task_params)

        # get assembly
        contig_file = self._get_contig_file(task_params['assembly_ref'])
        task_params['contig_file_path'] = contig_file

        # clean the assembly file so that there are no spaces in the fasta headers
        assembly_clean = self.retrieve_and_clean_assembly(task_params)

        assembly_clean_temp = self.filter_contigs_by_length(assembly_clean, task_params['min_contig_length'])

        task_params['contig_file_path'] = assembly_clean_temp
        assembly_clean = assembly_clean_temp  # need to clean this up, ugly redundant variable usage

        # get reads
        (reads_list_file, read_type) = self.stage_reads_list_file(task_params['reads_list'])
        task_params['read_type'] = read_type
        task_params['reads_list_file'] = reads_list_file

        # prep result directory
        result_directory = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY)
        self._mkdir_p(result_directory)

        cwd = os.getcwd()
        log('changing working dir to {}'.format(result_directory))
        os.chdir(result_directory)

        # run alignments, and update input contigs to use the clean file
        # this function has an internal loop to generate a sorted bam file for each input read file
        self.generate_alignment_bams(task_params, assembly_clean)

        # not used right now
        # depth_file_path = self.generate_make_coverage_table_command(task_params, sorted_bam_file_list)
        # depth_dict = self.create_dict_from_depth_file(depth_file_path)

        # run cocacola prep, cut up fasta input
        self.generate_cocacola_cut_up_fasta_command(task_params)

        # run cococola prep, generate coverage tables from bam
        self.generate_cocacola_input_table_from_bam(task_params)

        # run cococola prep, generate kmer table
        self.generate_cocacola_kmer_composition_table(task_params)

        # run cocacola prep and cocacola
        self.generate_cocacola_command(task_params)

        # run command to add header to output file
        self.add_header_to_post_clustering_file(task_params)

        # run cocacola post cluster merging command
        self.generate_cocacola_post_clustering_merging_command(task_params)

        # run extract bins command
        self.generate_cocacola_extract_fasta_bins_command(task_params)

        # run fasta renaming
        self.rename_and_standardize_bin_names(task_params)

        # make binned contig summary file
        self.make_binned_contig_summary_file_for_binning_apps(task_params)

        # file handling and management
        os.chdir(cwd)
        log('changing working dir to {}'.format(cwd))

        log('Saved result files to: {}'.format(result_directory))
        log('Generated files:\n{}'.format('\n'.join(os.listdir(result_directory))))

        # make new BinnedContig object and upload to KBase
        generate_binned_contig_param = {
            'file_directory': os.path.join(result_directory, self.BINNER_BIN_RESULT_DIR),
            'assembly_ref': task_params['assembly_ref'],
            'binned_contig_name': task_params['binned_contig_name'],
            'workspace_name': task_params['workspace_name']
        }

        binned_contig_obj_ref = \
            self.mgu.file_to_binned_contigs(generate_binned_contig_param).get('binned_contig_obj_ref')

        # generate report
        reportVal = self.generate_report(binned_contig_obj_ref, task_params)
        returnVal = {
            'result_directory': result_directory,
            'binned_contig_obj_ref': binned_contig_obj_ref
        }
        returnVal.update(reportVal)

        return returnVal
示例#16
0
    def run_centrifuge(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_centrifuge
        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading reads data as a Fastq file.')
        readsUtil = ReadsUtils(self.callback_url)
        download_reads_output = readsUtil.download_reads({'read_libraries': params['input_refs']})
        #print(f"Input parameters {params['input_refs']}, {params['db_type']} download_reads_output {download_reads_output}")
        fastq_files = []
        fastq_files_name = []
        for key,val in download_reads_output['files'].items():
            if 'fwd' in val['files'] and val['files']['fwd']:
                fastq_files.append(val['files']['fwd'])
                fastq_files_name.append(val['files']['fwd_name'])
            if 'rev' in val['files'] and val['files']['rev']:
                fastq_files.append(val['files']['rev'])
                fastq_files_name.append(val['files']['rev_name'])
        #logging.info(f"fastq files {fastq_files}")
        fastq_files_string = ','.join(fastq_files)
        output_dir = os.path.join(self.scratch, 'centrifuge_out')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        outprefix = "centrifuge"
        # Checking db 
        cmd0 = ["ls", "-al", '/data/centrifuge/']
        #logging.info(f'cmd {cmd0}')
        pls = subprocess.Popen(cmd0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        logging.info(f'subprocess {pls.communicate()}')

        cmd = ['/kb/module/lib/centrifuge/Utils/uge-centrifuge.sh', '-i', fastq_files_string, '-o', output_dir, '-p',
               'centrifuge', '-d', '/data/centrifuge/' + params['db_type']]
        logging.info(f'cmd {cmd}')
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        logging.info(f'subprocess {p.communicate()}')
        summary_file = os.path.join(output_dir, outprefix + '.report.txt')

        # generate report directory and html file
        report_dir = os.path.join(output_dir, 'html_report')
        if not os.path.exists(report_dir):
            os.makedirs(report_dir)
        summary_file_dt = os.path.join(report_dir, 'centrifuge.datatable.html')

        self._generate_DataTable(summary_file,summary_file_dt)
        shutil.copy2('/kb/module/lib/centrifuge/Utils/index.html',os.path.join(report_dir,'index.html'))
        shutil.copy2(os.path.join(output_dir,outprefix+'.krona.html'),os.path.join(report_dir,'centrifuge.krona.html'))
        shutil.move(os.path.join(output_dir,outprefix+'.tree.svg'),os.path.join(report_dir,'centrifuge.tree.svg'))
        html_zipped = self.package_folder(report_dir, 'index.html', 'index.html')


        # Step 5 - Build a Report and return
        objects_created = []
        output_files = os.listdir(output_dir)
        output_files_list = []
        for output in output_files:
            output_files_list.append({'path': os.path.join(output_dir, output),
                                      'name': output
                                      })

        # not used
        output_html_files = [{'path': os.path.join(report_dir, 'index.html'),
                             'name': 'index.html'},
                             {'path': os.path.join(report_dir, 'centrifuge.krona.html'),
                             'name': 'centrifuge.krona.html'},
                             {'path': os.path.join(report_dir, 'centrifuge.datatable.html'),
                             'name': 'centrifuge.datatable.html'},
                             {'path': os.path.join(report_dir, 'centrifuge.tree.svg'),
                             'name': 'centrifuge.tree.svg'}
                            ]
        message = 'Centrifuge run finished on %s against %s.' % (','.join(fastq_files_name) , params['db_type'])
        report_params = {'message': message,
                         'workspace_name': params.get('workspace_name'),
                         'objects_created': objects_created,
                         'file_links': output_files_list,
                         'html_links': [html_zipped],
                         'direct_html_link_index': 0,
                         'html_window_height': 480}

        # STEP 6: contruct the output to send back
        kbase_report_client = KBaseReport(self.callback_url)
        report_info = kbase_report_client.create_extended_report(report_params)
        report_info['report_params'] = report_params        
        logging.info(report_info)

        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #END run_centrifuge

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_centrifuge return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#17
0
    def run_velvet(self, ctx, params):
        """
        Definition of run_velvet
        :param params: instance of type "VelvetParams" (Arguments for
           run_velvet string workspace_name - the name of the workspace from
           which to take input and store output. int hash_length - an odd
           integer (if even, it will be decremented) <= 31 string
           output_contigset_name - the name of the output contigset
           list<paired_end_lib> read_libraries - Illumina PairedEndLibrary
           files to assemble min_contig_length - integer to filter out
           contigs with length < min_contig_length from the Velvet output.
           Default value is 500 (where 0 implies no filter). @optional
           min_contig_length @optional cov_cutoff @optional ins_length
           @optional read_trkg @optional amos_file @optional exp_cov
           @optional long_cov_cutoff) -> structure: parameter
           "workspace_name" of String, parameter "hash_length" of Long,
           parameter "read_libraries" of list of type "read_lib" (The
           workspace object name of a SingleEndLibrary or PairedEndLibrary
           file, whether of the KBaseAssembly or KBaseFile type.), parameter
           "output_contigset_name" of String, parameter "min_contig_length"
           of Long, parameter "cov_cutoff" of Double, parameter "ins_length"
           of Long, parameter "read_trkg" of type "bool" (A boolean - 0 for
           false, 1 for true. @range (0, 1)), parameter "amos_file" of type
           "bool" (A boolean - 0 for false, 1 for true. @range (0, 1)),
           parameter "exp_cov" of Double, parameter "long_cov_cutoff" of
           Double
        :returns: instance of type "VelvetResults" (Output parameter items
           for run_velvet report_name - the name of the KBaseReport.Report
           workspace object. report_ref - the workspace reference of the
           report.) -> structure: parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_velvet
        self.log('Running run_velvet with params:\n' + pformat(params))

        token = ctx['token']
        wsname = params[self.PARAM_IN_WS]
        self.process_params(params)
        input_reads_refs = params[self.PARAM_IN_LIB]

        # STEP 0: preprocess the reads in KBase way
        obj_ids = []
        for r in input_reads_refs:
            obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})
        ws = workspaceService(self.workspaceURL, token=token)
        ws_info = ws.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        readcli = ReadsUtils(self.callbackURL, token=token)

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = readcli.download_reads({'read_libraries':
                                            reads_params})['files']
        except ServerError as se:
            self.log('logging stacktrace from dynamic client error')
            self.log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.SingleEndLibrary ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'KBaseFile.SingleEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log('Got reads data from converter:\n' + pformat(reads))

        reads_data = []
        reads_name = ''
        for ref in input_reads_refs:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            seq_tech = reads[ref]["sequencing_tech"]
            if f['type'] == 'interleaved':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'interleaved',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'paired':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'rev_file': f['rev'],
                    'type': 'paired',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'single':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'single',
                    'seq_tech': seq_tech
                })
            else:
                raise ValueError('Something is very wrong with read lib' +
                                 reads_name)

        # STEP 1: run velveth and velvetg sequentially
        velvet_out = self.exec_velvet(params, reads_data)
        #self.log('Velvet final return: ' + str(velvet_out))

        # STEP 2: parse the output and save back to KBase, create report in the same time
        if isinstance(velvet_out, str) and velvet_out != '':
            output_contigs = os.path.join(velvet_out, 'contigs.fa')
            min_contig_len = params.get(self.PARAM_IN_MIN_CONTIG_LENGTH, 0)
            if (os.path.isfile(output_contigs)
                    and os.path.getsize(output_contigs) == 0):
                self.log(
                    'Given the minimal contig length of {} bp, Velvet could not find any '
                    'contig of the input reads libary.'.format(
                        str(min_contig_len)))
                output = {'report_name': 'Empty contigs', 'report_ref': None}
            elif (os.path.isfile(output_contigs)
                  and os.path.getsize(output_contigs) > 0):
                self.log('Uploading FASTA file to Assembly')

                assemblyUtil = AssemblyUtil(self.callbackURL,
                                            token=ctx['token'],
                                            service_ver='release')

                if min_contig_len > 0:
                    assemblyUtil.save_assembly_from_fasta({
                        'file': {
                            'path': output_contigs
                        },
                        'workspace_name':
                        wsname,
                        'assembly_name':
                        params[self.PARAM_IN_CS_NAME],
                        'min_contig_length':
                        min_contig_len
                    })
                else:
                    assemblyUtil.save_assembly_from_fasta({
                        'file': {
                            'path': output_contigs
                        },
                        'workspace_name':
                        wsname,
                        'assembly_name':
                        params[self.PARAM_IN_CS_NAME]
                    })
                # generate report from contigs.fa
                report_name, report_ref = self.generate_report(
                    output_contigs, params, velvet_out, wsname)

                # STEP 3: contruct the output to send back
                output = {'report_name': report_name, 'report_ref': report_ref}
            else:
                output = {
                    'report_name': 'Velvet found empty contig file',
                    'report_ref': None
                }
        else:
            output = {'report_name': 'Velvet aborted', 'report_ref': None}

        #END run_velvet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_velvet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#18
0
class SPAdesUtils:
    """
    Define the SPAdesUtils functions
    """
    SPADES_VERSION = '3.13.0'
    SPADES_BIN = '/opt/SPAdes-' + SPADES_VERSION + '-Linux/bin'

    DISABLE_SPADES_OUTPUT = False  # should be False in production

    # Basic options
    PARAM_IN_SINGLE_CELL = 'single_cell'  # --sc
    PARAM_IN_METAGENOME = 'metagenomic'  # --meta
    PARAM_IN_PLASMID = 'plasmid'  # --plasmid
    PARAM_IN_RNA = 'rna'  # --rna
    PARAM_IN_IONTORRENT = 'iontorrent'  # --iontorrent

    # Pipeline options
    PARAM_IN_ONLY_ERROR_CORR = 'only-error-correction'  # --only-error-correction
    PARAM_IN_ONLY_ASSEMBLER = 'only-assembler'  # --only-assembler
    PARAM_IN_CAREFUL = 'careful'  # --careful
    PARAM_IN_CONTINUE = 'continue'  # --continue
    PARAM_IN_DISABLE_GZIP = 'disable-gzip-output'  # --disable-gzip-output

    # Input parameters
    PARAM_IN_WS = 'workspace_name'
    PARAM_IN_CS_NAME = 'output_contigset_name'
    PARAM_IN_READS = 'reads_libraries'
    PARAM_IN_LONG_READS = 'long_reads_libraries'
    PARAM_IN_KMER_SIZES = 'kmer_sizes'
    PARAM_IN_SKIP_ERR_CORRECT = 'skip_error_correction'
    PARAM_IN_MIN_CONTIG_LENGTH = 'min_contig_length'
    PARAM_IN_DNA_SOURCE = 'dna_source'
    PARAM_IN_PIPELINE_OPTION = 'pipeline_options'
    ASSEMBLE_RESULTS_DIR = 'assemble_results'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    THREADS_PER_CORE = 3
    MAX_THREADS = 64  # per email thread with Anton Korobeynikov
    MAX_THREADS_META = 128  # Increase threads for metagenomic assemblies
    MEMORY_OFFSET_GB = 1  # 1GB
    MIN_MEMORY_GB = 5
    MAX_MEMORY_GB_SPADES = 500
    MAX_MEMORY_GB_META_SPADES = 1000
    GB = 1000000000

    # private method definition
    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token, service_ver='release')
        self.au = AssemblyUtil(self.callback_url, token=self.token, service_ver='release')
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir

        self.spades_version = 'SPAdes-' + os.environ['SPADES_VERSION']

    def _get_kbreads_info(self, wsname, reads_refs):
        """
        _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding
        reads info with as interleaved fastq files and returns a list of reads data in the
        following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'type': reads_type,  # ('interleaved', 'paired', or 'single')
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file,  # only if paired end
        }
        """
        obj_ids = []
        for r in reads_refs:
            if r:
                obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})

        if not obj_ids:
            return []

        ws_info = self.ws_client.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = self.ru.download_reads({
                        'read_libraries': reads_params,
                        'interleaved': 'false'
                        })['files']
        except ServerError as se:
            log('logging stacktrace from dynamic client error')
            log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.SingleEndLibrary ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'KBaseFile.SingleEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        # log('Downloaded reads data from KBase:\n' + pformat(reads))
        reads_data = []
        for ref in reads_refs:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            seq_tech = reads[ref]['sequencing_tech']
            rds_info = {
                'fwd_file': f['fwd'],
                'reads_ref': ref,
                'type': f['type'],
                'seq_tech': seq_tech,
                'reads_name': reads_name
            }
            if f.get('rev', None):
                rds_info['rev_file'] = f['rev']
            reads_data.append(rds_info)

        return reads_data

    def _generate_output_file_list(self, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.proj_dir, str(uuid.uuid4()))
        _mkdir_p(output_directory)
        spades_output = os.path.join(output_directory, 'spades_output.zip')
        self._zip_folder(out_dir, spades_output)

        output_files.append({'path': spades_output,
                             'name': os.path.basename(spades_output),
                             'label': os.path.basename(spades_output),
                             'description': 'Output file(s) generated by {}'.format(
                                 self.spades_version)})

        return output_files

    def _zip_folder(self, folder_path, output_path):
        """
        _zip_folder: Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders could be included in the archive as well
        if the commented portion is used.
        """
        with zipfile.ZipFile(output_path, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    # print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print("{} created successfully.".format(output_path))
        # with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size

    def _load_stats(self, input_file_name):
        log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly')
        log('Building Object.')
        if not os.path.isfile(input_file_name):
            raise Exception('The input file name {0} is not a file!'.format(input_file_name))
        with open(input_file_name, 'r') as input_file_handle:
            contig_id = None
            sequence_len = 0
            fasta_dict = dict()
            first_header_found = False
            # Pattern for replacing white space
            pattern = re.compile(r'\s+')
            for current_line in input_file_handle:
                if (current_line[0] == '>'):
                    # found a header line
                    # Wrap up previous fasta sequence
                    if not first_header_found:
                        first_header_found = True
                    else:
                        fasta_dict[contig_id] = sequence_len
                        sequence_len = 0
                    fasta_header = current_line.replace('>', '').strip()
                    try:
                        contig_id = fasta_header.strip().split(' ', 1)[0]
                    except (IndexError, ValueError, KeyError):
                        contig_id = fasta_header.strip()
                else:
                    sequence_len += len(re.sub(pattern, '', current_line))
        # wrap up last fasta sequence
        if not first_header_found:
            raise Exception("There are no contigs in this file")
        else:
            fasta_dict[contig_id] = sequence_len
        return fasta_dict

    def _parse_single_reads(self, reads_type, reads_list):
        """
        _parse_single_reads: given the reads_type and a list of reads, return an object
        defining the type and a list of fastq files.
        """
        single_reads_fqs = []
        ret_obj = {}
        if reads_list and isinstance(reads_list, list):
            for rds in reads_list:
                single_reads_fqs.append(rds['fwd_file'])
        if single_reads_fqs:
            ret_obj = {
                "type": reads_type,
                "single reads": single_reads_fqs
            }

        return ret_obj

    def _parse_pair_reads(self, reads_type, reads_list):
        """
        _parse_pair_reads: given the reads_type and a list of reads, return an object
        defining the type and a list of fastq files.
        """
        right_reads_fqs = []
        left_reads_fqs = []
        ret_obj = {}
        if reads_list and isinstance(reads_list, list):
            for rds in reads_list:
                right_reads_fqs.append(rds['fwd_file'])
                if rds.get('rev_file', None):
                    left_reads_fqs.append(rds['rev_file'])
            orent = reads_list[0]['orientation']

        if right_reads_fqs:
            ret_obj["right reads"] = right_reads_fqs
            ret_obj["orientation"] = orent
            ret_obj["type"] = reads_type
        if left_reads_fqs:
            ret_obj["left reads"] = left_reads_fqs

        return ret_obj
    # end of private methods

    # public method definitions

    def check_spades_params(self, params):
        """
        check_spades_params: checks params passed to run_HybridSPAdes method and set default values
        """
        # log('Start validating run_HybridSPAdes parameters:\n{}'.format(
        # json.dumps(params, indent=1)))

        # check for mandatory parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_WS))
        if self.INVALID_WS_NAME_RE.search(params[self.PARAM_IN_WS]):
            raise ValueError('Invalid workspace name: {}.'.format(params[self.PARAM_IN_WS]))

        if params.get(self.PARAM_IN_CS_NAME, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_CS_NAME))
        if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]):
            raise ValueError('Invalid workspace object name: {}.'.format(
                params[self.PARAM_IN_CS_NAME]))

        if params.get(self.PARAM_IN_READS, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_READS))
        if type(params[self.PARAM_IN_READS]) != list:
            raise ValueError('Input reads {} must be a list.'.format(self.PARAM_IN_READS))
        if len(params[self.PARAM_IN_READS]) == 0:
            raise ValueError('Input parameter {} should have at least one reads.'.format(
                             self.PARAM_IN_READS))

        if self.PARAM_IN_MIN_CONTIG_LENGTH in params:
            if not isinstance(params[self.PARAM_IN_MIN_CONTIG_LENGTH], int):
                raise ValueError('{} must be of type int.'.format(self.PARAM_IN_MIN_CONTIG_LENGTH))

        if not params.get(self.PARAM_IN_KMER_SIZES, None):
            params[self.PARAM_IN_KMER_SIZES] = [21, 33, 55]
        kmer_sstr = ",".join(str(num) for num in params[self.PARAM_IN_KMER_SIZES])
        params[self.PARAM_IN_KMER_SIZES] = kmer_sstr
        print("KMER_SIZES: " + kmer_sstr)

        if params.get(self.PARAM_IN_SKIP_ERR_CORRECT, None):
            print("SKIP ERR CORRECTION: " + str(params[self.PARAM_IN_SKIP_ERR_CORRECT]))

        # check for basic option parameters
        if params.get(self.PARAM_IN_DNA_SOURCE, None):
            dna_src = params[self.PARAM_IN_DNA_SOURCE]
            if dna_src not in [self.PARAM_IN_SINGLE_CELL,
                               self.PARAM_IN_METAGENOME,
                               self.PARAM_IN_PLASMID,
                               self.PARAM_IN_RNA,
                               self.PARAM_IN_IONTORRENT]:
                params[self.PARAM_IN_DNA_SOURCE] = None
        else:
            params[self.PARAM_IN_DNA_SOURCE] = None

        # a list of basic options0
        params['basic_options'] = ['-o', self.ASSEMBLE_RESULTS_DIR]
        dna_src = params.get(self.PARAM_IN_DNA_SOURCE)
        if dna_src == self.PARAM_IN_SINGLE_CELL:
            params['basic_options'].append('--sc')
        elif dna_src == self.PARAM_IN_METAGENOME:
            params['basic_options'].append('--meta')
        elif dna_src == self.PARAM_IN_PLASMID:
            params['basic_options'].append('--plasmid')
        elif dna_src == self.PARAM_IN_RNA:
            params['basic_options'].append('--rna')
        elif dna_src == self.PARAM_IN_IONTORRENT:
            params['basic_options'].append('--iontorrent')

        # processing pipeline option parameters
        if params.get(self.PARAM_IN_PIPELINE_OPTION, None):
            pipe_opts = params[self.PARAM_IN_PIPELINE_OPTION]
            opts = [self.PARAM_IN_ONLY_ERROR_CORR,
                    self.PARAM_IN_ONLY_ASSEMBLER,
                    self.PARAM_IN_CONTINUE,
                    self.PARAM_IN_DISABLE_GZIP,
                    self.PARAM_IN_CAREFUL]
            if any(elem in opts for elem in pipe_opts):
                pass
            else:
                params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL]
        else:
            params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL]

        if '--meta' in params['basic_options']:
            # you cannot specify --careful, --mismatch-correction
            # or --cov-cutoff in metagenomic mode!
            try:
                params[self.PARAM_IN_PIPELINE_OPTION].remove(self.PARAM_IN_CAREFUL)
                params[self.PARAM_IN_PIPELINE_OPTION].remove('mismatch-correction')
                params[self.PARAM_IN_PIPELINE_OPTION].remove('cov-cutoff')
            except ValueError:
                pass

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return params

    def generate_report(self, fa_file_name, params, out_dir, wsname):
        """
        Generating and saving report
        """
        log('Generating and saving report')

        fa_file_with_path = os.path.join(out_dir, fa_file_name)
        fasta_stats = self._load_stats(fa_file_with_path)
        lengths = [fasta_stats[contig_id] for contig_id in fasta_stats]

        assembly_ref = wsname + '/' + params[self.PARAM_IN_CS_NAME]

        report_text = ''
        report_text += 'SPAdes results saved to: ' + wsname + '/' + out_dir + '\n'
        report_text += 'Assembly saved to: ' + assembly_ref + '\n'
        report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report_text += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n'
        for c in range(bins):
            report_text += ('   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' +
                            str(edges[c + 1]) + ' bp\n')
        print('Running QUAST')
        quastret = self.kbq.run_QUAST(
            {'files': [{'path': fa_file_with_path, 'label': params[self.PARAM_IN_CS_NAME]}]})

        output_files = self._generate_output_file_list(out_dir)

        print('Saving report')
        report_output = self.kbr.create_extended_report(
            {'message': report_text,
             'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}],
             'direct_html_link_index': 0,
             'file_links': output_files,
             'html_links': [{'shock_id': quastret['shock_id'],
                             'name': 'report.html',
                             'label': 'QUAST report'}
                            ],
             'report_object_name': 'kb_spades_report_' + str(uuid.uuid4()),
             'workspace_name': params[self.PARAM_IN_WS]})

        return report_output['name'], report_output['ref']

    def get_hybrid_reads_info(self, input_params):
        """
        get_hybrid_reads_info--from a list of ReadsParams structures fetches the corresponding
        reads info with the ReadsParams[lib_ref]
        returns None or a tuple of nine reads data each is a list of the following structure:
        {
                'fwd_file': path_to_fastq_file,
                'orientation': (default value is "fr" (forward-reverse) for paired-end libraries
                                "rf" (reverse-forward) for mate-pair libraries), None for others
                'lib_type': ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio",
                              "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"),
                'type': reads_type, # 'interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file  # only if paired end
        }
        OR:
        {
                'fwd_file': path_to_fastq_file,
                'long_reads_type': ("pacbio-ccs", "pacbio-clr", "nanopore", "sanger",
                                    "trusted-contigs", "untrusted-contigs"),
                'type': reads_type, # 'interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience
        }
        """
        rds_params = copy.deepcopy(input_params)
        if rds_params.get(self.PARAM_IN_READS, None) is None:
            return ()  # an empty tuple

        wsname = rds_params[self.PARAM_IN_WS]

        sgl_rds_data = []  # single
        pe_rds_data = []   # paired-end
        mp_rds_data = []   # mate-pairs
        pb_ccs_data = []   # pacbio-ccs
        pb_clr_data = []   # pacbio-clr
        np_rds_data = []   # nanopore
        sgr_rds_data = []  # sanger
        tr_ctg_data = []   # trusted-contigs
        ut_ctg_data = []   # untrusted-contigs

        # a list of Illumina or IonTorrent paired-end/high-quality mate-pairs/unpaired reads
        rds_refs = []

        rds_libs = rds_params[self.PARAM_IN_READS]
        for rds_lib in rds_libs:
            if rds_lib.get('lib_ref', None):
                rds_refs.append(rds_lib['lib_ref'])
        kb_rds_data = self._get_kbreads_info(wsname, rds_refs)

        for rds_lib in rds_libs:
            for kb_d in kb_rds_data:
                if 'lib_ref' in rds_lib and rds_lib['lib_ref'] == kb_d['reads_ref']:
                    if rds_lib['lib_type'] == 'single':  # single end reads grouped params
                        kb_d['orientation'] = None
                        kb_d['lib_type'] = 'single'
                        sgl_rds_data.append(kb_d)
                    elif rds_lib['lib_type'] == 'paired-end':  # pairedEnd reads grouped params
                        kb_d['orientation'] = ('fr' if rds_lib.get('orientation', None) is None
                                               else rds_lib['orientation'])
                        kb_d['lib_type'] = 'paired-end'
                        pe_rds_data.append(kb_d)
                    elif rds_lib['lib_type'] == 'mate-pairs':
                        # mate-pairs reads grouped params
                        kb_d['orientation'] = ('rf' if rds_lib.get('orientation', None) is None
                                               else rds_lib['orientation'])
                        kb_d['lib_type'] = 'mate-pairs'
                        mp_rds_data.append(kb_d)

        # a list of PacBio (CCS or CLR), Oxford Nanopore Sanger reads
        # and/or additional contigs
        long_rds_refs = []
        if rds_params.get(self.PARAM_IN_LONG_READS, None):
            long_rds_libs = rds_params[self.PARAM_IN_LONG_READS]
            for lrds_lib in long_rds_libs:
                if lrds_lib.get('long_reads_ref', None):
                    long_rds_refs.append(lrds_lib['long_reads_ref'])
            kb_lrds_data = self._get_kbreads_info(wsname, long_rds_refs)

            for lrds_lib in long_rds_libs:
                for kb_ld in kb_lrds_data:
                    if ('long_reads_ref' in lrds_lib and
                            lrds_lib['long_reads_ref'] == kb_ld['reads_ref']):
                        if lrds_lib['long_reads_type'] == 'pacbio-ccs':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            pb_ccs_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'pacbio-clr':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            pb_clr_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'nanopore':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            np_rds_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'sanger':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            sgr_rds_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'trusted-contigs':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            tr_ctg_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'untrusted-contigs':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            ut_ctg_data.append(kb_ld)

        return (sgl_rds_data, pe_rds_data, mp_rds_data, pb_ccs_data, pb_clr_data, np_rds_data,
                sgr_rds_data, tr_ctg_data, ut_ctg_data)

    def construct_yaml_dataset_file(self, sgl_libs=None, pe_libs=None, mp_libs=None,
                                    pb_ccs=None, pb_clr=None, np_libs=None,
                                    sgr_libs=None, tr_ctgs=None, ut_ctgs=None):
        """
        construct_yaml_dataset_file: Specifying input data with YAML data set file (advanced)
        An alternative way to specify an input data set for SPAdes is to create a YAML
        data set file.
        By using a YAML file you can provide an unlimited number of paired-end, mate-pair
        and unpaired libraries. Basically, YAML data set file is a text file, in which input
        libraries are provided as a comma-separated list in square brackets. Each library is
        provided in braces as a comma-separated list of attributes.

        The following attributes are available:

            - orientation ("fr", "rf", "ff")
            - type ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio", "nanopore",
                "sanger", "trusted-contigs", "untrusted-contigs")
            - interlaced reads (comma-separated list of files with interlaced reads)
            - left reads (comma-separated list of files with left reads)
            - right reads (comma-separated list of files with right reads)
            - single reads (comma-separated list of files with single reads or unpaired reads from
                paired library)
            - merged reads (comma-separated list of files with merged reads)

        To properly specify a library you should provide its type and at least one file with reads.
        For ONT, PacBio, Sanger and contig libraries you can provide only single reads. Orientation
        is an optional attribute. Its default value is "fr" (forward-reverse) for paired-end
        libraries and "rf" (reverse-forward) for mate-pair libraries.

        The value for each attribute is given after a colon. Comma-separated lists of files should
        be given in square brackets.
        For each file you should provide its full path in double quotes. Make sure that files with
        right reads are given in the same order as corresponding files with left reads.

        For example, if you have one paired-end library splitted into two pairs of files:
            lib_pe1_left_1.fastq
            lib_pe1_right_1.fastq
            lib_pe1_left_2.fastq
            lib_pe1_right_2.fastq

        one mate-pair library:
            lib_mp1_left.fastq
            lib_mp1_right.fastq

        and PacBio CCS and CLR reads:
            pacbio_ccs.fastq
            pacbio_clr.fastq

        YAML file should look like this:
        ------------------------------------------------
        [
            {
                orientation: "fr",
                type: "paired-end",
                right reads: [
                "/FULL_PATH_TO_DATASET/lib_pe1_right_1.fastq",
                "/FULL_PATH_TO_DATASET/lib_pe1_right_2.fastq"
                ],
                left reads: [
                "/FULL_PATH_TO_DATASET/lib_pe1_left_1.fastq",
                "/FULL_PATH_TO_DATASET/lib_pe1_left_2.fastq"
                ]
            },
            {
                orientation: "rf",
                type: "mate-pairs",
                right reads: [
                "/FULL_PATH_TO_DATASET/lib_mp1_right.fastq"
                ],
                left reads: [
                "/FULL_PATH_TO_DATASET/lib_mp1_left.fastq"
                ]
            },
            {
                type: "single",
                single reads: [
                "/FULL_PATH_TO_DATASET/pacbio_ccs.fastq"
                ]
            },
            {
                type: "pacbio",
                single reads: [
                "/FULL_PATH_TO_DATASET/pacbio_clr.fastq"
                ]
            }
        ]
        ------------------------------------------------

        Once you have created a YAML file save it with .yaml extension (e.g. as my_data_set.yaml)
        and run SPAdes using the --dataset option:
        e.g., <SPAdes_bin_dir>/spades.py --dataset <your YAML file> -o spades_output

        """
        # STEP 1: get the working folder housing the .yaml file and the SPAdes results
        if not os.path.exists(self.proj_dir):
            os.makedirs(self.proj_dir)
        yaml_file_path = os.path.join(self.proj_dir, 'input_data_set.yaml')

        # STEP 2: construct and save the 'input_data_set.yaml' file
        # generate the object array
        input_data_set = []

        if pe_libs:
            pair_libs = self._parse_pair_reads('paired-end', pe_libs)
            if pair_libs:
                input_data_set.append(pair_libs)

        if mp_libs:
            pair_libs = self._parse_pair_reads('mate-pairs', mp_libs)
            if pair_libs:
                input_data_set.append(pair_libs)

        # for reads_type = 'single'
        if sgl_libs:
            single_libs = self._parse_single_reads("single", sgl_libs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'pacbio-ccs', treated as type of 'single'
        if pb_ccs:
            single_libs = self._parse_single_reads("single", pb_ccs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'pacbio-clr'
        if pb_clr:
            single_libs = self._parse_single_reads("pacbio", pb_clr)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'nanopore'
        if np_libs:
            single_libs = self._parse_single_reads("nanopore", np_libs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'sanger'
        if sgr_libs:
            single_libs = self._parse_single_reads("sanger", sgr_libs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'trusted-contigs'
        if tr_ctgs:
            single_libs = self._parse_single_reads("trusted-contigs", tr_ctgs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'untrusted-contigs'
        if ut_ctgs:
            single_libs = self._parse_single_reads("untrusted-contigs", ut_ctgs)
            if single_libs:
                input_data_set.append(single_libs)

        if input_data_set == []:
            print('Empty input data set!!')
            return ''

        pprint(input_data_set)
        try:
            with open(yaml_file_path, 'w') as yaml_file:
                json.dump(input_data_set, yaml_file)
        except IOError as ioerr:
            log('Creation of the {} file raised error:\n'.format(yaml_file_path))
            pprint(ioerr)
            return ''
        else:
            return yaml_file_path

    def run_assemble(self, yaml_file, kmer_sizes, dna_source=None,
                     basic_opts=None, pipeline_opts=['careful']):
        """
        run_assemble: run the SPAdes assemble with given input parameters/options
        """
        exit_code = 1
        if not os.path.isfile(yaml_file):
            log("The input data set yaml file DOES NOT exist at {}\n".format(yaml_file))
            return exit_code

        log("The input data set yaml file exists at {}\n".format(yaml_file))
        yf_dir, yf_nm = os.path.split(yaml_file)

        mem = (psutil.virtual_memory().available / self.GB - self.MEMORY_OFFSET_GB)
        if mem < self.MIN_MEMORY_GB:
            raise ValueError(
                'Only ' + str(psutil.virtual_memory().available) +
                ' bytes of memory are available. The SPAdes wrapper will' +
                ' not run without at least ' +
                str(self.MIN_MEMORY_GB + self.MEMORY_OFFSET_GB) +
                ' gigabytes available')

        if dna_source and dna_source == self.PARAM_IN_METAGENOME:
            max_mem = self.MAX_MEMORY_GB_META_SPADES
            max_threads = self.MAX_THREADS_META
        else:
            max_mem = self.MAX_MEMORY_GB_SPADES
            max_threads = self.MAX_THREADS

        threads = min(max_threads, psutil.cpu_count() * self.THREADS_PER_CORE)

        if mem > max_mem:
            mem = max_mem

        tmpdir = os.path.join(self.proj_dir, 'spades_tmp_dir')
        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        a_cmd = [os.path.join(self.SPADES_BIN, 'spades.py')]
        a_cmd += ['--threads', str(threads), '--memory', str(mem)]
        a_cmd += ['--tmp-dir', tmpdir]
        a_cmd += ['--dataset', yaml_file]

        if kmer_sizes is not None:
            a_cmd += ['-k ' + kmer_sizes]

        if basic_opts is None:
            basic_opts = ['-o', self.ASSEMBLE_RESULTS_DIR]
        if isinstance(basic_opts, list):
            a_cmd += basic_opts

        if pipeline_opts and isinstance(pipeline_opts, list):
            for p_opt in pipeline_opts:
                if p_opt == self.PARAM_IN_CAREFUL:
                    a_cmd += ['--careful']
                if p_opt == self.PARAM_IN_ONLY_ERROR_CORR:
                    a_cmd += ['--only-error-correction']
                if p_opt == self.PARAM_IN_ONLY_ASSEMBLER:
                    a_cmd += ['--only-assembler']
                if p_opt == self.PARAM_IN_CONTINUE:
                    a_cmd += ['--continue']
                if p_opt == self.PARAM_IN_DISABLE_GZIP:
                    a_cmd += ['--disable-gzip-output']

        # Last check of command options before the call
        if '--meta' in a_cmd:
            # you cannot specify --careful, --mismatch-correction
            # or --cov-cutoff in metagenomic mode!
            try:
                a_cmd.remove(self.PARAM_IN_CAREFUL)
                a_cmd.remove('mismatch-correction')
                a_cmd.remove('cov-cutoff')
            except ValueError:
                pass

        log("**************The HybridSPAdes assembling command is:\n{}".format(' '.join(a_cmd)))
        assemble_out_dir = os.path.join(self.proj_dir, self.ASSEMBLE_RESULTS_DIR)
        if not os.path.exists(assemble_out_dir):
            os.makedirs(assemble_out_dir)

        p = subprocess.Popen(a_cmd, cwd=yf_dir, shell=False)
        exit_code = p.wait()
        log('Return code: ' + str(exit_code))

        if p.returncode != 0:
            raise ValueError('Error running spades.py, return code: ' + str(p.returncode) + '\n')
        else:
            exit_code = p.returncode
        return exit_code

    def save_assembly(self, fa_file_path, wsname, a_name, min_ctg_length=0):
        """
        save_assembly: save the assembly to KBase workspace
        """
        if os.path.isfile(fa_file_path):
            log('Uploading FASTA file to Assembly...')
            if min_ctg_length > 0:
                self.au.save_assembly_from_fasta(
                            {'file': {'path': fa_file_path},
                             'workspace_name': wsname,
                             'assembly_name': a_name,
                             'min_contig_length': min_ctg_length})
            else:
                self.au.save_assembly_from_fasta(
                            {'file': {'path': fa_file_path},
                             'workspace_name': wsname,
                             'assembly_name': a_name})
        else:
            log("The resulting sequence file {} is not found.".format(fa_file_path))
示例#19
0
    def run_metaphlan2(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_metaphlan2

        # Check parameters
        logging.info(f'params {params}')
        # Check for presence of input file types in params
        input_genomes = 'input_genomes' in params and len(
            params['input_genomes']
        ) > 0 and None not in params['input_genomes']
        input_refs = 'input_ref' in params and len(
            params['input_ref']) > 0 and None not in params['input_ref']

        # for name in ['workspace_name', 'db_type']:
        #     if name not in params:
        #         raise ValueError(
        #             'Parameter "' + name + '" is required but missing')
        if not input_genomes and not input_refs:
            raise ValueError(
                'You must enter either an input genome or input reads')

        if input_refs and input_genomes:
            raise ValueError(
                'You must enter either an input genome or input reads, '
                'but not both')

        if input_genomes and (not isinstance(params['input_genomes'][0], str)):
            raise ValueError('Pass in a valid input genome string')

        if input_refs and (not isinstance(params['input_ref'], list)
                           or not len(params['input_ref'])):
            raise ValueError('Pass in a list of input references')
            # Start with base cmd and add parameters based on user input

        cmd = [
            'metaphlan2.py', '--bowtie2db', '/data/metaphlan2/mpa_v20_m200',
            '--mpa_pkl', '/data/metaphlan2/mpa_v20_m200.pkl'
        ]

        if input_genomes:
            assembly_util = AssemblyUtil(self.callback_url)
            fasta_file_obj = assembly_util.get_assembly_as_fasta(
                {'ref': params['input_genomes'][0]})
            logging.info(fasta_file_obj)
            fasta_file = fasta_file_obj['path']

            cmd.extend(['--input_type', 'fasta', fasta_file])

        if input_refs:
            logging.info('Downloading Reads data as a Fastq file.')
            logging.info(f"Input parameters {params.items()}")
            readsUtil = ReadsUtils(self.callback_url)
            download_reads_output = readsUtil.download_reads(
                {'read_libraries': params['input_ref']})
            print(
                f"Input refs {params['input_ref']} download_reads_output {download_reads_output}"
            )
            fastq_files = []
            fastq_files_name = []
            for key, val in download_reads_output['files'].items():
                if 'fwd' in val['files'] and val['files']['fwd']:
                    fastq_files.append(val['files']['fwd'])
                    fastq_files_name.append(val['files']['fwd_name'])
                if 'rev' in val['files'] and val['files']['rev']:
                    fastq_files.append(val['files']['rev'])
                    fastq_files_name.append(val['files']['rev_name'])
            logging.info(f"fastq files {fastq_files}")
            fastq_files_string = ' '.join(fastq_files)
            cmd.extend(['--input_type', 'fastq', fastq_files_string])

        output_dir = os.path.join(self.scratch, 'metaphlan2_output')

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # insert into second to last position, before input file(s)
        cmd.insert(
            -1,
            '--min_alignment_len') if params['min_alignment_len'] > 0 else cmd
        cmd.insert(-1, str(params['min_alignment_len'])
                   ) if params['min_alignment_len'] > 0 else cmd
        cmd.insert(
            -1, '--ignore_viruses') if params['ignore_viruses'] == 1 else cmd
        cmd.insert(
            -1, '--ignore_bacteria') if params['ignore_bacteria'] == 1 else cmd
        cmd.insert(
            -1,
            '--ignore_eukaryotes') if params['ignore_eukaryotes'] == 1 else cmd
        cmd.insert(
            -1, '--ignore_archaea') if params['ignore_archaea'] == 1 else cmd
        cmd.insert(-1, '--stat_q')
        cmd.insert(-1, str(params['stat_q']))
        cmd.insert(-1, '--min_cu_len')
        cmd.insert(-1, str(params['min_cu_len']))

        # append output file
        cmd.extend(['--bowtie2out', os.path.join(output_dir, 'report.txt')])
        cmd00 = ["ls", '-la', '/data/metaphlan2/']
        logging.info(f'cmd00 {cmd00}')
        pls = subprocess.Popen(cmd00,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT)
        logging.info(f'subprocess {pls.communicate()}')

        # run pipeline
        logging.info(f'cmd {" ".join(cmd)}')
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        logging.info(f'subprocess {p.communicate()}')

        cmd = [
            '/kb/module/lib/metaphlan2/src/accessories.sh',
            os.path.join(output_dir, 'report.txt'), output_dir, 'metaphlan2'
        ]
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        logging.info(f'subprocess {p.communicate()}')

        # get output file and convert to format for report
        # logging.info(f"params['input_ref'] {params['input_ref']}")
        report_df = pd.read_csv(os.path.join(output_dir, 'report.txt'),
                                sep='\t')
        taxa_list = [
            'kingdom', 'phylum', 'class', 'order', 'family', 'genus',
            'species', 'strain', 'unclassified'
        ]
        abbrev_list = ['k', 'p', 'c', 'o', 'f', 'g', 's', 't', 'unclassified']

        for taxa in taxa_list:
            report_df[taxa] = None
        tax_dict = dict(zip(abbrev_list, taxa_list))

        # split dunderscores to get tax level and name
        report_df['taxonomy'] = report_df['#SampleID'].apply(
            lambda x: x.split('|')).apply(lambda x: [y.split('__') for y in x])

        for idx, row in report_df.iterrows():
            for col in row['taxonomy']:
                try:
                    report_df.loc[idx, tax_dict[col[0]]] = col[1]
                except IndexError:
                    report_df.loc[idx, tax_dict[col[0]]] = col[0]

        report_df.drop(['taxonomy', '#SampleID'], axis=1, inplace=True)

        report_html_file = os.path.join(output_dir, 'report.html')
        self._generate_report_table(report_df, report_html_file, output_dir)
        # report_df.to_html(report_html_file, classes='Metaphlan2_report',
        #                   index=False)
        html_zipped = self.package_folder(output_dir, 'report.html', 'report')

        # Step 5 - Build a Report and return
        objects_created = []
        output_files = os.listdir(output_dir)
        output_files_list = []
        for output in output_files:
            if not os.path.isdir(output):
                output_files_list.append({
                    'path':
                    os.path.join(output_dir, output),
                    'name':
                    output
                })
        message = f"MetaPhlAn2 run finished."
        report_params = {
            'message': message,
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'file_links': output_files_list,
            'html_links': [html_zipped],
            'direct_html_link_index': 0,
            'html_window_height': 460
        }
        kbase_report_client = KBaseReport(self.callback_url)
        report_output = kbase_report_client.create_extended_report(
            report_params)
        report_output['report_params'] = report_params
        logging.info(report_output)
        # Return references which will allow inline display of
        # the report in the Narrative
        output = {
            'report_name': report_output['name'],
            'report_ref': report_output['ref'],
            'report_params': report_output['report_params']
        }
        #END run_metaphlan2

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_metaphlan2 return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#20
0
    def run_SPAdes(self, ctx, params):
        """
        Run SPAdes on paired end libraries
        :param params: instance of type "SPAdesParams" (Input parameters for
           running SPAdes. workspace_name - the name of the workspace from
           which to take input and store output. output_contigset_name - the
           name of the output contigset read_libraries - a list of Illumina
           PairedEndLibrary files in FASTQ or BAM format. dna_source -
           (optional) the source of the DNA used for sequencing
           'single_cell': DNA amplified from a single cell via MDA anything
           else: Standard DNA sample from multiple cells. Default value is
           None. min_contig_length - (optional) integer to filter out contigs
           with length < min_contig_length from the SPAdes output. Default
           value is 0 implying no filter. kmer_sizes - (optional) K-mer
           sizes, Default values: 33, 55, 77, 99, 127 (all values must be
           odd, less than 128 and listed in ascending order) In the absence
           of these values, K values are automatically selected.
           skip_error_correction - (optional) Assembly only (No error
           correction). By default this is disabled.) -> structure: parameter
           "workspace_name" of String, parameter "output_contigset_name" of
           String, parameter "read_libraries" of list of type
           "paired_end_lib" (The workspace object name of a PairedEndLibrary
           file, whether of the KBaseAssembly or KBaseFile type.), parameter
           "dna_source" of String, parameter "min_contig_length" of Long,
           parameter "kmer_sizes" of list of Long, parameter
           "skip_error_correction" of type "bool" (A boolean. 0 = false,
           anything else = true.)
        :returns: instance of type "SPAdesOutput" (Output parameters for
           SPAdes run. report_name - the name of the KBaseReport.Report
           workspace object. report_ref - the workspace reference of the
           report.) -> structure: parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_SPAdes

        # A whole lot of this is adapted or outright copied from
        # https://github.com/msneddon/MEGAHIT
        self.log('Running run_SPAdes with params:\n' + pformat(params))

        token = ctx['token']

        # the reads should really be specified as a list of absolute ws refs
        # but the narrative doesn't do that yet
        self.process_params(params)

        # get absolute refs from ws
        wsname = params[self.PARAM_IN_WS]
        obj_ids = []
        for r in params[self.PARAM_IN_LIB]:
            obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})
        ws = Workspace(self.workspaceURL, token=token)
        ws_info = ws.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        readcli = ReadsUtils(self.callbackURL, token=ctx['token'])

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = readcli.download_reads({'read_libraries': reads_params,
                                            'interleaved': 'false',
                                            'gzipped': None
                                            })['files']
        except ServerError as se:
            self.log('logging stacktrace from dynamic client error')
            self.log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log('Got reads data from converter:\n' + pformat(reads))

        phred_type = self.check_reads(params, reads, reftoname)

        reads_data = []
        for ref in reads:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
#            print ("REF:" + str(ref))
#            print ("READS REF:" + str(reads[ref]))
            seq_tech = reads[ref]["sequencing_tech"]
            if f['type'] == 'interleaved':
                reads_data.append({'fwd_file': f['fwd'], 'type': 'paired',
                                   'seq_tech': seq_tech})
            elif f['type'] == 'paired':
                reads_data.append({'fwd_file': f['fwd'], 'rev_file': f['rev'],
                                   'type': 'paired', 'seq_tech': seq_tech})
            elif f['type'] == 'single':
                reads_data.append({'fwd_file': f['fwd'], 'type': 'single',
                                   'seq_tech': seq_tech})
            else:
                raise ValueError('Something is very wrong with read lib' + reads_name)

        kmer_sizes = None
        if self.PARAM_IN_KMER_SIZES in params and params[self.PARAM_IN_KMER_SIZES] is not None:
            if (len(params[self.PARAM_IN_KMER_SIZES])) > 0:
                kmer_sizes = ",".join(str(num) for num in params[self.PARAM_IN_KMER_SIZES])

        skip_error_correction = 0
        if self.PARAM_IN_SKIP_ERR_CORRECT in params and params[self.PARAM_IN_SKIP_ERR_CORRECT] is not None:
            if params[self.PARAM_IN_SKIP_ERR_CORRECT] == 1:
                skip_error_correction = 1

        spades_out = self.exec_spades(params[self.PARAM_IN_DNA_SOURCE],
                                      reads_data,
                                      phred_type,
                                      kmer_sizes,
                                      skip_error_correction)

        self.log('SPAdes output dir: ' + spades_out)

        # parse the output and save back to KBase
        output_contigs = os.path.join(spades_out, 'scaffolds.fasta')

        self.log('Uploading FASTA file to Assembly')

        assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='release')

        if params.get('min_contig_length', 0) > 0:
            assemblyUtil.save_assembly_from_fasta(
                {'file': {'path': output_contigs},
                 'workspace_name': wsname,
                 'assembly_name': params[self.PARAM_IN_CS_NAME],
                 'min_contig_length': params['min_contig_length']
                 })
            # load report from scaffolds.fasta.filtered.fa
            report_name, report_ref = self.load_report(
                output_contigs + '.filtered.fa', params, wsname)
        else:
            assemblyUtil.save_assembly_from_fasta(
                {'file': {'path': output_contigs},
                 'workspace_name': wsname,
                 'assembly_name': params[self.PARAM_IN_CS_NAME]
                 })
            # load report from scaffolds.fasta
            report_name, report_ref = self.load_report(
                output_contigs, params, wsname)

        output = {'report_name': report_name,
                  'report_ref': report_ref
                  }
        #END run_SPAdes

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_SPAdes return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#21
0
class MetabolicUtil():
    '''
    Utilities for running METABOLIC
    '''
    def __init__(self, config, callback_url, workspace_id, cpus):
        self.shared_folder = config['scratch']
        self.callback_url = callback_url
        self.cpus = cpus
        self.ru = ReadsUtils(self.callback_url)

        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)

    def stage_reads_list_file(self, reads_list):
        """
        stage_reads_list_file: download fastq file associated to reads to scratch area
                          and return result_file_path
        """

        logging.info('Processing reads object list: {}'.format(reads_list))

        result_file_path = []
        read_type = []

        # getting from workspace and writing to scratch. The 'reads' dictionary now has file paths to scratch.
        reads = self.ru.download_reads({
            'read_libraries': reads_list,
            'interleaved': None
        })['files']

        # reads_list is the list of file paths on workspace? (i.e. 12804/1/1).
        # "reads" is the hash of hashes where key is "12804/1/1" or in this case, read_obj and
        # "files" is the secondary key. The tertiary keys are "fwd" and "rev", as well as others.
        for read_obj in reads_list:
            files = reads[read_obj][
                'files']  # 'files' is dictionary where 'fwd' is key of file path on scratch.
            result_file_path.append(files['fwd'])
            read_type.append(files['type'])
            if 'rev' in files and files['rev'] is not None:
                result_file_path.append(files['rev'])

        return result_file_path, read_type

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """
        os.chdir(self.shared_folder)
        logging.info('Start executing command:\n{}'.format(command))
        logging.info('Command is running from:\n{}'.format(self.shared_folder))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, stderr = pipe.communicate()
        exitCode = pipe.returncode

        if (exitCode == 0):
            logging.info('Executed command:\n{}\n'.format(command) +
                         'Exit Code: {}\n'.format(exitCode))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format(
                exitCode, output, stderr)
            raise ValueError(error_msg)
            sys.exit(1)
        return (output, stderr)

    def deinterlace_raw_reads(self, fastq):
        fastq_forward = fastq.split('.fastq')[0] + "_forward.fastq"
        fastq_reverse = fastq.split('.fastq')[0] + "_reverse.fastq"
        command = 'reformat.sh in={} out1={} out2={} overwrite=true'.format(
            fastq, fastq_forward, fastq_reverse)
        self._run_command(command)
        return (fastq_forward, fastq_reverse)

    def make_metabolic_reads_file_input(self, params):
        """
            This function runs the selected read mapper and creates the
            sorted and indexed bam files from sam files using samtools.
        """

        reads_list = params['reads_list']

        (read_scratch_path, read_type) = self.stage_reads_list_file(reads_list)

        omic_reads_parameter_file = os.path.abspath(
            self.shared_folder) + '/omic_reads_parameters.txt'
        with open(omic_reads_parameter_file, 'w+') as f:
            f.write("#Reads pair name with complete pathway: \n")

            for i in range(len(read_scratch_path)):
                fastq = read_scratch_path[i]
                fastq_type = read_type[i]

                if fastq_type == 'interleaved':  # make sure working - needs tests
                    logging.info("Running interleaved read mapping mode")
                    (fastq_forward,
                     fastq_reverse) = self.deinterlace_raw_reads(fastq)
                    f.write(fastq_forward + ',' + fastq_reverse)
                else:  # running read mapping in single-end mode
                    logging.info("Running unpaired read mapping mode")
                    f.write(fastq)
        return omic_reads_parameter_file

    def run_metabolic_without_reads(self, params):
        '''
        Run the METABOLIC-G workflow (not using raw reads)
        '''
        out_dir = os.path.join(self.shared_folder, "output")
        metabolic_cmd = " ".join([
            "perl", "/kb/module/bin/METABOLIC/METABOLIC-G.pl", "-in-gn",
            self.shared_folder, "-t",
            str(self.cpus), "-m-cutoff",
            str(params['kegg_module_cutoff']), "-p", params['prodigal_method'],
            "-o", out_dir, "-m", "/data/METABOLIC"
        ])
        logging.info("Starting Command:\n" + metabolic_cmd)
        output = subprocess.check_output(metabolic_cmd,
                                         shell=True).decode('utf-8')
        logging.info(output)

        # self._process_output_files(out_dir)
        return output

    def run_metabolic_with_reads(self, params):
        '''
        Run the METABOLIC-C workflow (using raw reads)
        '''

        out_dir = os.path.join(self.shared_folder, "output")
        omic_reads_parameter_file = self.make_metabolic_reads_file_input(
            params)
        metabolic_cmd = " ".join([
            "perl", "/kb/module/bin/METABOLIC/METABOLIC-C.pl", "-in-gn",
            self.shared_folder, "-t",
            str(self.cpus), "-m-cutoff", params['kegg_module_cutoff'], "-p",
            params['prodigal_method'], "-o", out_dir, "-m", "/data/METABOLIC",
            "-r", omic_reads_parameter_file
        ])
        logging.info("Starting Command:\n" + metabolic_cmd)
        output = subprocess.check_output(metabolic_cmd,
                                         shell=True).decode('utf-8')
        logging.info(output)

        # self._process_output_files(out_dir)
        return output
示例#22
0
    def run_kraken2(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kraken2

        # Download input data as FASTA or FASTQ
        logging.info('Calling run_kraken2')
        logging.info(f'params {params}')
        # Check for presence of input file types in params
        input_genomes = 'input_genomes' in params and len(
            params['input_genomes']
        ) > 0 and None not in params['input_genomes']
        input_refs = 'input_refs' in params and len(
            params['input_refs']) > 0 and None not in params['input_refs']
        input_paired_refs = 'input_paired_refs' in params and len(
            params['input_paired_refs']
        ) > 0 and None not in params['input_paired_refs']
        for name in ['workspace_name', 'db_type']:
            if name not in params:
                raise ValueError('Parameter "' + name +
                                 '" is required but missing')
        if not input_genomes and not input_refs and not input_paired_refs:
            raise ValueError(
                'You must enter either an input genome or input reads')

        if input_refs and input_paired_refs:
            raise ValueError(
                'You must enter either single-end or paired-end reads, '
                'but not both')

        if input_genomes and (input_refs or input_paired_refs):
            raise ValueError(
                'You must enter either an input genome or input reads, '
                'but not both')

        if input_genomes and (not isinstance(params['input_genomes'][0], str)):
            raise ValueError('Pass in a valid input genome string')

        if input_refs and (not isinstance(params['input_refs'], list)):
            raise ValueError('Pass in a list of input references')

        if input_paired_refs and (not isinstance(params['input_paired_refs'],
                                                 list)):
            raise ValueError('Pass in a list of input references')

        logging.info(params['db_type'])
        logging.info(
            f'input_genomes {input_genomes} input_refs {input_refs} input_paired_refs {input_paired_refs}'
        )
        input_string = []
        if input_genomes:
            assembly_util = AssemblyUtil(self.callback_url)
            fasta_file_obj = assembly_util.get_assembly_as_fasta(
                {'ref': params['input_genomes'][0]})
            logging.info(fasta_file_obj)
            fasta_file = fasta_file_obj['path']
            input_string.append(fasta_file)

        if input_refs:
            logging.info('Downloading Reads data as a Fastq file.')
            logging.info(f"input_refs {params['input_refs']}")
            readsUtil = ReadsUtils(self.callback_url)
            download_reads_output = readsUtil.download_reads(
                {'read_libraries': params['input_refs']})
            print(
                f"Input parameters {params['input_refs']}, {params['db_type']}"
                f"download_reads_output {download_reads_output}")
            fastq_files = []
            fastq_files_name = []
            for key, val in download_reads_output['files'].items():
                if 'fwd' in val['files'] and val['files']['fwd']:
                    fastq_files.append(val['files']['fwd'])
                    fastq_files_name.append(val['files']['fwd_name'])
                if 'rev' in val['files'] and val['files']['rev']:
                    fastq_files.append(val['files']['rev'])
                    fastq_files_name.append(val['files']['rev_name'])
            logging.info(f"fastq files {fastq_files}")
            input_string.append(' '.join(fastq_files))

        if input_paired_refs:
            logging.info('Downloading Reads data as a Fastq file.')
            logging.info(f"input_refs {params['input_paired_refs']}")
            readsUtil = ReadsUtils(self.callback_url)
            download_reads_output = readsUtil.download_reads(
                {'read_libraries': params['input_paired_refs']})
            print(
                f"Input parameters {params['input_paired_refs']}, {params['db_type']}"
                f"download_reads_output {download_reads_output}")
            fastq_files = []
            fastq_files_name = []
            # input_string.append('--paired')
            for key, val in download_reads_output['files'].items():
                if 'fwd' in val['files'] and val['files']['fwd']:
                    fastq_files.append(val['files']['fwd'])
                    fastq_files_name.append(val['files']['fwd_name'])
                if 'rev' in val['files'] and val['files']['rev']:
                    fastq_files.append(val['files']['rev'])
                    fastq_files_name.append(val['files']['rev_name'])
            # if len(fastq_files) % 2 != 0:
            #     raise ValueError('There must be an even number of Paired-end reads files')
            logging.info(f"fastq files {fastq_files}")
            input_string.extend(fastq_files)

        logging.info(f'input_string {input_string}')

        output_dir = os.path.join(self.shared_folder, 'kraken2_output')
        report_file_name = 'report.txt'
        report_file = os.path.join(output_dir, report_file_name)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        outprefix = "kraken2"

        cmd = [
            '/kb/module/lib/kraken2/src/kraken2.sh', '-d',
            '/data/kraken2/' + params['db_type'], '-o', output_dir, '-p',
            outprefix, '-t', '1', '-i'
        ]
        cmd.extend(input_string)

        # cmd = ['kraken2', '--db', '/data/kraken2/' + params['db_type'],
        #        '--output', output_dir, '--report', report_file,
        #        '--threads', '1']
        # cmd.extend(['--confidence', str(params['confidence'])]) if 'confidence' in params else cmd

        logging.info(f'cmd {cmd}')
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        logging.info(f'subprocess {p.communicate()}')

        summary_file = os.path.join(output_dir, outprefix + '.report.csv')
        report_dir = os.path.join(output_dir, 'html_report')
        if not os.path.exists(report_dir):
            os.makedirs(report_dir)
        summary_file_dt = os.path.join(report_dir, 'kraken2.datatable.html')
        self._generate_DataTable(summary_file, summary_file_dt)
        shutil.copy2('/kb/module/lib/kraken2/src/index.html',
                     os.path.join(report_dir, 'index.html'))
        shutil.copy2(os.path.join(output_dir, outprefix + '.krona.html'),
                     os.path.join(report_dir, 'kraken2.krona.html'))
        shutil.move(os.path.join(output_dir, outprefix + '.tree.svg'),
                    os.path.join(report_dir, 'kraken2.tree.svg'))
        html_zipped = self.package_folder(report_dir, 'index.html',
                                          'index.html')

        # columns = [
        #     'Percentage of fragments covered by the clade rooted at this taxon',
        #     'Number of fragments covered by the clade rooted at this taxon',
        #     'Number of fragments assigned directly to this taxon', 'rank code',
        #     'taxid', 'name']
        # report_df = pd.read_csv(report_file, sep='\t',
        #                         header=None, names=columns)
        # code_dict = {'U': 'Unclassified', 'R': 'Root', 'D': 'Domain',
        #              'K': 'Kingdom', 'P': 'Phylum', 'C': 'Class', 'O': 'Order',
        #              'F': 'Family', 'G': 'Genus', 'S': 'Species'}
        # report_df['rank code'] = report_df['rank code'].apply(
        #     lambda x: code_dict[x[0]] + x[1] if len(x) > 1 else code_dict[x])

        # self._generate_report_table(report_df, report_html_file, output_dir)
        # report_df.to_html(report_html_file, classes='Kraken2_report', index=False)
        # html_zipped = self.package_folder(output_dir, 'report.html',
        #                                   'report')
        # Step 5 - Build a Report and return
        objects_created = []
        output_files = os.listdir(output_dir)
        output_files_list = []
        for output in output_files:
            if not os.path.isdir(output):
                output_files_list.append({
                    'path':
                    os.path.join(output_dir, output),
                    'name':
                    output
                })
        message = f"Kraken2 run finished on {input_string} against {params['db_type']}."
        report_params = {
            'message': message,
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'file_links': output_files_list,
            'html_links': [html_zipped],
            'direct_html_link_index': 0,
            'html_window_height': 460
        }

        # STEP 6: construct the output to send back
        kbase_report_client = KBaseReport(self.callback_url)
        report_output = kbase_report_client.create_extended_report(
            report_params)
        report_output['report_params'] = report_params
        logging.info(report_output)
        # Return references which will allow inline display of
        # the report in the Narrative
        output = {
            'report_name': report_output['name'],
            'report_ref': report_output['ref'],
            'report_params': report_output['report_params']
        }
        #END run_kraken2

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kraken2 return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#23
0
    def runFastQC(self, ctx, input_params):
        """
        :param input_params: instance of type "FastQCParams" -> structure:
           parameter "input_ws" of String, parameter "input_file" of String,
           parameter "input_file_ref" of String
        :returns: instance of type "FastQCOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: reported_output
        #BEGIN runFastQC

        token = ctx['token']
        wsClient = workspaceService(self.workspaceURL, token=token)
        uuid_string = str(uuid.uuid4())
        read_file_path = self.scratch + "/" + uuid_string
        os.mkdir(read_file_path)

        input_file_ref = self._get_input_file_ref_from_params(input_params)

        library = None
        try:
            library = wsClient.get_objects2(
                {'objects': [{
                    'ref': input_file_ref
                }]})['data'][0]
        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                input_file_ref + ')' + str(e))

        download_read_params = {'read_libraries': [], 'interleaved': "false"}
        if ("SingleEnd" in library['info'][2]
                or "PairedEnd" in library['info'][2]):
            download_read_params['read_libraries'].append(library['info'][7] +
                                                          "/" +
                                                          library['info'][1])
        elif ("SampleSet" in library['info'][2]):
            for sample_id in library['data']['sample_ids']:
                if ("/" in sample_id):
                    download_read_params['read_libraries'].append(sample_id)
                else:
                    if (sample_id.isdigit()):
                        download_read_params['read_libraries'].append(
                            library['info'][6] + "/" + sample_id)
                    else:
                        download_read_params['read_libraries'].append(
                            library['info'][7] + "/" + sample_id)

        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        ret = ru.download_reads(download_read_params)

        read_file_list = list()
        for file in ret['files']:

            obj_info = self.dfu.get_objects({'object_refs':
                                             [file]})['data'][0]['info']
            obj_name = obj_info[1]
            obj_ref_suffix = '_' + str(obj_info[6]) + '_' + str(
                obj_info[0]) + '_' + str(obj_info[4])

            files = ret['files'][file]['files']

            fwd_name = files['fwd'].split('/')[-1]
            fwd_name = fwd_name.replace('.gz', '')
            # using object_name + ref_suffix + suffix as file name
            fwd_name = obj_name + obj_ref_suffix + '.' + fwd_name.split(
                '.', 1)[-1]
            shutil.move(files['fwd'], os.path.join(read_file_path, fwd_name))
            read_file_list.append(os.path.join(read_file_path, fwd_name))

            if (files['rev'] is not None):
                rev_name = files['rev'].split('/')[-1]
                rev_name = rev_name.replace('.gz', '')
                rev_name = obj_name + obj_ref_suffix + '.' + rev_name.split(
                    '.', 1)[-1]
                shutil.move(files['rev'], os.path.join(read_file_path,
                                                       rev_name))
                read_file_list.append(os.path.join(read_file_path, rev_name))

        subprocess.check_output(["fastqc"] + read_file_list)
        # report = "Command run: "+" ".join(["fastqc"]+read_file_list)

        output = self.create_report(token, input_params['input_ws'],
                                    uuid_string, read_file_path)
        reported_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        # Remove temp reads directory
        shutil.rmtree(read_file_path, ignore_errors=True)

        #END runFastQC

        # At some point might do deeper type checking...
        if not isinstance(reported_output, dict):
            raise ValueError('Method runFastQC return value ' +
                             'reported_output is not type dict as required.')
        # return the results
        return [reported_output]
示例#24
0
    def test_velveth(self):
        # get the test data
        out_folder = os.path.join(self.scratch, 'velvet_output_dir')
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)

        rc1 = {
            'read_type': 'long',
            'file_format': 'fastq.gz',
            'file_layout': 'interleaved',
            'read_file_info': {
                'read_file_name': 'ecoli_ref-5m-trim.fastq.gz'
            }
        }
        rc2 = {
            'read_type': 'longPaired',
            'file_format': 'fasta.gz',
            'file_layout': 'interleaved',
            'read_file_info': {
                'read_file_name': 'ecoli-reads-5m-dn-paired.fa.gz'
            }
        }
        rc3 = {
            'read_type': 'shortPaired',
            'file_format': 'fastq',
            'file_layout': 'separate',
            'read_file_info': {
                'read_file_name': 'small.reverse.fq',
                'left_file': 'small.forward.fq',
                'right_file': 'small.reverse.fq',
            }
        }

        pe_lib_info = self.getPairedEndLibInfo()
        print(pe_lib_info)

        obj_ids = [{'ref': pe_lib_info[7] + '/' + pe_lib_info[1]}]

        ws_info = self.wsClient.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        readcli = ReadsUtils(self.callback_url, token=self.token)

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = readcli.download_reads({
                'read_libraries': reads_params,
                'interleaved': 'false',
                'gzipped': None
            })['files']
        except ServerError as se:
            print('logging stacktrace from dynamic client error')
            print(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        print('Got reads data from converter:\n' + pformat(reads))

        reads_data = []
        for ref in reads:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            seq_tech = reads[ref]["sequencing_tech"]
            if f['type'] == 'interleaved':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'interleaved',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'paired':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'rev_file': f['rev'],
                    'type': 'separated',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'single':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'single',
                    'seq_tech': seq_tech
                })
            else:
                raise ValueError('Something is very wrong with read lib' +
                                 reads_name)

        params = {
            'workspace_name': pe_lib_info[7],
            'out_folder': out_folder,
            'hash_length': 21,
            'reads_channels': [rc1, rc2, rc3]  #tests passed
            #'reads_files': reads_data
        }

        result = self.getImpl().exec_velveth(params)
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.scratch,
                             params['out_folder'] + '/Roadmaps')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.scratch,
                             params['out_folder'] + '/Sequences')))
        print('RESULT from velveth is saved in:\n' +
              os.path.join(self.scratch, params['out_folder']))
        pprint('Returned value by Velveth is: ' + str(result))
        return result
示例#25
0
    def run_gottcha2(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_gottcha2

        # Step 2 - Download the input data as a FASTQ and
        # We can use the ReadsUtils module to download a FASTQ file from our Reads data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading Reads data as a Fastq file.')
        readsUtil = ReadsUtils(self.callback_url)
        download_reads_output = readsUtil.download_reads(
            {'read_libraries': params['input_refs']})
        print(
            f"Input parameters {params['input_refs']}, {params['db_type']} download_reads_output {download_reads_output}"
        )
        fastq_files = []
        fastq_files_name = []
        for key, val in download_reads_output['files'].items():
            if 'fwd' in val['files'] and val['files']['fwd']:
                fastq_files.append(val['files']['fwd'])
                fastq_files_name.append(val['files']['fwd_name'])
            if 'rev' in val['files'] and val['files']['rev']:
                fastq_files.append(val['files']['rev'])
                fastq_files_name.append(val['files']['rev_name'])
        logging.info(f"fastq files {fastq_files}")
        fastq_files_string = ' '.join(fastq_files)
        output_dir = os.path.join(self.scratch, 'gottcha2_output')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        ## default options
        if 'min_coverage' not in params:
            params['min_coverage'] = 0.005
        if 'min_reads' not in params:
            params['min_reads'] = 3
        if 'min_length' not in params:
            params['min_length'] = 60
        if 'min_mean_linear_read_length' not in params:
            params['min_mean_linear_read_length'] = 1
        outprefix = "gottcha2"
        cmd0 = ["ls", "-al", '/data/gottcha2/RefSeq90/']
        logging.info(f'cmd {cmd0}')
        pls = subprocess.Popen(cmd0,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT)
        logging.info(f'subprocess {pls.communicate()}')

        cmd = [
            '/kb/module/lib/gottcha2/src/uge-gottcha2.sh', '-i',
            fastq_files_string, '-t', '4', '-o', output_dir, '-p', outprefix,
            '-d', '/data/gottcha2/RefSeq90/' + params['db_type'], '-c',
            str(params['min_coverage']), '-r',
            str(params['min_reads']), '-s',
            str(params['min_length']), '-m',
            str(params['min_mean_linear_read_length'])
        ]
        logging.info(f'cmd {cmd}')
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        logging.info(f'subprocess {p.communicate()}')
        summary_file = os.path.join(output_dir, outprefix + '.summary.tsv')

        # generate report directory and html file
        report_dir = os.path.join(output_dir, 'html_report')
        if not os.path.exists(report_dir):
            os.makedirs(report_dir)
        summary_file_dt = os.path.join(report_dir, 'gottcha2.datatable.html')

        self._generate_DataTable(summary_file, summary_file_dt)
        shutil.copy2('/kb/module/lib/gottcha2/src/index.html',
                     os.path.join(report_dir, 'index.html'))
        shutil.copy2(os.path.join(output_dir, outprefix + '.krona.html'),
                     os.path.join(report_dir, 'gottcha2.krona.html'))
        shutil.move(os.path.join(output_dir, outprefix + '.tree.svg'),
                    os.path.join(report_dir, 'gottcha2.tree.svg'))
        html_zipped = self.package_folder(report_dir, 'index.html',
                                          'index.html')

        # Step 5 - Build a Report and return
        objects_created = []
        output_files = os.listdir(output_dir)
        output_files_list = []
        for output in output_files:
            if not os.path.isdir(output):
                output_files_list.append({
                    'path':
                    os.path.join(output_dir, output),
                    'name':
                    output
                })
        # not used
        output_html_files = [{
            'path': os.path.join(report_dir, 'index.html'),
            'name': 'index.html'
        }, {
            'path':
            os.path.join(report_dir, 'gottcha2.krona.html'),
            'name':
            'gottcha2.krona.html'
        }, {
            'path':
            os.path.join(report_dir, 'gottcha2.datatable.html'),
            'name':
            'gottcha2.datatable.html'
        }, {
            'path':
            os.path.join(report_dir, 'gottcha2.tree.svg'),
            'name':
            'gottcha2.tree.svg'
        }]
        message = 'GOTTCHA2 run finished on %s against %s.' % (
            ','.join(fastq_files_name), params['db_type'])
        report_params = {
            'message': message,
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'file_links': output_files_list,
            'html_links': [html_zipped],
            'direct_html_link_index': 0,
            'html_window_height': 460
        }

        # STEP 6: contruct the output to send back
        kbase_report_client = KBaseReport(self.callback_url)
        report_output = kbase_report_client.create_extended_report(
            report_params)
        report_output['report_params'] = report_params
        logging.info(report_output)
        # Return references which will allow inline display of
        # the report in the Narrative
        output = {
            'report_name': report_output['name'],
            'report_ref': report_output['ref']
        }
        #END run_gottcha2

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_gottcha2 return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]