Exemplo n.º 1
0
 def get_assembly(self, target_dir, assembly_upa):
     auc = AssemblyUtil(self.callbackURL)
     filename = os.path.join(target_dir, assembly_upa.replace('/', '_'))
     try:
         auc.get_assembly_as_fasta({
             'ref': assembly_upa,
             'filename': filename
         })
     except AssemblyUtilError as assembly_error:
         print(str(assembly_error))
         raise
     return filename
Exemplo n.º 2
0
    def get_fasta_file(self, genome_ref):
        ws = Workspace(self.ws_url)
        # test if genome references an assembly type
        # do get_objects2 without data. get list of refs
        genome_obj_info = ws.get_objects2({
            'objects': [{
                'ref': genome_ref
            }],
            'no_data': 1
        })
        # get the list of genome refs from the returned info.
        # if there are no refs (or something funky with the return), this will be an empty list.
        # this WILL fail if data is an empty list. But it shouldn't be, and we know because
        # we have a real genome reference, or get_objects2 would fail.
        genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

        # see which of those are of an appropriate type (ContigSet or Assembly), if any.
        assembly_ref = list()
        ref_params = [{'ref': x} for x in genome_obj_refs]
        ref_info = ws.get_object_info3({'objects': ref_params})
        for idx, info in enumerate(ref_info.get('infos')):
            if "KBaseGenomeAnnotations.Assembly" in info[
                    2] or "KBaseGenomes.ContigSet" in info[2]:
                assembly_ref.append(";".join(ref_info.get('paths')[idx]))
        # now just get the file.
        au = AssemblyUtil(self.callback_url)
        fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref[0]})
        return fasta_file["path"]
Exemplo n.º 3
0
    def _build_index(self, assembly_info, validated_params):
        # get the assembly as a fasta file using AssemblyUtil
        au = AssemblyUtil(self.callback_url)
        fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']})

        # make the target destination folder (check again it wasn't created yet)
        if os.path.exists(validated_params['output_dir']):
            raise('Output directory name specified (' + validated_params['output_dir'] +
                  ') already exists. Will not overwrite, so aborting.')
        os.makedirs(validated_params['output_dir'])

        # configure the command line args and run it
        cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params)
        self.bowtie2.run('bowtie2-build', cli_params)
        index_info = {'output_dir': validated_params['output_dir'],
                      'index_files_basename': fasta_info['assembly_name']}

        # cache the result, mark if it worked or not
        cache_success = self._put_cached_index(assembly_info,
                                               fasta_info['assembly_name'],
                                               validated_params['output_dir'],
                                               validated_params['ws_for_cache'])
        if cache_success:
            index_info['pushed_to_cache'] = 1
        else:
            index_info['pushed_to_cache'] = 0

        return index_info
Exemplo n.º 4
0
    def _stage_assembly_files(self, object_list):
        """
        _stage_assembly_files: download the fasta files to the scratch area
        return list of file names
        """
        log('Processing assembly object list: {}'.format(object_list))

        # Sourmash uses the sequence filename as the default label for the signatures
        # this includes the complete file path. So keeping the sequence file name as close
        # to the desired label as possible is the reason not to place each file under
        # a 'fasta' directory or inlude the '.fa' file extension

        auc = AssemblyUtil(self.callbackURL)
        staged_file_list = []

        for assembly_upa in object_list:
            try:
                file_ = auc.get_assembly_as_fasta({'ref':
                                                   assembly_upa})['path']
            except AssemblyUtilError as assembly_error:
                print(str(assembly_error))
                raise
            filename = os.path.basename(file_).replace('.fa', '')
            to_upper_command = "awk '{ if ($0 !~ />/) {print toupper($0)} else {print $0} }' " \
                               + file_ + ' > tmp.fa ' + '&& mv tmp.fa ' + filename
            self._run_command(to_upper_command)
            staged_file_list.append(filename)

        log('Created file list: {}'.format(staged_file_list))
        return staged_file_list
Exemplo n.º 5
0
    def run_mash_sketch(self, ctx, params):
        """
        Generate a sketch file from a fasta/fastq file
        :param params: instance of type "MashSketchParams" (* * Pass in **one
           of** input_path, assembly_ref, or reads_ref *   input_path -
           string - local file path to an input fasta/fastq *   assembly_ref
           - string - workspace reference to an Assembly type *   reads_ref -
           string - workspace reference to a Reads type * Optionally, pass in
           a boolean indicating whether you are using paired-end reads. *
           paired_ends - boolean - whether you are passing in paired ends) ->
           structure: parameter "input_path" of String, parameter
           "assembly_ref" of String, parameter "reads_ref" of String,
           parameter "paired_ends" of type "boolean" (params:
           input_upa: workspace reference to an assembly object
           workspace_name: name of current workspace search_db: database to
           search n_max_results: number of results to return, integer between
           1 and 100)
        :returns: instance of type "MashSketchResults" (* * Returns the local
           scratch file path of the generated sketch file. * Will have the
           extension '.msh') -> structure: parameter "sketch_path" of String
        """
        # ctx is the context object
        # return variables are: results
        #BEGIN run_mash_sketch
        if 'reads_ref' in params:
            reads_utils = ReadsUtils(self.callbackURL)
            result = reads_utils.download_reads({
                'read_libraries': [params['reads_ref']],
                'interleaved':
                'true'
            })
            input_path = result['files'][params['reads_ref']]['files']['fwd']
        elif 'assembly_ref' in params:
            assembly_util = AssemblyUtil(self.callbackURL)
            result = assembly_util.get_assembly_as_fasta(
                {'ref': params['assembly_ref']})
            input_path = result['path']
        elif 'input_path' in params:
            input_path = params['input_path']
        else:
            raise ValueError(
                'Invalid params; must provide one of `reads_ref`, `assembly_ref`, or `input_path`.'
            )
        mash_utils = MashUtils(self.config, self.auth_token)
        output_file_path = mash_utils.mash_sketch(
            input_path, paired_ends=params.get('paired_ends'))
        results = {'sketch_path': output_file_path}
        #END run_mash_sketch

        # At some point might do deeper type checking...
        if not isinstance(results, dict):
            raise ValueError('Method run_mash_sketch return value ' +
                             'results is not type dict as required.')
        # return the results
        return [results]
Exemplo n.º 6
0
def fetch_fasta_from_assembly(assembly_ref, ws_url, callback_url):
    """
    From an assembly or contigset, this uses a data file util to build a FASTA file and return the
    path to it.
    """
    allowed_types = ['KBaseFile.Assembly',
                     'KBaseGenomeAnnotations.Assembly',
                     'KBaseGenomes.ContigSet']
    if not check_ref_type(assembly_ref, allowed_types, ws_url):
        raise ValueError("The reference {} cannot be used to fetch a FASTA file".format(
            assembly_ref))
    au = AssemblyUtil(callback_url)
    return au.get_assembly_as_fasta({'ref': assembly_ref})
Exemplo n.º 7
0
def get_fasta_from_genome(logger,ws_client,urls,genome_id):
    
    ref = ws_client.get_object_subset(
                                     [{ 'ref' : genome_id ,'included': ['contigset_ref']}])
    contig_id = ref[0]['data']['contigset_ref']
    logger.info( "Generating FASTA from Genome")
    try:
         ## get the FASTA
         assembly = AssemblyUtil(urls['callback_url'])
         ret = assembly.get_assembly_as_fasta({'ref':contig_id})
         output_file = ret['path']
         fasta_file = os.path.basename(output_file)
    	 return fasta_file
    except Exception, e:
	 raise Exception(e)
	 raise Exception("Unable to Create FASTA file from Genome : {0}".format(genome_id))
Exemplo n.º 8
0
    def test_genbank_to_genome(self, download_staging_file,
                               update_staging_service):

        fasta_file = 'small_fasta.fna'
        ws_obj_name = 'MyAssembly'

        params = {
            'staging_file_subdir_path': fasta_file,
            'workspace_name': self.getWsName(),
            'assembly_name': ws_obj_name
        }

        ref = self.getImpl().import_fasta_as_assembly_from_staging(
            self.getContext(), params)
        self.assertTrue('obj_ref' in ref[0])
        self.assertTrue('report_ref' in ref[0])
        self.assertTrue('report_name' in ref[0])

        fasta_file_path = os.path.join('/kb/module/work/tmp', fasta_file)
        assemblyUtil = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        fasta_assembly = assemblyUtil.get_assembly_as_fasta(
            {'ref': self.getWsName() + "/{}".format(ws_obj_name)})

        expected_data = None
        with open(fasta_file_path, 'r') as f:
            expected_data = f.read()
        actual_data = None
        with open(fasta_assembly['path'], 'r') as f:
            actual_data = f.read()
        self.assertEqual(actual_data, expected_data)

        get_objects_params = {
            'object_refs': [ref[0].get('obj_ref')],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)
        base_count = object_data.get('data')[0].get('data').get('base_counts')
        dna_size = object_data.get('data')[0].get('data').get('dna_size')

        self.assertEqual(dna_size, 2520)

        expected_base_count = {'A': 700, 'C': 558, 'T': 671, 'G': 591}
        self.assertDictContainsSubset(base_count, expected_base_count)
        self.assertDictContainsSubset(expected_base_count, base_count)
Exemplo n.º 9
0
    def ScanGenomeForMotifs(self, ctx, params):
        """
        :param params: instance of type "ScanGenomeIn" (This example function
           accepts any number of parameters and returns results in a
           KBaseReport funcdef
           run_MotifScan(mapping<string,UnspecifiedObject> params) returns
           (ReportResults output) authentication required;) -> structure:
           parameter "genome_ref" of String, parameter "ws_name" of String,
           parameter "motifset_ref" of String
        :returns: instance of type "ScanGenomeOut" -> structure:
        """
        # ctx is the context object
        # return variables are: out
        #BEGIN ScanGenomeForMotifs
        ws = Workspace('https://appdev.kbase.us/services/ws')
        ws_name = params['workspace_name']
        subset = ws.get_object_subset([{
            'included': ['/assembly_ref'],
            'ref': params['genome_ref']
        }])
        aref = subset[0]['data']['assembly_ref']
        assembly_ref = {'ref': aref}
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)
        scanFastaParams = {
            'fasta_path': fasta_file['path'],
            'motifset_ref': params['motifset_ref'],
            'ws_name': params['ws_name']
        }

        #build mast command with this -> fasta_file['path']
        #no way we can use this fasta to build report, too big

        #END ScanGenomeForMotifs

        # At some point might do deeper type checking...
        if not isinstance(out, dict):
            raise ValueError('Method ScanGenomeForMotifs return value ' +
                             'out is not type dict as required.')
        # return the results
        return [out]
Exemplo n.º 10
0
    def stage_assembly_files(self, object_list):
        """
        _stage_assembly_files: download the fasta files to the scratch area
        return list of file names
        """
        log('Processing assembly object list: {}'.format(object_list))

        auc = AssemblyUtil(self.callbackURL)
        staged_file_list = []

        for assembly_upa in object_list:
            try:
                filename = auc.get_assembly_as_fasta({'ref': assembly_upa})['path']
            except AssemblyUtilError as assembly_error:
                print(str(assembly_error))
                raise

            staged_file_list.append(filename)

        log('Created file list: {}'.format(staged_file_list))
        return staged_file_list
Exemplo n.º 11
0
def download_fasta(refs, cb_url):
    """
    Args:
      ref - workspace reference in the form 'workspace_id/object_id/obj_version'
      cb_url - callback server URL
    Returns the path of the downloaded fasta file
    """
    dfu = DataFileUtil(cb_url)
    assembly_util = AssemblyUtil(cb_url)
    ws_objects = dfu.get_objects({'object_refs': refs})
    paths = []
    for (obj, ref) in zip(ws_objects['data'], refs):
        ws_type = obj['info'][2]
        if 'KBaseGenomes.Genome' in ws_type:
            assembly_ref = get_assembly_ref_from_genome(ref, obj)
        elif 'KBaseGenomeAnnotations.Assembly' in ws_type:
            assembly_ref = ref
        else:
            raise TypeError('Invalid type ' + ws_type + '. Must be an Assembly or Genome.')
        path = assembly_util.get_assembly_as_fasta({'ref': assembly_ref})['path']
        paths.append(path)
    return paths
Exemplo n.º 12
0
    def get_genome_data_files(self, genome_ref):
        genome_files = {"assembly": None, "gff": None}
        print('Fetching assembly or contig information from genome...')
        assembly_ref = self._get_assembly_ref(genome_ref)
        if len(assembly_ref) > 1:
            raise ValueError(
                'This genome, {}, appears to reference {} Assemblies or ContigSets, with these object references: {}'
                .format(genome_ref, len(assembly_ref), assembly_ref))
        elif len(assembly_ref) == 0:
            raise ValueError(
                'There was no Assembly or ContigSet found as a reference to this genome. Unable to build browser data.'
            )
        print('Done! Found valid assembly data.')

        print('Converting sequence data to FASTA file...')
        au = AssemblyUtil(self.callback_url)
        fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref[0]})
        print('Done! FASTA file created: {}'.format(fasta_file))

        if "path" not in fasta_file:
            raise IOError(
                'FASTA file was not apparently generated from the given genome fasta_file. fasta_file object missing key "path": {}'
                .format(fasta_file))
        genome_files["assembly"] = fasta_file.get('path', None)

        print('Converting genome annotation data to gff file...')
        gfu = GenomeFileUtil(self.callback_url)
        gff_file = gfu.genome_to_gff({'genome_ref': genome_ref})
        print('Done! GFF file created: {}'.format(gff_file))
        if "file_path" not in gff_file:
            raise IOError(
                'GFF file was not apparently generated from the given genome. gff_file object missing key "file_path": {}'
                .format(gff_file))
        genome_files["gff"] = gff_file.get('file_path', None)

        return genome_files
    def annotate_contigs(self, ctx, params):
        """
        :param params: instance of type "AnnotateContigsParams" (Required
           parameters: assembly_ref - reference to Assembly object,
           output_workspace - output workspace name, output_genome_name -
           output object name, Optional parameters (correspond to PROKKA
           command line arguments): --scientific_name Genome scientific name
           (default 'Unknown') --kingdom [X]     Annotation mode:
           Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria') --genus
           [X]       Genus name (triggers to use --usegenus) --gcode [N]     
           Genetic code / Translation table (set if --kingdom is set)
           (default '11') --metagenome      Improve gene predictions for
           highly fragmented genomes (default OFF) --rawproduct      Do not
           clean up /product annotation (default OFF) --fast            Fast
           mode - skip CDS /product searching (default OFF) --mincontiglen
           [N] Minimum contig size [NCBI needs 200] (default '1') --evalue
           [n.n]    Similarity e-value cut-off (default '1e-06') --rfam      
           Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default
           OFF) --norrna          Don't run rRNA search (default OFF)
           --notrna          Don't run tRNA search (default OFF)) ->
           structure: parameter "assembly_ref" of type "assembly_ref"
           (Reference to an Assembly object in the workspace @id ws
           KBaseGenomeAnnotations.Assembly), parameter "output_workspace" of
           String, parameter "output_genome_name" of String, parameter
           "scientific_name" of String, parameter "kingdom" of String,
           parameter "genus" of String, parameter "gcode" of Long, parameter
           "metagenome" of type "boolean" (A boolean. 0 = false, anything
           else = true.), parameter "rawproduct" of type "boolean" (A
           boolean. 0 = false, anything else = true.), parameter "fast" of
           type "boolean" (A boolean. 0 = false, anything else = true.),
           parameter "mincontiglen" of Long, parameter "evalue" of String,
           parameter "rfam" of type "boolean" (A boolean. 0 = false, anything
           else = true.), parameter "norrna" of type "boolean" (A boolean. 0
           = false, anything else = true.), parameter "notrna" of type
           "boolean" (A boolean. 0 = false, anything else = true.)
        :returns: instance of type "AnnotateContigsOutput" -> structure:
           parameter "output_genome_ref" of type "genome_ref" (Reference to
           an Genome object in the workspace @id ws KBaseGenomes.Genome),
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN annotate_contigs
        print("Input parameters: " + pformat(params))
        assembly_ref = self._get_input_value(params, 'assembly_ref')
        output_genome_name = self._get_input_value(params,
                                                   'output_genome_name')
        output_workspace = self._get_input_value(params, 'output_workspace')
        ws_client = workspaceService(self.ws_url, token=ctx['token'])

        sso_ret = ws_client.get_objects([{
            'ref':
            "KBaseOntology/seed_subsystem_ontology"
        }])[0]
        sso = sso_ret['data']
        ec_to_sso = {}
        for sso_id in sso['term_hash']:
            sso_name = sso['term_hash'][sso_id]['name']
            if "(EC " in sso_name and sso_name.endswith(")"):
                ec = sso_name[sso_name.index("(EC ") + 4:-1].strip()
                sso_list = ec_to_sso.get(ec, None)
                if not sso_list:
                    sso_list = []
                    ec_to_sso[ec] = sso_list
                sso_list.append(sso['term_hash'][sso_id])
        print("EC found in SSO: " + str(len(ec_to_sso)))
        sso_info = sso_ret['info']
        sso_ref = str(sso_info[6]) + '/' + str(sso_info[0]) + '/' + str(
            sso_info[4])
        with open('/kb/module/work/seed_so.json', 'w') as outfile:
            json.dump(sso, outfile, sort_keys=True, indent=4)

        assembly_info = ws_client.get_object_info_new({
            'objects': [{
                'ref': assembly_ref
            }],
            'includeMetadata':
            1
        })[0]
        assembly_meta = assembly_info[10]
        gc_content = float(assembly_meta.get("GC content"))
        dna_size = int(assembly_meta.get("Size"))
        n_contigs = 0
        if 'N Contigs' in assembly_meta:
            n_contigs = int(assembly_meta.get("N Contigs"))
        else:
            contig = ws_client.get_objects([{'ref': assembly_ref}])[0]
            n_contigs = len(contig['data']['contigs'])
        if n_contigs >= 30000:
            print("Hmmm.  There are over 30,000 contigs in this Assembly. ")
            print(
                "It looks like you are trying to run Prokka on a metagenome or non-prokayritoc data set."
            )
            print(
                "If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins."
            )
            print(
                "These bins can then be individually annotated as a single genome using Prokka."
            )
            print(
                "If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes."
            )
            print(
                "Alternatively, you can try reducing the number of contigs using a filter app."
            )
            raise ValueError(
                'Too many contigs for Prokka.  See logs for details and suggestios'
            )

        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=ctx['token'])
        orig_fasta_file = au.get_assembly_as_fasta({'ref':
                                                    assembly_ref})['path']
        renamed_fasta_file = orig_fasta_file + "_renamed.fna"
        records = []
        new_ids_to_old = {}
        contig_counter = 0
        for record in SeqIO.parse(orig_fasta_file, "fasta"):
            contig_counter += 1
            old_id = record.id
            new_id = "contig_" + str(contig_counter)
            sequence = record.seq  # it has type 'Seq'
            record = SeqRecord(sequence,
                               id=new_id,
                               description="(" + old_id + ")")
            records.append(record)
            new_ids_to_old[new_id] = old_id
        SeqIO.write(records, renamed_fasta_file, "fasta")
        output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4())
        scientific_name = params.get('scientific_name', 'Unknown')
        # --kingdom [X]     Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria')
        kingdom = str(params.get('kingdom', "Bacteria"))
        prokka_cmd_list = [
            "perl", "/kb/prokka/bin/prokka", "--outdir", output_dir,
            "--prefix", "mygenome", "--kingdom", kingdom, "--cpus", '1'
        ]
        # --genus [X]       Genus name (triggers to use --usegenus)
        if 'genus' in params and params['genus']:
            prokka_cmd_list.extend(
                ['--genus', str(params['genus']), '--usegenus'])
        # --gcode [N]       Genetic code / Translation table (set if --kingdom is set) (default '0')
        gcode = 0
        if 'gcode' in params and params['gcode']:
            gcode = params['gcode']
        prokka_cmd_list.extend(['--gcode', str(gcode)])
        # --gram [X]        Gram: -/neg +/pos (default '')
        if 'gram' in params and params['gram']:
            raise ValueError(
                "gram parameter is not supported in current Prokka installation"
            )
        # --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
        if 'metagenome' in params and params['metagenome'] == 1:
            prokka_cmd_list.append("--metagenome")
        # --rawproduct      Do not clean up /product annotation (default OFF)
        if 'rawproduct' in params and params['rawproduct'] == 1:
            prokka_cmd_list.append("--rawproduct")
        # --fast            Fast mode - skip CDS /product searching (default OFF)
        if 'fast' in params and params['fast'] == 1:
            prokka_cmd_list.append("--fast")
        # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default '1')
        if 'mincontiglen' in params and params['mincontiglen']:
            prokka_cmd_list.extend(
                ['--mincontiglen',
                 str(params['mincontiglen'])])
        # --evalue [n.n]    Similarity e-value cut-off (default '1e-06')
        if 'evalue' in params and params['evalue']:
            prokka_cmd_list.extend(['--evalue', str(params['evalue'])])
        # --rfam            Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default '0')
        if 'rfam' in params and params['rfam'] == 1:
            prokka_cmd_list.append("--rfam")
        # --norrna          Don't run rRNA search (default OFF)
        if 'norrna' in params and params['norrna'] == 1:
            prokka_cmd_list.append("--norrna")
        # --notrna          Don't run tRNA search (default OFF)
        if 'notrna' in params and params['notrna'] == 1:
            prokka_cmd_list.append("--notrna")
        prokka_cmd_list.append(renamed_fasta_file)
        print("Prokka command line: " + str(prokka_cmd_list))
        subprocess.Popen(prokka_cmd_list, cwd=self.scratch).wait()
        faa_file = output_dir + "/mygenome.faa"
        cds_to_prot = {}
        for record in SeqIO.parse(faa_file, "fasta"):
            cds_to_prot[record.id] = str(record.seq)
        ffn_file = output_dir + "/mygenome.ffn"
        cds_to_dna = {}
        for record in SeqIO.parse(ffn_file, "fasta"):
            cds_to_dna[record.id] = str(record.seq)
        gff_file = output_dir + "/mygenome.gff"
        if not os.path.isfile(gff_file):
            raise ValueError("PROKKA output GFF file is not found")
        cdss = []
        mrnas = []
        features = []
        non_hypothetical = 0
        genes_with_ec = 0
        genes_with_sso = 0
        prot_lengths = []
        with open(gff_file, "r") as f1:
            for rec in GFF.parse(f1):
                contig_id = new_ids_to_old[str(rec.id)]
                for ft in rec.features:
                    loc = ft.location
                    min_pos = int(loc.start) + 1
                    max_pos = int(loc.end)
                    strand = '+' if loc.strand == 1 else '-'
                    flen = max_pos - min_pos + 1
                    start = min_pos if strand == '+' else max_pos
                    location = [[contig_id, start, strand, flen]]
                    qualifiers = ft.qualifiers
                    generated_id = self._get_qualifier_value(
                        qualifiers.get('ID'))
                    if not generated_id:
                        # Skipping feature with no ID (mostly repeat regions)
                        continue
                    dna = cds_to_dna.get(generated_id)
                    if not dna:
                        # Skipping feature with no DNA (mostly repeat regions)
                        continue
                    name = self._get_qualifier_value(qualifiers.get('Name'))
                    ec = self._get_qualifier_value(qualifiers.get('eC_number'))
                    gene = self._get_qualifier_value(qualifiers.get('gene'))
                    product = self._get_qualifier_value(
                        qualifiers.get('product'))
                    fid = name if name else generated_id
                    aliases = []
                    if gene:
                        aliases.append(gene)
                    if ec:
                        aliases.append(ec)
                        genes_with_ec += 1
                    md5 = hashlib.md5(dna).hexdigest()
                    feature = {
                        'id': fid,
                        'location': location,
                        'type': 'gene',
                        'aliases': aliases,
                        'md5': md5,
                        'dna_sequence': dna,
                        'dna_sequence_length': len(dna),
                    }
                    if product:
                        feature['function'] = product
                        if product != "hypothetical protein":
                            non_hypothetical += 1
                    if ec and ec in ec_to_sso:
                        sso_list = ec_to_sso[ec]
                        sso_terms = {}
                        for sso_item in sso_list:
                            sso_terms[sso_item['id']] = {
                                'id': sso_item['id'],
                                'evidence': [],
                                'term_name': sso_item['name'],
                                'ontology_ref': sso_ref,
                                'term_lineage': []
                            }
                        feature['ontology_terms'] = {'SSO': sso_terms}
                        genes_with_sso += 1
                    cds = None
                    mrna = None
                    prot = cds_to_prot.get(generated_id)
                    if prot:
                        cds_id = fid + "_CDS"
                        mrna_id = fid + "_mRNA"
                        prot_len = len(prot)
                        prot_lengths.append(prot_len)
                        feature['protein_translation'] = prot
                        feature['protein_translation_length'] = prot_len
                        feature['cdss'] = [cds_id]
                        feature['mrnas'] = [mrna_id]
                        cds = {
                            'id': cds_id,
                            'location': location,
                            'md5': md5,
                            'parent_gene': fid,
                            'parent_mrna': mrna_id,
                            'function': (product if product else ''),
                            'ontology_terms': {},
                            'protein_translation': prot,
                            'protein_translation_length': prot_len,
                            'aliases': aliases
                        }
                        mrna = {
                            'id': mrna_id,
                            'location': location,
                            'md5': md5,
                            'parent_gene': fid,
                            'cds': cds_id
                        }
                    features.append(feature)
                    if cds:
                        cdss.append(cds)
                    if mrna:
                        mrnas.append(mrna)
        genome = {
            'id': 'Unknown',
            'features': features,
            'scientific_name': scientific_name,
            'domain': kingdom,
            'genetic_code': gcode,
            'assembly_ref': assembly_ref,
            'cdss': cdss,
            'mrnas': mrnas,
            'source': 'PROKKA annotation pipeline',
            'gc_content': gc_content,
            'dna_size': dna_size,
            'reference_annotation': 0
        }
        prov = ctx.provenance()
        ga = GenomeAnnotationAPI(self.sw_url, token=ctx['token'])
        info = ga.save_one_genome_v1({
            'workspace': output_workspace,
            'name': output_genome_name,
            'data': genome,
            'provenance': prov
        })['info']
        genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])

        # Prepare report
        report = ''
        report += 'Genome saved to: ' + output_workspace + '/' + output_genome_name + '\n'
        report += 'Number of genes predicted: ' + str(len(features)) + '\n'
        report += 'Number of protein coding genes: ' + str(
            len(prot_lengths)) + '\n'
        report += 'Number of genes with non-hypothetical function: ' + str(
            non_hypothetical) + '\n'
        report += 'Number of genes with EC-number: ' + str(
            genes_with_ec) + '\n'
        report += 'Number of genes with Seed Subsystem Ontology: ' + str(
            genes_with_sso) + '\n'
        report += 'Average protein length: ' + str(
            int(sum(prot_lengths) / float(len(prot_lengths)))) + ' aa.\n'
        kbr = KBaseReport(os.environ['SDK_CALLBACK_URL'], token=ctx['token'])
        report_info = kbr.create_extended_report({
            'message':
            report,
            'objects_created': [{
                'ref': genome_ref,
                'description': 'Annotated genome'
            }],
            'report_object_name':
            'kb_prokka_report_' + str(uuid.uuid4()),
            'workspace_name':
            output_workspace
        })

        returnVal = {
            'output_genome_ref': genome_ref,
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #END annotate_contigs

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method annotate_contigs return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Exemplo n.º 14
0
    def do_assembly(self, assemblyRef, wsName):
        #try:
        #    assembly = wsClient.get_objects2({'objects': [{'ref': assembly_ref}]})
        #except:
        #    exc_type, exc_value, exc_traceback = sys.exc_info()
        #    lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
        #    orig_error = ''.join('    ' + line for line in lines)
        #    raise ValueError('Error from workspace:\n' + orig_error)

        #print assembly#[200:]
        #print assembly['data']
        #print assembly['data'][0]
        #assembly['data'][0]['data']

        #fasta_handle_ref = assembly['data'][0]['data']['fasta_handle_ref']
        #print "fasta_handle_ref "+fasta_handle_ref
        #print type(fasta_handle_ref)

        #TODO create file here /kb/module/work
        #TODO set output file name
        print "SDK_CALLBACK_URL "+os.environ['SDK_CALLBACK_URL']
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        #assembly_input_ref = "16589/2/1"
        #filename = "test.fasta"
        #obj_name = "EcoliMG1655.f"
        #wsname = "example_assembly"

        param = dict()
        param['ref'] = assemblyRef#assembly_input_ref

        input_fasta_file = au.get_assembly_as_fasta(param)#{'ref': assembly_input_ref})

        #just_input_fasta_file = os.path.basename(input_fasta_file['path'])
        #print "input_fasta_file "+ str(input_fasta_file['path'])

        newtmp = "/kb/module/work/tmp/tmp_"+self.create_random_string()
        os.mkdir(newtmp)
        os.mkdir(newtmp+"/input")

        newfasta = newtmp +"/input/"+os.path.basename(input_fasta_file['path'])
        print "newfasta "+newfasta

        os.rename(input_fasta_file['path'], newfasta)

        args = ["wrapper_phage_contigs_sorter_iPlant.pl ", "--db 2 ","--fna ", newfasta," --wdir ",newtmp]

        print str(args)

        cmdstring = "".join(args)

        print "Executing"
        cmdProcess = subprocess.Popen(cmdstring, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
        print "Done "+str(cmdProcess)
        stdout, stderr = cmdProcess.communicate()
        print " stdout: " + stdout
        print " stderr: " + stderr

        #return [report]

        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [],
            'text_message': stdout
        }
        # 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],

        #report_info = report.create({'report': reportObj, 'workspace_name': wsName})

        #reportObj = {
        #    'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
        #    'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        #}
        #report = KBaseReport(self.callback_url)
        #report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})


        # contruct the output to send back
        #output = {'report_name': report_info['name'],
        #          'report_ref': report_info['ref']
        #          }
        #print('returning:' + pformat(output))

        print('Saving report')
        kbr = KBaseReport(self.callback_url, service_ver='dev')
        report = ''
        report += "cmdstring: " + str(cmdstring) + " stdout: " + str(stdout) + " stderr: " + str(stderr)

        virout = newtmp+"/"+"VIRSorter_global-phage-signal.csv"
        with open(virout, 'r') as myfile:
            data = myfile.read().replace('\n', '')

        print "wsName "+str(wsName)

        data = data.replace(",", "\t")
        data = data.replace("##", "\n##")
        report = report +"\n\n***** VirSorter output *****\n"+data
        report_data = {'message': report,
             'objects_created': None,
             'direct_html_link_index': None,
             'html_links': None,
             'report_object_name': 'kb_virsorter_' + str(uuid.uuid4()),
             'workspace_name': wsName
             }

        print "report_data"
        print str(report_data)
        report_info = kbr.create_extended_report(report_data
            )

        # 'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}],
        # 'html_links': [{'shock_id': quastret['shock_id'],
        #                     'name': 'report.html',
        #                     'label': 'QUAST report'}
        #                    ],

        reportName = report_info['name']
        reportRef = report_info['ref']
        return reportName, reportRef
Exemplo n.º 15
0
    def filter_contigs(self, ctx, params):
        """
        :param workspace_name: instance of String
        :param params: instance of type "ContigFilterParams" (Input
           parameters) -> structure: parameter "assembly_ref" of String,
           parameter "min_length" of Long
        :returns: instance of type "ContigFilterResults" (Output results) ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String, parameter "filtered_assembly_ref" of
           String, parameter "n_total" of Long, parameter "n_remaining" of
           Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_contigs
        for name in ['min_length', 'assembly_ref', 'workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name + '" is required but missing')
        if not isinstance(params['min_length'], int) or (params['min_length'] < 0):
            raise ValueError('Min length must be a non-negative integer')
        if not isinstance(params['assembly_ref'], basestring) or not len(params['assembly_ref']):
            raise ValueError('Pass in a valid assembly reference string')
        ws_name = params['workspace_name']
        assembly_util = AssemblyUtil(self.callback_url)
        file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']})
        # Parse the downloaded file in FASTA format
        parsed_assembly = SeqIO.parse(file['path'], 'fasta')
        min_length = params['min_length']
        # Keep a list of contigs greater than min_length
        good_contigs = []
        # total contigs regardless of length
        n_total = 0
        # total contigs over the min_length
        n_remaining = 0
        for record in parsed_assembly:
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1
        # Create a file to hold the filtered data
        filtered_path = os.path.join(self.scratch, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_path, 'fasta')
        # Upload the filtered data to the workspace
        new_ref = assembly_util.save_assembly_from_fasta({
            'file': {'path': filtered_path},
            'workspace_name': ws_name,
            'assembly_name': file['assembly_name']
        })
        # Create an output summary message for the report
        text_message = "".join([
            'Filtered assembly to ',
            str(n_remaining),
            ' contigs out of ',
            str(n_total)
        ])
        # Data for creating the report, referencing the assembly we uploaded
        html_dir = os.path.join(self.scratch, 'html')
        html_index_path = os.path.join(html_dir, 'index.html')
        file_path = os.path.join(self.scratch, 'myfile.txt')
        with open(file_path, 'w') as f:
            f.write('hello world')
        os.mkdir(html_dir)
        with open(html_index_path, 'w') as f:
            f.write('<p><b>hello world</b></p>')
        print('xyz1', os.listdir(html_dir))
        print('xyz2', os.listdir(self.scratch))
        html_links = [{
            'path': os.path.join(html_dir, 'index.html'),
            'name': 'main.html',
            'description': 'Sample description'
        }]
        file_links = [{
            'path': file_path,
            'name': 'file.txt',
            'description': 'Sample file description'
        }] + html_links
        # Extended report
        report_data = {
            'objects_created': [{'ref': new_ref, 'description': 'Filtered contigs'}],
            'html_links': html_links,
            'file_links': file_links,
            'warnings': ['warning 1', 'warning 2'],
            'report_object_name': 'my_report',
            'direct_html': '<p>Hello</p>',
            'message': text_message,
            'workspace_name': ws_name,
            'direct_html_link_index': 0,
            'html_window_height': 800,
            'summary_window_height': 800
        }
        # # Simple report
        # report_data = {
        #     'report': {
        #         'text_message': 'My simple report text message',
        #         'warnings': ['warning 1', 'warning 2'],
        #         'objects_created': [{'ref': new_ref, 'description': 'filtered contigs'}]
        #     },
        #     'workspace_name': ws_name
        # }
        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create_extended_report(report_data)
        # Return the report reference and name in our results
        returnVal = {
            'report_ref': report['ref'],
            'report_name': report['name'],
            'n_total': n_total,
            'n_remaining': n_remaining,
            'filtered_assembly_ref': new_ref
        }
        #END filter_contigs

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
    def get_promoter_for_gene(self, ctx, params):
        """
        :param params: instance of type "get_promoter_for_gene_input" (Genome
           is a KBase genome Featureset is a KBase featureset Promoter_length
           is the length of promoter requested for all genes) -> structure:
           parameter "genome_ref" of String, parameter "featureSet_ref" of
           String, parameter "promoter_length" of Long
        :returns: instance of type "get_promoter_for_gene_output_params" ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN get_promoter_for_gene
        #code goes here
        dfu = DataFileUtil(self.callback_url)
        objectRefs = {'object_refs': [params['Genome'], params['featureSet']]}
        objects = dfu.get_objects(objectRefs)
        genome = objects['data'][0]['data']
        featureSet = objects['data'][1]['data']
        assembly_ref = {'ref': genome['assembly_ref']}
        with open('/kb/module/work/genome.json', 'w') as f:
            json.dump(genome, f)
        with open('/kb/module/work/featureSet.json', 'w') as f:
            json.dump(featureSet, f)
        #with open('/kb/module/work/asssembly.json','w') as f:
        #    json.dump(assembly,f)
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)

        #pprint(fasta_file)
        #loop over featureSet
        #find matching feature in genome
        #get record, start, orientation, length
        #TODO: add some error checking logic to the bounds of the promoter
        prom = ""
        for feature in featureSet['elements']:
            #print(feature)
            #print(featureSet['elements'][feature])
            for f in genome['features']:
                if f['id'] == feature:
                    attributes = f['location'][0]
                    #print(f['location'])
                    break
            for record in SeqIO.parse(fasta_file['path'], 'fasta'):
                #print(record.id)
                #print(attributes[0])
                if record.id == attributes[0]:
                    #print(attributes[0])
                    if attributes[2] == '+':
                        #might need to offset by 1?
                        end = attributes[1]
                        start = end - params['promoter_length']
                        if end < 0:
                            end = 0
                        promoter = record.seq[start:end].upper()
                        prom += ">" + feature + "\n"
                        prom += promoter + "\n"

                    elif attributes[2] == '-':
                        start = attributes[1]
                        end = start + params['promoter_length']
                        if end > len(record.seq) - 1:
                            end = len(record.seq) - 1
                        promoter = record.seq[start:end].upper()
                        complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
                        promoter = ''.join(
                            [complement[base] for base in promoter[::-1]])
                        prom += ">" + feature + "\n"
                        prom += promoter + "\n"

                    else:
                        print('Error on orientation')

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        html_output_dir = os.path.join(self.shared_folder,
                                       'output_html.' + str(timestamp))
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)
        html_file = 'promoter.html'
        output_html_file_path = os.path.join(html_output_dir, html_file)

        html_report_lines = "<html><body>"
        html_report_lines += "<pre>" + prom + "</pre>"
        html_report_lines += "</body></html>"

        with open(output_html_file_path, 'w', 0) as html_handle:
            html_handle.write(html_report_lines)

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': html_output_dir,
                #html_upload_ret = dfu.file_to_shock({'file_path': output_html_file_path,
                #'make_handle': 0})
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        reportName = 'identify_promoter_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [],
            'message': '',
            'direct_html': None,
            'direct_html_index': 0,
            'file_links': [],
            'html_links': [],
            'html_window_height': 220,
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            'name': html_file,
            'label': 'View'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #iterate over records in fasta
        #for record in SeqIO.parse(fasta_file['path'], 'fasta'):

        #objects list of Genome and featureSet

        #pprint(objects)
        #END get_promoter_for_gene

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method get_promoter_for_gene return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 17
0
    def filter_contigs(self, ctx, params):
        """
        The actual function is declared using 'funcdef' to specify the name
        and input/return arguments to the function.  For all typical KBase
        Apps that run in the Narrative, your function should have the
        'authentication required' modifier.
        :param params: instance of type "FilterContigsParams" (A 'typedef'
           can also be used to define compound or container objects, like
           lists, maps, and structures.  The standard KBase convention is to
           use structures, as shown here, to define the input and output of
           your function.  Here the input is a reference to the Assembly data
           object, a workspace to save output, and a length threshold for
           filtering. To define lists and maps, use a syntax similar to C++
           templates to indicate the type contained in the list or map.  For
           example: list <string> list_of_strings; mapping <string, int>
           map_of_ints;) -> structure: parameter "assembly_input_ref" of type
           "assembly_ref" (A 'typedef' allows you to provide a more specific
           name for a type.  Built-in primitive types include 'string',
           'int', 'float'.  Here we define a type named assembly_ref to
           indicate a string that should be set to a KBase ID reference to an
           Assembly data object.), parameter "workspace_name" of String,
           parameter "min_length" of Long
        :returns: instance of type "FilterContigsResults" (Here is the
           definition of the output of the function.  The output can be used
           by other SDK modules which call your code, or the output
           visualizations in the Narrative.  'report_name' and 'report_ref'
           are special output fields- if defined, the Narrative can
           automatically render your Report.) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String,
           parameter "assembly_output" of type "assembly_ref" (A 'typedef'
           allows you to provide a more specific name for a type.  Built-in
           primitive types include 'string', 'int', 'float'.  Here we define
           a type named assembly_ref to indicate a string that should be set
           to a KBase ID reference to an Assembly data object.), parameter
           "n_initial_contigs" of Long, parameter "n_contigs_removed" of
           Long, parameter "n_contigs_remaining" of Long
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN filter_contigs

        # Print statements to stdout/stderr are captured and available as the App log
        print('Starting Filter Contigs function. Params=')
        pprint(params)

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        print('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
              str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        print('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })

        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'text_message':
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': params['workspace_name']
        })

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'assembly_output': new_assembly,
            'n_initial_contigs': n_total,
            'n_contigs_removed': n_total - n_remaining,
            'n_contigs_remaining': n_remaining
        }
        print('returning:' + pformat(output))

        #END filter_contigs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 18
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension)


        # 2) based on type, download the files
        ws = Workspace(self.ws_url)
        input_info = ws.get_object_info3({'objects': [{'ref': input_ref}]})['infos'][0]
        # 0 obj_id objid - the numerical id of the object.
        # 1 obj_name name - the name of the object.
        # 2 type_string type - the type of the object.
        # 3 timestamp save_date - the save date of the object.
        # 4 obj_ver ver - the version of the object.
        # 5 username saved_by - the user that saved or copied the object.
        # 6 ws_id wsid - the workspace containing the object.
        # 7 ws_name workspace - the workspace containing the object.
        # 8 string chsum - the md5 checksum of the object.
        # 9 int size - the size of the object in bytes.
        # 10 usermeta meta - arbitrary user-supplied metadata about
        #     the object.
        obj_name = input_info[1]
        type_name = input_info[2].split('-')[0]
        if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']:
            au = AssemblyUtil(self.callback_url)
            os.makedirs(input_dir)
            filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension)
            au.get_assembly_as_fasta({'ref': input_ref, 'filename': filename})
            if not os.path.isfile(filename):
                raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
            pass
        elif type_name == 'KBaseMetagenomes.BinnedContigs':
            # download the bins as fasta and set the input folder name
            au = MetagenomeUtils(self.callback_url)
            bin_file_dir = au.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
        elif type_name == 'KBaseGenomes.Genome':
            raise ValueError('Cannot yet stage fasta file input directory from KBaseGenomes.Genome')
        else:
            raise ValueError('Cannot stage fasta file input directory from type: ' + type_name)


        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
Exemplo n.º 19
0
class CufflinksUtils:
    CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/'
    GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/'

    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass

    def parse_FPKMtracking_calc_TPM(self, filename):
        """
        Generates TPM from FPKM
        :return:
        """
        fpkm_dict = {}
        tpm_dict = {}
        gene_col = 0
        fpkm_col = 9
        sum_fpkm = 0.0
        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                gene_id = larr[gene_col]
                if gene_id != "":
                    fpkm = float(larr[fpkm_col])
                    sum_fpkm = sum_fpkm + fpkm
                    fpkm_dict[gene_id] = math.log(fpkm + 1, 2)
                    tpm_dict[gene_id] = fpkm

        if sum_fpkm == 0.0:
            log("Warning: Unable to calculate TPM values as sum of FPKM values is 0"
                )
        else:
            for g in tpm_dict:
                tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2)

        return fpkm_dict, tpm_dict

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_cufflinks_params(self, params):
        """
        _validate_run_cufflinks_params:
                Raises an exception if params are invalid
        """

        log('Start validating run_cufflinks params')

        # check for required parameters
        for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)

            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility
        """
        log('converting gff to gtf')
        command = self.GFFREAD_TOOLKIT_PATH + '/gffread '
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _create_gtf_annotation_from_genome(self, genome_ref):
        """
         Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            'ref':
            genome_ref,
            'included': ['contigset_ref', 'assembly_ref']
        }])
        if 'contigset_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['contigset_ref']
        elif 'assembly_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['assembly_ref']
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print(contig_id)
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta({'ref': contig_id})
            output_file = ret['path']
            mapping_filename = c_mapping.create_sanitized_contig_ids(
                output_file)
            os.remove(output_file)
            # get the GFF
            ret = self.gfu.genome_to_gff({'genome_ref': genome_ref})
            genome_gff_file = ret['file_path']
            c_mapping.replace_gff_contig_ids(genome_gff_file,
                                             mapping_filename,
                                             to_modified=True)
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf'
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))
        return gtf_path

    def _get_gtf_file(self, alignment_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch
        alignment_data = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]['data']

        genome_ref = alignment_data.get('genome_id')
        # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1]
        # ws_gtf = genome_name+"_GTF_Annotation"

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_gtf_file_from_genome_ref(self, genome_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input BAM file from Alignment object
        """

        bam_file_dir = self.rau.download_alignment(
            {'source_ref': alignment_ref})['destination_dir']

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r'.*\_sorted\.bam', file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r'.*(?<!sorted)\.bam', file)
            ]

        if not bam_file_list:
            raise ValueError('Cannot find .bam file from alignment {}'.format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _generate_command(self, params):
        """
        _generate_command: generate cufflinks command
        """
        cufflinks_command = '/opt/cufflinks/cufflinks'
        cufflinks_command += (' -q --no-update-check -p ' +
                              str(params.get('num_threads', 1)))
        if 'max_intron_length' in params and params[
                'max_intron_length'] is not None:
            cufflinks_command += (' --max-intron-length ' +
                                  str(params['max_intron_length']))
        if 'min_intron_length' in params and params[
                'min_intron_length'] is not None:
            cufflinks_command += (' --min-intron-length ' +
                                  str(params['min_intron_length']))
        if 'overhang_tolerance' in params and params[
                'overhang_tolerance'] is not None:
            cufflinks_command += (' --overhang-tolerance ' +
                                  str(params['overhang_tolerance']))

        cufflinks_command += " -o {0} -G {1} {2}".format(
            params['result_directory'], params['gtf_file'],
            params['input_file'])

        log('Generated cufflinks command: {}'.format(cufflinks_command))

        return cufflinks_command

    def _process_rnaseq_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing RNASeqAlignment object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        if '/' not in params['genome_ref']:
            params['genome_ref'] = params['workspace_name'] + '/' + params[
                'genome_ref']

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_rnaseq_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params['gtf_file'],
            params['expression_suffix'])

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _process_kbasesets_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing KBaseSets object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_kbasesets_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params.get('gtf_file'),
            params.get('expression_suffix'))

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]

        expression_object_type = expression_object.get('info')[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqExpression-\d.\d',
                    expression_object_type):
            Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d',
                      expression_object_type):
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data'][
                    'sample_expression_ids']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref
                    }], includeMetadata=None)[0][1]
                Overview_Content += '<p>{}</p>'.format(expression_name)
        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            pprint(expression_object)
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data']['items']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref['ref']
                    }], includeMetadata=None)[0][1]
                condition = expression_ref['label']
                Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format(
                    condition, expression_name)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Cufflinks App'
        })
        return html_report

    def _save_rnaseq_expression(self, result_directory, alignment_ref,
                                workspace_name, genome_ref, gtf_file,
                                expression_suffix):
        """
        _save_rnaseq_expression: save Expression object to workspace
        """
        log('start saving Expression object')
        alignment_object_name = self.ws.get_object_info(
            [{
                "ref": alignment_ref
            }], includeMetadata=None)[0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_kbasesets_expression(self, result_directory, alignment_ref,
                                   workspace_name, genome_ref, gtf_file,
                                   expression_suffix):
        """
        _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils
        and SetAPI
        """
        log('start saving Expression object')

        alignment_info = self.ws.get_object_info3(
            {'objects': [{
                "ref": alignment_ref
            }]})
        alignment_object_name = alignment_info['infos'][0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_rnaseq_expression_set(self, alignment_expression_map,
                                    alignment_set_ref, workspace_name,
                                    expression_set_name):
        """
        _save_rnaseq_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _save_kbasesets_expression_set(self, alignment_expression_map,
                                       alignment_set_ref, workspace_name,
                                       expression_set_name):
        """
        _save_kbasesets_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _generate_report(self,
                         obj_ref,
                         workspace_name,
                         result_directory,
                         exprMatrix_FPKM_ref=None,
                         exprMatrix_TPM_ref=None):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+',
                    expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'ExpressionSet generated by Cufflinks'
            }]
            items = expression_data['items']
            for item in items:
                objects_created.append({
                    'ref':
                    item['ref'],
                    'description':
                    'Expression generated by Cufflinks'
                })
            objects_created.append({
                'ref':
                exprMatrix_FPKM_ref,
                'description':
                'FPKM ExpressionMatrix generated by Cufflinks'
            })
            objects_created.append({
                'ref':
                exprMatrix_TPM_ref,
                'description':
                'TPM ExpressionMatrix generated by Cufflinks'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _parse_FPKMtracking(self, filename, metric):
        result = {}
        pos1 = 0
        if metric == 'FPKM':
            pos2 = 7
        if metric == 'TPM':
            pos2 = 8

        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                if larr[pos1] != "":
                    try:
                        result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2)
                    except ValueError:
                        result[larr[pos1]] = math.log(1, 2)

        return result

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cufflinks_result.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by Cufflinks App'
        })

        return output_files

    def _generate_expression_data(self, result_directory, alignment_ref,
                                  gtf_file, workspace_name, expression_suffix):
        """
        _generate_expression_data: generate Expression object with cufflinks output files
        """
        alignment_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]

        # set expression name
        alignment_object_name = alignment_data_object['info'][1]
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_data = {
            'id': expression_name,
            'type': 'RNA-Seq',
            'numerical_interpretation': 'FPKM',
            'processing_comments': 'log2 Normalized',
            'tool_used': self.tool_used,
            'tool_version': self.tool_version
        }
        alignment_data = alignment_data_object['data']

        condition = alignment_data.get('condition')
        expression_data.update({'condition': condition})

        genome_id = alignment_data.get('genome_id')
        expression_data.update({'genome_id': genome_id})

        read_sample_id = alignment_data.get('read_sample_id')
        expression_data.update(
            {'mapped_rnaseq_alignment': {
                read_sample_id: alignment_ref
            }})

        exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM(
            os.path.join(result_directory, 'genes.fpkm_tracking'))

        expression_data.update({'expression_levels': exp_dict})

        expression_data.update({'tpm_expression_levels': tpm_exp_dict})

        handle = self.dfu.file_to_shock({
            'file_path': result_directory,
            'pack': 'zip',
            'make_handle': True
        })['handle']
        expression_data.update({'file': handle})

        return expression_data

    def _generate_expression_set_data(self, alignment_expression_map,
                                      alignment_set_ref, expression_set_name):
        """
        _generate_expression_set_data: generate ExpressionSet object with cufflinks output files
        """
        alignment_set_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_data = alignment_set_data_object['data']

        expression_set_data = {
            'tool_used': self.tool_used,
            'tool_version': self.tool_version,
            'id': expression_set_name,
            'alignmentSet_id': alignment_set_ref,
            'genome_id': alignment_set_data.get('genome_id'),
            'sampleset_id': alignment_set_data.get('sampleset_id')
        }

        sample_expression_ids = []
        mapped_expression_objects = []
        mapped_expression_ids = []

        for alignment_expression in alignment_expression_map:
            alignment_ref = alignment_expression.get('alignment_ref')
            expression_ref = alignment_expression.get('expression_obj_ref')
            sample_expression_ids.append(expression_ref)
            mapped_expression_ids.append({alignment_ref: expression_ref})
            alignment_name = self.ws.get_object_info(
                [{
                    "ref": alignment_ref
                }], includeMetadata=None)[0][1]
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_ref
                }], includeMetadata=None)[0][1]
            mapped_expression_objects.append({alignment_name: expression_name})

        expression_set_data['sample_expression_ids'] = sample_expression_ids
        expression_set_data[
            'mapped_expression_objects'] = mapped_expression_objects
        expression_set_data['mapped_expression_ids'] = mapped_expression_ids

        return expression_set_data

    def _process_alignment_set_object(self, params, alignment_object_type):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
                                        and KBaseSets.ReadsAlignmentSet type object
        """
        log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object'
            '\nparams:\n{}'.format(json.dumps(params, indent=1)))

        alignment_set_ref = params.get('alignment_set_ref')

        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            params['gtf_file'] = self._get_gtf_file(alignment_set_ref)
        else:
            if not '/' in params['genome_ref']:
                params['genome_ref'] = params['workspace_name'] + '/' + params[
                    'genome_ref']

            params['gtf_file'] = self._get_gtf_file_from_genome_ref(
                params['genome_ref'])

        alignment_set = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            0,
            'include_set_item_ref_paths':
            1
        })
        mul_processor_params = []
        for alignment in alignment_set["data"]["items"]:
            alignment_ref = alignment['ref_path']
            alignment_upload_params = params.copy()
            alignment_upload_params['alignment_ref'] = alignment_ref
            mul_processor_params.append(alignment_upload_params)
            # use the following when you want to run the cmd sequentially
            # self._process_kbasesets_alignment_object(mul_processor_params[0])

        cpus = min(params.get('num_threads'), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log('running _process_alignment_object with {} cpus'.format(cpus))
        alignment_expression_map = pool.map(
            self._process_kbasesets_alignment_object, mul_processor_params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        expression_items = list()
        for proc_alignment_return in alignment_expression_map:
            expression_obj_ref = proc_alignment_return.get(
                'expression_obj_ref')
            alignment_ref = proc_alignment_return.get('alignment_ref')
            alignment_info = self.ws.get_object_info3({
                'objects': [{
                    "ref": alignment_ref
                }],
                'includeMetadata':
                1
            })
            condition = alignment_info['infos'][0][10]['condition']
            expression_items.append({
                "ref": expression_obj_ref,
                "label": condition,
            })
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_obj_ref
                }], includeMetadata=None)[0][1]
            self._run_command('cp -R {} {}'.format(
                proc_alignment_return.get('result_directory'),
                os.path.join(result_directory, expression_name)))

        expression_set = {
            "description": "generated by kb_cufflinks",
            "items": expression_items
        }

        expression_set_info = self.set_api.save_expression_set_v1({
            "workspace":
            params['workspace_name'],
            "output_object_name":
            params['expression_set_name'],
            "data":
            expression_set
        })

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_set_info['set_ref']
        }

        widget_params = {
            "output": params.get('expression_set_name'),
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_output_object_name(self, params, alignment_object_type,
                                     alignment_object_name):
        """
        Generates the output object name based on input object type and name and stores it in
        params with key equal to 'expression' or 'expression_set' based on whether the input
        object is an alignment or alignment_set.

        :param params: module input params
        :param alignment_object_type: input alignment object type
        :param alignment_object_name: input alignment object name
        :param alignment_object_data: input alignment object data
        """
        expression_set_suffix = params['expression_set_suffix']
        expression_suffix = params['expression_suffix']

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment$', alignment_object_name):
                params['expression_name'] = re.sub('_[Aa]lignment$',
                                                   expression_suffix,
                                                   alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_name'] = alignment_object_name + expression_suffix
        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):
                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix
        if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):

                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log('start saving ExpressionMatrix object')

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '',
                                        expression_set_name)

        upload_expression_matrix_params = {
            'expressionset_ref': expressionset_ref,
            'output_obj_name': output_obj_name_prefix,
            'workspace_name': workspace_name
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def run_cufflinks_app(self, params):
        log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_cufflinks_params(params)

        alignment_object_ref = params.get('alignment_object_ref')
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})['infos'][0]

        alignment_object_type = alignment_object_info[2]
        alignment_object_name = alignment_object_info[1]

        # get output object name
        self._generate_output_object_name(params, alignment_object_type,
                                          alignment_object_name)

        log('--->\nalignment object type: \n' +
            '{}'.format(alignment_object_type))

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            params.update({'alignment_ref': alignment_object_ref})
            returnVal = self._process_rnaseq_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get('expression_obj_ref'),
                params.get('workspace_name'),
                returnVal.get('result_directory'))
            returnVal.update(report_output)
        elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \
             re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            params.update({'alignment_set_ref': alignment_object_ref})
            returnVal = self._process_alignment_set_object(
                params, alignment_object_type)
            expression_matrix_refs = self._save_expression_matrix(
                returnVal['expression_obj_ref'], params.get('workspace_name'))
            returnVal.update(expression_matrix_refs)

            report_output = self._generate_report(
                returnVal['expression_obj_ref'], params.get('workspace_name'),
                returnVal['result_directory'],
                expression_matrix_refs['exprMatrix_FPKM_ref'],
                expression_matrix_refs['exprMatrix_TPM_ref'])
            returnVal.update(report_output)
        else:
            raise ValueError(
                'None RNASeqAlignment type\nObject info:\n{}'.format(
                    alignment_object_info))

        return returnVal
    def filter_contigs(self, ctx, params):
        """
        Main method
        :param params: instance of type "ContigFilterParams" (Input parameter
           types) -> structure: parameter "workspace_name" of String,
           parameter "assembly_ref" of String, parameter "min_length" of Long
        :returns: instance of type "ContigFilterResults" (Output result
           types) -> structure: parameter "report_name" of String, parameter
           "report_ref" of String, parameter "filtered_assembly_ref" of
           String, parameter "n_total" of Long, parameter "n_remaining" of
           Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_contigs
        for name in ['min_length', 'assembly_ref', 'workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name + '" is required but missing')
        if not isinstance(params['min_length'], int) or (params['min_length'] < 0):
            raise ValueError('Min length must be a non-negative integer')
        if not isinstance(params['assembly_ref'], basestring) or not len(params['assembly_ref']):
            raise ValueError('Pass in a valid assembly reference string')

        print("params['min_length']=%s, params['assembly_ref']=%s" % (params['min_length'], params['assembly_ref']))
        print("params['params['workspace_name']=%s" % (params['workspace_name']))
        print("self.callback_url=%s" % self.callback_url)
        print("self.scratch=%s" % self.scratch)
        print "config = "
        pprint.pprint(self.config)

        ###############
        # Download ref
        ##############
        assembly_util = AssemblyUtil(self.callback_url)
        file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']})
        print "assembly fasta file = "
        pprint.pprint(file)

        ###################################
        # Real business - filter the contig
        ###################################
        parsed_assembly = SeqIO.parse(file['path'], 'fasta')
        min_length = params['min_length']
        # Keep a list of contigs greater than min_length
        good_contigs = []
        # total contigs regardless of length
        n_total = 0
        # total contigs over the min_length
        n_remaining = 0
        for record in parsed_assembly:
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        # returnVal = {
        #     'n_total': n_total,
        #     'n_remaining': n_remaining
        # }

        # returnVal = {}

        ##################
        # Output
        ##################
        workspace_name = params['workspace_name']
        filtered_path = os.path.join(self.scratch, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_path, 'fasta')
        # Upload the filtered data to the workspace
        new_ref = assembly_util.save_assembly_from_fasta({
            'file': {
                'path': filtered_path
            },
            'workspace_name': workspace_name,
            'assembly_name': file['assembly_name']
        })

        # returnVal = {
        #     'n_total': n_total,
        #     'n_remaining': n_remaining,
        #     'filtered_assembly_ref': new_ref
        # }


        ################
        # Reporting
        ################
        text_message = "".join([
            'Filtered assembly to ', str(n_remaining),
            's contigs out of ', str(n_total)
        ])
        # Data for creating the report, referencing the assembly we uploaded
        report_data = {
            'objects_created': [
                {'ref': new_ref, 'description': 'Filtered contigs'}
            ],
            'text_message': text_message
        }
        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create({
            'report': report_data,
            'workspace_name': workspace_name
        })
        # Return the report reference and name in our results
        returnVal = {
            'report_ref': report['ref'],
            'report_name': report['name'],
            'n_total': n_total,
            'n_remaining': n_remaining,
            'filtered_assembly_ref': new_ref
        }

        ###############
        # BBtools test
        ###############
        # bbtools = BBTools(self.callback_url)
        bbtools = BBTools(self.callback_url, service_ver='beta')

        # set up input files
        print "file['path'] = "
        print file['path']
        # print new_ref['filtered_assembly_ref']
        rqc_filter_input = {
            "reads_file": file['path'] # /kb/module/work/tmp/Shewanella_oneidensis_MR-1_assembly.fa
        }
        # or, if you want to use a KBase Workspace UPA for your reads object:
        # rqc_filter_input = {
        #     "reads_library_ref": new_ref['filtered_assembly_ref']
        # }

        # set up parameters (example below, there are many more options, see BBTools.spec)
        rqc_filter_params = {
            "qtrim": "rl",
            "maxns": 3,
            "minlength": 40,
            "maxmem": 5
        }
        #"maxmem": 5

        # run the local RQCFilter function
        result = bbtools.run_RQCFilter_local(rqc_filter_input, rqc_filter_params)
        print "result = "
        pprint.pprint(result)
        #END filter_contigs

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method filter_contigs return value returnVal is not type dict as required.')

        # return the results
        return [returnVal]
Exemplo n.º 21
0
    def predict_amr_phenotype(self, ctx, params):
        """
        The AMR prediction function specification
        :param params: instance of type "AMRPredictionParams" (Structure of
           input data for AMR prediction) -> structure: parameter
           "assembly_input_ref" of type "assembly_ref", parameter "species"
           of String, parameter "workspace_name" of String
        :returns: instance of type "AMRPredictionResults" (Structure of
           output of AMR prediction) -> structure: parameter "report_name" of
           String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN predict_amr_phenotype

        # Input validation
        for name in ['assembly_input_ref', 'species', 'workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name + '" is required but missing')
        if not (isinstance(params['assembly_input_ref'], string_types) or isinstance(params['assembly_input_ref'], list)) or not len(params['assembly_input_ref']):
            raise ValueError('Pass in a valid assembly reference string(s)')

        # Extract params
        if not isinstance(params["assembly_input_ref"], list):
            assemblies = [params["assembly_input_ref"]]
        else:
            assemblies = params["assembly_input_ref"]
        species = params["species"]

        # Get models for species
        scm_models = self.get_models_by_algorithm_and_species("scm", species)
        cart_models = self.get_models_by_algorithm_and_species("cart", species)

        # Process assemblies
        predictions = {}
        assembly_util = AssemblyUtil(self.callback_url)
        
        for assembly_ref in assemblies:
            
            assembly_predictions = {}

            # Get the fasta file path and other info
            assembly = assembly_util.get_assembly_as_fasta({'ref': assembly_ref})

            # Extract the k-mers
            kmers = self.extract_kmers(assembly["path"], k=31)
            print "Kmers --", assembly["assembly_name"], ":", len(kmers)

            # Make predictions (SCM)
            print "SCM models"
            assembly_predictions["scm"] = {}
            for antibiotic, model in scm_models.iteritems():
                p = model.predict(kmers)
                assembly_predictions["scm"][antibiotic] = {}
                assembly_predictions["scm"][antibiotic]["label"] = p[0]
                assembly_predictions["scm"][antibiotic]["why"] = p[1]

            # Make predictions (CART)
            print "CART models"
            assembly_predictions["cart"] = {}
            for antibiotic, model in cart_models.iteritems():
                p = model.predict(kmers)
                assembly_predictions["cart"][antibiotic] = {}
                assembly_predictions["cart"][antibiotic]["label"] = p[0]
                assembly_predictions["cart"][antibiotic]["why"] = p[1]

            predictions[assembly["assembly_name"]] = assembly_predictions
            del assembly_predictions

        # Generate report
        text_message = "This is a test report for kover amr (text)"

        # Data for creating the report, referencing the assembly we uploaded
        report_data = {
            'objects_created': [],
            'text_message': text_message,
            'direct_html': generate_html_prediction_report(predictions, species)
        }

        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create({
            'report': report_data,
            'workspace_name': params['workspace_name'],
            'file_links': generate_csv_prediction_report(predictions, species, self.scratch)
        })

        output = {
            'report_ref': report['ref'],
            'report_name': report['name']
        }

        #END predict_amr_phenotype

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method predict_amr_phenotype return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 22
0
 def get_assembly_info(self, ref):
     assembly_util = AssemblyUtil(self.callback_url)
     file = assembly_util.get_assembly_as_fasta({'ref': ref})
     return dict(name=file["assembly_name"], contigs=list(SeqIO.parse(file['path'], 'fasta')))
Exemplo n.º 23
0
class ProkkaUtils:

    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx'];
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None

    @staticmethod
    def _get_input_value(params, key):
        """Get value of key after checking for its existence

        :param params: Params dictionary haystack
        :param key: Key to search in Params
        :return: Parameter Value
        :raises ValueError: raises an exception if the key doesn"t exist
        """
        if not key in params:
            raise ValueError("Parameter " + key + " should be set in input parameters")
        return params[key]

    @staticmethod
    def _get_qualifier_value(qualifier):
        """Get first qualifier from the list of qualifiers

        :param qualifier: list contents of the qualifier from BCBio GFF Tools
        :return: first element in the list
        """
        return qualifier[0] if (qualifier and len(qualifier) > 0) else None

    def download_seed_data(self):
        """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and
        the create a table from ec numbers to sso (ec_to_sso)

        :return: None
        """
        # Download Seed Reference Data
        sso_ret = self.ws_client.get_objects([{"ref": "KBaseOntology/seed_subsystem_ontology"}])[0]
        sso = sso_ret["data"]
        for sso_id in sso["term_hash"]:
            sso_name = sso["term_hash"][sso_id]["name"]
            if "(EC " in sso_name and sso_name.endswith(")"):
                ec = sso_name[sso_name.index("(EC ") + 4: -1].strip()
                sso_list = self.ec_to_sso.get(ec, None)
                if not sso_list:
                    sso_list = []
                    self.ec_to_sso[ec] = sso_list
                sso_list.append(sso["term_hash"][sso_id])
        print("EC found in SSO: " + str(len(self.ec_to_sso)))
        sso_info = sso_ret["info"]
        sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(sso_info[4])
        with open("/kb/module/work/seed_so.json", "w") as outfile:
            json.dump(sso, outfile, sort_keys=True, indent=4)
        self.sso_ref = sso_ref

    def inspect_assembly(self, assembly_meta, assembly_ref):
        """Check to see if assembly has too many contigs and might not be a metagenome or
        non prokaryotic dataset

        :param assembly_meta: information about the assembly reference
        :param assembly_ref: the assembly reference number
        :return: a tuple containing gc_content and dna_size
        """
        gc_content = float(assembly_meta.get("GC content"))
        dna_size = int(assembly_meta.get("Size"))
        n_contigs = 0
        if "N Contigs" in assembly_meta:
            n_contigs = int(assembly_meta.get("N Contigs"))
        else:
            contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0]
            n_contigs = len(contig["data"]["contigs"])
        if n_contigs >= 30000:
            message = """
             Hmmm.  There are over 30,000 contigs in this Assembly. 
             It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. 
             If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. 
             These bins can then be individually annotated as a single genome using Prokka. 
             If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. 
             Alternatively, you can try reducing the number of contigs using a filter app.")
             raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions
             """
            print(message)
            raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions")

        assembly_info = namedtuple("assembly_info", "gc_content dna_size")
        return assembly_info(gc_content, dna_size)

    @staticmethod
    def create_renamed_assembly(assembly_fasta_filepath):
        """Rename records to be in the format of contig_N and output a new fasta file

        :param assembly_fasta_filepath:
        :return: The path to the fasta file with renamed contigs the number of contigs,
        the mapping from old ids to new ids, and the contigs as SeqRecords
        """
        records = []
        new_ids_to_old = {}
        contig_counter = 0
        for record in SeqIO.parse(assembly_fasta_filepath, "fasta"):
            contig_counter += 1
            old_id = record.id
            new_id = "contig_" + str(contig_counter)
            sequence = record.seq  # it has type "Seq"
            record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")")
            records.append(record)
            new_ids_to_old[new_id] = old_id

        renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna"
        SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta")

        renamed_assembly = namedtuple("renamed_assembly",
                                      "filepath contig_counter new_ids_to_old records")
        return renamed_assembly(renamed_assembly_fasta_filepath, contig_counter, new_ids_to_old,
                                records)

    def run_prokka(self, params, subject_fasta_filepath):
        """Run Prokka

        :param params: Prokka parameters
        :param subject_fasta_filepath: The contigs or genes to run prokka against
        :return: The  directory with all of the prokka output files
        """
        output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4())

        # --kingdom [X]  Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria")
        kingdom = "Bacteria"
        if "kingdom" in params and params["kingdom"]:
            kingdom = params["kingdom"]

        prokka_cmd_list = ["perl", "/kb/prokka/bin/prokka", "--outdir", output_dir, "--prefix",
                           "mygenome", "--kingdom", kingdom]

        # --genus [X]       Genus name (triggers to use --usegenus)
        if "genus" in params and params["genus"]:
            prokka_cmd_list.extend(["--genus", str(params["genus"]), "--usegenus"])
        # --gcode [N]       Genetic code / Translation table (set if --kingdom is set) (default "0")
        if "gcode" in params and params["gcode"]:
            prokka_cmd_list.extend(["--gcode", str(params["gcode"])])
        else:
            prokka_cmd_list.extend(["--gcode", "0"])
        # --gram [X]        Gram: -/neg +/pos (default "")
        if "gram" in params and params["gram"]:
            raise ValueError("gram parameter is not supported in current Prokka installation")
        # --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
        if "metagenome" in params and params["metagenome"] == 1:
            prokka_cmd_list.append("--metagenome")
        # --rawproduct      Do not clean up /product annotation (default OFF)
        if "rawproduct" in params and params["rawproduct"] == 1:
            prokka_cmd_list.append("--rawproduct")
        # --fast            Fast mode - skip CDS /product searching (default OFF)
        if "fast" in params and params["fast"] == 1:
            prokka_cmd_list.append("--fast")
        # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1")
        if "mincontiglen" in params and params["mincontiglen"]:
            prokka_cmd_list.extend(["--mincontiglen", str(params["mincontiglen"])])
        # --evalue [n.n]    Similarity e-value cut-off (default "1e-06")
        if "evalue" in params and params["evalue"]:
            prokka_cmd_list.extend(["--evalue", str(params["evalue"])])
        # --rfam            Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0")
        if "rfam" in params and params["rfam"] == 1:
            prokka_cmd_list.append("--rfam")
        # --norrna          Don"t run rRNA search (default OFF)
        if "norrna" in params and params["norrna"] == 1:
            prokka_cmd_list.append("--norrna")
        # --notrna          Don"t run tRNA search (default OFF)
        if "notrna" in params and params["notrna"] == 1:
            prokka_cmd_list.append("--notrna")
        prokka_cmd_list.append(subject_fasta_filepath)
        print("Prokka command line: " + str(prokka_cmd_list))

        try:
            check_output(prokka_cmd_list, cwd=self.scratch)
        except CalledProcessError as e:
            pprint(e)
        return output_dir

    @staticmethod
    def retrieve_prokka_results(output_dir):
        """ Gather up the relevant prokka results, load the records from the results files

        :param output_dir:
        :return: Sequences from the .faa .ffn files and the gff_filepath
        """
        faa_file = output_dir + "/mygenome.faa"
        cds_to_prot = {}
        for record in SeqIO.parse(faa_file, "fasta"):
            cds_to_prot[record.id] = str(record.seq)
        ffn_file = output_dir + "/mygenome.ffn"
        cds_to_dna = {}
        for record in SeqIO.parse(ffn_file, "fasta"):
            cds_to_dna[record.id] = str(record.seq)
        gff_file = output_dir + "/mygenome.gff"
        if not os.path.isfile(gff_file):
            raise ValueError("PROKKA output GFF file is not found")

        prokka_results = namedtuple("prokka_results", "cds_to_prot cds_to_dna gff_filepath")
        return prokka_results(cds_to_prot, cds_to_dna, gff_file)

    def parse_prokka_results(self, **prokka_parse_parameters):
        """ Go through the prokka results from the input contigs and then
        create the features, mrnas and cdss components of the KbaseGenome.Genome object

        :param prokka_parse_parameters: gff_filepath, mappings
        :return: Genome:features Genome:cdss  Genome:mrnas report_message of genes discovered
        """
        gff_filepath = prokka_parse_parameters["gff_filepath"]
        cds_to_dna = prokka_parse_parameters["cds_to_dna"]
        cds_to_prot = prokka_parse_parameters["cds_to_prot"]
        new_ids_to_old = prokka_parse_parameters["new_ids_to_old"]

        evidence = self.make_annotation_evidence()

        cdss = []
        mrnas = []
        features = []
        non_hypothetical = 0
        genes_with_ec = 0
        genes_with_sso = 0
        prot_lengths = []
        with open(gff_filepath, "r") as f1:
            for rec in GFF.parse(f1):
                contig_id = new_ids_to_old[str(rec.id)]
                for ft in rec.features:
                    loc = ft.location
                    min_pos = int(loc.start) + 1
                    max_pos = int(loc.end)
                    strand = "+" if loc.strand == 1 else "-"
                    flen = max_pos - min_pos + 1
                    start = min_pos if strand == "+" else max_pos
                    location = [[contig_id, start, strand, flen]]
                    qualifiers = ft.qualifiers
                    generated_id = self._get_qualifier_value(qualifiers.get("ID"))
                    if not generated_id:
                        # Skipping feature with no ID (mostly repeat regions)
                        continue
                    dna = cds_to_dna.get(generated_id)
                    if not dna:
                        # Skipping feature with no DNA (mostly repeat regions)
                        continue
                    name = self._get_qualifier_value(qualifiers.get("Name"))
                    ec = self._get_qualifier_value(qualifiers.get("eC_number"))
                    gene = self._get_qualifier_value(qualifiers.get("gene"))
                    product = self._get_qualifier_value(qualifiers.get("product"))
                    fid = generated_id
                    aliases = []
                    if name:
                        aliases.append(name)
                    if gene:
                        aliases.append(gene)
                    if ec:
                        aliases.append(ec)
                        genes_with_ec += 1
                    md5 = hashlib.md5(dna).hexdigest()
                    feature = {"id": fid, "location": location, "type": "gene",
                               "aliases": aliases, "md5": md5, "dna_sequence": dna,
                               "dna_sequence_length": len(dna),
                               }
                    if product:
                        feature["function"] = product
                        if product != "hypothetical protein":
                            non_hypothetical += 1
                    if ec and ec in self.ec_to_sso:
                        sso_list = self.ec_to_sso[ec]
                        sso_terms = {}
                        for sso_item in sso_list:
                            sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                         "evidence": [evidence],
                                                         "term_name": sso_item["name"],
                                                         "ontology_ref": self.sso_ref,
                                                         "term_lineage": []}
                        feature["ontology_terms"] = {"SSO": sso_terms}
                        genes_with_sso += 1
                    cds = None
                    mrna = None
                    prot = cds_to_prot.get(generated_id)
                    if prot:
                        cds_id = fid + "_CDS"
                        mrna_id = fid + "_mRNA"
                        prot_len = len(prot)
                        prot_lengths.append(prot_len)
                        feature["protein_translation"] = prot
                        feature["protein_translation_length"] = prot_len
                        feature["cdss"] = [cds_id]
                        feature["mrnas"] = [mrna_id]
                        cds = {"id": cds_id, "location": location, "md5": md5, "parent_gene": fid,
                               "parent_mrna": mrna_id, "function": (product if product else ""),
                               "ontology_terms": {}, "protein_translation": prot,
                               "protein_translation_length": prot_len, "aliases": aliases}
                        mrna = {"id": mrna_id, "location": location, "md5": md5,
                                "parent_gene": fid, "cds": cds_id}
                    features.append(feature)
                    if cds:
                        cdss.append(cds)
                    if mrna:
                        mrnas.append(mrna)

        # Prepare report
        report = ""
        report += "Number of genes predicted: " + str(len(features)) + "\n"
        report += "Number of protein coding genes: " + str(len(prot_lengths)) + "\n"
        report += "Number of genes with non-hypothetical function: " + str(non_hypothetical) + "\n"
        report += "Number of genes with EC-number: " + str(genes_with_ec) + "\n"
        report += "Number of genes with Seed Subsystem Ontology: " + str(genes_with_sso) + "\n"
        report += "Average protein length: " + str(int(sum(prot_lengths) /
                                                       float(len(prot_lengths)))) + " aa.\n"

        annotated_assembly = namedtuple("annotated_assembly", "features cdss mrnas report_message")
        return annotated_assembly(features, cdss, mrnas, report)

    def get_new_annotations(self, gff_filepath):
        """

        :param gff_filepath: A dictionary of ids with products and ec numbers
        :return:
        """
        evidence = self.make_annotation_evidence()
        genome = {}
        with open(gff_filepath, "r") as f:
            for rec in GFF.parse(f):
                gid = rec.id
                gene_features = {"id": id}

                for feature in rec.features:
                    qualifiers = feature.qualifiers
                    if "product" in qualifiers:
                        gene_features["function"] = " ".join(qualifiers["product"])

                    if "eC_number" in qualifiers:
                        ec_numbers = qualifiers["eC_number"]
                        sso_terms = dict()
                        for ec in ec_numbers:
                            sso_list = self.ec_to_sso.get(ec, [])
                            for sso_item in sso_list:
                                sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                             "evidence": [evidence],
                                                             "term_name": sso_item["name"],
                                                             "ontology_ref": self.sso_ref,
                                                             "term_lineage": []}

                        gene_features["ontology_terms"] = sso_terms
                genome[gid] = gene_features

        return genome

    def write_genome_to_fasta(self, genome_data):
        """

        :param genome_data:
        :return:
        """
        fasta_for_prokka_filepath = os.path.join(self.scratch,
                                                 "features_" + str(uuid.uuid4()) + ".fasta")
        count = 0
        with open(fasta_for_prokka_filepath, "w") as f:
            for item in genome_data["data"]["features"]:
                if "id" not in item or "dna_sequence" not in item:
                    print("This feature does not have a valid dna sequence.")
                else:
                    f.write(">" + item["id"] + "\n" + item["dna_sequence"] + "\n")
                    count += 1

        print("Finished printing to" + fasta_for_prokka_filepath)
        if os.stat(fasta_for_prokka_filepath).st_size == 0:
            raise Exception(
                "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty.")

        return fasta_for_prokka_filepath

    def make_sso_ontology_event(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation",
            "method_version": version,
            "timestamp": time_string,
            "id": "SSO",
            "ontology_ref": self.sso_ref
        }

    def make_annotation_evidence(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation (Evidence)",
            "method_version": version,
            "timestamp": time_string,
        }

    def create_genome_ontology_fields(self, genome_data):
        # Make sure ontologies_events exist
        sso_event = self.make_sso_ontology_event()
        ontology_event_index = 0

        if 'ontology_events' in genome_data['data']:
            genome_data['data']['ontology_events'].append(sso_event)
            ontology_event_index += len(genome_data['data']['ontology_events']) - 1
        else:
            genome_data['data']['ontology_events'] = [sso_event]

        genome_obj_modified = namedtuple('genome_obj_modified', 'genome_data ontology_event_index')
        return genome_obj_modified(genome_data, ontology_event_index)

    @staticmethod
    def old_genome_ontologies(feature, new_ontology):
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}
        for key in new_ontology.keys():
            feature["ontology_terms"]["SSO"][key] = new_ontology[key]
        return feature

    @staticmethod
    def new_genome_ontologies(feature, new_ontology, ontology_event_index):
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}

        for key in new_ontology.keys():
            id = new_ontology[key]["id"]
            if id in feature["ontology_terms"]["SSO"]:
                feature["ontology_terms"]["SSO"][id].append(ontology_event_index)
            else:
                feature["ontology_terms"]["SSO"][id] = [ontology_event_index]
        return feature

    def annotate_genome_with_new_annotations(self, **annotation_args):
        """

        :param annotation_args: genome_data, new_annotations from prokka, and the output_genome_name
        :type
        :return:
        """
        genome_data = annotation_args["genome_data"]
        new_annotations = annotation_args["new_annotations"]

        new_genome = False
        if 'feature_counts' in genome_data['data']:
            new_genome = True
            genome_obj_modified = self.create_genome_ontology_fields(genome_data)
            genome_data = genome_obj_modified.genome_data
            ontology_event_index = genome_obj_modified.ontology_event_index

        stats = {"current_functions": len(genome_data["data"]["features"]), "new_functions": 0,
                 "found_functions": 0, "new_ontologies": 0}

        function_report_filepath = os.path.join(self.scratch, "ontology_report")
        ontology_report_filepath = os.path.join(self.scratch, "function_report")
        onto_r = open(function_report_filepath, "w")
        func_r = open(ontology_report_filepath, "w")
        func_r.write("function_id current_function new_function\n")
        onto_r.write("function_id current_ontology new_ontology\n")

        for i, feature in enumerate(genome_data["data"]["features"]):
            fid = feature["id"]
            current_function = feature.get("function", "")
            current_functions = feature.get("functions", [])
            current_ontology = feature.get("ontology_terms", None)
            new_function = ""
            new_ontology = dict()

            if fid in new_annotations:
                # Set Function
                new_function = new_annotations[fid].get("function", "")
                if new_function and "hypothetical protein" not in new_function:
                    if (new_function != current_function and new_function not in current_functions):
                        stats['new_functions'] += 1
                    genome_data["data"]["features"][i]["function"] = new_function
                    genome_data["data"]["features"][i]["functions"] = [new_function]
                    stats['found_functions'] += 1

                # Set Ontologies
                new_ontology = new_annotations[fid].get("ontology_terms", None)
                if new_ontology:
                    stats['new_ontologies'] += 1
                    if new_genome:
                        genome_data["data"]["features"][i] = self. \
                            new_genome_ontologies(feature, new_ontology, ontology_event_index)
                    else:
                        genome_data["data"]["features"][i] = self. \
                            old_genome_ontologies(feature, new_ontology)
            if current_function:
                func_r.write(json.dumps([fid, [current_function], [new_function]]) + "\n")
            else:
                func_r.write(json.dumps([fid, current_functions, [new_function]]) + "\n")

            onto_r.write(json.dumps([fid, current_ontology, new_ontology]) + "\n")

        func_r.close()
        onto_r.close()

        info = self.gfu.save_one_genome({"workspace": self.output_workspace,
                                         "name": annotation_args["output_genome_name"],
                                         "data": genome_data["data"],
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        annotated_genome = namedtuple("annotated_genome",
                                      "genome_ref function_report_filepath ontology_report_filepath stats")

        return annotated_genome(genome_ref, function_report_filepath, ontology_report_filepath,
                                stats)

    def upload_file(self, filepath, message="Annotation report generated by kb_prokka"):
        """
        Upload a file to shock
        :param filepath: File to upload
        :param message: Optional Upload Message
        :return:
        """
        output_file_shock_id = self.dfu.file_to_shock({"file_path": filepath})["shock_id"]
        print("Uploaded filepath" + filepath + "to shock and got id" + output_file_shock_id)
        return {"shock_id": output_file_shock_id,
                "name": os.path.basename(filepath),
                "label": os.path.basename(filepath),
                "description": message}

    def report_annotated_genome(self, genome):
        """ Create report output with newly reannotated genome, and some stats

        :param genome: Reannotated Genome Reference, Report Files and Stats
        :return: Reference to Report Object
        """
        genome_ref = genome.genome_ref
        stats = genome.stats

        file_links = [self.upload_file(genome.ontology_report_filepath),
                      self.upload_file(genome.function_report_filepath)]

        report_message = ("Genome Ref:{0}\n"
                          "Number of features sent into prokka:{1}\n"
                          "New functions found:{2}\n"
                          "Ontology terms found:{3}\n"
                          ).format(genome_ref, stats["current_functions"], stats["new_functions"],
                                   stats["new_ontologies"])

        report_info = self.kbr.create_extended_report(
            {"message": report_message,
             "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}],
             "file_links": file_links,
             "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
             "workspace_name": self.output_workspace
             })

        return {"output_genome_ref": genome_ref, "report_name": report_info["name"],
                "report_ref": report_info["ref"]}

    def annotate_genome(self, params):
        """ User input an existing genome to re-annotate.

        :param params: Reference to the genome, Output File Name, UI Parameters
        :return: Report with Reannotated Genome and Stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        genome_ref = self._get_input_value(params, "object_ref")
        output_name = self._get_input_value(params, "output_genome_name")
        # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0]

        genome_data = \
            self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[
                "genomes"][0]

        fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data)
        output_dir = self.run_prokka(params, fasta_for_prokka_filepath)
        prokka_results = self.retrieve_prokka_results(output_dir)
        new_annotations = self.get_new_annotations(prokka_results.gff_filepath)
        annotated_genome = self.annotate_genome_with_new_annotations(genome_data=genome_data,
                                                                     new_annotations=new_annotations,
                                                                     output_genome_name=output_name)
        return self.report_annotated_genome(annotated_genome)

    def annotate_assembly(self, params, assembly_info):
        """
        Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file,
        rename the contigs, run prokka against the contigs, parse the results, and finally,
        create and upload a genome object.

        :param params: object reference, output_genome_name and output_workspace
        :param assembly_info: Information used to determine if the assembly is too big
        :return: Report with newly annotated assembly as a genome, and stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        assembly_ref = self._get_input_value(params, "object_ref")
        output_genome_name = self._get_input_value(params, "output_genome_name")
        output_workspace = self._get_input_value(params, "output_workspace")
        assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref)
        orig_fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"]

        # Rename Assembly and Keep Track of Old Contigs
        renamed_assembly = self.create_renamed_assembly(orig_fasta_file)
        # Run Prokka with the modified, renamed fasta file
        output_dir = self.run_prokka(params, renamed_assembly.filepath)
        # Prokka_results
        prokka_results = self.retrieve_prokka_results(output_dir)
        # Parse Results
        annotated_assembly = self.parse_prokka_results(gff_filepath=prokka_results.gff_filepath,
                                                       cds_to_dna=prokka_results.cds_to_dna,
                                                       cds_to_prot=prokka_results.cds_to_dna,
                                                       new_ids_to_old=renamed_assembly.new_ids_to_old)

        # Force defaults for optional parameters that may be set to None
        scientific_name = 'Unknown'
        if 'scientific_name' in params and params['scientific_name']:
            scientific_name = params['scientific_name']
        domain = "Bacteria"
        if 'kingdom' in params and params['kingdom']:
            domain = params['kingdom']
        gcode = 0
        if 'gcode' in params and params['gcode']:
            gcode = params['gcode']

        genome = {"id": "Unknown",
                  "features": annotated_assembly.features,
                  "scientific_name": scientific_name,
                  "domain": domain,
                  "genetic_code": gcode,
                  "assembly_ref": assembly_ref,
                  "cdss": annotated_assembly.cdss,
                  "mrnas": annotated_assembly.mrnas,
                  "source": "PROKKA annotation pipeline",
                  "gc_content": assembly_info.gc_content,
                  "dna_size": assembly_info.dna_size,
                  "reference_annotation": 0}

        info = self.gfu.save_one_genome({"workspace": output_workspace,
                                         "name": output_genome_name,
                                         "data": genome,
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        report_message = "Genome saved to: " + output_workspace + "/" + \
                         output_genome_name + "\n" + annotated_assembly.report_message

        report_info = self.kbr.create_extended_report(
            {"message": report_message,
             "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}],
             "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
             "workspace_name": output_workspace
             })

        return {"output_genome_ref": genome_ref, "report_name": report_info["name"],
                "report_ref": report_info["ref"]}
Exemplo n.º 24
0
    def __init__(self, genome=None, callbackURL=''):
        self._contents = StringIO.StringIO()
        self._ga = genome
        print('downloading assembly')
        au = AssemblyUtil(callbackURL)
        assembly_info = self._ga.get_assembly().get_info()
        assembly_ref = str(assembly_info['workspace_id']) + '/' + str(
            assembly_info['object_id']) + '/' + str(assembly_info['version'])
        print('Assembly reference = ' + assembly_ref)
        assembly_file_path = au.get_assembly_as_fasta({'ref':
                                                       assembly_ref})['path']

        print('extracting taxonomy information')
        self._taxa = self._ga.get_taxon()
        self._tax_lineage = self._taxa.get_scientific_lineage()

        print('assembling feature and protein data')
        self._genome_name = str(self._ga.get_id())
        self._proteins = self._ga.get_proteins()
        self._features = self._ga.get_features()

        print('writing file')
        # read in fasta file to build the contig index
        contigs = {}
        for record in SeqIO.parse(assembly_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])
            contigs[record.id] = {
                'length': len(record.seq),
                'sequence': str(record.seq)
            }
        self._contigs = contigs
        contig_length_dict = dict()
        for contig_id in contigs:
            contig_length_dict[contig_id] = contigs[contig_id]["length"]
        del contigs
        contigs_tuples = sorted(contig_length_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)
        # print("Contig tuples : " + str(contigs_tuples))

        # organize features by location
        feature_ids_by_region = self._ga.get_feature_ids(
            group_by="region")["by_region"]
        # print('FEATURE IDS BY REGION :: ' + str(feature_ids_by_region))

        # flatten the last level of the results to get a contiguous list per contig/strand
        feature_ids_by_contig = {}
        for contig_tuple in contigs_tuples:
            cid = contig_tuple[0]
            feature_ids_by_contig[cid] = {}
            if cid in feature_ids_by_region:
                if "+" in feature_ids_by_region[cid]:
                    sorted_regions = sorted(
                        feature_ids_by_region[cid]["+"].keys(),
                        cmp=lambda x, y: cmp(int(x.split("-")[0]),
                                             int(y.split("-")[0])))

                    sorted_ids = []
                    for region in sorted_regions:
                        for fid in self._sort_feature_ids(
                                feature_ids_by_region[cid]["+"][region]):
                            sorted_ids.append(fid)

                    feature_ids_by_contig[cid]["+"] = sorted_ids
                else:
                    feature_ids_by_contig[cid]["+"] = []

                if "-" in feature_ids_by_region[cid]:
                    sorted_regions = sorted(
                        feature_ids_by_region[cid]["-"].keys(),
                        cmp=lambda x, y: cmp(int(x.split("-")[0]),
                                             int(y.split("-")[0])))

                    sorted_ids = []
                    for region in sorted_regions:
                        for fid in self._sort_feature_ids(
                                feature_ids_by_region[cid]["-"][region]):
                            sorted_ids.append(fid)

                    feature_ids_by_contig[cid]["-"] = sorted_ids
                else:
                    feature_ids_by_contig[cid]["-"] = []

        for cid in feature_ids_by_contig:
            # add a header for the contig
            self._add_contig_header(cid)

            # add positive strand features
            if "+" in feature_ids_by_contig[cid]:
                for fid in feature_ids_by_contig[cid]["+"]:
                    self._add_feature(fid)

            # add minus strand features
            if "-" in feature_ids_by_contig[cid]:
                for fid in feature_ids_by_contig[cid]["-"]:
                    self._add_feature(fid)

            self._add_contig_sequence(cid)
Exemplo n.º 25
0
    def KButil_Build_InSilico_Metagenomes_with_Grinder(self, ctx, params):
        """
        :param params: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Params"
           (KButil_Build_InSilico_Metagenomes_with_Grinder() ** **  Use
           Grinder to generate in silico shotgun metagenomes) -> structure:
           parameter "workspace_name" of type "workspace_name" (** The
           workspace object refs are of form: ** **    objects =
           ws.get_objects([{'ref':
           params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means
           the entire name combining the workspace id and the object name **
           "id" is a numerical identifier of the workspace or object, and
           should just be used for workspace ** "name" is a string identifier
           of a workspace or object.  This is received from Narrative.),
           parameter "input_refs" of type "data_obj_ref", parameter
           "output_name" of type "data_obj_name", parameter "desc" of String,
           parameter "num_reads_per_lib" of Long, parameter
           "population_percs" of String, parameter "read_len_mean" of Long,
           parameter "read_len_stddev" of Double, parameter "pairs_flag" of
           Long, parameter "mate_orientation" of String, parameter
           "insert_len_mean" of Long, parameter "insert_len_stddev" of
           Double, parameter "mutation_dist" of String, parameter
           "mutation_ratio" of String, parameter "qual_good" of Long,
           parameter "qual_bad" of Long, parameter "len_bias_flag" of Long,
           parameter "random_seed" of Long
        :returns: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Output" ->
           structure: parameter "report_name" of type "data_obj_name",
           parameter "report_ref" of type "data_obj_ref"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN KButil_Build_InSilico_Metagenomes_with_Grinder

        #### STEP 0: basic init
        ##
        console = []
        invalid_msgs = []
        report_text = ''
        self.log(console,
                 'Running KButil_Build_InSilico_Metagenomes_with_Grinder(): ')
        self.log(console, "\n" + pformat(params))

        # Auth
        token = ctx['token']
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # API Clients
        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'
        wsClient = workspaceService(self.workspaceURL, token=token)
        readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                       token=ctx['token'])  # SDK local
        #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token'])  # for SDK local.  local doesn't work for SetAPI
        setAPI_Client = SetAPI(url=self.serviceWizardURL,
                               token=ctx['token'])  # for dynamic service
        auClient = AssemblyUtil(self.callbackURL,
                                token=ctx['token'],
                                service_ver=SERVICE_VER)
        dfu = DFUClient(self.callbackURL)

        # param checks
        required_params = [
            'workspace_name', 'input_refs', 'output_name', 'num_reads_per_lib',
            'population_percs', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'mate_orientation', 'insert_len_mean',
            'insert_len_stddev', 'mutation_dist', 'mutation_ratio',
            'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # cast to str unpredictable numerical params (mostly used in string context)
        numerical_params = [
            'num_reads_per_lib', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'insert_len_mean', 'insert_len_stddev', 'qual_good',
            'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in numerical_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                continue
            params[arg] = str(params[arg])

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        provenance[0]['input_ws_objects'] = []
        for input_ref in params['input_refs']:
            provenance[0]['input_ws_objects'].append(input_ref)

        # set the output paths
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        html_output_dir = os.path.join(output_dir, 'html')
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)

        #### STEP 1: Parse population_percs and write to file
        ##
        abundance_str = params['population_percs'].strip()
        abundance_file_path = os.path.join(output_dir, 'my_abundances.txt')
        abundance_config_num_libs = 0
        abundance_config_num_libs_set = False
        grinder_genome_ids = []
        header = []
        out_buf = []

        for row in abundance_str.split("\n"):
            cols = re.split(r'\s+', row)
            if cols[0].upper() == "GENOME":
                for col in cols:
                    if col == '':
                        continue
                    header.append(col)
                continue
            grinder_genome_ids.append(cols[0])
            self.log(console, "GRINDER GENOME ID: '" + cols[0] + "'")  # DEBUG
            out_row = []
            for col in cols:
                if col == '':
                    continue
                elif col == '%':
                    continue
                elif col.endswith('%'):
                    col = col.rstrip('%')
                out_row.append(col)
            out_buf.append("\t".join(out_row))
            num_samples = len(out_row) - 1  # first col is genome id
            if not abundance_config_num_libs_set:
                abundance_config_num_libs_set = True
                abundance_config_num_libs = num_samples
            elif num_samples != abundance_config_num_libs:
                invalid_msgs.append(
                    "inconsistent number of samples in population_percs input field"
                )
        # data validation
        if abundance_config_num_libs == 0:
            invalid_msgs.append(
                "unable to find sample percentages in population_percs input field"
            )
        sample_sums = []
        for row_i, abund_row_str in enumerate(out_buf):
            abund_row = abund_row_str.split()
            for sample_i, abund in enumerate(abund_row[1:]):
                if row_i == 0:
                    sample_sums.append(0)
                #self.log (console, "row_i: "+str(row_i)+" sample_i: "+str(sample_i))  # DEBUG
                sample_sums[sample_i] += float(abund)
        for sample_i, sample_sum in enumerate(sample_sums):
            if sample_sum < 99.5 or sample_sum > 100.5:
                self.log(
                    invalid_msgs, "Sample: " + str(sample_i + 1) + " " +
                    header[sample_i + 1] +
                    " proportions is not summing to 100.0. Summing to: " +
                    str(sample_sum))

        if len(invalid_msgs) == 0:
            with open(abundance_file_path, 'w') as abundance_fh:
                for out_line in out_buf:
                    abundance_fh.write(out_line + "\n")
            # DEBUG
            with open(abundance_file_path, 'r') as abundance_fh:
                for out_line in abundance_fh.readlines():
                    out_line = out_line.rstrip()
                    self.log(console, "ABUNDANCE_CONFIG: '" + out_line + "'")

        #### STEP 2: get genome scaffold sequences
        ##
        if len(invalid_msgs) == 0:
            genomes_src_db_file_path = os.path.join(output_dir, 'genomes.fna')
            read_buf_size = 65536
            write_buf_size = 65536
            accepted_input_types = ["KBaseGenomes.Genome"]
            genome_refs = params['input_refs']
            genome_obj_names = []
            genome_sci_names = []
            assembly_refs = []

            for i, input_ref in enumerate(genome_refs):
                # genome obj info
                try:
                    [
                        OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I,
                        SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I,
                        META_I
                    ] = range(11)  # object_info tuple
                    input_obj_info = wsClient.get_object_info_new(
                        {'objects': [{
                            'ref': input_ref
                        }]})[0]
                    input_obj_type = re.sub(
                        '-[0-9]+\.[0-9]+$', "",
                        input_obj_info[TYPE_I])  # remove trailing version
                    genome_obj_names.append(input_obj_info[NAME_I])

                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' +
                                     input_ref + ')' + str(e))
                if input_obj_type not in accepted_input_types:
                    raise ValueError("Input object of type '" +
                                     input_obj_type +
                                     "' not accepted.  Must be one of " +
                                     ", ".join(accepted_input_types))

                # genome obj data
                try:
                    genome_obj = wsClient.get_objects([{
                        'ref': input_ref
                    }])[0]['data']
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError("unable to fetch genome: " + input_ref)

                # Get assembly_refs
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    self.log(console, msg)
                    self.log(invalid_msgs, msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj[
                        'assembly_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING assembly_ref: " + str(
                                genome_obj['assembly_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj[
                        'contigset_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING contigset_ref: " + str(
                                genome_obj['contigset_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['contigset_ref'])

        # get fastas for scaffolds
        if len(invalid_msgs) == 0:
            contig_file_paths = []

            for genome_i, input_ref in enumerate(genome_refs):
                contig_file = auClient.get_assembly_as_fasta({
                    'ref':
                    assembly_refs[genome_i]
                }).get('path')
                sys.stdout.flush()
                contig_file_path = dfu.unpack_file({'file_path':
                                                    contig_file})['file_path']
                contig_file_paths.append(contig_file_path)

            # reformat FASTA IDs for Grinder
            with open(genomes_src_db_file_path, 'w',
                      write_buf_size) as genomes_src_db_fh:
                for genome_i, contig_file_path in enumerate(contig_file_paths):
                    #self.log(console,str(genome_i)+" CONTIG_FILE: "+contig_file_path)  # DEBUG
                    #contig_ids = []
                    with open(contig_file_path, 'r',
                              read_buf_size) as contig_fh:
                        genome_seq = ''
                        contig_seq = ''
                        contig_seqs = []
                        for contig_line in contig_fh.readlines():
                            contig_line = contig_line.rstrip()
                            if contig_line.startswith('>'):
                                #contig_id = contig_line.strip()[1:].split(' ')[0]
                                #contig_ids.append(contig_id)
                                #genomes_src_db_fh.write(">"+grinder_genome_ids[genome_i]+"\n")
                                if contig_seq != '':
                                    contig_seqs.append(contig_seq)
                                    contig_seq = ''
                                    continue
                            else:
                                #genomes_src_db_fh.write(contig_line)
                                contig_seq += contig_line
                        if contig_seq != '':
                            contig_seqs.append(contig_seq)
                            contig_seq = ''

                    # write joined contigs to file
                    genome_seq = "NNNNNNNNNN".join(
                        contig_seqs
                    )  # NOTE: Using "-exclude_chars" grinder opt on N to avoid contig joins
                    genome_seq = genome_seq.upper(
                    )  # grinder might require upper case?
                    genomes_src_db_fh.write(">" +
                                            grinder_genome_ids[genome_i] +
                                            "\n")
                    genomes_src_db_fh.write(genome_seq + "\n")
                    genome_seq = ''
                    contig_seqs = []

                    # DEBUG
                    #for contig_id in contig_ids:
                    #    self.log(console, "\tCONTIG_ID: "+contig_id)  # DEBUG
            # DEBUG
            toggle = 0
            with open(genomes_src_db_file_path, 'r',
                      write_buf_size) as genomes_src_db_fh:
                for contig_line in genomes_src_db_fh.readlines():
                    contig_line = contig_line.rstrip()
                    if contig_line.startswith('>'):
                        self.log(console, 'GENOMES_SRC_DB: ' + contig_line)
                        genome_id = contig_line[1:]
                        toggle = 0
                    elif toggle == 0:
                        #elif genome_id == 'G3':
                        self.log(
                            console,
                            'GENOMES_SRC_DB: ' + contig_line[0:50] + '...')
                        toggle += 1

        #### STEP 3: Run Grinder
        ##
        if len(invalid_msgs) == 0:
            cmd = []
            cmd.append(self.GRINDER)
            # output
            cmd.append('-base_name')
            cmd.append(params['output_name'])
            cmd.append('-output_dir')
            cmd.append(output_dir)
            # contigs input
            cmd.append('-reference_file')
            cmd.append(genomes_src_db_file_path)
            # abundances
            cmd.append('-abundance_file')
            cmd.append(abundance_file_path)
            # library size
            cmd.append('-total_reads')
            cmd.append(str(params['num_reads_per_lib']))
            # num libraries (overridden by abundance file?)
            cmd.append('-num_libraries')
            cmd.append(str(abundance_config_num_libs))
            # read and insert lens
            cmd.append('-read_dist')
            cmd.append(str(params['read_len_mean']))
            cmd.append('normal')
            cmd.append(str(params['read_len_stddev']))
            if str(params['pairs_flag']) == '1':
                cmd.append('-insert_dist')
                cmd.append(str(params['insert_len_mean']))
                cmd.append('normal')
                cmd.append(str(params['insert_len_stddev']))
                # mate orientation
                cmd.append('-mate_orientation')
                cmd.append(params['mate_orientation'])
            # genome len bias
            cmd.append('-length_bias')
            cmd.append(str(params['len_bias_flag']))
            # mutation model
            cmd.append('-mutation_dist')
            cmd.append(str(params['mutation_dist']))
            cmd.append('-mutation_ratio')
            cmd.append(str(params['mutation_ratio']))
            # qual scores
            cmd.append('-fastq_output')
            cmd.append('1')
            cmd.append('-qual_levels')
            cmd.append(str(params['qual_good']))
            cmd.append(str(params['qual_bad']))
            # skip contig joins
            cmd.append('-exclude_chars')
            cmd.append('NX')
            # explicitly request bidirectional
            cmd.append('-unidirectional')
            cmd.append('0')
            # random seed
            if 'random_seed' in params and params[
                    'random_seed'] != None and params['random_seed'] != '':
                cmd.append('-random_seed')
                cmd.append(str(params['random_seed']))

            # RUN
            cmd_str = " ".join(cmd)
            self.log(console, "===========================================")
            self.log(console, "RUNNING: " + cmd_str)
            self.log(console, "===========================================")

            cmdProcess = subprocess.Popen(cmd_str,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.STDOUT,
                                          shell=True)
            outputlines = []
            while True:
                line = cmdProcess.stdout.readline()
                outputlines.append(line)
                if not line: break
                self.log(console, line.replace('\n', ''))

            cmdProcess.stdout.close()
            cmdProcess.wait()
            self.log(console,
                     'return code: ' + str(cmdProcess.returncode) + '\n')
            if cmdProcess.returncode != 0:
                raise ValueError('Error running kb_grinder, return code: ' +
                                 str(cmdProcess.returncode) + '\n')

            #report_text += "\n".join(outputlines)
            #report_text += "cmdstring: " + cmdstring + " stdout: " + stdout + " stderr " + stderr

            # capture output for report and paths to out files
            report_text_buf = []
            struct_file_paths = []
            struct_file_names = []
            fastq_file_paths = []
            for out_line in outputlines:
                out_line = out_line.rstrip()
                if 'Community structure' in out_line:
                    clean_line = out_line.lstrip()
                    struct_file_path = re.split(r'\s+', clean_line)[3]
                    struct_file_paths.append(struct_file_path)
                    struct_file_names.append(struct_file_path.split('/')[-1])
                    self.log(console, "STRUCT_FILE_NAME: '" +
                             struct_file_path.split('/')[-1])  # DEBUG
                elif 'FASTQ file' in out_line:
                    clean_line = out_line.lstrip()
                    fastq_file_paths.append(re.split(r'\s+', clean_line)[3])
                else:
                    report_text_buf.append(out_line)
            report_text += "\n".join(report_text_buf)

        #### STEP 4: Upload Read Libs and create reads set
        ##
        if len(invalid_msgs) == 0:
            lib_obj_refs = []
            lib_obj_names = []
            readsSet_items = []

            for sample_i, fastq_file_path in enumerate(fastq_file_paths):

                if not os.path.isfile (fastq_file_path) \
                   or os.path.getsize (fastq_file_path) == 0:

                    raise ValueError("empty read lib generated: " +
                                     fastq_file_path)
                else:

                    # lib obj name
                    if len(fastq_file_paths) == 0:
                        output_obj_name = params['output_name']
                    else:
                        if str(params['pairs_flag']) == '1':
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".PairedEndLib"
                        else:
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".SingleEndLib"
                    lib_obj_names.append(output_obj_name)

                    # upload lib and get obj ref
                    self.log(
                        console,
                        'Uploading trimmed paired reads: ' + output_obj_name)
                    sequencing_tech = 'artificial reads'
                    if str(params['pairs_flag']) == '1':
                        interleaved = 1
                    else:
                        interleaved = 0
                    lib_obj_ref = readsUtils_Client.upload_reads({
                        'wsname':
                        str(params['workspace_name']),
                        'name':
                        output_obj_name,
                        'fwd_file':
                        fastq_file_path,
                        'interleaved':
                        interleaved,
                        'sequencing_tech':
                        sequencing_tech
                    })['obj_ref']
                    lib_obj_refs.append(lib_obj_ref)
                    os.remove(fastq_file_path)  # free up disk

                    # add to readsSet
                    readsSet_items.append({
                        'ref': lib_obj_ref,
                        'label': output_obj_name
                    })
            # create readsset
            readsSet_obj_ref = None
            if len(lib_obj_refs) > 1:
                readsSet_obj = {
                    'description':
                    "Grinder Metagenome from " + " ".join(genome_obj_names),
                    'items':
                    readsSet_items
                }
                readsSet_obj_name = params['output_name']
                readsSet_obj_ref = setAPI_Client.save_reads_set_v1({
                    'workspace_name':
                    params['workspace_name'],
                    'output_object_name':
                    readsSet_obj_name,
                    'data':
                    readsSet_obj
                })['set_ref']

        #### STEP 5: Build report
        ##
        reportName = 'kb_grinder_report_' + str(uuid.uuid4())
        reportObj = {
            'objects_created': [],
            #'text_message': '',  # or is it 'message'?
            'message': '',  # or is it 'text_message'?
            'direct_html': '',
            #'direct_html_link_index': 0,
            'file_links': [],
            'html_links': [],
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # message
        if len(invalid_msgs) > 0:
            report_text = "\n".join(invalid_msgs)
        reportObj['message'] = report_text

        if len(invalid_msgs) == 0:
            # objs
            if readsSet_obj_ref != None:
                reportObj['objects_created'].append({
                    'ref':
                    readsSet_obj_ref,
                    'desc':
                    params['output_name'] + " ReadsSet"
                })
            for lib_obj_i, lib_obj_ref in enumerate(lib_obj_refs):
                reportObj['objects_created'].append({
                    'ref':
                    lib_obj_ref,
                    'desc':
                    lib_obj_names[lib_obj_i]
                })
            # downloadable data
            for data_i, data_path in enumerate(struct_file_paths):
                try:
                    upload_ret = dfu.file_to_shock({
                        'file_path': data_path,
                        #'pack': 'zip'})
                        'make_handle': 0
                    })
                except:
                    raise ValueError('error uploading ' + data_path +
                                     ' file to shock')
                reportObj['file_links'].append({
                    'shock_id':
                    upload_ret['shock_id'],
                    'name':
                    struct_file_names[data_i],
                    'label':
                    struct_file_names[data_i]
                })

            # html report
            """
            try:
                html_upload_ret = dfu.file_to_shock({'file_path': html_output_dir,
                                                     'make_handle': 0,
                                                     'pack': 'zip'})
            except:
                raise ValueError ('error uploading html report to shock')
            reportObj['direct_html_link_index'] = 0
            reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                        'name': html_file,
                                        'label': params['output_name']+' HTML'
                                    }
                                   ]
            """

        # save report object
        #
        SERVICE_VER = 'release'
        reportClient = KBaseReport(self.callbackURL,
                                   token=ctx['token'],
                                   service_ver=SERVICE_VER)
        #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']})
        report_info = reportClient.create_extended_report(reportObj)

        returnVal = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #END KButil_Build_InSilico_Metagenomes_with_Grinder

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method KButil_Build_InSilico_Metagenomes_with_Grinder return value '
                + 'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Exemplo n.º 26
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(
            self.scratch,
            'all_sequences_' + suffix + '.' + fasta_file_extension)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)

        # 2) based on type, download the files
        ws = Workspace(self.ws_url)
        input_info = ws.get_object_info3({'objects': [{
            'ref': input_ref
        }]})['infos'][0]
        # 0 obj_id objid - the numerical id of the object.
        # 1 obj_name name - the name of the object.
        # 2 type_string type - the type of the object.
        # 3 timestamp save_date - the save date of the object.
        # 4 obj_ver ver - the version of the object.
        # 5 username saved_by - the user that saved or copied the object.
        # 6 ws_id wsid - the workspace containing the object.
        # 7 ws_name workspace - the workspace containing the object.
        # 8 string chsum - the md5 checksum of the object.
        # 9 int size - the size of the object in bytes.
        # 10 usermeta meta - arbitrary user-supplied metadata about
        #     the object.
        [
            OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
            WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
        ] = range(11)  # object_info tuple
        obj_name = input_info[NAME_I]
        type_name = input_info[TYPE_I].split('-')[0]

        # auClient
        try:
            auClient = AssemblyUtil(self.callbackURL,
                                    token=self.ctx['token'],
                                    service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            setAPI_Client = SetAPI(
                url=self.serviceWizardURL,
                token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError(
                'Unable to instantiate setAPI_Client with serviceWizardURL: ' +
                self.serviceWizardURL + ' ERROR: ' + str(e))

        # mguClient
        try:
            mguClient = MetagenomeUtils(self.callbackURL,
                                        token=self.ctx['token'],
                                        service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate mguClient with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # Standard Single Assembly
        #
        if type_name in [
                'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet'
        ]:
            # create file data
            filename = os.path.join(input_dir,
                                    obj_name + '.' + fasta_file_extension)
            auClient.get_assembly_as_fasta({
                'ref': input_ref,
                'filename': filename
            })
            if not os.path.isfile(filename):
                raise ValueError(
                    'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                )
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                raise ValueError(
                    'Assembly or ContigSet is empty in filename: ' +
                    str(filename))

        # AssemblySet
        #
        elif type_name == 'KBaseSets.AssemblySet':

            # read assemblySet
            try:
                assemblySet_obj = setAPI_Client.get_assembly_set_v1({
                    'ref':
                    input_ref,
                    'include_item_info':
                    1
                })
            except Exception as e:
                raise ValueError('Unable to get object from workspace: (' +
                                 input_ref + ')' + str(e))
            assembly_refs = []
            assembly_names = []
            for assembly_item in assemblySet_obj['data']['items']:
                this_assembly_ref = assembly_item['ref']
                # assembly obj info
                try:
                    this_assembly_info = ws.get_object_info_new(
                        {'objects': [{
                            'ref': this_assembly_ref
                        }]})[0]
                    this_assembly_name = this_assembly_info[NAME_I]
                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' +
                                     this_assembly_ref + '): ' + str(e))
                assembly_refs.append(this_assembly_ref)
                assembly_names.append(this_assembly_name)

            # create file data (name for file is what's reported in results)
            for ass_i, assembly_ref in enumerate(assembly_refs):
                this_name = assembly_names[ass_i]
                filename = os.path.join(input_dir,
                                        this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({
                    'ref': assembly_ref,
                    'filename': filename
                })
                if not os.path.isfile(filename):
                    raise ValueError(
                        'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                    )
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError(
                        'Assembly or ContigSet is empty in filename: ' +
                        str(filename))

        # Binned Contigs
        #
        elif type_name == 'KBaseMetagenomes.BinnedContigs':

            # download the bins as fasta and set the input folder name
            bin_file_dir = mguClient.binned_contigs_to_file({
                'input_ref': input_ref,
                'save_to_shock': 0
            })['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            # make sure fasta file isn't empty
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
            for (dirpath, dirnames, filenames) in os.walk(input_dir):
                for fasta_file in filenames:
                    fasta_path = os.path.join(input_dir, fasta_file)
                    min_fasta_len = 1
                    if not self.fasta_seq_len_at_least(fasta_path,
                                                       min_fasta_len):
                        raise ValueError(
                            'Binned Assembly is empty for fasta_path: ' +
                            str(fasta_path))
                break

        # Genome and GenomeSet
        #
        elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet':
            genome_obj_names = []
            genome_sci_names = []
            genome_assembly_refs = []

            if type_name == 'KBaseGenomes.Genome':
                genomeSet_refs = [input_ref]
            else:  # get genomeSet_refs from GenomeSet object
                genomeSet_refs = []
                try:
                    genomeSet_object = ws.get_objects2(
                        {'objects': [{
                            'ref': input_ref
                        }]})['data'][0]['data']
                except Exception as e:
                    raise ValueError('Unable to fetch ' + str(input_ref) +
                                     ' object from workspace: ' + str(e))
                    #to get the full stack trace: traceback.format_exc()

                # iterate through genomeSet members
                for genome_id in genomeSet_object['elements'].keys():
                    if 'ref' not in genomeSet_object['elements'][genome_id] or \
                       genomeSet_object['elements'][genome_id]['ref'] == None or \
                       genomeSet_object['elements'][genome_id]['ref'] == '':
                        raise ValueError(
                            'genome_ref not found for genome_id: ' +
                            str(genome_id) + ' in genomeSet: ' +
                            str(input_ref))
                    else:
                        genomeSet_refs.append(
                            genomeSet_object['elements'][genome_id]['ref'])

            # genome obj data
            for i, this_input_ref in enumerate(genomeSet_refs):
                try:
                    objects = ws.get_objects2(
                        {'objects': [{
                            'ref': this_input_ref
                        }]})['data']
                    genome_obj = objects[0]['data']
                    genome_obj_info = objects[0]['info']
                    genome_obj_names.append(genome_obj_info[NAME_I])
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError("unable to fetch genome: " +
                                     this_input_ref)

                # Get genome_assembly_ref
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    raise ValueError(msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj[
                        'assembly_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING assembly_ref: " + str(
                                genome_obj['assembly_ref'])
                    print(msg)
                    genome_assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj[
                        'contigset_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING contigset_ref: " + str(
                                genome_obj['contigset_ref'])
                    print(msg)
                    genome_assembly_refs.append(genome_obj['contigset_ref'])

            # create file data (name for file is what's reported in results)
            for ass_i, assembly_ref in enumerate(genome_assembly_refs):
                this_name = genome_obj_names[ass_i]
                filename = os.path.join(input_dir,
                                        this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({
                    'ref': assembly_ref,
                    'filename': filename
                })
                if not os.path.isfile(filename):
                    raise ValueError(
                        'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                    )
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError(
                        'Assembly or ContigSet is empty in filename: ' +
                        str(filename))

        # Unknown type slipped through
        #
        else:
            raise ValueError(
                'Cannot stage fasta file input directory from type: ' +
                type_name)

        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {
            'input_dir': input_dir,
            'folder_suffix': suffix,
            'all_seq_fasta': all_seq_fasta
        }
Exemplo n.º 27
0
    def get_promoter_for_gene(self, ctx, params):
        """
        :param params: instance of type "get_promoter_for_gene_input" (Genome
           is a KBase genome Featureset is a KBase featureset Promoter_length
           is the length of promoter requested for all genes) -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "featureSet_ref" of String, parameter
           "promoter_length" of Long
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN get_promoter_for_gene
        #code goes here
        dfu = DataFileUtil(self.callback_url)
        #objectRefs = {'object_refs':[params['genome_ref'],params['featureSet_ref']]}
        objectRefs = {'object_refs': [params['featureSet_ref']]}
        ws = Workspace('https://appdev.kbase.us/services/ws')
        ws_name = params['workspace_name']
        subset = ws.get_object_subset([{
            'included':
            ['/features/[*]/location', '/features/[*]/id', '/assembly_ref'],
            'ref':
            params['genome_ref']
        }])
        features = subset[0]['data']['features']
        aref = subset[0]['data']['assembly_ref']
        objects = dfu.get_objects(objectRefs)
        #genome = objects['data'][0]['data']
        #featureSet = objects['data'][1]['data']
        featureSet = objects['data'][0]['data']
        assembly_ref = {'ref': aref}
        #print assembly_ref
        #with open(self.shared_folder + '/genome.json','w') as f:
        #    json.dump(genome,f)
        #with open(self.shared_folder + '/featureSet.json','w') as f:
        #    json.dump(featureSet,f)
        #with open('/kb/module/work/asssembly.json','w') as f:
        #    json.dump(assembly,f)
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)

        #pprint(fasta_file)
        #loop over featureSet
        #find matching feature in genome
        #get record, start, orientation, length
        #TODO: add some error checking logic to the bounds of the promoter
        prom = ""
        featureFound = False
        for feature in featureSet['elements']:
            #print(feature)
            #print(featureSet['elements'][feature])
            featureFound = False
            for f in features:
                #print f['id']
                #print feature
                if f['id'] == feature:
                    attributes = f['location'][0]
                    featureFound = True
                    #print('found match ' + feature)
                    #print(f['location'])
                    break
            if featureFound:
                for record in SeqIO.parse(fasta_file['path'], 'fasta'):
                    #for record in SeqIO.parse('/kb/module/work/Gmax_189_genome_assembly.fa', 'fasta'):
                    #print(record.id)
                    #print(attributes[0])
                    if record.id == attributes[0]:
                        #print('adding to prom string')
                        #print(attributes[0])
                        if attributes[2] == '+':
                            #print('1')
                            #might need to offset by 1?
                            end = attributes[1]
                            start = end - params['promoter_length']
                            if end < 0:
                                end = 0
                            promoter = record.seq[start:end].upper()
                            #HERE: resolve ambiguous characters
                            prom += ">" + feature + "\n"
                            prom += promoter + "\n"

                        elif attributes[2] == '-':
                            #print('2')
                            start = attributes[1]
                            end = start + params['promoter_length']
                            if end > len(record.seq) - 1:
                                end = len(record.seq) - 1
                            promoter = record.seq[start:end].upper()
                            complement = {
                                'A': 'T',
                                'C': 'G',
                                'G': 'C',
                                'T': 'A',
                                'N': 'N'
                            }
                            promoter = ''.join(
                                [complement[base] for base in promoter[::-1]])
                            #HERE: resolve ambiguous characters
                            prom += ">" + feature + "\n"
                            prom += promoter + "\n"

                        else:
                            print('Error on orientation')
            else:
                print('Could not find feature ' + feature + 'in genome')
        promOutputPath = '/kb/module/work/tmp/promFile.fa'
        #print('prom string\n' + str(prom))
        with open(promOutputPath, 'w') as promFile:
            promFile.write(str(prom))

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        html_output_dir = os.path.join(self.shared_folder,
                                       'output_html.' + str(timestamp))
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)
        html_file = 'promoter.html'
        output_html_file_path = os.path.join(html_output_dir, html_file)

        html_report_lines = '<html><body>'
        html_report_lines += '<pre>' + prom + '</pre>'
        html_report_lines += '</body></html>'

        with open(output_html_file_path, 'w', 0) as html_handle:
            html_handle.write(str(html_report_lines))

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': html_output_dir,
                #html_upload_ret = dfu.file_to_shock({'file_path': output_html_file_path,
                #'make_handle': 0})
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        reportName = 'identify_promoter_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [],
            'message': '',
            'direct_html': None,
            'direct_html_index': 0,
            'file_links': [],
            'html_links': [],
            'html_window_height': 220,
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            'name': html_file,
            'label': 'View'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #changing output to be path string
        #TODO: get rid of this html maybe and move into find_motifs
        output = promOutputPath

        #iterate over records in fasta
        #for record in SeqIO.parse(fasta_file['path'], 'fasta'):

        #objects list of Genome and featureSet

        #pprint(objects)
        #END get_promoter_for_gene

        # At some point might do deeper type checking...
        if not isinstance(output, basestring):
            raise ValueError('Method get_promoter_for_gene return value ' +
                             'output is not type basestring as required.')
        # return the results
        return [output]
Exemplo n.º 28
0
    def filter_contigs(self, ctx, params):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_contigs

        # Print statements to stdout/stderr are captured and available as the App log
        print('Starting Filter Contigs function. Params=')
        pprint(params)

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        print('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError('Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError('Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError('Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError('Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')')


        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta({'ref': assembly_input_ref})


        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')


        # Step 4 - Save the new Assembly back to the system
        print('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({'file': {'path': filtered_fasta_file},
                                                              'workspace_name': workspace_name,
                                                              'assembly_name': fasta_file['assembly_name']
                                                              })


        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
            'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})


        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref'],
                  'assembly_output': new_assembly,
                  'n_initial_contigs': n_total,
                  'n_contigs_removed': n_total - n_remaining,
                  'n_contigs_remaining': n_remaining
                  }
        print('returning:' + pformat(output))
                
        #END filter_contigs
        

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
class JNixonHelloContigsFilter:
    '''
    Module Name:
    JNixonHelloContigsFilter

    Module Description:
    A KBase module: JNixonHelloContigsFilter
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/nixonpjoshua/JNixonHelloContigsFilter.git"
    GIT_COMMIT_HASH = "16a66ab4d699d973e210ff92163fbe763009e6d3"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = AssemblyUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def filter_contigs(self, ctx, workspace_name, contigset, minimum):
        """
        :param workspace_name: instance of String
        :param contigset: instance of String
        :param minimum: instance of Long
        :returns: instance of type "FilterContigResults" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "assembly_ref" of String, parameter
           "contig_count" of Long, parameter "filtered_contig_count" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_contigs
        print(workspace_name)
        print(contigset)
        print(minimum)

        def perform_filter(min_length, contigs):
            result_type = namedtuple(
                'filter_result',
                ['total_count', 'filtered_count', 'filtered_set'])
            total_count = 0
            filtered_count = 0
            filtered_set = set()
            for contig in contigs:
                if len(contig) > min_length:
                    filtered_count += 1
                    filtered_set.add(contig)
                total_count += 1
            return result_type(total_count, filtered_count, filtered_set)

        print('about to get fasta')
        fasta_file = self.dfu.get_assembly_as_fasta({'ref': contigset})
        print('got fasta')
        contigs = SeqIO.parse(fasta_file['path'], 'fasta')
        filtered_file = os.path.join(self.scratch, 'filtered.fasta')
        filtered = perform_filter(minimum, contigs)
        SeqIO.write(filtered.filtered_set, filtered_file, 'fasta')

        new_assembly = self.dfu.\
            save_assembly_from_fasta({'file': {'path': filtered_file},
                                      'workspace_name': workspace_name,
                                      'assembly_name': fasta_file['assembly_name']
                                      })

        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'text_message':
            'Filtered Assembly to ' + str(filtered.filtered_count) +
            ' contigs out of ' + str(filtered.total_count)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': workspace_name
        })

        returnVal = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'contig_count': filtered.total_count,
            'filtered_contig_count': filtered.filtered_count
        }
        #END filter_contigs

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
    def tandem_repeats(self, ctx, params):
        """
        The actual function is declared using 'funcdef' to specify the name
        and input/return arguments to the function.  For all typical KBase
        Apps that run in the Narrative, your function should have the 
        'authentication required' modifier.
        :param params: instance of type "TandemRepeatsParams" (A 'typedef'
           can also be used to define compound or container objects, like
           lists, maps, and structures.  The standard KBase convention is to
           use structures, as shown here, to define the input and output of
           your function.  Here the input is a reference to the Assembly data
           object, a workspace to save output, and a length threshold for
           filtering. To define lists and maps, use a syntax similar to C++
           templates to indicate the type contained in the list or map.  For
           example: list <string> list_of_strings; mapping <string, int>
           map_of_ints;) -> structure: parameter "assembly_input_ref" of type
           "assembly_ref" (A 'typedef' allows you to provide a more specific
           name for a type.  Built-in primitive types include 'string',
           'int', 'float'.  Here we define a type named assembly_ref to
           indicate a string that should be set to a KBase ID reference to an
           Assembly data object.), parameter "workspace_name" of String,
           parameter "match_weight" of Long, parameter "mismatch_weight" of
           Long, parameter "delta_weight" of Long, parameter "prob_pm" of
           Long, parameter "prob_pi" of Long, parameter "min_score" of Long,
           parameter "max_period_size" of Long
        :returns: instance of type "TandemRepeatsResults" (Here is the
           definition of the output of the function.  The output can be used
           by other SDK modules which call your code, or the output
           visualizations in the Narrative.  'report_name' and 'report_ref'
           are special output fields- if defined, the Narrative can
           automatically render your Report.) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN tandem_repeats

        # init
        #
        console = []
        invalid_msgs = []
        self.log(console, 'Running run_TandemRepeats with params=')
        self.log(console, "\n" + pformat(params))
        report = ''

        #### do some basic checks
        #
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'assembly_input_ref' not in params:
            raise ValueError('assembly_input_ref parameter is required')
        assembly_input_ref = params['assembly_input_ref']

        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        input_fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})['path']

        print 'PATH: ', input_fasta_file

        # check for necessary files
        if not os.path.isfile(self.TRF_bin):
            raise ValueError("no such file '" + self.TRF_bin + "'")
        if not os.path.isfile(input_fasta_file):
            raise ValueError("no such file '" + input_fasta_file + "'")
        if not os.path.getsize(input_fasta_file) > 0:
            raise ValueError("empty file '" + input_fasta_file + "'")

        # set the output path
#        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
#        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
#        if not os.path.exists(output_dir):
#            os.makedirs(output_dir)
#        output_trf_file_path = os.path.join(output_dir, 'tandem_repeats.txt');

### Construct the command
#
#  e.g. trf yoursequence.txt 2 7 7 80 10 50 500 -f -d -m
#

        trf_cmd = [self.TRF_bin]
        trf_options = [str(input_fasta_file)]

        trf_match = 2
        trf_options.append(str(trf_match))
        trf_mismatch = 7
        trf_options.append(str(trf_mismatch))
        trf_delta = 7
        trf_options.append(str(trf_delta))
        trf_pm = 80
        trf_options.append(str(trf_pm))
        trf_pi = 10
        trf_options.append(str(trf_pi))
        trf_minscore = 50
        trf_options.append(str(trf_minscore))
        trf_maxperiod = 500
        trf_options.append(str(trf_maxperiod))
        trf_masked = "-m"
        trf_options.append(trf_masked)
        trf_flank = "-f"
        trf_options.append(trf_flank)
        trf_data = "-d"
        trf_options.append(trf_data)
        trf_noredun = "-r"
        trf_maxTR = "-l 2"

        trf_cmd = trf_cmd + trf_options

        # Run Tandem Repeats Finder, capture output as it happens
        #
        self.log(console, 'RUNNING TandemRepeatsFinder:')
        self.log(console, '    ' + ' '.join(trf_cmd))

        #  Runnin TandemRepeatFinder

        p = subprocess.Popen(trf_cmd, \
                             cwd=self.scratch, \
                             shell = False
                            )
        retcode = p.wait()

        self.log(console, 'Returned Contigs: ' + str(p.returncode))
        if p.returncode != 0:
            print "Number of Contigs Analyzed", p.returncode

        # Check that TandemRepeatsFinder produced output
        #
        option_string = '.'.join(trf_options[0:8])
        print os.listdir(self.scratch)
        print "INPUT file = ", input_fasta_file

        html_file = option_string + ".summary.html"
        mask_file = option_string + ".mask"
        data_file = option_string + ".dat"
        if not os.path.isfile(html_file):
            html_file = option_string + ".1.html"
        if not os.path.isfile(html_file):
            raise ValueError("failed to create TandemRepeats output: " +
                             html_file)
        elif not os.path.getsize(html_file) > 0:
            raise ValueError("created empty file for TandemRepeats output: " +
                             html_file)

        # Upload results
        #
        if len(invalid_msgs) == 0:
            self.log(console, "UPLOADING RESULTS")  # DEBUG

            with open(html_file, 'r', 0) as html_file_handle:
                html_buf = html_file_handle.read()
            self.log(console, html_buf + "\n")

        self.log(console, "BUILDING REPORT")  # DEBUG

        # If input data is invalid
        #
        if len(invalid_msgs) != 0:
            reportName = 'trf_report_' + str(uuid.uuid4())
            report += "FAILURE\n\n" + "\n".join(invalid_msgs) + "\n"
            reportObj = {'objects_created': [], 'text_message': report}
            report_obj_info = ws.save_objects({
                # 'id':info[6],
                'workspace':
                params['workspace_name'],
                'objects': [{
                    'type': 'KBaseReport.Report',
                    'data': reportObj,
                    'name': reportName,
                    'meta': {},
                    'hidden': 1
                }]
            })[0]
            returnVal = {
                'report_name':
                reportName,
                'report_ref':
                str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' +
                str(report_obj_info[4]),
            }
            return [returnVal]

        # If input data is VALID
        # Create report obj
        #
        file_links = list()
        html_links = list()
        file_links.append({
            'path': html_file,
            'name': os.path.basename(html_file),
            'desc': 'HTML file output'
        })
        file_links.append({
            'path': mask_file,
            'name': os.path.basename(mask_file),
            'desc': 'Masked file output'
        })
        file_links.append({
            'path': data_file,
            'name': os.path.basename(data_file),
            'desc': 'Data file output'
        })
        html_links.append({
            'path': html_file,
            'name': os.path.basename(html_file),
            'desc': 'HTML file output'
        })

        reportName = 'trf_report_' + str(uuid.uuid4())
        reportObj = {
            'objects_created': [],
            'message': '',  # or is it 'text_message'?
            'direct_html': '',
            'direct_html_link_index': None,
            'file_links': file_links,
            'html_links': html_links,
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }
        #        reportObj['objects_created'].append({'ref': str(params['workspace_name'])+'/'+str(html_file),'description': 'Report'})
        reportObj['direct_html_link_index'] = 0

        SERVICE_VER = 'release'
        reportClient = KBaseReport(self.callback_url,
                                   token=ctx['token'],
                                   service_ver=SERVICE_VER)
        report_info = reportClient.create_extended_report(reportObj)

        # Done
        #
        self.log(console, "BUILDING RETURN OBJECT")
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        self.log(console, "run_TandemRepeats DONE")

        #END tandem_repeats

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method tandem_repeats return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 31
0
    def do_assembly(self, assemblyRef, wsName):
        #try:
        #    assembly = wsClient.get_objects2({'objects': [{'ref': assembly_ref}]})
        #except:
        #    exc_type, exc_value, exc_traceback = sys.exc_info()
        #    lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
        #    orig_error = ''.join('    ' + line for line in lines)
        #    raise ValueError('Error from workspace:\n' + orig_error)

        #print assembly#[200:]
        #print assembly['data']
        #print assembly['data'][0]
        #assembly['data'][0]['data']

        #fasta_handle_ref = assembly['data'][0]['data']['fasta_handle_ref']
        #print "fasta_handle_ref "+fasta_handle_ref
        #print type(fasta_handle_ref)

        #TODO create file here /kb/module/work
        #TODO set output file name
        print "SDK_CALLBACK_URL " + os.environ['SDK_CALLBACK_URL']
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        #assembly_input_ref = "16589/2/1"
        #filename = "test.fasta"
        #obj_name = "EcoliMG1655.f"
        #wsname = "example_assembly"

        param = dict()
        param['ref'] = assemblyRef  #assembly_input_ref

        input_fasta_file = au.get_assembly_as_fasta(
            param)  #{'ref': assembly_input_ref})

        #just_input_fasta_file = os.path.basename(input_fasta_file['path'])
        #print "input_fasta_file "+ str(input_fasta_file['path'])

        newtmp = "/kb/module/work/tmp/tmp_" + self.create_random_string()
        os.mkdir(newtmp)
        os.mkdir(newtmp + "/input")

        newfasta = newtmp + "/input/" + os.path.basename(
            input_fasta_file['path'])
        print "newfasta " + newfasta

        os.rename(input_fasta_file['path'], newfasta)

        args = [
            "wrapper_phage_contigs_sorter_iPlant.pl ", "--db 2 ", "--fna ",
            newfasta, " --wdir ", newtmp
        ]

        print str(args)

        cmdstring = "".join(args)

        print "Executing"
        cmdProcess = subprocess.Popen(cmdstring,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE,
                                      shell=True)
        print "Done " + str(cmdProcess)
        stdout, stderr = cmdProcess.communicate()
        print " stdout: " + stdout
        print " stderr: " + stderr

        #return [report]

        # Step 5 - Build a Report and return
        reportObj = {'objects_created': [], 'text_message': stdout}
        # 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],

        #report_info = report.create({'report': reportObj, 'workspace_name': wsName})

        #reportObj = {
        #    'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
        #    'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        #}
        #report = KBaseReport(self.callback_url)
        #report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})

        # contruct the output to send back
        #output = {'report_name': report_info['name'],
        #          'report_ref': report_info['ref']
        #          }
        #print('returning:' + pformat(output))

        print('Saving report')
        kbr = KBaseReport(self.callback_url, service_ver='dev')
        report = ''
        report += "cmdstring: " + str(cmdstring) + " stdout: " + str(
            stdout) + " stderr: " + str(stderr)

        virout = newtmp + "/" + "VIRSorter_global-phage-signal.csv"
        with open(virout, 'r') as myfile:
            data = myfile.read().replace('\n', '')

        print "wsName " + str(wsName)

        data = data.replace(",", "\t")
        data = data.replace("##", "\n##")
        report = report + "\n\n***** VirSorter output *****\n" + data
        report_data = {
            'message': report,
            'objects_created': None,
            'direct_html_link_index': None,
            'html_links': None,
            'report_object_name': 'kb_virsorter_' + str(uuid.uuid4()),
            'workspace_name': wsName
        }

        print "report_data"
        print str(report_data)
        report_info = kbr.create_extended_report(report_data)

        # 'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}],
        # 'html_links': [{'shock_id': quastret['shock_id'],
        #                     'name': 'report.html',
        #                     'label': 'QUAST report'}
        #                    ],

        reportName = report_info['name']
        reportRef = report_info['ref']
        return reportName, reportRef