def export_genome_annotation_as_genbank(self, ctx, params):
        """
        A method designed especially for download, this calls 'genome_annotation_to_genbank' to do
        the work, but then packages the output with WS provenance and object info into
        a zip file and saves to shock.
        :param params: instance of type "ExportParams" -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_annotation_as_genbank

         # validate parameters
        if 'input_ref' not in params:
            raise ValueError('Cannot export GenomeAnnotation- not input_ref field defined.')

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref': params['input_ref'] }],'includeMetadata':0, 'ignoreErrors':0})[0]

        # export to a file
        file = self.genome_annotation_to_genbank(ctx, { 
                            'genome_ref': params['input_ref'], 
                            'new_genbank_file_name': info[1]+'.gbk' })[0]

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.sharedFolder, info[1])
        os.makedirs(export_package_dir)
        shutil.move(file['path'], os.path.join(export_package_dir, os.path.basename(file['path'])))

        # package it up and be done
        dfUtil = DataFileUtil(self.callback_url)
        package_details = dfUtil.package_for_download({
                                    'file_path': export_package_dir,
                                    'ws_refs': [ params['input_ref'] ]
                                })

        output = { 'shock_id': package_details['shock_id'] }

        #END export_genome_annotation_as_genbank

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_annotation_as_genbank return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#2
0
class Utils:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.gen_api = GenericsAPI(self.callback_url)
        self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom"
        self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
        self.DEFAULT_UNIT_ID = "Custom:Unit"

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def get_conditions(self, params):
        data = self.dfu.get_objects(
            {'object_refs': [params['condition_set_ref']]})['data'][0]['data']
        conditions = {}
        keep_keys = params.get('conditions', data['conditions'].keys())
        for key in keep_keys:
            conditions[key] = defaultdict(list)
            for factor, val in zip(data['factors'], data['conditions'][key]):
                ont_abriv = factor['factor_ont_id'].split(":")[0]
                factor['value'] = val
                conditions[key][ont_abriv].append(copy.copy(factor))
        return {"conditions": conditions}

    def file_to_condition_set(self, params):
        """Convert a user supplied file to a compound set"""
        if 'input_file_path' in params:
            scratch_file_path = params['input_file_path']
        elif 'input_shock_id' in params:
            scratch_file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        else:
            raise ValueError(
                "Must supply either a input_shock_id or input_file_path")
        try:
            df = pd.read_excel(scratch_file_path, dtype='str')
        except XLRDError:
            df = pd.read_csv(scratch_file_path, sep="\t", dtype='str')
        comp_set = self._df_to_cs_obj(df)
        info = self.dfu.save_objects({
            "id":
            params['output_ws_id'],
            "objects": [{
                "type": "KBaseExperiments.ConditionSet",
                "data": comp_set,
                "name": params['output_obj_name']
            }]
        })[0]
        return {"condition_set_ref": "%s/%s/%s" % (info[6], info[0], info[4])}

    def _conditionset_data_to_df(self, data):
        """
        Converts a compound set object data to a dataframe
        """

        factors = pd.DataFrame(data['factors'])
        factors.rename(columns=lambda x: x.replace("ont", "ontology").
                       capitalize().replace("_", " "))
        conditions = pd.DataFrame(data['conditions'])
        cs_df = factors.join(conditions)

        return cs_df

    def _clusterset_data_to_df(self, data):
        """
        Converts a cluster set object data to a dataframe
        """

        original_matrix_ref = data.get('original_data')
        data_matrix = self.gen_api.fetch_data({
            'obj_ref': original_matrix_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        clusters = data.get('clusters')

        id_name_list = [
            cluster.get('id_to_data_position').keys() for cluster in clusters
        ]
        id_names = [item for sublist in id_name_list for item in sublist]

        if set(data_df.columns.tolist()) == set(
                id_names):  # cluster is based on condition
            data_df = data_df.T

        cluster_names = [None] * data_df.index.size

        cluster_id = 0
        for cluster in clusters:
            item_ids = cluster.get('id_to_data_position').keys()
            item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids]

            for idx in item_idx:
                cluster_names[idx] = cluster_id

            cluster_id += 1

        data_df['cluster'] = cluster_names

        return data_df

    def _ws_obj_to_df(self, input_ref):
        """Converts workspace obj to a dataframe"""
        res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0]
        name = res['info'][1]

        obj_type = res['info'][2]

        if "KBaseExperiments.ConditionSet" in obj_type:
            cs_df = self._conditionset_data_to_df(res['data'])
        elif "KBaseExperiments.ClusterSet" in obj_type:
            cs_df = self._clusterset_data_to_df(res['data'])
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseExperiments.ConditionSet or KBaseExperiments.ClusterSet'
            raise ValueError("err_msg")

        return name, cs_df, obj_type

    def _df_to_cs_obj(self, cs_df):
        """Converts a dataframe from a user file to a compound set object"""
        condition_set = {'ontology_mapping_method': "User Curation"}
        cs_df.fillna('', inplace=True)
        if not len(cs_df):
            raise ValueError("No factors in supplied files")
        factor_df = cs_df.filter(regex="[Uu]nit|[Ff]actor")
        condition_df = cs_df.drop(factor_df.columns, axis=1)
        if not len(condition_df.columns):
            raise ValueError(
                "Unable to find any condition columns in supplied file")

        factor_df.rename(
            columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(),
            inplace=True)
        if "factor" not in factor_df.columns:
            raise ValueError(
                "Unable to find a 'Factor' column in supplied file")
        factor_fields = ('factor', 'unit', 'factor_ont_id', 'unit_ont_id')
        factors = factor_df.filter(items=factor_fields).to_dict('records')

        condition_set['factors'] = [
            self._add_ontology_info(f) for f in factors
        ]
        condition_set['conditions'] = condition_df.to_dict('list')
        return condition_set

    def _search_ontologies(self, term, closest=False):
        """
        Match to an existing KBase ontology term
        :param term: Test to match
        :param closest: if false, term must exactly match an ontology ID
        :return: dict(ontology_ref, id)
        """
        params = {
            "object_types": ["OntologyTerm"],
            "match_filter": {
                "lookup_in_keys": {
                    "id": {
                        "value": term
                    }
                }
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "pagination": {
                "count": 1
            },
            "post_processing": {
                "skip_data": 1
            }
        }
        if closest:
            params['match_filter'] = {"full_text_in_all": term}
        res = self.kbse.search_objects(params)
        if not res['objects']:
            return None
        term = res['objects'][0]
        return {
            "ontology_ref": term['guid'].split(":")[1],
            "id": term['key_props']['id']
        }

    def _add_ontology_info(self, factor):
        """Searches KBASE ontologies for terms matching the user supplied factors and units.
        Add the references if found"""
        optionals = {
            "unit",
            "unit_ont_id",
            "unit_ont_ref",
        }
        factor = {
            k: v
            for k, v in factor.items() if k not in optionals or v != ""
        }
        ont_info = self._search_ontologies(
            factor.get('factor_ont_id', "").replace("_", ":"))
        if ont_info:
            factor['factor_ont_ref'] = ont_info['ontology_ref']
            factor['factor_ont_id'] = ont_info['id']
        else:
            factor['factor_ont_ref'] = self.DEFAULT_ONTOLOGY_REF
            factor['factor_ont_id'] = self.DEFAULT_ONTOLOGY_ID

        if factor.get('unit'):
            ont_info = self._search_ontologies(
                factor.get('unit_ont_id', '').replace("_", ":"))
            if ont_info:
                factor['unit_ont_ref'] = ont_info['ontology_ref']
                factor['unit_ont_id'] = ont_info['id']
            else:
                factor['unit_ont_ref'] = self.DEFAULT_ONTOLOGY_REF
                factor['unit_ont_id'] = self.DEFAULT_UNIT_ID
        return factor

    def to_tsv(self, params):
        """Convert an compound set to TSV file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".tsv")
        df.to_csv(files['file_path'], sep="\t", index=False)

        return _id, files

    def to_excel(self, params):
        """Convert an compound set to Excel file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".xlsx")

        writer = pd.ExcelWriter(files['file_path'])

        if "KBaseExperiments.ConditionSet" in obj_type:
            df.to_excel(writer, "Conditions", index=False)
        elif "KBaseExperiments.ClusterSet" in obj_type:
            df.to_excel(writer, "ClusterSet", index=True)
        # else is checked in `_ws_obj_to_df`

        writer.save()

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
示例#3
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1]
            res = self.gsu.search({'ref': genome,
                                   'structured_query': {'feature_id': fids},
                                   'sort_by': [['contig_id', 1]],
                                   'start': 0,
                                   'limit': len(fids)
                                   })

            for feat in res['features']:
                features.append({'Feature Id': feat['feature_id'],
                                 'Aliases': ", ".join(feat['aliases'].keys()),
                                 'Genome': "{} ({})".format(genome_name, genome),
                                 'Type': feat['feature_type'],
                                 'Function': feat['function']
                                 })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}
    def export_genome_as_genbank(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_as_genbank
        print('export_genome_as_genbank -- paramaters = ')

        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot run export_genome_as_genbank- no "input_ref" field defined.'
            )

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_object_info_new({
            'objects': [{
                'ref': params['input_ref']
            }],
            'includeMetadata': 0,
            'ignoreErrors': 0
        })[0]

        genome_to_genbank_params = {'genome_ref': params['input_ref']}

        # export to file
        result = self.genome_to_genbank(
            ctx, genome_to_genbank_params)[0]['genbank_file']

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.cfg.sharedFolder, info[1])
        os.makedirs(export_package_dir)
        shutil.move(
            result['file_path'],
            os.path.join(export_package_dir,
                         os.path.basename(result['file_path'])))

        # package it up and be done
        dfUtil = DataFileUtil(self.cfg.callbackURL)
        package_details = dfUtil.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {'shock_id': package_details['shock_id']}

        print('export complete -- result = ')
        pprint(output)
        #END export_genome_as_genbank

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_as_genbank return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
class PangenomeDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.pga = PanGenomeAPI(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "pangenome_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        files = {}
        working_dir = os.path.join(self.scratch,
                                   'pangenome-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)

        pg_id, id_name_map, genome_df = self.make_genomes_df(
            params['pangenome_ref'])
        files['genomes_path'] = os.path.join(working_dir, pg_id + "_Genomes.tsv")
        genome_df.to_csv(files['genomes_path'], sep="\t")

        ortho_df = self.make_ortholog_df(params['pangenome_ref'], id_name_map)
        files['orthologs_path'] = os.path.join(working_dir,
                                             pg_id + "_Orthologs.tsv")
        ortho_df.to_csv(files['orthologs_path'], sep="\t")

        return pg_id, files

    def to_excel(self, params):
        files = {}
        working_dir = os.path.join(self.scratch,
                                   'pangenome-download-' + str(uuid.uuid4()))
        os.makedirs(working_dir)

        pg_id, id_name_map, genome_df = self.make_genomes_df(
            params['pangenome_ref'])
        files['path'] = os.path.join(working_dir, pg_id + ".xlsx")
        writer = pandas.ExcelWriter(files['path'])
        genome_df.to_excel(writer, "Genomes")

        ortho_df = self.make_ortholog_df(params['pangenome_ref'], id_name_map)
        ortho_df.to_excel(writer, "Orthologs")
        writer.save()

        return pg_id, files

    def make_genomes_df(self, pg_ref):
        summary = self.pga.compute_summary_from_pangenome({
            "pangenome_ref": pg_ref})
        return summary['pangenome_id'], summary['genome_ref_name_map'], \
               pandas.DataFrame(summary['shared_family_map'])

    def make_ortholog_df(self, pg_ref, id_name_map):
        pangen = self.dfu.get_objects({'object_refs': [pg_ref]}
                                      )['data'][0]['data']
        ortho = {}
        for cluster in pangen['orthologs']:
            ortho[cluster['id']] = {
                "representative function": cluster.get('function', ""),
                "type": cluster.get("type", ""),
                "protein sequence": cluster.get("protein_translation", ""),
            }
            for gid, name in id_name_map.items():
                ortho[cluster['id']][name] = ";".join(
                    [x[0] for x in cluster['orthologs'] if x[2] == gid])

        return pandas.DataFrame.from_dict(ortho, 'index')[
            ["representative function", "type", "protein sequence"] +
            sorted([x for x in id_name_map.values()])]

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['pangenome_ref']]
        })

        return {'shock_id': package_details['shock_id']}
    def export_genome_as_genbank(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_as_genbank
        print('export_genome_as_genbank -- paramaters = ')

        # validate parameters
        if 'input_ref' not in params:
            raise ValueError('Cannot run export_genome_as_genbank- no "input_ref" field defined.')

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref': params['input_ref'] }],'includeMetadata':0, 'ignoreErrors':0})[0]

        genome_to_genbank_params = {
            'genome_ref': params['input_ref']
        }

        # export to file (building from KBase Genome Object)
        result = self.genome_to_genbank(ctx, genome_to_genbank_params)[0]['genbank_file'];

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.cfg.sharedFolder, info[1])
        os.makedirs(export_package_dir)
        shutil.move(
          result['file_path'],
          os.path.join(export_package_dir, os.path.basename(result['file_path'])))

        # export original uploaded GenBank file if it existed.
        exporter = GenomeToGenbank(self.cfg)
        original_result_full = exporter.export_original_genbank(ctx, genome_to_genbank_params)
        if original_result_full is not None:
            original_result = original_result_full['genbank_file']
            shutil.move(
              original_result['file_path'],
              os.path.join(export_package_dir, os.path.basename(original_result['file_path'])))

        # Make warning file about genes only.
        warning_filename = "warning.txt"
        with open(os.path.join(export_package_dir, warning_filename), 'wb') as temp_file:
            temp_file.write('Please note: the KBase-derived GenBank file for annotated genome ' +
                            'objects currently only shows "gene" features. CDS and mRNA ' +
                            'feature types are not currently included in the GenBank download, ' +
                            'but are in the KBase Genome object. ' +
                            'We hope to address this issue in the future.\n\n' +
                            'This directory includes the KBase-derived GenBank file and also ' +
                            '(if you originally uploaded the genome from an annotated ' +
                            'GenBank file) the original GenBank input.')

        # package it up and be done
        dfUtil = DataFileUtil(self.cfg.callbackURL)
        package_details = dfUtil.package_for_download({
                                    'file_path': export_package_dir,
                                    'ws_refs': [ params['input_ref'] ]
                                })

        output = { 'shock_id': package_details['shock_id'] }

        print('export complete -- result = ')
        pprint(output)
        #END export_genome_as_genbank

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_as_genbank return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#7
0
class AssemblyToFasta:
    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

    def export_as_fasta(self, ctx, params):
        ''' Used almost exclusively for download only '''
        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot export Assembly- not input_ref field defined.')

        # export to a file
        file = self.assembly_as_fasta(ctx, {'ref': params['input_ref']})

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.scratch, file['assembly_name'])
        os.makedirs(export_package_dir)
        shutil.move(
            file['path'],
            os.path.join(export_package_dir, os.path.basename(file['path'])))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        return {'shock_id': package_details['shock_id']}

    def assembly_as_fasta(self, ctx, params):
        ''' main function that accepts a ref to an object and writes a file '''

        self.validate_params(params)

        print('downloading ws object data (' + params['ref'] + ')')
        assembly_object = self.dfu.get_objects(
            {'object_refs': [params['ref']]})['data'][0]
        ws_type = assembly_object['info'][2]
        obj_name = assembly_object['info'][1]

        if 'filename' in params:
            output_filename = params['filename']
        else:
            output_filename = obj_name + '.fa'

        output_fasta_file_path = os.path.join(self.scratch, output_filename)

        if 'KBaseGenomes.ContigSet' in ws_type:
            self.process_legacy_contigset(output_fasta_file_path,
                                          assembly_object['data'])
        elif 'KBaseGenomeAnnotations.Assembly' in ws_type:
            self.process_assembly(output_fasta_file_path,
                                  assembly_object['data'])

        else:
            raise ValueError(
                'Cannot write data to fasta; invalid WS type (' + ws_type +
                ').  Supported types are KBaseGenomes.ContigSet and ' +
                'KBaseGenomeAnnotations.Assembly')

        return {'path': output_fasta_file_path, 'assembly_name': obj_name}

    def fasta_rows_generator_from_contigset(self, contig_list):
        ''' generates SeqRecords iterator for writing from a legacy contigset object '''
        for contig in contig_list:
            description = ''
            if 'description' in contig and contig['description']:
                description = contig['description']
            yield SeqRecord(Seq(contig['sequence'], SingleLetterAlphabet),
                            id=contig['id'],
                            description=description)

    def process_legacy_contigset(self, output_fasta_path, data):
        ''' '''
        SeqIO.write(self.fasta_rows_generator_from_contigset(data['contigs']),
                    output_fasta_path, "fasta")

    def process_assembly(self, output_fasta_path, data):
        ''' '''
        self.dfu.shock_to_file({
            'handle_id': data['fasta_handle_ref'],
            'file_path': output_fasta_path,
            'unpack': 'uncompress'
        })

    def validate_params(self, params):
        for key in ['ref']:
            if key not in params:
                raise ValueError('required "' + key +
                                 '" field was not defined')
示例#8
0
class ReadsAlignmentUtils:
    '''
    Module Name:
    ReadsAlignmentUtils

    Module Description:
    A KBase module: ReadsAlignmentUtils

This module is intended for use by Aligners and Assemblers to upload and download alignment files.
The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to
the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files
in sam and bai formats from the downloaded bam file. This utility also generates stats from the
stored alignment.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git"
    GIT_COMMIT_HASH = "a807d122b097a4c6713a81d5a82eef335835f77a"

    #BEGIN_CLASS_HEADER

    PARAM_IN_FILE = 'file_path'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_CONDITION = 'condition'
    PARAM_IN_READ_LIB_REF = 'read_library_ref'
    PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref'

    PARAM_IN_ALIGNED_USING = 'aligned_using'
    PARAM_IN_ALIGNER_VER = 'aligner_version'
    PARAM_IN_ALIGNER_OPTS = 'aligner_opts'
    PARAM_IN_REPLICATE_ID = 'replicate_id'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index'
    PARAM_IN_SAMPLESET_REF = 'sampleset_ref'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'

    PARAM_IN_DOWNLOAD_SAM = 'downloadSAM'
    PARAM_IN_DOWNLOAD_BAI = 'downloadBAI'
    PARAM_IN_VALIDATE = 'validate'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    def _get_file_path_info(self, file_path):
        """
        Given a file path, returns the directory, file name, file base and file extension
        """
        dir, file_name = os.path.split(file_path)
        file_base, file_ext = os.path.splitext(file_name)

        return dir, file_name, file_base, file_ext

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _check_required_param(self, in_params, param_list):
        """
        Checks if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Checks the validity of workspace and object params and returns them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _proc_upload_alignment_params(self, ctx, params):
        """
        Checks the presence and validity of upload alignment params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION,
            self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        file_path = params.get(self.PARAM_IN_FILE)

        if not (os.path.isfile(file_path)):
            raise ValueError('File does not exist: ' + file_path)

        lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2]
        if lib_type.startswith('KBaseFile.SingleEndLibrary') or \
           lib_type.startswith('KBaseFile.PairedEndLibrary') or \
           lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \
           lib_type.startswith('KBaseAssembly.PairedEndLibrary'):
            pass
        else:
            raise ValueError(self.PARAM_IN_READ_LIB_REF +
                             ' parameter should be of type' +
                             ' KBaseFile.SingleEndLibrary or' +
                             ' KBaseFile.PairedEndLibrary or' +
                             ' KBaseAssembly.SingleEndLibrary or' +
                             ' KBaseAssembly.PairedEndLibrary')

        obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2]
        if obj_type.startswith('KBaseGenomes.Genome') or \
           obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \
           obj_type.startswith('KBaseGenomes.ContigSet'):
            pass
        else:
            raise ValueError(self.PARAM_IN_ASM_GEN_REF +
                             ' parameter should be of type' +
                             ' KBaseGenomes.Genome or' +
                             ' KBaseGenomeAnnotations.Assembly or' +
                             ' KBaseGenomes.ContigSet')
        return ws_name_id, obj_name_id, file_path, lib_type

    def _get_aligner_stats(self, bam_file):
        """
        Gets the aligner stats from BAM file
        """
        path, file = os.path.split(bam_file)
        return self.samtools.get_stats(file, path)

    def _validate(self, params):
        samt = SamTools(self.config, self.__LOGGER)
        if 'ignore' in params:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file,
                                 ipath=path,
                                 ignore=params['ignore'])
        else:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file, ipath=path)

        return rval

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.__LOGGER = logging.getLogger('KBaseRNASeq')
        if 'log_level' in config:
            self.__LOGGER.setLevel(config['log_level'])
        else:
            self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - \
                                       %(levelname)s - %(message)s")
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        script_utils.check_sys_stat(self.__LOGGER)

        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.samtools = SamTools(config)
        #END_CONSTRUCTOR
        pass

    def validate_alignment(self, ctx, params):
        """
        :param params: instance of type "ValidateAlignmentParams" (* Input
           parameters for validating a reads alignment. For validation errors
           to ignore, see
           http://broadinstitute.github.io/picard/command-line-overview.html#V
           alidateSamFile) -> structure: parameter "file_path" of String,
           parameter "ignore" of list of String
        :returns: instance of type "ValidateAlignmentOutput" (* Results from
           validate alignment *) -> structure: parameter "validated" of type
           "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN validate_alignment

        rval = self._validate(params)

        if rval == 0:
            returnVal = {'validated': True}
        else:
            returnVal = {'validated': False}

        #END validate_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method validate_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def upload_alignment(self, ctx, params):
        """
        Validates and uploads the reads alignment  *
        :param params: instance of type "UploadAlignmentParams" (* Required
           input parameters for uploading a reads alignment string
           destination_ref -  object reference of alignment destination. The
           object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id
           is the workspace name or id and obj_name_or_id is the object name
           or id file_path              -  File with the path of the sam or
           bam file to be uploaded. If a sam file is provided, it will be
           converted to the sorted bam format before being saved
           read_library_ref       -  workspace object ref of the read sample
           used to make the alignment file condition              -
           assembly_or_genome_ref -  workspace object ref of genome assembly
           or genome object that was used to build the alignment *) ->
           structure: parameter "destination_ref" of String, parameter
           "file_path" of String, parameter "read_library_ref" of String,
           parameter "condition" of String, parameter
           "assembly_or_genome_ref" of String, parameter "aligned_using" of
           String, parameter "aligner_version" of String, parameter
           "aligner_opts" of mapping from String to String, parameter
           "replicate_id" of String, parameter "platform" of String,
           parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter
           "sampleset_ref" of type "ws_Sampleset_ref", parameter
           "mapped_sample_id" of mapping from String to mapping from String
           to String, parameter "validate" of type "boolean" (A boolean - 0
           for false, 1 for true. @range (0, 1)), parameter "ignore" of list
           of String
        :returns: instance of type "UploadAlignmentOutput" (*  Output from
           uploading a reads alignment  *) -> structure: parameter "obj_ref"
           of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_alignment

        self.__LOGGER.info(
            'Starting upload Reads Alignment, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params(
            ctx, params)

        dir, file_name, file_base, file_ext = self._get_file_path_info(
            file_path)

        if self.PARAM_IN_VALIDATE in params and params[
                self.PARAM_IN_VALIDATE] is True:
            if self._validate(params) == 1:
                raise Exception('{0} failed validation'.format(file_path))

        bam_file = file_path
        if file_ext.lower() == '.sam':
            bam_file = os.path.join(dir, file_base + '.bam')
            self.samtools.convert_sam_to_sorted_bam(ifile=file_name,
                                                    ipath=dir,
                                                    ofile=bam_file)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': bam_file,
            'make_handle': 1
        })
        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        aligner_stats = self._get_aligner_stats(file_path)
        aligner_data = {
            'file': file_handle,
            'size': file_size,
            'condition': params.get(self.PARAM_IN_CONDITION),
            'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF),
            'library_type': lib_type,
            'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF),
            'alignment_stats': aligner_stats
        }
        optional_params = [
            self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER,
            self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID,
            self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX,
            self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID
        ]
        for opt_param in optional_params:
            if opt_param in params and params[opt_param] is not None:
                aligner_data[opt_param] = params[opt_param]

        self.__LOGGER.info('=========  Adding extra_provenance_refs')
        self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF))
        self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF))
        self.__LOGGER.info('=======================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqAlignment",
                "data":
                aligner_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs": [
                    params.get(self.PARAM_IN_READ_LIB_REF),
                    params.get(self.PARAM_IN_ASM_GEN_REF)
                ]
            }]
        })[0]
        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        self.__LOGGER.info(returnVal)

        #END upload_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_alignment(self, ctx, params):
        """
        Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats *
        :param params: instance of type "DownloadAlignmentParams" (* Required
           input parameters for downloading a reads alignment string
           source_ref -  object reference of alignment source. The object ref
           is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "DownloadAlignmentOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String, parameter "stats" of type
           "AlignmentStats" -> structure: parameter "properly_paired" of
           Long, parameter "multiple_alignments" of Long, parameter
           "singletons" of Long, parameter "alignment_rate" of Double,
           parameter "unmapped_reads" of Long, parameter "mapped_reads" of
           Long, parameter "total_reads" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_alignment

        self.__LOGGER.info('Running download_alignment with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        try:
            alignment = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        uuid_str = str(uuid.uuid4())
        output_dir = os.path.join(self.scratch, 'download_' + uuid_str)
        self._mkdir_p(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            alignment[0]['data']['file']['id'],
            'file_path':
            output_dir
        })
        if zipfile.is_zipfile(file_ret.get('file_path')):
            with zipfile.ZipFile(file_ret.get('file_path')) as z:
                z.extractall(output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        bam_files = glob.glob(output_dir + '/*.bam')

        uuid_prefix = uuid_str[:8]
        if len(bam_files) == 0:
            raise ValueError("Alignment object does not contain a bam file")

        for bam_file_path in bam_files:
            dir, file_name, file_base, file_ext = self._get_file_path_info(
                bam_file_path)
            if params.get(self.PARAM_IN_VALIDATE, False):
                validate_params = {'file_path': bam_file_path}
                if self._validate(validate_params) == 1:
                    raise Exception(
                        '{0} failed validation'.format(bam_file_path))

            if params.get('downloadBAI', False):
                bai_file = uuid_prefix + '_' + file_base + '.bai'
                bai_file_path = os.path.join(output_dir, bai_file)
                self.samtools.create_bai_from_bam(ifile=file_name,
                                                  ipath=output_dir,
                                                  ofile=bai_file)
                if not os.path.isfile(bai_file_path):
                    raise ValueError('Error creating {}'.format(bai_file_path))

            if params.get('downloadSAM', False):
                sam_file = uuid_prefix + '_' + file_base + '.sam'
                sam_file_path = os.path.join(output_dir, sam_file)
                self.samtools.convert_bam_to_sam(ifile=file_name,
                                                 ipath=output_dir,
                                                 ofile=sam_file)
                if not os.path.isfile(sam_file_path):
                    raise ValueError('Error creating {}'.format(sam_file_path))

        returnVal = {
            'destination_dir': output_dir,
            'stats': alignment[0]['data']['alignment_stats']
        }

        #END download_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_alignment(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download alignments from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting a reads alignment string source_ref - 
           object reference of alignment source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_alignment

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        if params.get(self.PARAM_IN_VALIDATE, False) or \
           params.get('exportBAI', False) or \
           params.get('exportSAM', False):
            """
            Need to validate or convert files. Use download_alignment
            """
            download_params = {}
            for key, val in params.iteritems():
                download_params[key.replace('export', 'download')] = val

            download_retVal = self.download_alignment(ctx, download_params)[0]

            export_dir = download_retVal['destination_dir']

            # package and load to shock
            ret = self.dfu.package_for_download({
                'file_path': export_dir,
                'ws_refs': [inref]
            })
            output = {'shock_id': ret['shock_id']}
        else:
            """
            return shock id from the object
            """
            try:
                alignment = self.dfu.get_objects({'object_refs':
                                                  [inref]})['data']
            except DFUError as e:
                self.__LOGGER.error(
                    'Logging stacktrace from workspace exception:\n' + e.data)
                raise
            output = {'shock_id': alignment[0]['data']['file']['id']}

        #END export_alignment

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_alignment return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
示例#9
0
class Utils:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    @staticmethod
    def validate_newick(newick):
        """Validates a Newick string by attempting to make a tree with ete3"""
        try:
            ete3.Tree(newick)
        except NewickError:
            return False
        return True

    def to_newick(self, params):
        """Convert an Tree to a Newick File"""
        files = {}

        res = self.dfu.get_objects({'object_refs':
                                    [params['input_ref']]})['data'][0]
        name = res['info'][1]
        if "KBaseTrees.Tree" not in res['info'][2]:
            raise ValueError("Supplied reference is not a Tree")

        files['file_path'] = os.path.join(params['destination_dir'],
                                          name + ".newick")
        with open(files['file_path'], 'w') as out_file:
            out_file.write(res['data']['tree'])

        return name, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}