Exemplo n.º 1
0
 def get_data_obj_type_by_name(self, input_ref, remove_module=False):
     # 0 obj_id objid - the numerical id of the object.
     # 1 obj_name name - the name of the object.
     # 2 type_string type - the type of the object.
     # 3 timestamp save_date - the save date of the object.
     # 4 obj_ver ver - the version of the object.
     # 5 username saved_by - the user that saved or copied the object.
     # 6 ws_id wsid - the workspace containing the object.
     # 7 ws_name workspace - the workspace containing the object.
     # 8 string chsum - the md5 checksum of the object.
     # 9 int size - the size of the object in bytes.
     # 10 usermeta meta - arbitrary user-supplied metadata about
     #     the object.
     [
         OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
         WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
     ] = range(11)  # object_info tuple
     ws = Workspace(self.ws_url)
     input_info = ws.get_object_info3({'objects': [{
         'ref': input_ref
     }]})['infos'][0]
     obj_name = input_info[NAME_I]
     type_name = input_info[TYPE_I].split('-')[0]
     if remove_module:
         type_name = type_name.split('.')[1]
     return {obj_name: type_name}
Exemplo n.º 2
0
 def get_data_obj_name(self, input_ref):
     [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11)  # object_info tuple
     ws = Workspace(self.ws_url)
     input_info = ws.get_object_info3({'objects': [{'ref': input_ref}]})['infos'][0]
     obj_name = input_info[NAME_I]
     #type_name = input_info[TYPE_I].split('-')[0]
     return obj_name
Exemplo n.º 3
0
 def get_data_obj_type(self, input_ref, remove_module=False):
     [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11)  # object_info tuple
     ws = Workspace(self.ws_url)
     input_info = ws.get_object_info3({'objects': [{'ref': input_ref}]})['infos'][0]
     #obj_name = input_info[NAME_I]
     type_name = input_info[TYPE_I].split('-')[0]
     if remove_module:
         type_name = type_name.split('.')[1]
     return type_name
Exemplo n.º 4
0
def get_static_info(ws_url: str, token: str, ws_id: int) -> Dict:
    """
    Looks up the static narrative info for the given Workspace id.
    That info is stashed in the Workspace metadata, so that gets fetched, munged into a structure,
    and returned.
    If there's no static narrative, this returns an empty structure, as there's no info.
    If ws_id is not present, or not numeric, raises a ValueError.
    If there's a problem when contacting the Workspace (anything that raises a ServerError),
    this raises a WorkspaceError.
    :param ws_url: the URL for the workspace endpoint
    :param token: the user auth token
    :param ws_id: the workspace id of the narrative to fetch info for.
    :returns: a dictionary with the following keys if a static narrative is present:
        ws_id - int - the workspace id
        narrative_id - int - the id of the narrative object
        version - int - the version of the narrative object made static
        url - str - the url of the static narrative
        narr_saved - int - the timestamp of when the narrative that the static version is
            based on was saved (ms since epoch)
        static_saved - int - the timestamp of when the static narrative was saved (ms
            since epoch)

    """
    if not ws_id or not str(ws_id).isdigit():
        raise ValueError(f"The parameter ws_id must be an integer, not {ws_id}")

    ws_client = Workspace(url=ws_url, token=token)
    try:
        ws_info = ws_client.get_workspace_info({"id": ws_id})
    except ServerError as err:
        raise WorkspaceError(err, ws_id)

    info = {}
    meta = ws_info[8]
    if "static_narrative_ver" in meta:
        info = {
            "ws_id": ws_id,
            "version": int(meta["static_narrative_ver"]),
            "narrative_id": int(meta["narrative"]),
            "url": meta["static_narrative"],
            "static_saved": int(meta["static_narrative_saved"])
        }
        try:
            obj_info = ws_client.get_object_info3({
                "objects": [{
                    "ref": f"{ws_id}/{info['narrative_id']}/{info['version']}"
                }]
            })
        except ServerError as err:
            raise WorkspaceError(err, ws_id)
        ts = date_parser.isoparse(obj_info["infos"][0][3]).timestamp()
        info["narr_saved"] = int(ts*1000)
    return info
Exemplo n.º 5
0
def get_object_type(ref, ws_url):
    """
    Fetches and returns the typed object name of ref from the given workspace url.
    If that object doesn't exist, or there's another Workspace error, this raises a
    RuntimeError exception.
    """
    ws = Workspace(ws_url)
    info = ws.get_object_info3({"objects": [{"ref": ref}]})
    obj_info = info.get("infos", [[]])[0]
    if len(obj_info) == 0:
        raise RuntimeError("An error occurred while fetching type info from the Workspace. "
                           "No information returned for reference {}".format(ref))
    return obj_info[2]
Exemplo n.º 6
0
def get_object_names(ref_list, ws_url):
    """
    From a list of workspace references, returns a mapping from ref -> name of the object.
    """
    ws = Workspace(ws_url)
    obj_ids = list()
    for ref in ref_list:
        obj_ids.append({"ref": ref})
    info = ws.get_object_info3({"objects": obj_ids})
    name_map = dict()
    # might be in a data palette, so we can't just use the ref.
    # we already have the refs as passed previously, so use those for mapping, as they're in
    # the same order as what's returned.
    for i in range(len(info["infos"])):
        name_map[ref_list[i]] = info["infos"][i][1]
    return name_map
Exemplo n.º 7
0
 def extract_dna_sequences(self, token, params):
     """Takes an assembly/contig set ref and one or more locations and returns the DNA sequence
     from the assembly at that location while caching the assembly for efficiency"""
     if not params.get('ref'):
         raise ValueError("'ref', a reference to an assembly must be provided")
     ref = params['ref']
     locs = params.get('locations', [])
     ws = Workspace(self.ws_url, token=token)
     # This is also a cheap way to ensure that the object exists and that the user has access
     obj_type = ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][0][2]
     if obj_type.split('-')[0] not in self.valid_types:
         raise ValueError(f'{obj_type} is not a valid input type for this function')
     assembly_dir = os.path.join(self.cache_dir, ref.replace('/', ':'))
     if not os.path.exists(assembly_dir):
         self._cache_assembly(ws, token, ref, assembly_dir)
     return [_extract_sequence(assembly_dir, l) for l in locs]
Exemplo n.º 8
0
    def check_assembly_cache(self, ref, token):
        ws = Workspace(self.ws_url, token=token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(self.assembly_index_dir, inner_chsum + self.ASSEMBLY_SUFFIX + ".tsv.gz")
        if not os.path.isfile(index_file):
            if self.debug:
                print("    Loading WS object...")
                t1 = time.time()

            if 'KBaseGenomeAnnotations.Assembly' in info[2]:
                included = ["/contigs"]
                assembly_data = ws.get_objects2(
                    {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data']
                contigs = list(assembly_data['contigs'].values())
                self.save_assembly_tsv(contigs, inner_chsum)

            elif 'KBaseGenomes.ContigSet' in info[2]:
                included = ["/contigs/[*]/id",
                            "/contigs/[*]/length",
                            "/contigs/[*]/md5",
                            "/contigs/[*]/description"]
                cs_data = ws.get_objects2(
                    {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data']
                contigs = []
                for c in cs_data['contigs']:
                    this_contig_data = {'contig_id': ''}
                    if 'id' in c:
                        this_contig_data['contig_id'] = c['id']
                    if 'md5' in c:
                        this_contig_data['md5'] = c['md5']
                    if 'length' in c:
                        this_contig_data['length'] = c['length']
                    if 'description' in c:
                        this_contig_data['description'] = c['description']
                    contigs.append(this_contig_data)

                self.save_assembly_tsv(contigs, inner_chsum)
            else:
                raise ValueError('The "ref" is not an Assembly or ContigSet data object. '
                                 'It was a ' + info[2])

            if self.debug:
                print(f"    (time={time.time() - t1})")
        return inner_chsum
Exemplo n.º 9
0
    def check_object_cache(self, ref, search_object, info_included,
                           index_dir, object_suffix, debug):
        ws = Workspace(self.ws_url, token=self.token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(index_dir,
                                  inner_chsum + object_suffix + ".tsv.gz")
        if not os.path.isfile(index_file):
            if debug:
                print("    Loading WS object...")
                t1 = time.time()

            included = self.build_info_included(search_object, info_included)
            object = ws.get_objects2({'objects': [{'ref': ref,
                                                   'included': included}]})['data'][0]['data']
            self.save_object_tsv(object[search_object], inner_chsum, info_included,
                                 index_dir, object_suffix)
            if debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
Exemplo n.º 10
0
def get_upa_name(ws_url, cb_url, upa, is_test):
    '''
    '''
    if is_test:
        return "test_object"

    ws = Workspace(ws_url)
    objs = ws.get_object_info3({'objects': [{'ref': upa}]})
    upa_names = [info[1] for info in objs['infos']]
    if len(upa_names) > 0:
        return upa_names[0]

    dfu = DataFileUtil(cb_url)
    objs = dfu.get_objects({'object_refs': [upa]})['data']
    upa_names = [obj['info'][1] for obj in objs]
    if len(upa_names) > 0:
        return upa_names[0]
    else:
        raise ValueError("Could not find name of workspace object with id %s" %
                         upa)
Exemplo n.º 11
0
 def _make_upa_dict(self, value, param_spec: dict):
     upas = list()
     if param_spec["field_type"] == "text":
         valid_ws_types = param_spec.get("text_options",
                                         {}).get("valid_ws_types", [])
         if len(valid_ws_types) > 0 and value:
             if isinstance(value, list):
                 for v in value:
                     if self._is_upa(v):
                         upas.append(v)
             else:
                 if self._is_upa(value):
                     upas.append(value)
     upa_map = dict()
     if len(upas):
         ws = Workspace(url=self.ws_url, token=self.token)
         obj_infos = ws.get_object_info3(
             {"objects": [{
                 "ref": upa
             } for upa in upas]})["infos"]
         upa_map = {u: obj_infos[i] for i, u in enumerate(upas)}
     return upa_map
Exemplo n.º 12
0
def fetch_pangenome_summary(
        pangenome_ref: str,
        workspace_url: str,
        token: str) -> dict:
    """
    Construct a summary data object for a single pangenome, used in the
    "simple_summary" method.
    Args:
        pangenome_ref: Workspace reference to the pangenome object
        workspace_url: URL of the Workspace being used in the current env
        token: authorization token for fetching the data
    Returns:
        A python object adhering to the SimpleSummaryResult type in
        PanGenomeAPI.spec
    """
    ws_client = Workspace(workspace_url, token=token)
    # Download the full pangenome workspace dataset
    resp = ws_client.get_objects2({
        'objects': [{'ref': pangenome_ref}]
    })
    data = resp['data'][0]['data']
    # Fetch the object infos for each genome
    genome_refs = [{"ref": ref} for ref in data["genome_refs"]]
    genome_infos = ws_client.get_object_info3({
        "objects": genome_refs,
        "includeMetadata": 1
    })["infos"]
    name_mapping = _genome_name_mapping(genome_infos)
    ret = {
        "pangenome_id": data["id"],
        "genomes_count": len(data["genome_refs"]),
        "genes": _count_genes(data),
        "families": _count_families(data),
        "genomes": _genome_counts(data, genome_infos, name_mapping),
        "shared_family_map": _shared_family_map(data, name_mapping),
        "genome_ref_name_map": name_mapping,
    }
    return ret
Exemplo n.º 13
0
def fetch_fasta_from_genome(genome_ref, ws_url, callback_url):
    """
    Returns an assembly or contigset as FASTA.
    """
    if not check_ref_type(genome_ref, ['KBaseGenomes.Genome'], ws_url):
        raise ValueError(
            "The given genome_ref {} is not a KBaseGenomes.Genome type!")
    # test if genome references an assembly type
    # do get_objects2 without data. get list of refs
    ws = Workspace(ws_url)
    genome_obj_info = ws.get_objects2({
        'objects': [{
            'ref': genome_ref
        }],
        'no_data': 1
    })
    # get the list of genome refs from the returned info.
    # if there are no refs (or something funky with the return), this will be an empty list.
    # this WILL fail if data is an empty list. But it shouldn't be, and we know because
    # we have a real genome reference, or get_objects2 would fail.
    genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

    # see which of those are of an appropriate type (ContigSet or Assembly), if any.
    assembly_ref = list()
    ref_params = [{'ref': genome_ref + ";" + x} for x in genome_obj_refs]
    ref_info = ws.get_object_info3({'objects': ref_params})
    for idx, info in enumerate(ref_info.get('infos')):
        if "KBaseGenomeAnnotations.Assembly" in info[
                2] or "KBaseGenomes.ContigSet" in info[2]:
            assembly_ref.append(";".join(ref_info.get('paths')[idx]))

    if len(assembly_ref) == 1:
        return fetch_fasta_from_assembly(assembly_ref[0], ws_url, callback_url)
    else:
        raise ValueError(
            "Multiple assemblies found associated with the given genome ref {}! "
            "Unable to continue.")
class VariationUtil:
    '''
    Module Name:
    VariationUtil

    Module Description:
    A KBase module: VariationUtil
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.4"
    GIT_URL = ""
    GIT_COMMIT_HASH = "2a4c2dbc058b702811c967997e7100c834e755d4"

    #BEGIN_CLASS_HEADER

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        # TODO: Make sure we need to define config just once
        # TODO: Change the code tp match this style
        self.config = config
        self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL']
        self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.config['ws_url'] = config['workspace-url']

        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shared_folder = config['scratch']
        self.hr = htmlreportutils()
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.shock_url = config['shock-url']
        self.sw_url = config['srv-wiz-url']
        pass
        #END_CONSTRUCTOR
        pass

    def save_variation_from_vcf(self, ctx, params):
        """
        Save a variation (and trait?) object to Kbase given a reference genome, object output name,
        Variant Call Format (VCF) file, and sample attribute file.
        :param params: instance of type "save_variation_input" (## funcdef
           save_variation_from_vcf ## required input params:
           genome_or_assembly_ref: KBaseGenomes.Genome or
           KBaseGenomeAnnotations.Assembly object reference *** variation
           input data *** vcf_staging_file_path: path to location data
           associated with samples variation_object_name: output name for
           KBase variation object *** sample input data ***
           sample_attribute_ref: x/y/z reference to kbase sample attribute
           optional params: NA output report: report_name report_ref HTML
           visualization: Manhattan plot *** Visualization *** plot_maf:
           generate histogram of minor allele frequencies plot_hwe: generate
           histogram of Hardy-Weinberg Equilibrium p-values) -> structure:
           parameter "workspace_name" of String, parameter
           "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "vcf_staging_file_path" of type "filepath"
           (KBase file path to staging files), parameter
           "variation_object_name" of String, parameter
           "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "save_variation_output" -> structure:
           parameter "variation_ref" of String, parameter "report_name" of
           String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: report
        #BEGIN save_variation_from_vcf

        # Get workspace id
        ws_id = self.dfu.ws_name_to_id(params['workspace_name'])

        genome_ref = None
        assembly_ref = None

        # 1) Find whether the input is a genome or assembly
        #    and get genome_ref and assembly_ref

        genome_or_assembly_ref = params['genome_or_assembly_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        # 2)  Validate VCF, compress, and build VCF index
        logging.info("Validating VCF, Compressing VCF and Indexing VCF")
        VCFUtilsConfig = {"scratch": self.scratch}
        VCFUtilsParams = {
            'vcf_staging_file_path': params['vcf_staging_file_path']
        }
        VCU = VCFUtils(VCFUtilsConfig)
        vcf_compressed, vcf_index, vcf_strain_ids = VCU.validate_compress_and_index_vcf(
            VCFUtilsParams)

        if vcf_index is not None:
            logging.info("vcf compressed :" + str(vcf_compressed))
            logging.info("vcf index :" + str(vcf_index))
            logging.info("vcf strain ids :" + str(vcf_strain_ids))
        else:
            raise ValueError(
                "No result obtained after compression and indexing step")

        # Get strain info
        # TODO: Remove hard coded stuff
        StrainInfoConfig = self.config
        StrainInfoParams = {
            "ws_id": ws_id,
            "vcf_strain_ids": vcf_strain_ids,
            "sample_set_ref": params["sample_set_ref"],
            "sample_attribute_name": params["sample_attribute_name"]
        }
        si = StrainInfo(StrainInfoConfig)
        sample_attribute_ref, strains = si.sample_strain_info(StrainInfoParams)
        print(sample_attribute_ref)
        print(strains)

        # 3) Create json for variation object. In a following step genomic_indexes will be
        # added to this json before it is saved as Variation object

        VCFToVariationConfig = {"ws_url": self.ws_url, "scratch": self.scratch}
        VCFToVariationParams = {
            "vcf_compressed": vcf_compressed,
            "vcf_index": vcf_index,
            "assembly_ref": assembly_ref
        }
        if genome_ref is not None:
            VCFToVariationParams['genome_ref'] = genome_ref

        vtv = VCFToVariation(VCFToVariationConfig)
        variation_object_data = vtv.generate_variation_object_data(
            VCFToVariationParams)
        # Append sample information
        if sample_attribute_ref:
            variation_object_data[
                'sample_attribute_ref'] = sample_attribute_ref
        else:
            raise ValueError(f'sample attribute ref not found')
        if strains:
            variation_object_data['strains'] = strains
        else:
            raise ValueError(f'strains not found')
        if 'sample_set_ref' in params:
            variation_object_data['sample_set_ref'] = params['sample_set_ref']
        else:
            raise ValueError(f'sample_set_ref not found in params')

        # 4)
        JbrowseConfig = {
            "ws_url": self.ws_url,
            "scratch": self.scratch,
            "sw_url": self.sw_url,
            "shock_url": self.shock_url
        }
        JbrowseParams = {
            "vcf_path": vcf_compressed,
            "assembly_ref": assembly_ref,
            "binsize": 10000,
            "vcf_shock_id": variation_object_data['vcf_handle']['id'],
            "vcf_index_shock_id":
            variation_object_data['vcf_index_handle']['id']
        }
        if genome_ref is not None:
            JbrowseParams["genome_ref"] = genome_ref

        jb = JbrowseUtil(JbrowseConfig)
        jbrowse_report = jb.prepare_jbrowse_report(JbrowseParams)

        # 5) Now we have the genomic indices and we have all the information needed to save
        # the variation object
        # TODO: Take out the genomic_indexes field from the object spec
        #  TODO: Take out the vcf_handle stuff not needed

        variation_object_data['genomic_indexes'] = jbrowse_report[
            'genomic_indexes']

        var_obj = self.dfu.save_objects({
            'id':
            self.dfu.ws_name_to_id(params['workspace_name']),
            'objects': [{
                'type': 'KBaseGwasData.Variations',
                'data': variation_object_data,
                'name': params['variation_object_name']
            }]
        })[0]

        var_obj_ref = str(var_obj[6]) + "/" + str(var_obj[0]) + "/" + str(
            var_obj[4])
        print(var_obj_ref)

        # 5) Build Variation report
        # This is a simple report
        #
        workspace = params['workspace_name']
        created_objects = []
        created_objects.append({
            "ref": var_obj_ref,
            "description": "Variation Object"
        })
        ReportConfig = {
            "ws_url": self.ws_url,
            "scratch": self.scratch,
        }
        ReportParams = {"variation_ref": var_obj_ref}
        vr = VariationReport(ReportConfig)
        htmlreport_dir = vr.create_variation_report(ReportParams)

        report = self.hr.create_html_report(htmlreport_dir, workspace,
                                            created_objects)
        report['variation_ref'] = var_obj_ref
        print(report)
        #END save_variation_from_vcf

        # At some point might do deeper type checking...
        if not isinstance(report, dict):
            raise ValueError('Method save_variation_from_vcf return value ' +
                             'report is not type dict as required.')
        # return the results
        return [report]

    def export_variation_as_vcf(self, ctx, params):
        """
        Export KBase variation object as Variant Call Format (VCF) file
        :param params: instance of type "export_variation_input" (## funcdef
           export_variation_as_vcf ## required input params: Variation object
           reference optional params: NA output report: Shock id pointing to
           exported vcf file) -> structure: parameter "input_var_ref" of type
           "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "export_variation_output" -> structure:
           parameter "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_variation_as_vcf

        vtv = VariationToVCF(self.callback_url, self.shared_folder)
        output = vtv.export_as_vcf(params)

        #END export_variation_as_vcf

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_variation_as_vcf return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def get_variation_as_vcf(self, ctx, params):
        """
        Given a reference to a variation object, and output name: return a Variant Call Format (VCF)
        file path and name.
        :param params: instance of type "get_variation_input" (## funcdef
           get_variation_as_vcf ## required input params: Variation object
           reference output file name optional params: NA output report: path
           to returned vcf name of variation object) -> structure: parameter
           "variation_ref" of type "obj_ref" (An X/Y/Z style reference),
           parameter "filename" of String
        :returns: instance of type "get_variation_output" -> structure:
           parameter "path" of type "filepath" (KBase file path to staging
           files), parameter "variation_name" of String
        """
        # ctx is the context object
        # return variables are: file
        #BEGIN get_variation_as_vcf
        vtv = VariationToVCF(self.callback_url, self.shared_folder)
        file = vtv.variation_to_vcf(params)

        #END get_variation_as_vcf

        # At some point might do deeper type checking...
        if not isinstance(file, dict):
            raise ValueError('Method get_variation_as_vcf return value ' +
                             'file is not type dict as required.')
        # return the results
        return [file]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Exemplo n.º 15
0
    def run_FamaReadProfiling(self, ctx, params):
        """
        Run metagenome functional profiling module of Fama.
        :param params: instance of type "FamaReadProfilingParams" (Parameters
           for metagenome functional profiling. workspace_name - the name of
           the workspace for input/output read_library_refs - references to
           the name of the PE read library or SE read library ref_dataset -
           the name of Fama reference dataset is_paired_end - 1 for
           paired-end library, 0 for single-end library
           output_functional_profile_name - the name of the output functional
           profile output_read_library_ref - the name of the output filtered
           PE or SE read library) -> structure: parameter "workspace_name" of
           String, parameter "read_library_refs" of list of String, parameter
           "ref_dataset" of String, parameter "is_paired_end" of type "bool"
           (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "output_functional_profile_name" of String, parameter
           "output_read_library_name" of String
        :returns: instance of type "ReportResults" (Output report parameters
           report_name - the name of the report object report_ref - the
           reference to the report object) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_FamaReadProfiling
        # Import Read Library and save as two paired-end FASTQ files
        input_refs = params['read_library_refs']
        fama_reference = params['ref_dataset']
        ws_client = Workspace(self.ws_url)
        ru = ReadsUtils(self.callback_url)
        ret = ws_client.get_object_info3(
            {'objects': [{
                'ref': ref
            } for ref in input_refs]})
        name2ref = {}
        input_reads = {}
        for input_ref in input_refs:
            ret = ws_client.get_object_info3({'objects': [{'ref': input_ref}]})
            obj_name = ret['infos'][0][1]
            name2ref[obj_name] = input_ref

            reads_params = {
                'read_libraries': [input_ref],
                'interleaved': 'false',
                'gzipped': None
            }

            reads = ru.download_reads(reads_params)['files']

            print('Input reads files downloaded:')
            print(reads)
            fwd_reads_file = reads[input_ref]['files']['fwd']
            rev_reads_file = reads[input_ref]['files']['rev']
            print('forward: ' + str(fwd_reads_file))
            print('reverse: ' + str(rev_reads_file))
            input_reads[obj_name] = {}
            input_reads[obj_name]['fwd'] = fwd_reads_file
            input_reads[obj_name]['rev'] = rev_reads_file

        fama_params = {
            'input_reads':
            input_reads,
            'work_dir':
            self.shared_folder,
            'reference':
            fama_reference,
            'is_paired_end':
            params['is_paired_end'],
            'name2ref':
            name2ref,
            'ws_name':
            params['workspace_name'],
            'ws_client':
            ws_client,
            'output_read_library_name':
            params['output_read_library_name'],
            'output_functional_profile_name':
            params['output_functional_profile_name'],
            'input_read_refs':
            params['read_library_refs']
        }

        # Run Fama
        fama_output = functional_profiling_pipeline(fama_params)

        # Write filtered reads to workspace
        reads_params = {
            'fwd_file': fama_output['fwd_reads'],
            'sequencing_tech': reads[input_ref]['sequencing_tech'],
            'single_genome': '0',
            'wsname': params['workspace_name'],
            'name': params['output_read_library_name']
        }
        if 'rev_reads' in fama_output:
            reads_params['rev_file'] = fama_output['rev_reads']
            reads_params['interleaved'] = '0'

        ru_ret = ru.upload_reads(reads_params)
        print('reads_params', reads_params)
        print('ru_ret', ru_ret)
        output_reads_ref = ru_ret['obj_ref']

        # Write HTML output to workspace
        message = 'Fama functional profiling finished successfully'
        dfu = DataFileUtil(self.callback_url)
        try:
            dfu_output = dfu.file_to_shock(
                {'file_path': fama_output['html_report']})
        except ServerError as dfue:
            # not really any way to test this block
            self.log('Logging exception loading results to shock')
            self.log(str(dfue))
            raise

        html_links = [{
            'shock_id': dfu_output['shock_id'],
            'description': 'HTML report for Fama App',
            'name': 'fama_report.html',
            'label': 'Fama_report'
        }]
        for krona_file in fama_output['krona_charts']:
            try:
                dfu_output = dfu.file_to_shock({'file_path': krona_file})
                html_links.append({
                    'shock_id':
                    dfu_output['shock_id'],
                    'description':
                    'Krona chart for function taxonomy profile',
                    'name':
                    fama_output['krona_charts'][krona_file][0],
                    'label':
                    fama_output['krona_charts'][krona_file][1]
                })
            except ServerError as dfue:
                # not really any way to test this block
                self.log('Logging exception loading results to shock')
                self.log(str(dfue))
                raise
        self.log('Krona chart saved: ' + str(dfu_output))

        # Save report
        report_params = {
            'message':
            message,
            'objects_created': [{
                'ref': output_reads_ref,
                'description': 'Filtered Read Library'
            }, {
                'ref': fama_output['trait_matrix_ref'],
                'description': 'Raw counts matrix'
            }, {
                'ref': fama_output['functional_profile_ref'],
                'description': 'Functional profile'
            }],
            'direct_html_link_index':
            0,
            'html_links':
            html_links,
            'file_links':
            fama_output['report_files'],
            'report_object_name':
            'fama_profiling_report_' + str(uuid.uuid4()),
            'workspace_name':
            params['workspace_name'],
            'html_window_height':
            460
        }
        try:
            report = KBaseReport(self.callback_url)
            report_info = report.create_extended_report(report_params)
        except ServerError as kre:
            # not really any way to test this block
            self.log('Logging exception saving report')
            self.log(str(kre))
            raise

        report_info['report_params'] = report_params
        self.log(str(report_info))
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END run_FamaReadProfiling

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_FamaReadProfiling return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 16
0
class FeatureSetBuilder:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in [
                'diff_expression_ref', 'workspace_name', 'p_cutoff',
                'q_cutoff', 'fold_change_cutoff'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError(
                '"fold_scale_type" parameter must be set to "logarithm", if used'
            )

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def _generate_report(self, up_feature_set_ref_list,
                         down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(
            up_feature_set_ref_list, down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{
                'ref': up_feature_set_ref,
                'description': 'Upper FeatureSet Object'
            }]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{
                'ref': down_feature_set_ref,
                'description': 'Lower FeatureSet Object'
            }]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{
                'ref':
                filtered_expression_matrix_ref,
                'description':
                'Filtered ExpressionMatrix Object'
            }]

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name':
            'kb_FeatureSetUtils_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list,
                              down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2(
                {'objects': [{
                    'ref': up_feature_set_ref
                }]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(
                feature_set_name, len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2(
                {'objects': [{
                    'ref': down_feature_set_ref
                }]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(
                feature_set_name, len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<tr><td>Upper_FeatureSet</td></tr>',
                    uppper_feature_content)

                report_template = report_template.replace(
                    '<tr><td>Lower_FeatureSet</td></tr>',
                    lower_feature_content)

                result_file.write(report_template)

        html_report.append({
            'path': result_file_path,
            'name': os.path.basename(result_file_path),
            'label': os.path.basename(result_file_path),
            'description': 'HTML summary report'
        })
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref,
                                 result_directory, condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_set_ref
            }]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory,
                                             diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2(
                {'objects': [{
                    'ref': diff_expression_ref
                }]})['data'][0]['data']

            label_string = set_item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']
                selected_diff_expression_ref = diff_expression_ref

                with open(diff_expr_matrix_file, 'a') as csvfile:
                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({
                            'gene_id': row_id,
                            'log2_fold_change': row_value[0],
                            'p_value': row_value[1],
                            'q_value': row_value[2]
                        })

        return diff_expr_matrix_file, genome_id, selected_diff_expression_ref

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name,
                              feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {feature_id: [genome_id] for feature_id in feature_ids}
        feature_set_data = {
            'description': 'Generated FeatureSet from DifferentialExpression',
            'element_ordering': feature_ids,
            'elements': elements
        }

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': feature_set_data,
                'name': feature_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0],
                                                dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value,
                             comp_q_value, comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = {'NA', 'null', ''}
                col_value = {row_p_value, row_q_value, row_fold_change_cutoff}

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition
                                            and q_value_condition
                                            and (float(row_fold_change_cutoff)
                                                 >= comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition
                                              and q_value_condition and
                                              (float(row_fold_change_cutoff) <=
                                               -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self,
                                  expression_matrix_ref,
                                  feature_ids,
                                  workspace_name,
                                  filtered_expression_matrix_suffix="",
                                  diff_expression_matrix_ref=None,
                                  filtered_expression_matrix_name=None):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects(
            {'object_refs': [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if not filtered_expression_matrix_name:
            if re.match('.*_*[Ee]xpression_*[Mm]atrix',
                        expression_matrix_name):
                filtered_expression_matrix_name = re.sub(
                    '_*[Ee]xpression_*[Mm]atrix',
                    filtered_expression_matrix_suffix, expression_matrix_name)
            else:
                filtered_expression_matrix_name = expression_matrix_name + \
                                                  filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values
        filtered_expression_matrix_data['data'] = filtered_data

        expression_obj = {
            'type': expression_matrix_info[2],
            'data': filtered_expression_matrix_data,
            'name': filtered_expression_matrix_name
        }
        # we now save the filtering DEM in a EM field added for this purpose
        if diff_expression_matrix_ref:
            expression_obj['data'][
                'diff_expr_matrix_ref'] = diff_expression_matrix_ref
            expression_obj['extra_provenance_input_refs'] = [
                diff_expression_matrix_ref
            ]

        save_object_params = {'id': workspace_id, 'objects': [expression_obj]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = "{}/{}/{}".format(
            dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            try:
                label_string = condition_pair['label_string'][0].strip()
                label_list = [x.strip() for x in label_string.split(',')]
                first_label = label_list[0]
                second_label = label_list[1]
            except IndexError:
                raise IndexError('No selected values for Partial Condition')

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(
                    first_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(
                    second_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_set_ref
            }]})['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_label_pairs.append(label_list)
            available_condition_labels |= set(label_list)

        log('all possible condition pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def _get_feature_ids(self, genome_ref, ids):
        """
        _get_feature_ids: get feature ids from genome
        """

        genome_features = self.gsu.search({
            'ref': genome_ref,
            'limit': len(ids),
            'structured_query': {
                "$or": [{
                    "feature_id": x
                } for x in ids]
            },
            'sort_by': [['feature_id', True]]
        })['features']

        features_ids = set(
            (feature.get('feature_id') for feature in genome_features))

        return features_ids

    def _build_fs_obj(self, params):
        new_feature_set = {
            'description': '',
            'element_ordering': [],
            'elements': {}
        }
        genome_ref = params['genome']
        if params.get('base_feature_sets',
                      []) and None not in params['base_feature_sets']:
            base_feature_sets = self.dfu.get_objects(
                {'object_refs': params['base_feature_sets']})['data']
            for ret in base_feature_sets:
                base_set = ret['data']
                base_set_name = ret['info'][1]

                new_feature_set['element_ordering'] += [
                    x for x in base_set['element_ordering']
                    if x not in new_feature_set['elements']
                ]
                for element, genome_refs in base_set['elements'].items():
                    if element in new_feature_set['elements']:
                        new_feature_set['elements'][element] += [
                            x for x in genome_refs
                            if x not in new_feature_set['elements'][element]
                        ]
                    else:
                        new_feature_set['elements'][element] = genome_refs
                new_feature_set[
                    'description'] += 'From FeatureSet {}: {}\n'.format(
                        base_set_name, base_set.get('description'))
        new_feature_ids = []
        if params.get('feature_ids'):
            if isinstance(params['feature_ids'], str):
                new_feature_ids += params['feature_ids'].split(',')
            else:
                new_feature_ids += params['feature_ids']
        if params.get('feature_ids_custom'):
            new_feature_ids += params['feature_ids_custom'].split(',')
        if new_feature_ids:
            genome_feature_ids = self._get_feature_ids(genome_ref,
                                                       new_feature_ids)
        for new_feature in new_feature_ids:
            if new_feature not in genome_feature_ids:
                raise ValueError(
                    'Feature ID {} does not exist in the supplied genome {}'.
                    format(new_feature, genome_ref))
            if new_feature in new_feature_set['elements']:
                if genome_ref not in new_feature_set['elements'][new_feature]:
                    new_feature_set['elements'][new_feature].append(genome_ref)
            else:
                new_feature_set['elements'][new_feature] = [genome_ref]
                new_feature_set['element_ordering'].append(new_feature)

        if params.get('description'):
            new_feature_set['description'] = params['description']

        return new_feature_set

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": diff_expression_set_ref
            }]})['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs, available_condition_labels
         ) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs,
                                        available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = [
                        x.strip() for x in label_string.split(',')
                    ]
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '-'.join(reversed(condition_label_pair))
            diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression(
                diff_expression_set_ref, result_directory,
                condition_label_pair)
            up_feature_ids, down_feature_ids = self._process_matrix_file(
                diff_expr_matrix_file, params.get('p_cutoff'),
                params.get('q_cutoff'), params.get('fold_change_cutoff'))
            filtered_em_name = _sanitize_name(condition_string) + params.get(
                'filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                    params.get('expression_matrix_ref'),
                    up_feature_ids + down_feature_ids,
                    params.get('workspace_name'), "", diff_expr_matrix_ref,
                    filtered_em_name)
                filtered_expression_matrix_ref_list.append(
                    filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix', "")
            up_feature_set_name = "{}_{}_up{}".format(
                diff_expression_set_name, _sanitize_name(condition_string),
                feature_set_suffix)
            up_feature_set_ref = self._generate_feature_set(
                up_feature_ids, genome_id, params.get('workspace_name'),
                up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = "{}_{}_down{}".format(
                diff_expression_set_name, _sanitize_name(condition_string),
                feature_set_suffix)
            down_feature_set_ref = self._generate_feature_set(
                down_feature_ids, genome_id, params.get('workspace_name'),
                down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {
            'result_directory':
            result_directory,
            'up_feature_set_ref_list':
            up_feature_set_ref_list,
            'down_feature_set_ref_list':
            down_feature_set_ref_list,
            'filtered_expression_matrix_ref_list':
            filtered_expression_matrix_ref_list
        }

        report_output = self._generate_report(
            up_feature_set_ref_list, down_feature_set_ref_list,
            filtered_expression_matrix_ref_list, params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal

    def filter_matrix_with_fs(self, params):
        self.validate_params(
            params,
            ('feature_set_ref', 'workspace_name', 'expression_matrix_ref',
             'filtered_expression_matrix_suffix'))
        ret = self.dfu.get_objects(
            {'object_refs': [params['feature_set_ref']]})['data'][0]
        feature_set = ret['data']
        feature_set_name = ret['info'][1]
        feature_ids = set(feature_set['elements'].keys())
        filtered_matrix_ref = self._filter_expression_matrix(
            params['expression_matrix_ref'], feature_ids,
            params['workspace_name'],
            params['filtered_expression_matrix_suffix'])

        objects_created = [{
            'ref': filtered_matrix_ref,
            'description': 'Filtered ExpressionMatrix Object'
        }]
        message = "Filtered Expression Matrix based of the {} feature ids present in {}" \
            .format(len(feature_ids), feature_set_name)

        report_params = {
            'message': message,
            'workspace_name': params['workspace_name'],
            'objects_created': objects_created,
            'report_object_name':
            'kb_FeatureSetUtils_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'filtered_expression_matrix_ref': filtered_matrix_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def build_feature_set(self, params):
        self.validate_params(params, {
            'output_feature_set',
            'workspace_name',
        }, {
            'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets',
            'description'
        })
        feature_sources = ('feature_ids', 'feature_ids_custom',
                           'base_feature_sets')
        if not any([params.get(x) for x in feature_sources]):
            raise ValueError(
                "You must supply at least one feature source: {}".format(
                    ", ".join(feature_sources)))
        workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])

        new_feature_set = self._build_fs_obj(params)
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': 'KBaseCollections.FeatureSet',
                'data': new_feature_set,
                'name': params['output_feature_set']
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0],
                                                dfu_oi[4])

        objects_created = [{
            'ref': feature_set_obj_ref,
            'description': 'Feature Set'
        }]
        message = 'A new feature set containing {} features was created.'.format(
            len(new_feature_set['elements']))

        report_params = {
            'message': message,
            'workspace_name': params['workspace_name'],
            'objects_created': objects_created,
            'report_object_name':
            'kb_FeatureSetUtils_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'feature_set_ref': feature_set_obj_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }
Exemplo n.º 17
0
class kb_GATK:
    '''
    Module Name:
    kb_GATK

    Module Description:
    A KBase module: kb_GATK
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbasecollaborations/kb_GATK.git"
    GIT_COMMIT_HASH = "5e6e4bdca9a7749bba0abab081736c56007212ed"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        self.gu = GATKUtils()
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        self.vu = VariationUtil(self.callback_url)
        self.du = DownloadAlignmentUtils(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def run_kb_GATK(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_GATK
        source_ref = params['alignment_ref']
        alignment_out = self.du.downloadreadalignment(source_ref, params,
                                                      self.callback_url)
        sam_file = os.path.join(alignment_out['destination_dir'],
                                "reads_alignment.sam")
        '''
        #Todo Reading sample set and sample strains information
        '''
        '''
        command.extend(["-filter-name", "\"QD_filter\"", "-filter", "\"QD", "<", params['snp_filter']['snp_qd_filter'] + "\""])
        command.extend(["-filter-name", "\"FS_filter\"", "-filter", "\"FS", "<", params['snp_filter']['snp_fs_filter'] + "\""])
        command.extend(["-filter-name", "\"MQ_filter\"", "-filter", "\"MQ", "<", params['snp_filter']['snp_mq_filter'] + "\""])
        command.extend(["-filter-name", "\"SOR_filter\"", "-filter", "\"SOR", "<", params['snp_filter']['snp_sor_filter'] + "\""])
        command.extend(["-filter-name", "\"MQRankSum_filter\"", "-filter", "\"MQRankSum", "<", params['snp_filter']['snp_mqrankSum_filter'] + "\""])
        command.extend(["-filter-name", "\"ReadPosRankSum_filter\"", "-filter", "\"ReadPosRankSum", "<", params['snp_filter']['snp_readposranksum_filter'] + "\""])
        '''
        print(params)
        strain_info = params['strain_info']
        output_dir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        os.mkdir(output_dir)

        genome_or_assembly_ref = params['assembly_or_genome_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        assembly_file = self.du.download_genome(assembly_ref,
                                                output_dir)['path']

        #output_dir = output_dir + "/"

        #Todo: check time for building index file or donwload from cache.
        #Todo: To discuss about cache_id to be used.
        #Todo: In case of copying genome, find the way of finding original genome (ref id) for getting original cache id.

        self.gu.build_genome(assembly_file)
        self.gu.index_assembly(assembly_file)
        self.gu.generate_sequence_dictionary(assembly_file)
        self.gu.duplicate_marking(output_dir, sam_file)
        #self.gu.sort_bam_index(output_dir)
        self.gu.collect_alignment_and_insert_size_metrics(
            assembly_file, output_dir)
        #self.gu.analyze_covariates(output_dir)

        #Todo: avoid writing intermediate fies to save space and time I/O.
        self.gu.variant_calling(assembly_file, output_dir)
        self.gu.extract_variants(assembly_file, output_dir)
        self.gu.filter_SNPs(assembly_file, "filtered_snps.vcf", output_dir,
                            params)
        self.gu.filter_Indels(assembly_file, "filtered_indels.vcf", output_dir,
                              params)
        self.gu.exclude_filtered_variants(output_dir)
        self.gu.base_quality_score_recalibration(assembly_file,
                                                 "recal_data.table",
                                                 output_dir)
        self.gu.apply_BQSR(assembly_file, "recal_data.table", output_dir)
        self.gu.base_quality_score_recalibration(assembly_file,
                                                 "post_recal_data.table",
                                                 output_dir)
        self.gu.apply_BQSR(assembly_file, "post_recal_data.table", output_dir)
        self.gu.filter_SNPs(assembly_file, "filtered_snps_final.vcf",
                            output_dir, params)

        #Todo: To save indels also using VariationUtils or merge with snps and sort them with chr & pos and save using variaiotiontuils.
        #Todo: To get an example for saving structural variants(specially CNV) and compare with standard vcf output.

        self.gu.filter_Indels(assembly_file, "filtered_indels_final.vcf",
                              output_dir, params)
        '''
        os.system("grep   '##fileformat' " + output_dir + "/filtered_snps_final.vcf > " + output_dir + "/sample.vcf")
        cmd = "grep -v  '##' " + output_dir + "/filtered_snps_final.vcf >> " + output_dir + "/sample.vcf"
        os.system(cmd)            # TODO : need to remove system command after fixing variationUtils.
        '''

        vcf_filepath = self.gu.index_vcf_file(output_dir +
                                              "/filtered_snps_final.vcf")
        reheader_vcf_file = self.gu.reheader(vcf_filepath, strain_info)
        #Todo : check existence of final filtered finals snps.
        #Todo : chnage assembly_or_genome_ref to genome_or_assembly_ref

        #Todo: to derive name of sample_attribute_name from sample set ref by prefixing/suffixing. Attribute mapping should have one sample.

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['assembly_or_genome_ref'],
            'sample_set_ref': params['input_sample_set'],
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': reheader_vcf_file,
            'variation_object_name': params['variation_object_name']
        }

        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'Success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_kb_GATK

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_GATK return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Exemplo n.º 18
0
class masurca_utils:
    """
    masurca_utils: defining a system of utils for running masurca
    """
    MaSuRCA_VERSION = 'MaSuRCA-3.2.9'
    MaSuRCA_BIN = '/kb/module/' + MaSuRCA_VERSION + '/bin/masurca'
    PARAM_IN_WS = 'workspace_name'
    PARAM_IN_THREADN = 'num_threads'
    PARAM_IN_READS_LIBS = 'reads_libraries'
    PARAM_IN_JUMP_LIBS = 'jump_libraries'
    PARAM_IN_JF_SIZE = 'jf_size'
    PARAM_IN_CS_NAME = 'output_contigset_name'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')

    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token)
        self.au = AssemblyUtil(self.callback_url, token=self.token)
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir
        self.prog_runner = Program_Runner(self.MaSuRCA_BIN, self.proj_dir)

    def _has_long_reads(self, params):
        """
        _has_long_reads: check if a long reads input exists in the parameters
        """
        return (params.get('pacbio_reads', None)
                or params.get('nanopore_reads', None)
                or params.get('other_frg_file', None))

    def _get_data_portion(self,
                          pe_reads_data,
                          jp_reads_data=None,
                          pacbio_reads_file='',
                          nanopore_reads_file='',
                          other_frg_file=''):
        """
        _get_data_portion: build the 'DATA...END' portion for the config.txt file
        """
        data_str = ''
        if pe_reads_data:
            # log('PE reads data details:\n{}'.format(json.dumps(pe_reads_data, indent=1)))
            for pe in pe_reads_data:
                if data_str != '':
                    data_str += '\n'
                data_str += 'PE= ' + pe['pe_prefix'] + ' ' + str(pe['pe_mean']) + ' ' + \
                            str(pe['pe_stdev']) + ' ' + pe['fwd_file']
                if pe.get('rev_file', None):
                    data_str += ' ' + pe['rev_file']

        if jp_reads_data:
            # log('JUMP reads data details:\n{}'.format(json.dumps(jp_reads_data, indent=1)))
            for jp in jp_reads_data:
                if data_str != '':
                    data_str += '\n'
                data_str += 'JUMP= ' + jp['jp_prefix'] + ' ' + str(jp['jp_mean']) + ' ' + \
                            str(jp['jp_stdev']) + ' ' + jp['fwd_file']
                if jp.get('rev_file', None):
                    data_str += ' ' + jp['rev_file']

        # Adding the pacbio_reads
        # Note that pcbio reads must be in a single fasta file!
        # For example:
        # data_str +='\nPACBIO= /pool/genomics/frandsenp/masurca/PacBio/pacbio_reads.fasta'
        # ***if you have both types of reads supply them both as NANOPORE type***
        if pacbio_reads_file != '':
            if data_str != '':
                data_str += '\n'
            if nanopore_reads_file != '':
                data_str += 'NANOPORE=' + pacbio_reads_file
            else:
                data_str += 'PACBIO=' + pacbio_reads_file

        # Adding the nanopore_reads and note that nanopore reads must be in a single fasta file!
        # For example:
        # data_str +='\nNANOPORE= /pool/genomics/frandsenp/masurca/NanoPore/nanopore_reads.fasta'
        if nanopore_reads_file != '':
            if data_str != '':
                data_str += '\n'
            data_str += 'NANOPORE= ' + nanopore_reads_file

        # Adding the other_frg_file inputs if any
        # any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first converted into
        # Celera Assembler compatible .frg file
        # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg
        if other_frg_file != '':
            if data_str != '':
                data_str += '\n'
            data_str += 'OTHER=' + other_frg_file

        return data_str

    def _get_parameters_portion(self, params):
        """
        build the 'PARAMETERS...END' portion for the config.txt file
        """
        # set the default parameters as suggested in the example configuration file
        param_str = (
            "EXTEND_JUMP_READS=0\nUSE_GRID=0\nGRID_QUEUE=all.q\nGRID_BATCH_SIZE"
            + "=300000000\nLHE_COVERAGE=25\nMEGA_READS_ONE_PASS=0")
        if (params.get('graph_kmer_size', None)
                and type(params['graph_kmer_size']) == int):
            if param_str != '':
                param_str += '\n'
            param_str += 'GRAPH_KMER_SIZE=' + str(params['graph_kmer_size'])
        else:
            if param_str != '':
                param_str += '\n'
            param_str += 'GRAPH_KMER_SIZE=auto'
        if params.get('use_linking_mates', None):
            if param_str != '':
                param_str += '\n'
            if params['use_linking_mates'] == 1 and not self._has_long_reads(
                    params):
                param_str += 'USE_LINKING_MATES=1'
            else:
                param_str += 'USE_LINKING_MATES=0'
        if params.get('limit_jump_coverage', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'LIMIT_JUMP_COVERAGE = ' + str(
                params['limit_jump_coverage'])
        if params.get('cgwErrorRate', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'CA_PARAMETERS = cgwErrorRate=' + str(
                params['cgwErrorRate'])
        if params.get(self.PARAM_IN_THREADN, None):
            if param_str != '':
                param_str += '\n'
            param_str += 'NUM_THREADS = ' + str(params[self.PARAM_IN_THREADN])
        if params.get('jf_size', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'JF_SIZE=' + str(params['jf_size'])
        if params.get('kmer_count_threshold', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'KMER_COUNT_THRESHOLD=' + str(
                params['kmer_count_threshold'])
        if params.get('do_homopolymer_trim', None):
            if param_str != '':
                param_str += '\n'
            if params['do_homopolymer_trim'] == 1:
                param_str += 'DO_HOMOPOLYMER_TRIM=1'
            else:
                param_str += 'DO_HOMOPOLYMER_TRIM=0'
        if params.get('close_gaps', None):
            if param_str != '':
                param_str += '\n'
            if params['close_gaps'] == 1:
                param_str += 'CLOSE_GAPS=1'
            else:
                param_str += 'CLOSE_GAPS=0'
        if params.get('soap_assembly', None):
            if param_str != '':
                param_str += '\n'
            if params['soap_assembly'] == 1:
                param_str += 'SOAP_ASSEMBLY=1'
            else:
                param_str += 'SOAP_ASSEMBLY=0'
        return param_str

    def _replaceSectionText(self, orig_txt, begin_patn, end_patn, repl_txt):
        """
        replace a section of text of orig_txt between lines begin-patn and end-patn with repl_text
        examples of parameters:
            begin_patn1 = "DATA\n"
            begin_patn2 = "PARAMETERS\n"
            end_patn1 = "END\nPARAMETERS\n"
            end_patn2 = "END\n"
            repl_txt1 = ('PE= pe 500 50 /kb/module/work/testReads/small.forward.fq' +
                          ' /kb/module/work/testReads/small.reverse.fq\n')
            repl_txt2 = ('GRAPH_KMER_SIZE=auto\nUSE_LINKING_MATES=1\nLIMIT_JUMP_COVERAGE = 60\n' +
                          'CA_PARAMETERS = cgwErrorRate=0.15\nNUM_THREADS= 64\nJF_SIZE=100000000\n
                          DO_HOMOPOLYMER_TRIM=0\n')
        """
        if repl_txt != '':
            # create regular expression pattern
            repl = re.compile(begin_patn + '.*?' + end_patn, re.DOTALL)
            repl_txt = begin_patn + repl_txt + '\n' + end_patn
            # replace the text between begin_patn and end_patn with repl_txt
            txt_replaced = repl.sub(repl_txt, orig_txt)
            # pprint(txt_replaced)
            return txt_replaced
        else:
            return orig_txt

    def _unique_prefix_check(self, pfix, refs):
        prefix_lookup = {}
        for ref in refs:
            pre = ref[pfix][0:2]
            if pre not in prefix_lookup:
                prefix_lookup[pre] = 1
            else:
                raise ValueError('The first two characters in \'' + ref[pfix] +
                                 '\' has been used.')

    def _get_pereads_info(self, input_params):
        """
        _get_pereads_info--from a list of paired_readsParams structures fetches the
        corresponding reads info with the paired_readsParams[pe_id]
        returns a list of reads data in the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'pe_prefix': the two-letter prefix for the reads library,
                'pe_mean': the average reads length for the reads library,
                'pe_stdev': the standard deviation for the reads library,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        rds_params = copy.deepcopy(input_params)
        wsname = rds_params[self.PARAM_IN_WS]
        rds_refs = []
        rds_data = []

        # reads_libraries grouped params
        if rds_params.get(self.PARAM_IN_READS_LIBS, None):
            pe_reads_libs = rds_params[self.PARAM_IN_READS_LIBS]

            for pe_lib in pe_reads_libs:
                if pe_lib.get('pe_id', None):
                    rds_refs.append(pe_lib['pe_id'])
            rds_data = self._get_kbreads_info(wsname, rds_refs)

            for pe_lib in pe_reads_libs:
                i = 0
                for rds in rds_data:
                    i += 1
                    if 'pe_id' in pe_lib and pe_lib['pe_id'] == rds[
                            'reads_ref']:
                        if pe_lib.get('pe_prefix', None):
                            rds['pe_prefix'] = pe_lib['pe_prefix'][0]
                        else:
                            rds['pe_prefix'] = 'p'
                        rds['pe_prefix'] += str(i)
                        pe_lib['pe_prefix'] = rds['pe_prefix']

                        if pe_lib.get('pe_mean', None) is None:
                            pe_lib['pe_mean'] = 500
                        rds['pe_mean'] = pe_lib['pe_mean']

                        if pe_lib.get('pe_stdev', None) is None:
                            pe_lib['pe_stdev'] = 50
                        rds['pe_stdev'] = pe_lib['pe_stdev']

            self._unique_prefix_check('pe_prefix', pe_reads_libs)
        else:
            raise ValueError("Parameter {} is required.".format(
                self.PARAM_IN_READS_LIBS))
        return rds_data

    def _get_jpreads_info(self, input_params):
        """
        _get_jpreads_info--from a list of jump_readsParams structures fetches the corresponding
        reads info with the paired_readsParams[pe_id]
        returns a list of reads data in the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'jp_prefix': the two-letter prefix for the reads library,
                'jp_mean': the average reads length for the reads library,
                'jp_stdev': the standard deviation for the reads library,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        rds_params = copy.deepcopy(input_params)
        wsname = rds_params[self.PARAM_IN_WS]
        rds_refs = []
        rds_data = []

        # jump_libraries grouped params
        if rds_params.get(self.PARAM_IN_JUMP_LIBS, None):
            jp_reads_libs = rds_params[self.PARAM_IN_JUMP_LIBS]
            for jp_lib in jp_reads_libs:
                if jp_lib.get('jp_id', None):
                    rds_refs.append(jp_lib['jp_id'])
            rds_data = self._get_kbreads_info(wsname, rds_refs)

            for jp_lib in jp_reads_libs:
                i = 0
                for rds in rds_data:
                    i += 1
                    if 'jp_id' in jp_lib and jp_lib['jp_id'] == rds[
                            'reads_ref']:
                        if jp_lib.get('jp_prefix', None):
                            rds['jp_prefix'] = jp_lib['jp_prefix'][0]
                        else:
                            rds['jp_prefix'] = 's'
                        rds['jp_prefix'] += str(i)
                        jp_lib['jp_prefix'] = rds['jp_prefix']

                        if jp_lib.get('jp_mean', None) is None:
                            jp_lib['jp_mean'] = 3600
                        rds['jp_mean'] = jp_lib['jp_mean']

                        if jp_lib.get('jp_stdev', None) is None:
                            jp_lib['jp_stdev'] = 200
                        rds['jp_stdev'] = jp_lib['jp_stdev']

            self._unique_prefix_check('jp_prefix', jp_reads_libs)
        return rds_data

    def _get_kbreads_info(self, wsname, reads_refs):
        """
        _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding
         reads info with as deinterleaved fastq files and returns a list of reads data in
         the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        obj_ids = []
        for r in reads_refs:
            if r:
                obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})

        if not obj_ids:
            return []

        ws_info = self.ws_client.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = self.ru.download_reads({
                'read_libraries': reads_params,
                'interleaved': 'false'
            })['files']
        except ServerError as se:
            log('logging stacktrace from dynamic client error')
            log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.SingleEndLibrary ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'KBaseFile.SingleEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        # log('Downloaded reads data from KBase:\n' + pformat(reads))
        reads_data = []
        for ref in reads_refs:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            seq_tech = reads[ref]['sequencing_tech']
            rds_info = {
                'fwd_file': f['fwd'],
                'reads_ref': ref,
                'type': f['type'],
                'seq_tech': seq_tech,
                'reads_name': reads_name
            }
            if f.get('rev', None) is not None:
                rds_info['rev_file'] = f['rev']
            reads_data.append(rds_info)

        return reads_data

    def _generate_output_file_list(self, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.proj_dir, str(uuid.uuid4()))
        mkdir_p(output_directory)
        masurca_output = os.path.join(output_directory, 'masurca_output.zip')
        self._zip_folder(out_dir, masurca_output)

        output_files.append({
            'path':
            masurca_output,
            'name':
            os.path.basename(masurca_output),
            'label':
            os.path.basename(masurca_output),
            'description':
            'Output file(s) generated by MaSuRCA'
        })

        return output_files

    def _zip_folder(self, folder_path, output_path):
        """
        _zip_folder: Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders could be included in the archive as well
        if the commented portion is used.
        """
        with zipfile.ZipFile(output_path,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    # print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print("{} created successfully.".format(output_path))
        # with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size

    def _load_stats(self, input_file_name):
        log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly')
        log('Building Object.')
        if not os.path.isfile(input_file_name):
            raise Exception('The input file name {0} is not a file!'.format(
                input_file_name))
        with open(input_file_name, 'r') as input_file_handle:
            contig_id = None
            sequence_len = 0
            fasta_dict = dict()
            first_header_found = False
            # Pattern for replacing white space
            pattern = re.compile(r'\s+')
            for current_line in input_file_handle:
                if current_line[0] == '>':
                    # found a header line
                    # Wrap up previous fasta sequence
                    if not first_header_found:
                        first_header_found = True
                    else:
                        fasta_dict[contig_id] = sequence_len
                        sequence_len = 0
                    fasta_header = current_line.replace('>', '').strip()
                    try:
                        contig_id = fasta_header.strip().split(' ', 1)[0]
                    except (IndexError, KeyError, ValueError):
                        contig_id = fasta_header.strip()
                else:
                    sequence_len += len(re.sub(pattern, '', current_line))
        # wrap up last fasta sequence
        if not first_header_found:
            raise Exception("There are no contigs in this file")
        else:
            fasta_dict[contig_id] = sequence_len
        return fasta_dict

    def _check_reference(self, ref):
        """
        Tests the given ref string to make sure it conforms to the expected
        object reference format. Returns True if it passes, False otherwise.
        """
        obj_ref_regex = re.compile(
            "^(?P<wsid>\d+)\/(?P<objid>\d+)(\/(?P<ver>\d+))?$")
        ref_path = ref.strip().split(";")
        for step in ref_path:
            if not obj_ref_regex.match(step):
                return False
        return True

    def _check_ref_type(self, ref, allowed_types):
        """
        Validates the object type of ref against the list of allowed types. If it passes, this
        returns True, otherwise False.
        Really, all this does is verify that at least one of the strings in allowed_types is
        a substring of the ref object type name.
        Ex1:
        ref = "KBaseGenomes.Genome-4.0"
        allowed_types = ["assembly", "KBaseFile.Assembly"]
        returns False
        Ex2:
        ref = "KBaseGenomes.Genome-4.0"
        allowed_types = ["assembly", "genome"]
        returns True
        """
        obj_type = self._get_object_type(ref).lower()
        for t in allowed_types:
            if t.lower() in obj_type:
                return True
        return False

    def _get_object_type(self, ref):
        """
        Fetches and returns the typed object name of ref from the given workspace url.
        If that object doesn't exist, or there's another Workspace error, this raises a
        RuntimeError exception.
        """
        info = self.ws_client.get_object_info3({'objects': [{'ref': ref}]})
        obj_info = info.get('infos', [[]])[0]
        if len(obj_info) == 0:
            raise RuntimeError(
                "An error occurred while fetching type info from the Workspace. "
                "No information returned for reference {}".format(ref))
        return obj_info[2]

    def _get_fasta_from_assembly(self, assembly_ref):
        """
        From an assembly or contigset, this uses a data file to build a FASTA file
        and return the path to it.
        """
        allowed_types = [
            'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly',
            'KBaseGenomes.ContigSet'
        ]
        if not self._check_ref_type(assembly_ref, allowed_types):
            raise ValueError(
                "The reference {} cannot be used to fetch a FASTA file".format(
                    assembly_ref))
        au = AssemblyUtil(self.callback_url)
        return au.get_assembly_as_fasta({'ref': assembly_ref})

    def generate_report(self, contig_file_name, params, out_dir, wsname):
        """
        generate_report: reporting results
        """
        log('Generating and saving report')

        contig_file_with_path = os.path.join(out_dir, contig_file_name)
        fasta_stats = self._load_stats(contig_file_with_path)
        lengths = [fasta_stats[contig_id] for contig_id in fasta_stats]

        assembly_ref = params[self.PARAM_IN_WS] + '/' + params[
            self.PARAM_IN_CS_NAME]

        report_text = ''
        report_text += 'MaSuRCA results saved to: ' + wsname + '/' + out_dir + '\n'
        report_text += 'Assembly saved to: ' + assembly_ref + '\n'
        report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report_text += 'Avg Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n'
        for c in range(bins):
            report_text += ('   ' + str(counts[c]) + '\t--\t' + str(edges[c]) +
                            ' to ' + str(edges[c + 1]) + ' bp\n')
        print('Running QUAST')
        quastret = self.kbq.run_QUAST({
            'files': [{
                'path': contig_file_with_path,
                'label': params[self.PARAM_IN_CS_NAME]
            }]
        })

        output_files = self._generate_output_file_list(out_dir)

        print('Saving report')
        report_output = self.kbr.create_extended_report({
            'message':
            report_text,
            'objects_created': [{
                'ref': assembly_ref,
                'description': 'Assembled contigs'
            }],
            'direct_html_link_index':
            0,
            'file_links':
            output_files,
            'html_links': [{
                'shock_id': quastret['shock_id'],
                'name': 'report.html',
                'label': 'QUAST report'
            }],
            'report_object_name':
            'kb_masurca_report_' + str(uuid.uuid4()),
            'workspace_name':
            params[self.PARAM_IN_WS]
        })
        report_name = report_output['name']
        report_ref = report_output['ref']
        return report_name, report_ref

    def validate_params(self, params):
        """
        validate_params: checks params passed to run_masurca_app method and set default values
        """
        # log('Start validating run_masurca_app parameters:\n{}'.format(
        # json.dumps(params, indent=1)))

        # check for mandatory parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError(self.PARAM_IN_WS + ' parameter is mandatory')
        if self.PARAM_IN_THREADN not in params:
            raise ValueError(self.PARAM_IN_THREADN + ' parameter is mandatory')

        if params.get(self.PARAM_IN_JF_SIZE, None) is None:
            raise ValueError(self.PARAM_IN_JF_SIZE + ' parameter is mandatory')
        if params.get(self.PARAM_IN_READS_LIBS, None) is None:
            raise ValueError(self.PARAM_IN_READS_LIBS +
                             ' parameter is mandatory')
        if type(params[self.PARAM_IN_READS_LIBS]) != list:
            raise ValueError(self.PARAM_IN_READS_LIBS + ' must be a list')

        if params.get(self.PARAM_IN_CS_NAME, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(
                self.PARAM_IN_CS_NAME))
        if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]):
            raise ValueError('Invalid workspace object name: {}.'.format(
                params[self.PARAM_IN_CS_NAME]))

        if 'dna_source' in params:
            dna_src = params.get('dna_source')
            if dna_src == 'bacteria':
                params['limit_jump_coverage'] = 60
                params['cgwErrorRate'] = 0.25
            else:
                params['limit_jump_coverage'] = 300
                params['cgwErrorRate'] = 0.15

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return params

    def construct_masurca_assembler_cfg(self, params):
        # STEP 1: get the working folder housing the config.txt file and the masurca results
        wsname = params[self.PARAM_IN_WS]
        config_file_path = os.path.join(self.proj_dir, 'config.txt')

        # STEP 2.1: retrieve the reads data from input parameter
        pe_reads_data = self._get_pereads_info(params)
        jp_reads_data = []
        if params.get(self.PARAM_IN_JUMP_LIBS, None):
            jp_reads_data = self._get_jpreads_info(params)
            if 'jp_mean' not in params or type(params['jp_mean']) != int:
                params['jp_mean'] = 3600
            if 'jp_stdev' not in params or type(params['jp_stdev']) != int:
                params['jp_stdev'] = 200

        # STEP 2.2: PACBIO reads must be in a single FASTA file and supplied as PACBIO=reads.fa;
        assbl_types = [
            'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly',
            'KBaseGenomes.ContigSet'
        ]
        reads_types = [
            'KBaseAssembly.SingleEndLibrary', 'KBaseFile.SingleEndLibrary',
            'KBaseAssembly.PairedEndLibrary', 'KBaseFile.PairedEndLibrary'
        ]
        pb_reads_file = ''
        if params.get('pacbio_reads', None):
            pb_ref = params['pacbio_reads']
            if self._check_ref_type(pb_ref, assbl_types):
                pb_reads_file = (self._get_fasta_from_assembly(pb_ref)).get(
                    'path', '')
            else:
                if self._check_ref_type(pb_ref, reads_types):
                    pb_rd = self._get_kbreads_info(wsname, [pb_ref])
                    pb_reads_file = pb_rd[0]['fwd_file']
                    if pb_rd[0].get('rev_file', None):
                        pb_reads_file += ' ' + pb_rd[0]['rev_file']

        # STEP 2.3: NANOPORE reads must be in a single FASTA/FASTQ file and supplied
        # as NANOPORE=reads.fa
        np_reads_file = ''
        if params.get('nanopore_reads', None):
            np_ref = params['nanopore_reads']
            if self._check_ref_type(np_ref, assbl_types):
                np_reads_file = (self._get_fasta_from_assembly(np_ref)).get(
                    'path', '')
            else:
                if self._check_ref_type(np_ref, reads_types):
                    np_rd = self._get_kbreads_info(wsname, [np_ref])
                    np_reads_file = np_rd[0]['fwd_file']
                    if np_rd[0].get('rev_file', None):
                        np_reads_file += ' ' + np_rd[0]['rev_file']

        # STEP 2.4: any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
        # converted into Celera Assembler compatible .frg files
        # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg
        other_frg = ''
        if params.get('other_frg_file', None):
            other_frg = params['other_frg_file']

        # STEP 3: construct and save the config.txt file for running masurca
        try:
            # STEP 3.1: replace the 'DATA...END' portion of the config_template.txt file
            data_str = self._get_data_portion(pe_reads_data, jp_reads_data,
                                              pb_reads_file, np_reads_file,
                                              other_frg)
            if data_str == '':  # no reads libraries are specified, no further actions
                return ''

            config_template = ''
            with codecs.open(os.path.join(os.path.dirname(__file__),
                                          'config_template.txt'),
                             mode='r',
                             encoding='utf-8') as config_template_file:
                config_template = config_template_file.read()

            begin_patn1 = "DATA\n"
            end_patn1 = "END\nPARAMETERS\n"
            config_with_data = self._replaceSectionText(
                config_template, begin_patn1, end_patn1, data_str)
            # log("\n***After DATA section replacement:\n{}\nSaved at {}".format(
            #             config_with_data.encode('utf-8').decode('utf-8'), config_file_path))

            with codecs.open(config_file_path, mode='w',
                             encoding='utf-8') as config_file:
                config_file.write(config_with_data)

            # STEP 3.2: replace the 'PARAMETERS...END' portion of the config_file file saved above
            param_str = self._get_parameters_portion(params)
            if param_str == '':  # no parameters are specified, no further actions
                return ''

            previous_config = ''
            with codecs.open(config_file_path, mode='r',
                             encoding='utf-8') as previous_config_file:
                previous_config = previous_config_file.read()

            begin_patn2 = "PARAMETERS\n"
            end_patn2 = "END\n"
            final_config = self._replaceSectionText(previous_config,
                                                    begin_patn2, end_patn2,
                                                    param_str)
            log("\n***Configuration file content:\n{}\nSaved at {}".format(
                final_config.encode('utf-8').decode('utf-8'),
                config_file_path))

            with codecs.open(config_file_path, mode='w',
                             encoding='utf-8') as config_file:
                config_file.write(final_config)
        except IOError as ioerr:
            log('Creation of the config.txt file raised error:\n')
            pprint(ioerr)
            return ''
        else:
            return config_file_path

    def generate_assemble_script(self, config_file):
        if os.path.isfile(config_file):
            f_dir, f_nm = os.path.split(config_file)
            m_cmd = [self.MaSuRCA_BIN]
            m_cmd.append(config_file)
            try:
                self.prog_runner.run(m_cmd, f_dir)
                assemble_file = os.path.join(f_dir, 'assemble.sh')
                log('Created the assemble.sh file at {}.\n'.format(
                    assemble_file))
                return assemble_file
            except ValueError as ve:
                log('Error generating assemble.sh file: \n{}'.format(ve))
                raise ValueError('Failed to generate assemble.sh file!')
        else:
            log("The config file {} is not found.\n".format(config_file))
            log('NO assemble.sh file created.\n')
        return ''

    def run_assemble(self, asmbl_file):
        exit_code = 1
        if os.path.isfile(asmbl_file):
            log("The assemble.sh file exists at {}\n".format(asmbl_file))
            f_dir, f_nm = os.path.split(asmbl_file)
            a_cmd = ['/bin/bash']
            a_cmd.append(asmbl_file)
            log("The working directory is {}\n".format(f_dir))
            log("The assembling command is {}\n".format(' '.join(a_cmd)))
            try:
                exit_code = self.prog_runner.run(a_cmd, f_dir)
            except ValueError as ve:
                log('Error running assemble: \n{}'.format(ve))
        else:
            log("The assemble.sh file {} is not found.".format(asmbl_file))
        return exit_code

    def save_assembly(self, contig_fa, wsname, a_name):
        if os.path.isfile(contig_fa):
            log('Uploading FASTA file to Assembly...')
            self.au.save_assembly_from_fasta({
                'file': {
                    'path': contig_fa
                },
                'workspace_name': wsname,
                'assembly_name': a_name
            })
        else:
            log("The contig file {} is not found.".format(contig_fa))
Exemplo n.º 19
0
class AMAUtils():
    def __init__(self, ws_url, cb_url, token, scratch):
        self.ws = Workspace(ws_url, token=token)
        self.cb_url = cb_url
        self.token = token
        self.scratch = scratch

    def _confirm_ws_type(self, ref):
        """confirm whether 'ref' is of type 'KBaseMetagenomes.AnnotatedMetagenomeAssembly
        if not, throw error. """
        if ref is None:
            raise ValueError(" 'ref' argument must be specified.")
        obj_info = self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
        # check object type is 'KBaseMetagenome.AnnotatedMetagenomeAssembly'
        obj_type = obj_info[2]
        if 'KBaseMetagenomes.AnnotatedMetagenomeAssembly' not in obj_type:
            raise ValueError(
                f"input ref '{ref}' is of type {obj_type}. function "
                "'get_annotated_metagenome_assembly' requires objects"
                " of type KBaseMetagenome.AnnotatedMetagenomeAssembly")

    def get_annotated_metagenome_assembly(self, params):
        """
        params:
            ref - workspace reference
            included_fields - list of fields to include, defaults to list below if not specified.
        output
            genomes - contains the returned data fields from the workspace request.

        """
        ref = params.get('ref', None)
        included_fields = params.get('included_fields', None)
        self._confirm_ws_type(ref)

        get_obj_params = {'ref': ref}
        if included_fields is not None:
            get_obj_params['included'] = included_fields

        data = self.ws.get_objects2({'objects': [get_obj_params]})['data']

        return {'genomes': data}

    def get_annotated_metagenome_assembly_features(self, params):
        """
        params: 
            ref - workspace reference for KBaseMetagenomes.AnnotatedMetagenomeAssembly object
        output:
            features - list of features, each representing a dict.
        """
        ref = params['ref']
        self._confirm_ws_type(ref)
        ret = self.ws.get_objects2(
            {"objects": [{
                "ref": ref,
                "included": ["features_handle_ref"]
            }]})['data']
        features_handle_ref = ret[0]['data']['features_handle_ref']
        dfu = DataFileUtil(self.cb_url, token=self.token)
        file_name = 'features.json.gz'
        file_path = os.path.join(self.scratch, file_name)
        shock_ret = dfu.shock_to_file({
            'handle_id': features_handle_ref,
            'file_path': file_path,
            'unpack': "uncompress"
        })
        file_path = shock_ret['file_path']

        with open(file_path) as fd:
            json_features = json.load(fd)

        if params.get('feature_type'):
            accepted_feature_types = [
                "cds", "gene", "mrna", "trna", "rrna", "repeat_region"
            ]
            feat_type = params['feature_type']
            if feat_type.lower() not in accepted_feature_types:
                raise ValueError(
                    f"{feat_type} not an accepted feature type; accepted feature"
                    " types (in lower case) are {accepted_feature_types}")
            json_features = [
                feature for feature in json_features
                if feature['type'].lower() == feat_type.lower()
            ]

        if params.get('only_ids'):
            json_features = [{
                'id': feature['id']
            } for feature in json_features]

        return {'features': json_features}
Exemplo n.º 20
0
    def save_variation_from_vcf(self, ctx, params):
        """
        Save a variation (and trait?) object to Kbase given a reference genome, object output name,
        Variant Call Format (VCF) file, and sample attribute file.
        :param params: instance of type "save_variation_input" (## funcdef
           save_variation_from_vcf ## required input params:
           genome_or_assembly_ref: KBaseGenomes.Genome or
           KBaseGenomeAnnotations.Assembly object reference *** variation
           input data *** vcf_staging_file_path: path to location data
           associated with samples variation_object_name: output name for
           KBase variation object *** sample input data ***
           sample_attribute_ref: x/y/z reference to kbase sample attribute
           optional params: NA output report: report_name report_ref HTML
           visualization: Manhattan plot *** Visualization *** plot_maf:
           generate histogram of minor allele frequencies plot_hwe: generate
           histogram of Hardy-Weinberg Equilibrium p-values) -> structure:
           parameter "workspace_name" of String, parameter
           "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "vcf_staging_file_path" of type "filepath"
           (KBase file path to staging files), parameter
           "variation_object_name" of String, parameter
           "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "save_variation_output" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: report
        #BEGIN save_variation_from_vcf
        genome_or_assembly_ref = params['genome_or_assembly_ref']
        ws_url = self.config['workspace-url']
        wsc = Workspace(ws_url)
        obj_type = wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            params['genome_ref'] = genome_or_assembly_ref
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            params['assembly_ref'] = genome_or_assembly_ref
        else:
            raise ValueError(
                obj_type +
                ' is not the right input for this method. Valid input include KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly '
            )

        vtv = VCFToVariation(self.config, self.shared_folder,
                             self.callback_url)

        var_obj = vtv.import_vcf(params)
        var_obj_ref = str(var_obj[0][6]) + "/" + str(
            var_obj[0][0]) + "/" + str(var_obj[0][4])

        upload_message = "Variation object created."
        upload_message += "\nObject #" + str(var_obj[0][0])
        upload_message += "\nObject name: " + str(var_obj[0][1])
        upload_message += "\nGenotypes in variation: " + str(
            var_obj[1]['numgenotypes'])
        upload_message += "\nVariants in VCF file: " + str(
            var_obj[1]['numvariants'])

        report_obj = {
            'objects_created': [{
                'ref':
                var_obj_ref,
                'description':
                'Variation object from VCF file.'
            }],
            'text_message':
            upload_message
        }

        report_client = KBaseReport(self.callback_url)
        report_create = report_client.create({
            'report':
            report_obj,
            'workspace_name':
            params['workspace_name']
        })

        report = {
            "report_name": report_create['name'],
            "report_ref": report_create['ref'],
            "workspace_name": params["workspace_name"]
        }

        #END save_variation_from_vcf

        # At some point might do deeper type checking...
        if not isinstance(report, dict):
            raise ValueError('Method save_variation_from_vcf return value ' +
                             'report is not type dict as required.')
        # return the results
        return [report]
Exemplo n.º 21
0
class StringTieUtil:
    STRINGTIE_TOOLKIT_PATH = "/kb/deployment/bin/StringTie"
    GFFREAD_TOOLKIT_PATH = "/kb/deployment/bin/gffread"
    GFFCOMPARE_TOOLKIT_PATH = "/kb/deployment/bin/gffcompare"

    OPTIONS_MAP = {
        "output_transcripts": "-o",
        "gene_abundances_file": "-A",
        "num_threads": "-p",
        "fr_firststrand": "--rf",
        "fr_secondstrand": "--fr",
        "cov_refs_file": "-C",
        "junction_base": "-a",
        "junction_coverage": "-j",
        "disable_trimming": "-t",
        "min_locus_gap_sep_value": "-g",
        "ballgown_mode": "-B",
        "skip_reads_with_no_ref": "-e",
        "maximum_fraction": "-M",
        "label": "-l",
        "gtf_file": "-G",
        "min_length": "-m",
        "min_read_coverage": "-c",
        "min_isoform_abundance": "-f",
    }

    BOOLEAN_OPTIONS = [
        "disable_trimming", "ballgown_mode", "skip_reads_with_no_ref"
    ]

    def _validate_run_stringtie_params(self, params):
        """
        _validate_run_stringtie_params:
                validates params passed to run_stringtie method
        """

        log("start validating run_stringtie params")

        # check for required parameters
        for p in [
                "alignment_object_ref",
                "workspace_name",
                "expression_suffix",
                "expression_set_suffix",
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _generate_command(self, params):
        """
        _generate_command: generate stringtie command
        """

        command = self.STRINGTIE_TOOLKIT_PATH + "/stringtie "

        for key, option in self.OPTIONS_MAP.items():
            option_value = params.get(key)
            if key in self.BOOLEAN_OPTIONS and option_value:
                option_value = " "
            if option_value:
                command += "{} {} ".format(option, option_value)

        command += "{} ".format(params.get("input_file"))

        log("generated stringtie command: {}".format(command))

        return command

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log("start executing command:\n{}".format(command))

        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if exitCode == 0:
            log("Executed command:\n{}\n".format(command) +
                "Exit Code: {}\nOutput:\n{}".format(exitCode, output))
        else:
            error_msg = "Error running command:\n{}\n".format(command)
            error_msg += "Exit Code: {}\nOutput:\n{}".format(exitCode, output)
            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://ccb.jhu.edu/software/stringtie/gff.shtml
        """

        log("converting gff to gtf")

        command = self.GFFREAD_TOOLKIT_PATH + "/gffread "
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _run_gffcompare(self, gff_path, gtf_path):
        """
        _run_gffcompare: run gffcompare script

        ref: http://ccb.jhu.edu/software/stringtie/gff.shtml
        """

        log("converting gff to gtf")
        output = os.path.dirname(gtf_path) + "/gffcmp"

        command = self.GFFCOMPARE_TOOLKIT_PATH + "/gffcompare "
        command += "-r {} -G -o {} {}".format(gff_path, output, gtf_path)

        self._run_command(command)

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input  SAM/BAM file from Alignment object
        """

        log("getting bam file from alignment")

        bam_file_dir = self.rau.download_alignment(
            {"source_ref": alignment_ref})["destination_dir"]

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r".*\_sorted\.bam", file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r".*(?<!sorted)\.bam", file)
            ]

        if not bam_file_list:
            raise ValueError("Cannot find .bam file from alignment {}".format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _get_gtf_file(self, alignment_ref, result_directory):
        """
        _get_gtf_file: get the reference annotation file (in GTF format)
        """

        alignment_data = self.ws.get_objects2(
            {"objects": [{
                "ref": alignment_ref
            }]})["data"][0]["data"]

        genome_ref = alignment_data.get("genome_id")

        # annotation_file = self._create_gtf_file(genome_ref, result_directory)
        annotation_file = self._create_gtf_annotation_from_genome(
            genome_ref, result_directory)

        gene_name_annotation_file = (annotation_file.split(".gtf")[0] +
                                     "_append_name.gtf")

        with open(gene_name_annotation_file, "w") as output_file:
            with open(annotation_file, "r") as input_file:
                for line in input_file:
                    if ('gene_id "' in line) and ('gene_name "' not in line):
                        line = line.replace("\n", "")
                        gene_id = line.split('gene_id "')[1].split('"')[0]
                        line += ' gene_name "{}";\n'.format(gene_id)
                        output_file.write(line)
                    else:
                        output_file.write(line)

        return gene_name_annotation_file

    def _create_gtf_annotation_from_genome(self, genome_ref, result_directory):
        """
        Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            "ref":
            genome_ref,
            "included": ["contigset_ref", "assembly_ref"]
        }])
        contig_id = None
        if "contigset_ref" in ref[0]["data"]:
            contig_id = ref[0]["data"]["contigset_ref"]
        elif "assembly_ref" in ref[0]["data"]:
            contig_id = ref[0]["data"]["assembly_ref"]
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print(contig_id)
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta(
                {"ref": genome_ref + ";" + contig_id})
            fa_output_file = ret["path"]

            if os.path.dirname(fa_output_file) != result_directory:
                shutil.copy(fa_output_file, result_directory)

            # get the GFF
            ret = self.gfu.genome_to_gff({
                "genome_ref": genome_ref,
                "target_dir": result_directory
            })
            genome_gff_file = ret["file_path"]
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + ".gtf"
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))

        return gtf_path

    def _create_gtf_file(self, genome_ref, result_directory):
        """
        _create_gtf_file: create reference annotation file from genome
        """

        log("start generating reference annotation file")

        genome_gtf_file = self.gfu.genome_to_gff({
            "genome_ref": genome_ref,
            "target_dir": result_directory,
            "is_gtf": True
        })["file_path"]

        return genome_gtf_file

    def _save_expression(
        self,
        result_directory,
        alignment_ref,
        workspace_name,
        expression_suffix,
        genome_ref="",
        transcripts=0,
    ):
        """
        _save_expression: save Expression object to workspace
        """

        log("start saving Expression object")

        alignment_data_object = self.ws.get_objects2(
            {"objects": [{
                "ref": alignment_ref
            }]})["data"][0]

        alignment_name = alignment_data_object["info"][1]
        if re.match(".*_*[Aa]lignment", alignment_name):
            expression_obj_name = re.sub("_*[Aa]lignment", expression_suffix,
                                         alignment_name)
        else:
            expression_obj_name = alignment_name + expression_suffix

        destination_ref = workspace_name + "/" + expression_obj_name
        upload_expression_params = {
            "destination_ref": destination_ref,
            "source_dir": result_directory,
            "alignment_ref": alignment_ref,
            "tool_used": "StringTie",
            "tool_version": "1.3.3",
            "genome_ref": genome_ref,
            "transcripts": transcripts,
        }

        expression_ref = self.eu.upload_expression(
            upload_expression_params)["obj_ref"]

        return expression_ref

    def _save_expression_set(
        self,
        alignment_expression_map,
        alignment_set_ref,
        workspace_name,
        expression_set_suffix,
        genome_ref=None,
    ):
        """
        _save_expression_set: save ExpressionSet object to workspace
        """

        log("start saving ExpressionSet object")

        items = []
        for alignment_expression in alignment_expression_map:
            items.append({
                "ref": alignment_expression.get("expression_obj_ref"),
                "label": alignment_expression.get("alignment_label"),
            })

        expression_set_data = {
            "description": "ExpressionSet using StringTie",
            "items": items,
        }

        alignment_set_data_object = self.ws.get_objects2(
            {"objects": [{
                "ref": alignment_set_ref
            }]})["data"][0]

        alignment_set_name = alignment_set_data_object["info"][1]
        if re.match(".*_*[Aa]lignment_*[Ss]et", alignment_set_name):
            expression_set_name = re.sub("_*[Aa]lignment_*[Ss]et",
                                         expression_set_suffix,
                                         alignment_set_name)
        else:
            expression_set_name = alignment_set_name + expression_set_suffix

        expression_set_save_params = {
            "data": expression_set_data,
            "workspace": workspace_name,
            "genome_ref": genome_ref,
            "output_object_name": expression_set_name,
        }

        save_result = self.set_client.save_expression_set_v1(
            expression_set_save_params)
        expression_set_ref = save_result["set_ref"]

        return expression_set_ref

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log("start saving ExpressionMatrix object")

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub("_*[Ee]xpression_*[Ss]et", "",
                                        expression_set_name)

        upload_expression_matrix_params = {
            "expressionset_ref": expressionset_ref,
            "output_obj_name": output_obj_name_prefix,
            "workspace_name": workspace_name,
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        log("start packing result files")

        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, "stringtie_result.zip")

        with zipfile.ZipFile(result_file,
                             "w",
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not file.endswith(".DS_Store"):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file),
                        )

        output_files.append({
            "path":
            result_file,
            "name":
            os.path.basename(result_file),
            "label":
            os.path.basename(result_file),
            "description":
            "File(s) generated by StringTie App",
        })

        result_dirs = os.listdir(result_directory)
        if "merge_result" in result_dirs:
            merge_file = os.path.join(result_directory, "merge_result",
                                      "stringtie_merge.gtf")
            output_files.append({
                "path":
                merge_file,
                "name":
                os.path.basename(merge_file),
                "label":
                os.path.basename(merge_file),
                "description":
                "merge file generated by StringTie App",
            })

        return output_files

    def _generate_merge_html_report(self, result_directory):
        """
        _generate_html_report: generate html summary report
        """

        log("start generating merge html report")
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, "report.html")

        result_dirs = os.listdir(result_directory)

        Overview_Content = ""
        Overview_Content += "<br/><table><tr><th>Generated Files</th>"
        Overview_Content += "<th></th></tr>"
        Overview_Content += "<tr><th>Directory</th><th>File Name</th></tr>"
        for result_dir in result_dirs:
            result_files = os.listdir(
                os.path.join(result_directory, result_dir))
            result_files.sort()
            first_file = True
            for file_name in result_files:
                if first_file:
                    Overview_Content += "<tr><td>{}</td>".format(result_dir)
                    Overview_Content += "<td>{}</td></tr>".format(file_name)
                    first_file = False
                else:
                    Overview_Content += "<tr><td>{}</td>".format("")
                    Overview_Content += "<td>{}</td></tr>".format(file_name)
        Overview_Content += "</table>"

        with open(result_file_path, "w") as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 "report_template.html"),
                    "r") as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    "<p>Overview_Content</p>", Overview_Content)
                result_file.write(report_template)

        html_report.append({
            "path":
            result_file_path,
            "name":
            os.path.basename(result_file_path),
            "label":
            os.path.basename(result_file_path),
            "description":
            "HTML summary report for StringTie App",
        })
        return html_report

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """

        log("start generating html report")
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, "report.html")

        expression_object = self.ws.get_objects2(
            {"objects": [{
                "ref": obj_ref
            }]})["data"][0]
        expression_info = expression_object["info"]
        expression_data = expression_object["data"]

        expression_object_type = expression_info[2]
        Overview_Content = ""
        if re.match("KBaseRNASeq.RNASeqExpression-\d.\d",
                    expression_object_type):
            Overview_Content += "<br/><table><tr><th>Generated Expression Object</th>"
            Overview_Content += "<th></th></tr>"
            Overview_Content += "<tr><th>Expression Name</th><th>Condition</th></tr>"
            Overview_Content += "<tr><td>{} ({})</td>".format(
                expression_info[1], obj_ref)
            Overview_Content += "<td>{}</td></tr>".format(
                expression_data["condition"])
            Overview_Content += "</table>"
        elif re.match("KBaseSets.ExpressionSet-\d.\d", expression_object_type):
            Overview_Content += (
                "<br/><table><tr><th>Generated ExpressionSet Object</th></tr>")
            Overview_Content += "<tr><td>{} ({})".format(
                expression_info[1], obj_ref)
            Overview_Content += "</td></tr></table>"
            Overview_Content += "<p><br/></p>"
            Overview_Content += "<table><tr><th>Generated Expression Objects</th>"
            Overview_Content += "<th></th></tr>"
            Overview_Content += "<tr><th>Expression Name</th><th>Condition</th></tr>"
            for item in expression_data["items"]:
                item_expression_object = self.ws.get_objects2(
                    {"objects": [{
                        "ref": item["ref"]
                    }]})["data"][0]
                item_expression_info = item_expression_object["info"]
                item_expression_data = item_expression_object["data"]
                expression_name = item_expression_info[1]
                Overview_Content += "<tr><td>{} ({})</td>".format(
                    expression_name, item["ref"])
                Overview_Content += "<td>{}</td>".format(
                    item_expression_data["condition"])
                Overview_Content += "</tr>"
            Overview_Content += "</table>"
        with open(result_file_path, "w") as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 "report_template.html"),
                    "r") as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    "<p>Overview_Content</p>", Overview_Content)
                result_file.write(report_template)

        html_report.append({
            "path":
            result_file_path,
            "name":
            os.path.basename(result_file_path),
            "label":
            os.path.basename(result_file_path),
            "description":
            "HTML summary report for StringTie App",
        })
        return html_report

    def _generate_merge_report(self, workspace_name, result_directory):
        """
        _generate_merge_report: generate summary report
        """

        log("creating merge report")

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_merge_html_report(result_directory)

        report_params = {
            "message": "",
            "workspace_name": workspace_name,
            "file_links": output_files,
            "html_links": output_html_files,
            "direct_html_link_index": 0,
            "html_window_height": 366,
            "report_object_name": "kb_stringtie_report_" + str(uuid.uuid4()),
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            "report_name": output["name"],
            "report_ref": output["ref"]
        }

        return report_output

    def _generate_report(
        self,
        obj_ref,
        workspace_name,
        result_directory,
        exprMatrix_FPKM_ref=None,
        exprMatrix_TPM_ref=None,
        genome_ref=None,
    ):
        """
        _generate_report: generate summary report
        """

        log("creating report")

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {"objects": [{
                "ref": obj_ref
            }]})["data"][0]
        expression_info = expression_object["info"]
        expression_data = expression_object["data"]
        objects_created = []

        expression_object_type = expression_info[2]
        if re.match("KBaseRNASeq.RNASeqExpression-\d+.\d+",
                    expression_object_type):
            objects_created.append({
                "ref":
                obj_ref,
                "description":
                "Expression generated by StringTie"
            })
        elif re.match("KBaseSets.ExpressionSet-\d+.\d+",
                      expression_object_type):
            objects_created.append({
                "ref":
                obj_ref,
                "description":
                "ExpressionSet generated by StringTie"
            })
            items = expression_data["items"]
            for item in items:
                objects_created.append({
                    "ref":
                    item["ref"],
                    "description":
                    "Expression generated by StringTie",
                })
            objects_created.append({
                "ref":
                exprMatrix_FPKM_ref,
                "description":
                "FPKM ExpressionMatrix generated by StringTie",
            })
            objects_created.append({
                "ref":
                exprMatrix_TPM_ref,
                "description":
                "TPM ExpressionMatrix generated by StringTie",
            })
        if genome_ref:
            objects_created.append({
                "ref":
                genome_ref,
                "description":
                "Genome containing novel transcripts generated "
                "by StringTie",
            })

        report_params = {
            "message": "",
            "workspace_name": workspace_name,
            "file_links": output_files,
            "objects_created": objects_created,
            "html_links": output_html_files,
            "direct_html_link_index": 0,
            "html_window_height": 366,
            "report_object_name": "kb_stringtie_report_" + str(uuid.uuid4()),
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            "report_name": output["name"],
            "report_ref": output["ref"]
        }

        return report_output

    def _process_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """

        log("start processing RNASeqAlignment object\n")
        log("params:\n{}".format(json.dumps(params, indent=1)))
        alignment_ref = params.get("alignment_ref")

        alignment_set_object = self.ws.get_objects2(
            {"objects": [{
                "ref": alignment_ref
            }]})["data"][0]

        alignment_info = alignment_set_object["info"]
        alignment_data = alignment_set_object["data"]

        alignment_name = alignment_info[1]
        alignment_label = alignment_data["condition"]

        result_directory = os.path.join(
            self.scratch, alignment_name + "_" + str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        # input files
        if not params.get("gtf_file"):
            params["gtf_file"] = self._get_gtf_file(alignment_ref,
                                                    result_directory)
            if params.get("label"):
                if params["label"] in open(params["gtf_file"]).read():
                    raise ValueError(
                        "Provided prefix for transcripts matches an existing "
                        "feature ID. Please select a different label for "
                        "transcripts.")
        else:
            shutil.copy(params.get("gtf_file"), result_directory)
        params["input_file"] = self._get_input_file(alignment_ref)
        log("using {} as reference annotation file.".format(
            params.get("gtf_file")))

        # output files
        self.output_transcripts = "transcripts.gtf"
        params["output_transcripts"] = os.path.join(result_directory,
                                                    self.output_transcripts)

        self.gene_abundances_file = "genes.fpkm_tracking"
        params["gene_abundances_file"] = os.path.join(
            result_directory, self.gene_abundances_file)

        command = self._generate_command(params)
        self._run_command(command)

        if params.get("exchange_gene_ids"):
            exchange_gene_ids(result_directory)

        if "generate_ws_object" in params and not params.get(
                "generate_ws_object"):
            log("skip generating expression object")
            expression_obj_ref = ""
        else:
            expression_obj_ref = self._save_expression(
                result_directory,
                alignment_ref,
                params.get("workspace_name"),
                params["expression_suffix"],
                params.get("genome_ref"),
                params.get("novel_isoforms", 0),
            )

        returnVal = {
            "result_directory": result_directory,
            "expression_obj_ref": expression_obj_ref,
            "alignment_ref": alignment_ref,
            "annotation_file": params["gtf_file"],
            "alignment_label": alignment_label,
        }

        return returnVal

    def _process_alignment_set_object(self, params):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
        """

        log("start processing AlignmentSet object\nparams:\n{}".format(
            json.dumps(params, indent=1)))
        alignment_set_ref = params.get("alignment_set_ref")

        alignment_set = self.set_client.get_reads_alignment_set_v1({
            "ref":
            alignment_set_ref,
            "include_item_info":
            0,
            "include_set_item_ref_paths":
            1,
        })
        # pull down the genome once so as to avoid duplicate effort
        if not params.get("gtf_file"):
            alignment_ref = alignment_set["data"]["items"][0]["ref_path"]
            params["gtf_file"] = self._get_gtf_file(alignment_ref,
                                                    self.scratch)
            if params.get("label"):
                if params["label"] in open(params["gtf_file"]).read():
                    raise ValueError(
                        "Provided prefix for transcripts matches an existing "
                        "feature ID. Please select a different label for "
                        "transcripts.")

        def wrapped_process_alignment_object(params):
            try:
                returnVal = self._process_alignment_object(params)
            except:
                log("caught exception in worker")
                exctype, value = sys.exc_info()[:2]
                returnVal = {"exception": "{}: {}".format(exctype, value)}

            return returnVal

        mul_processor_params = []
        for alignment in alignment_set["data"]["items"]:
            alignment_ref = alignment["ref_path"]
            alignment_upload_params = params.copy()
            alignment_upload_params["alignment_ref"] = alignment_ref
            mul_processor_params.append(alignment_upload_params)

        cpus = min(params.get("num_threads"), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log("running _process_alignment_object with {} cpus".format(cpus))
        alignment_expression_map = pool.map(wrapped_process_alignment_object,
                                            mul_processor_params)

        for proc_alignment_return in alignment_expression_map:
            if "exception" in proc_alignment_return:
                error_msg = "Caught exception in worker\n"
                error_msg += "Exception: {}".format(
                    proc_alignment_return["exception"])
                raise ValueError(error_msg)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        for proc_alignment_return in alignment_expression_map:
            alignment_ref = proc_alignment_return.get("alignment_ref")
            alignment_info = self.ws.get_object_info3(
                {"objects": [{
                    "ref": alignment_ref
                }]})
            alignment_name = alignment_info["infos"][0][1]
            self._run_command("cp -R {} {}".format(
                proc_alignment_return.get("result_directory"),
                os.path.join(result_directory, alignment_name),
            ))
        if "generate_ws_object" in params and not params.get(
                "generate_ws_object"):
            log("skip generating expression set object")
            expression_obj_ref = ""
            expression_matrix_refs = {}
        else:
            expression_obj_ref = self._save_expression_set(
                alignment_expression_map,
                alignment_set_ref,
                params.get("workspace_name"),
                params["expression_set_suffix"],
                params.get("genome_ref"),
            )
            expression_matrix_refs = self._save_expression_matrix(
                expression_obj_ref, params.get("workspace_name"))

        annotation_file_name = os.path.basename(
            alignment_expression_map[0]["annotation_file"])
        annotation_file_path = os.path.join(result_directory,
                                            os.listdir(result_directory)[0],
                                            annotation_file_name)

        returnVal = {
            "result_directory":
            result_directory,
            "expression_obj_ref":
            expression_obj_ref,
            "annotation_file":
            annotation_file_path,
            "exprMatrix_FPKM_ref":
            expression_matrix_refs.get("exprMatrix_FPKM_ref"),
            "exprMatrix_TPM_ref":
            expression_matrix_refs.get("exprMatrix_TPM_ref"),
        }

        return returnVal

    def _run_merge_option(self, result_directory, params, annotation_file):

        log("start running stringtie merge")

        result_dirs = os.listdir(result_directory)

        merge_directory = os.path.join(result_directory, "merge_result")
        self._mkdir_p(merge_directory)

        option_params = params.copy()

        option_params.pop("num_threads", None)
        option_params.pop("ballgown_mode", None)
        option_params.pop("skip_reads_with_no_ref", None)
        option_params.pop("junction_coverage", None)
        option_params.pop("junction_base", None)
        option_params.pop("min_read_coverage", None)
        option_params.pop("min_locus_gap_sep_value", None)

        output_merge = "stringtie_merge.gtf"
        option_params["output_transcripts"] = os.path.join(
            merge_directory, output_merge)

        command = self.STRINGTIE_TOOLKIT_PATH + "/stringtie "
        command += "--merge "
        command += "-G {} ".format(annotation_file)

        for key, option in self.OPTIONS_MAP.items():
            option_value = option_params.get(key)
            if key in self.BOOLEAN_OPTIONS and option_value:
                option_value = " "
            if option_value:
                command += "{} {} ".format(option, option_value)

        for result_dir in result_dirs:
            gtf_file = os.path.join(result_directory, result_dir,
                                    "transcripts.gtf")
            command += "{} ".format(gtf_file)

        self._run_command(command)

    def _get_genome_ref(self, alignment_set_ref):
        """Get a genome ref from an alignment set"""
        alignment_set_data = self.dfu.get_objects(
            {"object_refs": [alignment_set_ref]})["data"][0]["data"]

        for alignment in alignment_set_data["items"]:
            alignment_data = self.dfu.get_objects(
                {"object_refs": [alignment["ref"]]})["data"][0]["data"]
            return alignment_data["genome_id"]

    def _save_genome_with_novel_isoforms(self,
                                         workspace,
                                         genome_ref,
                                         gff_file,
                                         new_genome_name=None):
        """"""
        log("Saving genome with novel isoforms")
        genome_data = self.dfu.get_objects({"object_refs":
                                            [genome_ref]})["data"][0]["data"]
        if "assembly_ref" in genome_data:
            assembly_ref = genome_data["assembly_ref"]
        elif "contigset_ref" in genome_data:
            assembly_ref = genome_data["contigset_ref"]
        else:
            raise ValueError("Genome missing assembly")
        fasta_file = self.au.get_assembly_as_fasta({"ref":
                                                    assembly_ref})["path"]
        if not new_genome_name:
            new_genome_name = genome_data["id"] + "_stringtie"
        ret = self.gfu.fasta_gff_to_genome({
            "workspace_name": workspace,
            "genome_name": new_genome_name,
            "fasta_file": {
                "path": fasta_file
            },
            "gff_file": {
                "path": gff_file
            },
            "source": "StringTie",
        })
        return ret["genome_ref"]

    def _novel_isoform_mode(self, alignment_object_ref, params):
        """This is a three step process: First, run StringTie on all the alignments individually
        which will produce novel transcripts. Next, merge the resulting transcripts together.
        Finally, rerun StringTie with the merged GTF file as the reference genome.
        """
        log("running Stringtie the 1st time")
        params.update({
            "ballgown_mode": 0,
            "skip_reads_with_no_ref": 0,
            "generate_ws_object": False,
            "exchange_gene_ids": 1,
        })
        returnVal = self._process_alignment_set_object(params)
        first_run_result_dir = returnVal.get("result_directory")
        annotation_file = returnVal["annotation_file"]

        log("running StringTie merge")
        self._run_merge_option(first_run_result_dir, params, annotation_file)
        merge_file = os.path.join(first_run_result_dir, "merge_result",
                                  "stringtie_merge.gtf")

        old_genome_ref = self._get_genome_ref(alignment_object_ref)
        ret = self.gfu.genome_to_gff({
            "genome_ref": old_genome_ref,
            "target_dir": first_run_result_dir
        })
        self._run_gffcompare(ret["file_path"], merge_file)
        comp_file = os.path.join(first_run_result_dir, "merge_result",
                                 "gffcmp.annotated.gtf")
        upload_file = _make_gff(comp_file, ret["file_path"],
                                params.get("label", "MSTRG."))
        params["genome_ref"] = self._save_genome_with_novel_isoforms(
            params["workspace_name"],
            old_genome_ref,
            upload_file,
            params.get("novel_isoforms", {}).get("stringtie_genome_name"),
        )
        _update_merge_file(merge_file)

        log("running StringTie the 3rd time with merged gtf")
        params.update({
            "gtf_file": merge_file,
            "generate_ws_object": True,
            "exchange_gene_ids": 0,
            "ballgown_mode": 1,
            "skip_reads_with_no_ref": 1,
        })
        returnVal = self._process_alignment_set_object(params)

        shutil.move(
            os.path.join(first_run_result_dir, "merge_result"),
            returnVal.get("result_directory"),
        )

        report_output = self._generate_report(
            returnVal.get("expression_obj_ref"),
            params.get("workspace_name"),
            returnVal.get("result_directory"),
            returnVal.get("exprMatrix_FPKM_ref"),
            returnVal.get("exprMatrix_TPM_ref"),
            params["genome_ref"],
        )
        return report_output, returnVal

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config["SDK_CALLBACK_URL"]
        self.token = config["KB_AUTH_TOKEN"]
        self.shock_url = config["shock-url"]
        self.srv_wiz_url = config["srv-wiz-url"]
        self.scratch = config["scratch"]
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)
        self.set_client = SetAPI(self.srv_wiz_url, service_ver="dev")

    def run_stringtie_app(self, params):
        """
        run_stringtie_app: run StringTie app
        (http://ccb.jhu.edu/software/stringtie/index.shtml?t=manual)

        required params:
        alignment_object_ref: Alignment or AlignmentSet object reference
        workspace_name: the name of the workspace it gets saved to
        expression_set_suffix: suffix append to expression set object name
        expression_suffix: suffix append to expression object name
        mode: one of ['normal', 'merge', 'novel_isoform']

        optional params:
        num_threads: number of processing threads
        junction_base: junctions that don't have spliced reads
        junction_coverage: junction coverage
        disable_trimming: disables trimming at the ends of the assembled transcripts
        min_locus_gap_sep_value: minimum locus gap separation value
        ballgown_mode: enables the output of Ballgown input table files
        skip_reads_with_no_ref: reads with no reference will be skipped
        maximum_fraction: maximum fraction of muliple-location-mapped reads
        label: prefix for the name of the output transcripts
        min_length: minimum length allowed for the predicted transcripts
        min_read_coverage: minimum input transcript coverage
        min_isoform_abundance: minimum isoform abundance

        return:
        result_directory: folder path that holds all files generated by run_stringtie_app
        expression_obj_ref: generated Expression/ExpressionSet object reference
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """
        log("--->\nrunning StringTieUtil.run_stringtie\n" +
            "params:\n{}".format(json.dumps(params, indent=1)))

        self._validate_run_stringtie_params(params)
        if (isinstance(params.get("novel_isoforms"), dict)
                and "transcript_label" in params["novel_isoforms"]):
            params["label"] = params["novel_isoforms"]["transcript_label"]

        alignment_object_ref = params.get("alignment_object_ref")
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})["infos"][0]
        alignment_object_type = alignment_object_info[2]

        if re.match("KBaseRNASeq.RNASeqAlignment-\d.\d",
                    alignment_object_type):
            params.update({"alignment_ref": alignment_object_ref})
            returnVal = self._process_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get("expression_obj_ref"),
                params.get("workspace_name"),
                returnVal.get("result_directory"),
            )
            returnVal.update(report_output)
        elif re.match("KBaseRNASeq.RNASeqAlignmentSet-\d.\d",
                      alignment_object_type) or re.match(
                          "KBaseSets.ReadsAlignmentSet-\d.\d",
                          alignment_object_type):
            params.update({"alignment_set_ref": alignment_object_ref})
            if params.get("novel_isoforms"):
                report_output, returnVal = self._novel_isoform_mode(
                    alignment_object_ref, params)
            else:
                params.update({
                    "ballgown_mode": 1,
                    "skip_reads_with_no_ref": 1,
                    "exchange_gene_ids": 0,
                })

                returnVal = self._process_alignment_set_object(params)

                report_output = self._generate_report(
                    returnVal.get("expression_obj_ref"),
                    params.get("workspace_name"),
                    returnVal.get("result_directory"),
                    returnVal.get("exprMatrix_FPKM_ref"),
                    returnVal.get("exprMatrix_TPM_ref"),
                )
            returnVal.update(report_output)
        else:
            error_msg = "Invalid input object type\nObject info:\n{}".format(
                alignment_object_info)
            raise ValueError(error_msg)

        return returnVal
Exemplo n.º 22
0
class QualiMapRunner:

    QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap'
    JAVA_MEM_DEFAULT_SIZE = '16G'
    LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024  # 20 GB
    TIMEOUT = 72 * 60 * 60  # 72 hours

    def _get_file_size(self, file_path):
        file_size = os.path.getsize(file_path)
        print('File size: {} -- {}'.format(file_size, file_path))
        return file_size

    def _large_file(self, file_path):

        filename, file_extension = os.path.splitext(file_path)
        multiplier = 0

        if file_extension == '.txt':
            total_file_size = 0
            with open(file_path, 'r') as f:
                for line in f:
                    bam_file_path = line.split('\t')[1]
                    total_file_size += self._get_file_size(bam_file_path)
            print('Total file size: {}'.format(total_file_size))
            multiplier = int(total_file_size) // int(self.LARGE_BAM_FILE_SIZE)
        else:
            multiplier = int(self._get_file_size(file_path)) // int(
                self.LARGE_BAM_FILE_SIZE)

        print('setting number of windows multiplier to: {}'.format(multiplier))

        return multiplier

    def _timeout_handler(self, signum, frame):
        print('Signal handler called with signal', signum)
        raise ValueError('QualiMap takes too long')

    def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
        self.scratch_dir = scratch_dir
        self.rau = ReadsAlignmentUtils(callback_url)
        self.kbr = KBaseReport(callback_url)
        self.dfu = DataFileUtil(callback_url)
        self.gfu = GenomeFileUtil(callback_url)
        self.set_api = SetAPI(srv_wiz_url)
        self.ws = Workspace(workspace_url)
        self.valid_commands = ['bamqc', 'multi-bamqc']

    def run_app(self, params):
        self.validate_params(params)
        print('Validated Params = ')
        pprint(params)
        run_info = self.get_run_info(params)

        if run_info.get('mode') not in ['single', 'multi']:
            raise ValueError(
                'Error in fetching the type to determine run settings.')

        run_error = False
        try:
            signal.signal(signal.SIGALRM, self._timeout_handler)
            signal.alarm(self.TIMEOUT)
            if run_info['mode'] == 'single':
                result = self.run_bamqc(params['input_ref'],
                                        run_info['input_info'])
            elif run_info['mode'] == 'multi':
                result = self.run_multi_sample_qc(params['input_ref'],
                                                  run_info['input_info'])
            signal.alarm(0)
        except Exception:
            run_error = True

            workdir = os.path.join(self.scratch_dir,
                                   'qualimap_' + str(int(time.time() * 10000)))
            os.makedirs(workdir)

            with open(os.path.join(workdir, 'qualimapReport.html'),
                      'w') as report:
                report.write('<html><body><p></p></body></html>')

            package_info = self.package_output_folder(
                workdir, 'QualiMap_report',
                'EMPTY HTML report directory for QualiMap BAM QC',
                'qualimapReport.html')

            result = {
                'qc_result_folder_path': workdir,
                'qc_result_zip_info': package_info,
                'shock_id': None
            }
            error_msg = 'Running QualiMap returned an error:\n{}\n'.format(
                traceback.format_exc())
            error_msg += 'Generating simple report instead\n'
            print(error_msg)

        if params['create_report']:
            result = self.create_report(result, params['output_workspace'],
                                        run_error, params['input_ref'])

        return result

    def create_report(self,
                      result,
                      output_workspace,
                      run_error=None,
                      input_ref=None):

        if run_error:
            objects_created = []
            info = self.get_obj_info(input_ref)
            obj_type = self.get_type_from_obj_info(info)
            if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'Alignment'
                })

            if obj_type in [
                    'KBaseRNASeq.RNASeqAlignmentSet',
                    'KBaseSets.ReadsAlignmentSet'
            ]:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'AlignmentSet'
                })
                reads_alignment_info = self.get_alignments_from_set(input_ref)
                for alignment in reads_alignment_info:
                    alignment_ref = alignment.get('ref')
                    objects_created.append({
                        'ref': alignment_ref,
                        'description': 'Alignment'
                    })

            report_info = self.kbr.create_extended_report({
                'message':
                ' ',
                'objects_created':
                objects_created,
                'report_object_name':
                'qualimap_report' + str(uuid.uuid4()),
                'workspace_name':
                output_workspace
            })
            result['report_name'] = report_info['name']
            result['report_ref'] = report_info['ref']
            return result

        qc_result_zip_info = result['qc_result_zip_info']
        report_info = self.kbr.create_extended_report({
            'message':
            '',
            'objects_created': [],
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'report_object_name':
            'qualimap_report' + str(uuid.uuid4()),
            'workspace_name':
            output_workspace
        })
        result['report_name'] = report_info['name']
        result['report_ref'] = report_info['ref']
        return result

    def get_gtf_file(self, input_ref, set_op=False):

        print('Start fetching GFF file from genome')

        if set_op:
            set_data = self.set_api.get_reads_alignment_set_v1({
                'ref':
                input_ref,
                'include_item_info':
                1
            })
            input_ref = set_data['data']['items'][0]['ref']

        obj_data = self.dfu.get_objects({"object_refs":
                                         [input_ref]})['data'][0]['data']

        genome_ref = obj_data.get('genome_id')

        if not genome_ref:
            raise ValueError(
                'Alignment is not associated with a Genome object')

        result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4()))
        os.makedirs(result_directory)

        genome_gtf_file = self.gfu.genome_to_gff({
            'genome_ref': genome_ref,
            'is_gtf': True,
            'target_dir': result_directory
        })['file_path']

        return genome_gtf_file

    def run_bamqc(self, input_ref, input_info):
        # download the input and setup a working dir
        alignment_info = self.rau.download_alignment({'source_ref': input_ref})
        bam_file_path = self.find_my_bam_file(
            alignment_info['destination_dir'])
        try:
            gtf_file = self.get_gtf_file(input_ref)
        except:
            gtf_file = ''

        workdir = os.path.join(self.scratch_dir,
                               'qualimap_' + str(int(time.time() * 10000)))

        options = [
            '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat',
            'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        options.append('--java-mem-size={}'.format(
            self.JAVA_MEM_DEFAULT_SIZE))  # always use large mem
        multiplier = self._large_file(bam_file_path)
        if multiplier:
            window_size = multiplier * 400
            print(f'using larger window size: {window_size} and Java memory: '
                  f'{self.JAVA_MEM_DEFAULT_SIZE}')
            options.append(
                '-nw {}'.format(window_size))  # increase size of windows

        self.run_cli_command('bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap BAM QC', 'qualimapReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def run_multi_sample_qc(self, input_ref, input_info):
        # download the input and setup a working dir
        reads_alignment_info = self.get_alignments_from_set(input_ref)
        try:
            gtf_file = self.get_gtf_file(input_ref, set_op=True)
        except:
            gtf_file = ''
        suffix = 'qualimap_' + str(int(time.time() * 10000))
        workdir = os.path.join(self.scratch_dir, suffix)
        os.makedirs(workdir)

        input_file_path = self.create_multi_qualimap_cfg(
            reads_alignment_info, workdir)

        options = [
            '-d', input_file_path, '-r', '-c', '-outdir', workdir,
            '-outformat', 'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        multiplier = self._large_file(input_file_path)
        if multiplier:
            window_size = multiplier * 400
            print(f'using larger window size: {window_size} and Java memory: '
                  f'{self.JAVA_MEM_DEFAULT_SIZE}')
            options.append(f'-nw {window_size}')  # increase size of windows
            options.append(f'--java-mem-size={self.JAVA_MEM_DEFAULT_SIZE}')

        self.run_cli_command('multi-bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap Multi-sample BAM QC',
            'multisampleBamQcReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def get_alignments_from_set(self, alignment_set_ref):
        set_data = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            1
        })
        items = set_data['data']['items']

        reads_alignment_data = []
        for alignment in items:
            alignment_info = self.rau.download_alignment(
                {'source_ref': alignment['ref']})
            bam_file_path = self.find_my_bam_file(
                alignment_info['destination_dir'])
            label = None
            if 'label' in alignment:
                label = alignment['label']
            reads_alignment_data.append({
                'bam_file_path': bam_file_path,
                'ref': alignment['ref'],
                'label': label,
                'info': alignment['info']
            })
        return reads_alignment_data

    def create_multi_qualimap_cfg(self, reads_alignment_info, workdir):
        # Group by labels if there is at least one defined
        use_labels = False
        for alignment in reads_alignment_info:
            if alignment['label']:
                use_labels = True
                break

        # write the file
        input_file_path = os.path.join(workdir, 'multi_input.txt')
        input_file = open(input_file_path, 'w')
        name_lookup = {}
        for alignment in reads_alignment_info:
            name = alignment['info'][1]
            if name in name_lookup:
                name_lookup[name] += 1
                name = name + '_' + str(name_lookup[name])
            else:
                name_lookup[name] = 1

            input_file.write(name + '\t' + alignment['bam_file_path'])
            if use_labels:
                if alignment['label']:
                    input_file.write('\t' + alignment['label'])
                else:
                    input_file.write('\tunlabeled')
            input_file.write('\n')
        input_file.close()
        return input_file_path

    def get_run_info(self, params):
        info = self.get_obj_info(params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
            return {'mode': 'single', 'input_info': info}
        if obj_type in [
                'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet'
        ]:
            return {'mode': 'multi', 'input_info': info}
        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def validate_params(self, params):
        if 'input_ref' not in params:
            raise ValueError(
                'required parameter field "input_ref" was not set')

        create_report = False
        if 'create_report' in params:
            if int(params['create_report']) == 1:
                if 'output_workspace' not in params:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                if not params['output_workspace']:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                create_report = True
        params['create_report'] = create_report

    def run_cli_command(self, command, options, cwd=None):
        if command not in self.valid_commands:
            raise ValueError('Invalid QualiMap command: ' + str(command))
        command = [self.QUALIMAP_PATH, command] + options
        print('Running: ' + ' '.join(command))

        if not cwd:
            cwd = self.scratch_dir

        p = subprocess.Popen(command, cwd=cwd, shell=False)
        exitCode = p.wait()

        if exitCode == 0:
            print('Success, exit code was: ' + str(exitCode))
        else:
            raise ValueError('Error running command: ' + ' '.join(command) +
                             '\n' + 'Exit Code: ' + str(exitCode))

    def find_my_bam_file(self, dirpath):
        bam_path = None
        for f in os.listdir(dirpath):
            fullpath = os.path.join(dirpath, f)
            if os.path.isfile(fullpath) and f.lower().endswith('.bam'):
                if bam_path is not None:
                    raise ValueError(
                        'Error! Too many BAM files were downloaded for this alignment!'
                    )
                bam_path = fullpath
        if bam_path is None:
            raise ValueError(
                'Error! No BAM files were downloaded for this alignment!')
        return bam_path

    def package_output_folder(self, folder_path, zip_file_name,
                              zip_file_description, index_html_file):
        """ Simple utility for packaging a folder and saving to shock """
        output = self.dfu.file_to_shock({
            'file_path': folder_path,
            'make_handle': 0,
            'pack': 'zip'
        })
        return {
            'shock_id': output['shock_id'],
            'name': zip_file_name,
            'description': zip_file_description,
            'index_html_file_name': index_html_file
        }

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
Exemplo n.º 23
0
def build_report_view_data(host: str, ws_client: Workspace,
                           result: list) -> dict:
    """
    Returns a structure like this:
    {
        html: {
            height: max height string for iframes (default = 500px, unless present in report),
            set_height: boolean - if True, then apply height to the height style value as well.
            direct: string (optional) - direct html to plop in the page,
            iframe_style: string (optional) - styling for direct html iframe,
            links: [{
                url: string,
                name: string,
                description: string,
                handle: ?
                label: ?
            }],
            paths: [ path1, path2, path3, ... ] for all urls in links (just a convenience),
            link_idx: index of paths to use
                (this is a little funky, might get cleared up in a later iteration.)
                (I suspect this'll be here 3 years later. Today's 2/13/2020. Let's see!)
            file_links: [{
                'URL': 'https://ci.kbase.us/services/shock-api/node/a2625b71-48d5-4ba6-8603-355485508da8',
                'description': 'JGI Metagenome Assembly Report',
                'handle': 'KBH_253154',
                'label': 'assembly_report',
                'name': 'assembly_report.zip'
            }]
        }
        objects: [{
            'upa': '...',
            'name': 'foo',
            'type': '...',
            'description': '...'
        }]
        summary: '',
        summary_height: height string for summary panel (default = 500px unless specified in report),
        report: ''
    }
    """
    if not result:
        return {}
    if not isinstance(result, list):
        result = [result]
    if (not result[0] or not isinstance(result[0], dict)
            or not result[0].get('report_name')
            or not result[0].get('report_ref')):
        return {}
    report_ref = result[0]['report_ref']
    report = ws_client.get_objects2({'objects': [{
        'ref': report_ref
    }]})['data'][0]['data']
    """{'direct_html': None,
     'direct_html_link_index': None,
     'file_links': [],
     'html_links': [],
     'html_window_height': None,
     'objects_created': [{'description': 'Annotated genome', 'ref': '43666/6/1'}],
     'summary_window_height': None,
     'text_message': 'Genome saved to: wjriehl:narrative_1564507007662/some_genome\nNumber of genes predicted: 3895\nNumber of protein coding genes: 3895\nNumber of genes with non-hypothetical function: 2411\nNumber of genes with EC-number: 1413\nNumber of genes with Seed Subsystem Ontology: 1081\nAverage protein length: 864 aa.\n',
     'warnings': []}
    """
    created_objs = []
    if report.get('objects_created'):
        report_objs_created = report['objects_created']
        # make list to look up obj types with get_object_info3
        info_lookup = [{"ref": o["ref"]} for o in report_objs_created]
        infos = ws_client.get_object_info3({'objects': info_lookup})['infos']
        for idx, info in enumerate(infos):
            created_objs.append({
                'upa':
                report_objs_created[idx]['ref'],
                'description':
                report_objs_created[idx].get('description', ''),
                'name':
                info[1],
                'type':
                info[2].split('-')[0].split('.')[-1],
                'link':
                host + '/#dataview/' + report_objs_created[idx]['ref']
            })
    html_height = report.get("html_window_height")
    if html_height is None:
        html_height = 500
    html = {"height": f"{html_height}px", "set_height": True}
    if report.get("direct_html"):
        if not report.get("direct_html").startswith("<html"):
            html["set_height"] = False
        html["direct"] = "data:text/html;charset=utf-8," + quote(
            report.get("direct_html"))

    if report.get("html_links"):
        idx = report.get("direct_html_link_index", 0)
        if idx is None or idx < 0 or idx >= len(report["html_links"]):
            idx = 0
        html["links"] = report["html_links"]
        html["paths"] = list()
        for i, link in enumerate(html["links"]):
            html["paths"].append(f'/api/v1/{report_ref}/$/{i}/{link["name"]}')
        html["link_idx"] = idx

    if report.get("file_links"):
        html["file_links"] = report["file_links"]

    summary_height = report.get("summary_window_height")
    if summary_height is None:
        summary_height = 500

    html["iframe_style"] = f"max-height: {html['height']}"
    if html["set_height"]:
        html["iframe_style"] += f"; height: {html['height']}"
    else:
        html["iframe_style"] += "; height: auto"
    return {
        "objects": created_objs,
        "summary": report.get("text_message", ""),
        "summary_height": f"{summary_height}px",
        "html": html
    }
Exemplo n.º 24
0
class FastaToAssembly:

    def __init__(self, callback_url, scratch, ws_url):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)
        self.ws = Workspace(ws_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"

    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print(f'filtering FASTA file by contig length (min len={min_contig_length} bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print(f'parsing FASTA file: {fasta_file_path}')
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(f' - parsed {assembly_data["num_contigs"]} contigs,{assembly_data["dna_size"]} bp')
        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)
        json.dump(assembly_object_to_save, open(self.scratch+"/example.json", 'w'))

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info

    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        """ construct the WS object data to save based on the parsed info and params """
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        fasta_file_handle_info['handle'] = fasta_file_handle_info['handle']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            info = self.ws.get_object_info3({'objects':[{'ref': params['taxon_ref']}]})['infos'][0]
            assembly_data['taxon_ref'] = f'{info[6]}/{info[0]}/{info[4]}'

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return sort_dict(assembly_data)

    def parse_fasta(self, fasta_file_path, params):
        """ Do the actual work of inspecting each contig """

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This FASTA file may have amino acids in it instead '
                                         'of the required nucleotides.')
                    raise ValueError(f"This FASTA file has non nucleic acid characters: "
                                     f"{character}")

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence.encode()).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The FASTA header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')

            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list)).encode()).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data

    @staticmethod
    def fasta_filter_contigs_generator(fasta_record_iter, min_contig_length):
        """ generates SeqRecords iterator for writing from a legacy contigset object """
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(f' - filtered out {rows - rows_added} of {rows} contigs that were shorter '
              f'than {(min_contig_length)} bp.')

    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        """ removes all contigs less than the min_contig_length provided """
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path

    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        if len(obj_data["contigs"]) == 0:
            raise ValueError('There are no contigs to save, thus there is no valid assembly.')
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info

    def save_fasta_file_to_shock(self, fasta_file_path):
        """ Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        """
        print(f'Uploading FASTA file ({fasta_file_path}) to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})

    def stage_input(self, params):
        """ Setup the input_directory by fetching the files and returning the path to the file"""
        file_path = None
        if 'file' in params:
            if not os.path.isfile(params['file']['path']):
                raise ValueError('KBase Assembly Utils tried to save an assembly, but the calling application specified a file ('+params['file']['path']+') that is missing. Please check the application logs for details.')
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print(f'Downloading file from SHOCK node: {params["shock_id"]}')
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print(f'Downloading file from: {params["ftp_url"]}')
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid FASTA could be extracted based on the input parameters')


    @staticmethod
    def validate_params(params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a FASTA file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required FASTA file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one FASTA file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')
Exemplo n.º 25
0
class VirSorterUtils:
    def __init__(self, config):
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.mgu = MetagenomeUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.ws = Workspace(config['workspace-url'], token=config['token'])

    def VirSorter_help(self):
        command = 'wrapper_phage_contigs_sorter_iPlant.pl --help'
        self._run_command(command)

    def get_fasta(self, ref):
        # check type of object, i.e KBaseGenomeAnnotations.Assembly-3.0
        obj_type = self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0][2]
        if 'assembly' in obj_type.lower():
            genome_ref = ref
        elif 'kbasegenomes' in obj_type.lower():
            data = self.ws.get_objects2({
                'objects': [{
                    'ref': ref,
                    'included': ['assembly_ref'],
                    'strict_maps': 1
                }]
            })['data'][0]['data']
            genome_ref = data['assembly_ref']
        else:
            raise ValueError(
                f"Input reference {ref} is of type {obj_type}. Type KBaseGenomes.Genome or "
                f"KBaseGenomeAnnotations.Assembly required.")
        return self.au.get_assembly_as_fasta({'ref': genome_ref})['path']

    def run_VirSorter(self, params):

        params['SDK_CALLBACK_URL'] = self.callback_url
        params['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']

        # Get contigs from 'assembly'
        genome_fp = self.get_fasta(params['genomes'])

        command = 'wrapper_phage_contigs_sorter_iPlant.pl --data-dir /data/virsorter-data'

        # Add in first args
        command += f' -f {genome_fp} --db {params["database"]}'

        # Check if additional genomes were submitted
        if params.get('add_genomes'):
            add_genomes_fp = self.get_fasta(params['add_genomes'])
            print(f'Added genomes DETECTED: {add_genomes_fp}')
            command += f' --cp {add_genomes_fp}'

        bool_args = ['virome', 'diamond', 'keep_db',
                     'no_c']  # keep_db = keep-db

        for bool_arg in bool_args:
            if params[
                    bool_arg] == 1:  # 0 is true and therefore run... though for some reason it's reversed on json
                if bool_arg == 'keep_db':
                    bool_arg = 'keep-db'

                command += f' --{bool_arg}'

        self._run_command(command)

        report = self._generate_report(
            params)  # Basically, do everything that's after the tool runs

        return report

    def _run_command(self, command):
        """

        :param command:
        :return:
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, err = pipe.communicate()
        exitCode = pipe.returncode

        if exitCode == 0:
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nError: {}'.format(
                exitCode, output, err)
            raise RuntimeError(error_msg)

    def _parse_summary(self, virsorter_global_fp, affi_contigs_shock_id):
        columns = [
            'Contig_id',
            'Nb genes contigs',
            'Fragment',
            'Nb genes',
            'Category',
            'Nb phage hallmark genes',
            'Phage gene enrichment sig',
            'Non-Caudovirales phage gene enrichment sig',
            'Pfam depletion sig',
            'Uncharacterized enrichment sig',
            'Strand switch depletion sig',
            'Short genes enrichment sig',
        ]

        try:
            with open(virsorter_global_fp, 'r') as vir_fh:
                data = {}
                category = ''
                for line in vir_fh:
                    if line.startswith('## Contig_id'):
                        continue
                    elif line.startswith(
                            '## '
                    ):  # If 'header' lines are consumed by 1st if, then remaining should be good
                        category = line.split('## ')[-1].split(' -')[0]
                    else:
                        values = line.strip().split(',')
                        data[values[0]] = dict(zip(columns[1:], values[1:]))
        except:
            vir_path = os.path.join(os.getcwd(), 'virsorter-out')
            files = os.listdir(vir_path)
            raise RuntimeError(
                f"{virsorter_global_fp} is not a file. existing files {files}."
            )

        df = pd.DataFrame().from_dict(data, orient='index')
        df.index.name = columns[0]
        df.reset_index(inplace=True)

        html = df.to_html(index=False,
                          classes='my_class table-striped" id = "my_id')

        # Need to file write below
        direct_html = html_template.substitute(
            html_table=html, affi_contigs_shock_id=affi_contigs_shock_id)

        # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer
        start_header = Literal("<thead>")
        end_header = Literal("</thead>")

        text = start_header + SkipTo(end_header)

        new_text = ''
        for data, start_pos, end_pos in text.scanString(direct_html):
            new_text = ''.join(data).replace(
                ' style="text-align: right;"', '').replace(
                    'thead>', 'tfoot>\n  ') + '\n</tfoot>'

        # Get start and end positions to insert new text
        end_tbody = Literal("</tbody>")
        end_table = Literal("</table>")

        insertion_pos = end_tbody + SkipTo(end_table)

        final_html = ''
        for data, start_pos, end_pos in insertion_pos.scanString(direct_html):
            final_html = direct_html[:start_pos +
                                     8] + '\n' + new_text + direct_html[
                                         start_pos + 8:]

        return final_html

    def get_assembly_contig_ids(self, assembly_ref):
        """get contig ids from assembly_ref"""
        contigs = self.ws.get_objects2(
            {'objects': [{
                'ref': assembly_ref,
                'included': ['contigs']
            }]})['data'][0]['data']['contigs']
        return contigs.keys()

    def _generate_report(self, params):
        """

        :param params:
        :return:
        """

        # Get URL
        self.dfu = dfu(params['SDK_CALLBACK_URL'])

        # Output directory should be $PWD/virsorter-out - ASSUMES that's the output location
        virsorter_outdir = os.path.join(os.getcwd(), 'virsorter-out')

        print(
            f'VIRSorter output directory contents: {os.listdir(virsorter_outdir)}'
        )

        # Replacing individual download files with BinnedContigs

        # kb_deseq adds output files, then builds report files and sends all of them to the workspace
        output_files = []  # Appended list of dicts containing attributes

        # Collect all the files needed to report to end-user
        # Get all predicted viral sequences
        pred_fnas = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.fasta'))
        pred_gbs = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.gb'))
        # Summary 'table'
        glob_signal = os.path.join(virsorter_outdir,
                                   'VIRSorter_global-phage-signal.csv')

        print('Identified the following predicted viral sequences:\n{}'.format(
            '\n\t'.join(pred_fnas)))

        if len(pred_fnas) == 0:
            print(
                f"Unable to find predicted viral sequences, here are the directory's content:\n"
                f"{os.listdir(os.path.join(virsorter_outdir, 'Predicted_viral_sequences'))}"
            )

        if os.path.exists(glob_signal):

            print(f'Identified the global phage signal: {glob_signal}')

            lines = -1  # Don't count header
            with open(glob_signal) as fh:
                for ln in fh:
                    lines += 1

            if lines == 0:
                print('But it is EMPTY!')

        else:
            print(
                'Unable to find the global phage signal file. Was there an error during the run?'
            )

        # Append error and out files from VIRSorter
        err_fp = os.path.join(virsorter_outdir, 'logs/err')
        # if os.path.exists(err_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/err'),
        #         'name': 'VIRSorter_err',
        #         'label': 'VIRSorter_err',
        #         'description': 'VIRSorter error log file, generated from the tool itself.'
        #     })
        out_fp = os.path.join(virsorter_outdir, 'logs/out')
        # if os.path.exists(out_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/out'),
        #         'name': 'VIRSorter_out',
        #         'label': 'VIRSorter_out',
        #         'description': 'VIRSorter output log file, generated from the tool itself.'
        #     })

        if not (os.path.exists(err_fp) or os.path.exists(out_fp)):
            print(
                'Unable to find err and/or out files in LOG directory, contents:'
            )
            print(os.listdir(os.path.join(virsorter_outdir, 'logs')))

        # Make output directory
        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_dir)

        # Deal with nucleotide and protein fasta
        pred_fna_tgz_fp = os.path.join(output_dir,
                                       'VIRSorter_predicted_viral_fna.tar.gz')
        with tarfile.open(
                pred_fna_tgz_fp,
                'w:gz') as pred_fna_tgz_fh:  # Compress to minimize disk usage
            for pred_fna in pred_fnas:
                pred_fna_tgz_fh.add(pred_fna,
                                    arcname=os.path.basename(pred_fna))
        output_files.append({
            'path':
            pred_fna_tgz_fp,
            'name':
            os.path.basename(pred_fna_tgz_fp),
            'label':
            os.path.basename(pred_fna_tgz_fp),
            'description':
            'FASTA-formatted nucleotide sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_fna_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in FASTA format: '
                f'{pred_fna_tgz_fp}')

        pred_gb_tgz_fp = os.path.join(output_dir,
                                      'VIRSorter_predicted_viral_gb.tar.gz')
        with tarfile.open(pred_gb_tgz_fp, 'w:gz') as pred_gb_tgz_fh:
            for pred_gb in pred_gbs:
                pred_gb_tgz_fh.add(pred_gb, arcname=os.path.basename(pred_gb))
        output_files.append({
            'path':
            pred_gb_tgz_fp,
            'name':
            os.path.basename(pred_gb_tgz_fp),
            'label':
            os.path.basename(pred_gb_tgz_fp),
            'description':
            'Genbank-formatted sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_gb_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in Genbank format: '
                f'{pred_gb_tgz_fp}')

        # To create BinnedContig, need to create another directory with each of the "bins" as separate files?
        binned_contig_output_dir = os.path.join(self.scratch,
                                                str(uuid.uuid4()))
        self._mkdir_p(binned_contig_output_dir)

        # Before creating final HTML output, need to create BinnedContig object so other tools/users can take advantage
        # of its features, but also to feed more easily into other tools (e.g. vConTACT)
        created_objects = []  # Will store the objects that go to the workspace

        # load contig ids from the assembly input
        # assembly_contig_ids = self.get_assembly_contig_ids(self.assembly_ref)
        assembly_contig_ids = self.get_assembly_contig_ids(
            params['genomes'])  # Will fail for Genome

        summary_fp = os.path.join(
            binned_contig_output_dir,
            'VIRSorter.summary')  # Anything that ends in .summary
        with open(summary_fp, 'w') as summary_fh:

            summary_writer = csv.writer(summary_fh,
                                        delimiter='\t',
                                        quoting=csv.QUOTE_MINIMAL)
            summary_writer.writerow(
                ['Bin name', 'Completeness', 'Genome size', 'GC content'])

            for category_fp in pred_fnas:
                # _get_bin_ids from MetaGenomeUtils requires files to follow the header.0xx.fasta convention
                category = os.path.basename(category_fp).split(
                    'cat-')[-1].split('.')[0]
                dest_fn = 'VirSorter.{}.fasta'.format(category.zfill(3))
                dest_fp = os.path.join(output_dir, dest_fn)
                binned_contig_fp = os.path.join(binned_contig_output_dir,
                                                dest_fn)

                genome_size = 0
                gc_content = []

                # Need stats for summary file
                # Also need to adjust sequence name so binnedContig object can retrieve sequences
                adjusted_sequences = []
                with open(category_fp, 'rU') as category_fh:
                    for record in SeqIO.parse(category_fh, 'fasta'):
                        seq = record.seq
                        gc_content.append(SeqUtils.GC(seq))
                        genome_size += len(seq)

                        # This is very dirty, but need to change name to match original contigs
                        record.id = record.id.replace('VIRSorter_',
                                                      '').replace(
                                                          '-circular',
                                                          '').split('-cat_')[0]
                        if 'gene' in record.id:  # Prophage
                            record.id = record.id.split('_gene')[0]
                        record.id = record.id.rsplit('_', 1)[0]

                        # here we make sure that the id's line up with contig ids in the input assembly object
                        if record.id not in assembly_contig_ids:
                            for assembly_contig_id in assembly_contig_ids:
                                # first check if record.id is substring of current contig id,
                                # then check if current contig id is substring of record.id
                                # NOTE: this is not a perfect way of checking and will likely
                                #       fail in some circumstances.
                                #       A more complete check would be to make sure there is a 1:1
                                #       mapping of contig id's in the assembly object as compared to
                                #       the binned contig object (the fasta files defined here).
                                if (record.id in assembly_contig_id) or (
                                        assembly_contig_id in record.id):
                                    record.id = assembly_contig_id
                                    break

                        record.description = ''
                        record.name = ''
                        adjusted_sequences.append(record)

                if genome_size != 0:  # Empty file

                    summary_writer.writerow([
                        dest_fn, '100%', genome_size,
                        (sum(gc_content) / len(gc_content))
                    ])

                    print('Copying {} to results directory'.format(
                        os.path.basename(category_fp)))
                    # Yes, need both. One is to get file_links in report. Second is for binnedContigs object
                    shutil.copyfile(category_fp, dest_fp)

                    # Write renamed sequences
                    with open(binned_contig_fp, 'w') as binned_contig_fh:
                        SeqIO.write(adjusted_sequences, binned_contig_fh,
                                    'fasta')

                    result = self.au.save_assembly_from_fasta({
                        'file': {
                            'path': dest_fp
                        },
                        'workspace_name':
                        params['workspace_name'],
                        'assembly_name':
                        'VirSorter-Category-{}'.format(category)
                    })

                    created_objects.append({
                        "ref":
                        result,
                        "description":
                        "KBase Assembly object from VIRSorter"
                    })

        # Create BinnedContigs object, but 1st, a little metadata
        generate_binned_contig_param = {
            'file_directory': binned_contig_output_dir,
            'assembly_ref':
            params['genomes'],  # params.get('genomes'), self.assembly_ref
            'binned_contig_name': params['binned_contig_name'],
            'workspace_name': params['workspace_name']
        }
        binned_contig_object_ref = self.mgu.file_to_binned_contigs(
            generate_binned_contig_param).get('binned_contig_obj_ref')

        # Add binned contigs reference here, as it was already created above
        created_objects.append({
            "ref": binned_contig_object_ref,
            "description": "BinnedContigs from VIRSorter"
        })

        # Save VIRSorter_affi-contigs.tab for DRAM-v
        affi_contigs_fp = os.path.join(virsorter_outdir, 'Metric_files',
                                       'VIRSorter_affi-contigs.tab')
        affi_contigs_shock_id = self.dfu.file_to_shock(
            {'file_path': affi_contigs_fp})['shock_id']

        # Use global signal (i.e. summary) file and create HTML-formatted version
        raw_html = self._parse_summary(glob_signal, affi_contigs_shock_id)

        html_fp = os.path.join(output_dir, 'index.html')

        with open(html_fp, 'w') as html_fh:
            html_fh.write(raw_html)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_report = [{
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(html_fp),
            'label':
            os.path.basename(html_fp),
            'description':
            'HTML summary report for VIRSorter-predicted viral genomes.'
        }]

        report_params = {
            'message':
            'Here are the results from your VIRSorter run. Above, you\'ll find a report with '
            'all the identified (putative) viral genomes, and below, links to the report as '
            'well as files generated.',
            'workspace_name':
            params['workspace_name'],
            'html_links':
            html_report,
            'direct_html_link_index':
            0,
            'report_object_name':
            'VIRSorter_report_{}'.format(str(uuid.uuid4())),
            'file_links':
            output_files,
            'objects_created':
            created_objects,
        }

        kbase_report_client = KBaseReport(params['SDK_CALLBACK_URL'],
                                          token=params['KB_AUTH_TOKEN'])
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref'],
            'result_directory': binned_contig_output_dir,
            'binned_contig_obj_ref': binned_contig_object_ref
        }

        return report_output

    def _mkdir_p(self, path):
        """
        :param path:
        :return:
        """

        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise
Exemplo n.º 26
0
def process_kbase_objects(host_ref, virus_ref, shared_folder, callback,
                          workspace, token):
    """
    Convert KBase object(s) into usable files for VirMatcher
    :param host_ref: Putative host / microbial genomes with KBase '#/#/#' used to describe each object
    :param virus_ref: Viral genomes with KBase '#/#/#' used to describe each object
    :param shared_folder: KBase job node's "working" directory, where actual files exist
    :param callback:
    :param workspace: Workspace name
    :param token: Job token
    :return:
    """

    dfu = DataFileUtil(callback, token=token)

    ws = Workspace(workspace, token=token)

    mgu = MetagenomeUtils(callback, token=token)

    au = AssemblyUtil(callback, token=token)

    # Need to determine KBase type in order to know how to properly proceed
    host_type = ws.get_object_info3({'objects': [{
        'ref': host_ref
    }]})['infos'][0][2].split('-')[0]
    virus_type = ws.get_object_info3({'objects': [{
        'ref': virus_ref
    }]})['infos'][0][2].split('-')[0]

    logging.info(f'Potential hosts identified as: {host_type}')
    logging.info(f'Viruses identified as: {virus_type}')

    # Create new directory to house virus and host files
    host_dir = Path(shared_folder) / 'host_files'
    if not host_dir.exists():
        os.mkdir(host_dir)

    host_count = 0

    if host_type == 'KBaseGenomeAnnotations.Assembly':  # No info about individual genomes, so treat each as organism
        host_fps = au.get_assembly_as_fasta(
            {'ref':
             host_ref})['path']  # Consists of dict: path + assembly_name

        logging.info(
            f'Identified {host_type}. Each sequence will be treated as a separate organism.'
        )

        records = SeqIO.parse(host_fps, 'fasta')

        for record in records:
            host_count += 1
            tmp_fp = host_dir / f'{record.id}.fasta'  # TODO Illegal filenames?
            SeqIO.write([record], tmp_fp, 'fasta')

    elif host_type == 'KBaseGenomes.Genomes':  # TODO Genomes?!
        genome_data = ws.get_objects2({'objects': [{
            'ref': host_ref
        }]})['data'][0]['data']
        genome_data.get('contigset_ref') or genome_data.get('assembly_ref')

    # elif host_type == 'KBaseSets.GenomeSet'

    elif host_type == 'KBaseSets.AssemblySet':
        obj_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0]

        for subobj in obj_data['data']['items']:
            host_fp = au.get_assembly_as_fasta({'ref': subobj['ref']})['path']

            if os.path.splitext(host_fp)[-1] != 'fasta':
                # Ensure extension always = fasta
                target_fn = os.path.splitext(
                    os.path.basename(host_fp))[0].strip('_') + '.fasta'
            else:
                target_fn = os.path.basename(host_fp).strip('_')

            shutil.copyfile(host_fp, host_dir / target_fn)
            host_count += 1

    elif host_type == 'KBaseMetagenomes.BinnedContigs':  # This is what we want!
        host_kbase_dir = mgu.binned_contigs_to_file({
            'input_ref': host_ref,
            'save_to_shock': 0
        })['bin_file_directory']  # Dict of bin_file_dir and shock_id

        for (dirpath, dirnames, fns) in os.walk(
                host_kbase_dir):  # Dirnames = all folders under dirpath
            for fn in fns:
                if os.path.splitext(fn)[-1] != 'fasta':
                    fn = os.path.splitext(fn)[0] + '.fasta'
                fp = Path(dirpath) / fn
                shutil.copy(fp, host_dir)
                host_count += 1

    else:
        raise ValueError(f'{host_type} is not supported.')

    logging.info(f'{host_count} potential host genomes were identified.')

    virus_count = 0

    if virus_type == 'KBaseGenomeAnnotations.Assembly':
        virus_fps = au.get_assembly_as_fasta({'ref': virus_ref})['path']

        records = SeqIO.parse(virus_fps, 'fasta')
        virus_count = len(list(records))

        # for record in records:
        #     virus_count += 1
        # tmp_fp = virus_dir / f'{record.id}.fasta'
        # SeqIO.write([record], tmp_fp, 'fasta')

    else:
        raise ValueError(f'{virus_type} is not supported.')

    logging.info(f'{virus_count} potential viral genomes were identified.')

    # TODO Do we even need any of this data? We don't care about what the sequences are called

    # host_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0]
    # virus_data = dfu.get_objects({'object_refs': [virus_ref]})['data'][0]

    return host_dir, virus_fps
Exemplo n.º 27
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params,
                        expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-' + str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'),
                                header,
                                delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3(
                {'objects': [{
                    'ref': genome
                }]})['infos'][0][1]
            res = self.gsu.search({
                'ref': genome,
                'structured_query': {
                    'feature_id': fids
                },
                'sort_by': [['contig_id', 1]],
                'start': 0,
                'limit': len(fids)
            })

            for feat in res['features']:
                features.append({
                    'Feature Id':
                    feat['feature_id'],
                    'Aliases':
                    ", ".join(sorted(feat['aliases'].keys())),
                    'Genome':
                    "{} ({})".format(genome_name, genome),
                    'Type':
                    feat['feature_type'],
                    'Function':
                    feat['function']
                })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(
                file, os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}
Exemplo n.º 28
0
class kb_ReadSim:
    '''
    Module Name:
    kb_ReadSim

    Module Description:
    A KBase module: kb_ReadSim
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbasecollaborations/kb_ReadSim.git"
    GIT_COMMIT_HASH = "c9c0185e34d25be57cc6e1c901d8801fbc0f4784"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        self.du = DownloadUtils(self.callback_url)
        self.su = SimUtils()
        self.ru = ReadsUtils(self.callback_url)
        self.vu = VariationUtil(self.callback_url)
        self.eu = VcfEvalUtils()
        self.hu = htmlreportutils()
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def run_kb_ReadSim(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "Inparams" -> structure: parameter
           "workspace_name" of String, parameter "input_sample_set" of
           String, parameter "strain_info" of String, parameter
           "assembly_or_genome_ref" of String, parameter "base_error_rate" of
           String, parameter "outer_distance" of String, parameter
           "standard_deviation" of String, parameter "num_read_pairs" of
           String, parameter "len_first_read" of String, parameter
           "len_second_read" of String, parameter "mutation_rate" of String,
           parameter "frac_indels" of String, parameter
           "variation_object_name" of String, parameter "output_read_object"
           of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_ReadSim
        output_dir = self.shared_folder
        print(params)
        self.su.validate_simreads_params(params)

        genome_or_assembly_ref = params['assembly_or_genome_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        self.du.download_genome(assembly_ref, output_dir)

        ref_genome = os.path.join(self.shared_folder, "ref_genome.fa")
        output_fwd_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed1.fq")
        output_rev_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed2.fq")

        self.eu.check_path_exists(ref_genome)

        self.su.simreads(ref_genome, output_fwd_paired_file_path,
                         output_rev_paired_file_path, params)
        self.eu.check_path_exists(output_fwd_paired_file_path)
        self.eu.check_path_exists(output_rev_paired_file_path)

        retVal = self.ru.upload_reads({
            'wsname': params['workspace_name'],
            'name': params['output_read_object'],
            'sequencing_tech': 'illumina',
            'fwd_file': output_fwd_paired_file_path,
            'rev_file': output_rev_paired_file_path
        })

        logfile = os.path.join(self.shared_folder, "variant.txt")
        self.eu.check_path_exists(logfile)

        vcf_file = self.su.format_vcf(logfile)
        self.eu.check_path_exists(vcf_file)

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['assembly_or_genome_ref'],
            'sample_set_ref': params['input_sample_set'],
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': vcf_file,
            'variation_object_name': params['variation_object_name']
        }
        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'Success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_kb_ReadSim

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_ReadSim return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def run_eval_variantcalling(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "Evalparams" -> structure: parameter
           "workspace_name" of String, parameter "sim_varobject_name" of
           String, parameter "calling_varobject_name" of String, parameter
           "output_var_object" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_eval_variantcalling

        print(params)
        self.eu.validate_eval_params(params)

        report_dir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        os.mkdir(report_dir)

        self.ws = Workspace(url=self.ws_url, token=ctx['token'])

        var_object_ref1 = params['varobject_ref1']
        sampleset_ref1 = self.ws.get_objects2({
            'objects': [{
                "ref": var_object_ref1,
                'included': ['/sample_set_ref']
            }]
        })['data'][0]['data']['sample_set_ref']

        var_object_ref2 = params['varobject_ref2']
        sampleset_ref2 = self.ws.get_objects2({
            'objects': [{
                "ref": var_object_ref2,
                'included': ['/sample_set_ref']
            }]
        })['data'][0]['data']['sample_set_ref']

        if (sampleset_ref1 != sampleset_ref2):
            raise Exception(
                "Variation objects are from different sample set\n")

        assembly_ref_set = set()
        genomeset_ref_set = set()

        variation_obj1 = self.ws.get_objects2(
            {'objects': [{
                'ref': var_object_ref1
            }]})['data'][0]

        if 'assembly_ref' in variation_obj1['data']:
            assembly_ref1 = variation_obj1['data']['assembly_ref']
            assembly_ref_set.add(assembly_ref1)
        elif 'genome_ref' in variation_obj1['data']:
            genome_ref1 = variation_obj1['data']['genome_ref']
            genomeset_ref_set.add(genome_ref1)

        variation_obj2 = self.ws.get_objects2(
            {'objects': [{
                'ref': var_object_ref2
            }]})['data'][0]
        if 'assembly_ref' in variation_obj2['data']:
            assembly_ref2 = variation_obj2['data']['assembly_ref']
            assembly_ref_set.add(assembly_ref2)
        elif 'genome_ref' in variation_obj2['data']:
            genome_ref2 = variation_obj2['data']['genome_ref']
            genomeset_ref_set.add(genome_ref2)

        assembly_or_genome_ref = None

        if (not genomeset_ref_set and len(assembly_ref_set) != 1):
            raise Exception(
                "variation objects are from different assembly refs")
        elif (not assembly_ref_set and len(genomeset_ref_set) != 1):
            raise Exception("variation objects are from different genome refs")

        simvarfile = os.path.join(report_dir, "simvarinat.vcf.gz")
        simvarpath = self.du.download_variations(var_object_ref1, simvarfile)

        os.rename(simvarpath, simvarfile)
        self.eu.index_vcf(simvarfile)

        callingvarfile = os.path.join(report_dir, "callingvarinat.vcf.gz")
        callingvarpath = self.du.download_variations(var_object_ref2,
                                                     callingvarfile)

        os.rename(callingvarpath, callingvarfile)
        self.eu.index_vcf(callingvarfile)

        eval_results = self.eu.variant_evalation(simvarfile, callingvarfile,
                                                 report_dir)

        unique_vcf1 = eval_results['unique1']
        self.eu.check_path_exists(unique_vcf1)

        unique_vcf2 = eval_results['unique2']
        self.eu.check_path_exists(unique_vcf2)

        common_vcf = eval_results['common']
        self.eu.check_path_exists(common_vcf)

        image_path = self.eu.plot_venn_diagram(report_dir, unique_vcf1,
                                               unique_vcf2, common_vcf)
        self.eu.check_path_exists(image_path)
        '''
        if(len(assembly_ref_set) != 0):
            assembly_or_genome_ref = assembly_ref_set.pop()
        elif(len(genomeset_ref_set) != 0):
            assembly_or_genome_ref = genomeset_ref_set.pop()

        logging.info("Saving Unique1 vcf\n")
        save_unique_variation_params1 = {'workspace_name': params['workspace_name'],
                                        'genome_or_assembly_ref': assembly_or_genome_ref,
                                        'sample_set_ref': sampleset_ref1,
                                        'sample_attribute_name': 'sample_unique_attr1',
                                        'vcf_staging_file_path': unique_vcf1,
                                        'variation_object_name': params['output_variant_object'] + "_sample1_unique"
        }
        self.vu.save_variation_from_vcf(save_unique_variation_params1)
        logging.info("Saving done\n")

        logging.info("Saving Unique2 vcf\n")
        save_unique_variation_params2 = {'workspace_name': params['workspace_name'],
                                        'genome_or_assembly_ref': assembly_or_genome_ref,
                                        'sample_set_ref': sampleset_ref1,
                                        'sample_attribute_name': 'sample_unique_attr2',
                                        'vcf_staging_file_path': unique_vcf2,
                                        'variation_object_name': params['output_variant_object'] + "_sample2_unique"
        }
        self.vu.save_variation_from_vcf(save_unique_variation_params2)
        logging.info("Saving done\n")

        logging.info("Saving Common vcf\n")
        save_common_variation_params = {'workspace_name': params['workspace_name'],
                                 'genome_or_assembly_ref': assembly_or_genome_ref,
                                 'sample_set_ref': sampleset_ref1,
                                 'sample_attribute_name': 'sample_common_attr',
                                 'vcf_staging_file_path': common_vcf,
                                 'variation_object_name': params['output_variant_object'] + "_sample1_sample2_common"
        }
        self.vu.save_variation_from_vcf(save_common_variation_params)
        logging.info("Saving done\n")
        '''

        workspace = params['workspace_name']
        output = self.hu.create_html_report(self.callback_url, report_dir,
                                            workspace)
        #END run_eval_variantcalling

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_eval_variantcalling return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Exemplo n.º 29
0
class BwaIndexBuilder:
    def __init__(self, scratch_dir, ws_url, callback_url, service_wizard_url, provenance):
        self.scratch_dir = scratch_dir
        self.ws_url = ws_url
        self.ws = Workspace(self.ws_url)
        self.callback_url = callback_url
        self.service_wizard_url = service_wizard_url
        self.bwa = BwaRunner(self.scratch_dir)
        self.provenance = provenance

    def get_index(self, params):
        ''' The key function of this module- get a bwa index for the specified input '''

        # validate the parameters and fetch assembly_info
        validated_params = self._validate_params(params)
        assembly_info = self._get_assembly_info(validated_params['ref'])

        # check the cache (keyed off of assembly_info)
        index_info = self._get_cached_index(assembly_info, validated_params)
        if index_info:
            index_info['from_cache'] = 1
            index_info['pushed_to_cache'] = 0
        else:
            # on a cache miss, build the index
            index_info = self._build_index(assembly_info, validated_params)
            index_info['from_cache'] = 0
            # pushed_to_cache will be set in return from _build_index

        index_info['assembly_ref'] = assembly_info['ref']
        index_info['genome_ref'] = assembly_info['genome_ref']

        return index_info

    def _validate_params(self, params):
        ''' validate parameters; can do some processing here to produce validated params '''
        # params['ref'] = params['assembly_or_genome_ref']
        validated_params = {'ref': None}
        if 'ref' in params and params['ref']:
            validated_params['ref'] = params['ref']
        else:
            raise ValueError('"ref" field indicating either an assembly or genome is required.')

        if 'output_dir' in params:
            validated_params['output_dir'] = params['output_dir']
        else:
            validated_params['output_dir'] = os.path.join(self.scratch_dir,
                                                          'bwa_index_' + str(int(time.time() * 100)))

        if os.path.exists(validated_params['output_dir']):
            raise ('Output directory name specified (' + validated_params['output_dir'] +
                   ') already exists. Will not overwrite, so aborting.')

        if 'ws_for_cache' in params and params['ws_for_cache']:
            validated_params['ws_for_cache'] = params['ws_for_cache']
        else:
            print('WARNING: bwa index if created will not be cached because "ws_for_cache" field not set')
            validated_params['ws_for_cache'] = None

        return validated_params

    def _get_assembly_info(self, ref):
        ''' given a ref to an assembly or genome, figure out the assembly and return its info '''
        info = self.ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][0]
        obj_type = info[2]
        if obj_type.startswith('KBaseGenomeAnnotations.Assembly') or obj_type.startswith('KBaseGenomes.ContigSet'):
            return {'info': info, 'ref': ref, 'genome_ref': None}

        if obj_type.startswith('KBaseGenomes.Genome'):
            # we need to get the assembly for this genome
            ga = GenomeAnnotationAPI(self.service_wizard_url)
            assembly_ref = ga.get_assembly({'ref': ref})
            # using the path ensures we can access the assembly even if we don't have direct access
            ref_path = ref + ';' + assembly_ref
            info = self.ws.get_object_info3({'objects': [{'ref': ref_path}]})['infos'][0]
            return {'info': info, 'ref': ref_path, 'genome_ref': ref}

        raise ValueError('Input object was not of type: Assembly, ContigSet or Genome.  Cannot get bwa Index.')

    def _get_cached_index(self, assembly_info, validated_params):

        try:
            # note: list_reference_objects does not yet support reference paths, so we need to call
            # with the direct reference.  So we won't get a cache hit if you don't have direct access
            # to the assembly object right now (although you can still always build the assembly object)
            # Once this call supports paths, this should be changed to set ref = assembly_info['ref']
            info = assembly_info['info']
            ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
            objs = self.ws.list_referencing_objects([{'ref': ref}])[0]

            # iterate through each of the objects that reference the assembly
            bwa_indexes = []
            for o in objs:
                if o[2].startswith('KBaseRNASeq.Bowtie2IndexV2'):
                    bwa_indexes.append(o)

            # Nothing refs this assembly, so cache miss
            if len(bwa_indexes) == 0:
                return False

            # if there is more than one hit, get the most recent one
            # (obj_info[3] is the save_date timestamp (eg 2017-05-30T22:56:49+0000), so we can sort on that)
            bwa_indexes.sort(key=lambda x: x[3])
            bwa_index_info = bwa_indexes[-1]
            index_ref = str(bwa_index_info[6]) + '/' + str(bwa_index_info[0]) + '/' + str(bwa_index_info[4])

            # get the object data
            index_obj_data = self.ws.get_objects2({'objects': [{'ref': index_ref}]})['data'][0]['data']

            # download the handle object
            os.makedirs(validated_params['output_dir'])

            dfu = DataFileUtil(self.callback_url)
            dfu.shock_to_file({'file_path': os.path.join(validated_params['output_dir'], 'bt2_index.tar.gz'),
                               'handle_id': index_obj_data['handle']['hid'],
                               'unpack': 'unpack'})
            print('Cache hit: ')
            pprint(index_obj_data)
            return {'output_dir': validated_params['output_dir'],
                    'index_files_basename': index_obj_data['index_files_basename']}

        except Exception:
            # if we fail in saving the cached object, don't worry
            print('WARNING: exception encountered when trying to lookup in cache:')
            print(traceback.format_exc())
            print('END WARNING: exception encountered when trying to lookup in cache.')

        return None

    def _put_cached_index(self, assembly_info, index_files_basename, output_dir, ws_for_cache):

        if not ws_for_cache:
            print('WARNING: bwa index cannot be cached because "ws_for_cache" field not set')
            return False

        try:
            dfu = DataFileUtil(self.callback_url)
            result = dfu.file_to_shock({'file_path': output_dir,
                                        'make_handle': 1,
                                        'pack': 'targz'})

            bwa_index = {'handle': result['handle'], 'size': result['size'],
                         'assembly_ref': assembly_info['ref'],
                         'index_files_basename': index_files_basename}

            ws = Workspace(self.ws_url)
            save_params = {'objects': [{'hidden': 1,
                                        'provenance': self.provenance,
                                        'name': os.path.basename(output_dir),
                                        'data': bwa_index,
                                        'type': 'KBaseRNASeq.Bowtie2IndexV2'
                                        }]
                           }
            if ws_for_cache.strip().isdigit():
                save_params['id'] = int(ws_for_cache)
            else:
                save_params['workspace'] = ws_for_cache.strip()
            save_result = ws.save_objects(save_params)
            print('Bowtie2IndexV2 cached to: ')
            pprint(save_result[0])
            return True

        except Exception:
            # if we fail in saving the cached object, don't worry
            print('WARNING: exception encountered when trying to cache the index files:')
            print(traceback.format_exc())
            print('END WARNING: exception encountered when trying to cache the index files')

        return False

    def _build_index(self, assembly_info, validated_params):
        # get the assembly as a fasta file using AssemblyUtil
        au = AssemblyUtil(self.callback_url)
        fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']})

        # make the target destination folder (check again it wasn't created yet)
        if os.path.exists(validated_params['output_dir']):
            raise ('Output directory name specified (' + validated_params['output_dir'] +
                   ') already exists. Will not overwrite, so aborting.')
        os.makedirs(validated_params['output_dir'])

        # configure the command line args and run it
        cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params)
        self.bwa.run('index', cli_params)
        # self.bwa.run('index', cli_params)
        for file in glob.glob(r'/kb/module/work/tmp/' + fasta_info['assembly_name'] + '.*'):
            print(file)
            shutil.copy(file, validated_params['output_dir'])

        index_info = {'output_dir': validated_params['output_dir'],
                      'index_files_basename': fasta_info['assembly_name']}

        # cache the result, mark if it worked or not
        cache_success = self._put_cached_index(assembly_info,
                                               fasta_info['assembly_name'],
                                               validated_params['output_dir'],
                                               validated_params['ws_for_cache'])
        if cache_success:
            index_info['pushed_to_cache'] = 1
        else:
            index_info['pushed_to_cache'] = 0

        return index_info

    def _build_cli_params(self, fasta_file_path, index_files_basename, validated_params):
        cli_params = []

        # always run in quiet mode
        # positional args: first the fasta path, then the base name used for the index files
        cli_params.append(fasta_file_path)
        cli_params.append("-p")
        cli_params.append(index_files_basename)

        return cli_params
    def run_generate_metadata_report(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_generate_metadata_report
        object_type = params['object_type']
        workspace_name = params['workspace_name']

        ws = Workspace(self.ws_url)
        print(params)

        objects_in_workspace = ws.list_objects({
            'workspaces': [workspace_name],
            'type': object_type
        })
        object_names = sorted([j[1] for j in objects_in_workspace])

        d = dict()

        if (object_type == 'KBaseRNASeq.RNASeqAlignment'):
            for object_name in object_names:
                alignment_stats = ws.get_objects2({
                    'objects': [{
                        'workspace': workspace_name,
                        'name': object_name
                    }]
                })['data'][0]['data']['alignment_stats']
                metadata_keys = alignment_stats.keys()
                object_pd = pd.Series(alignment_stats, index=metadata_keys)
                d[object_name] = object_pd

        else:
            for object_name in object_names:
                obj_meta_data = ws.get_object_info3(
                    {
                        'objects': [{
                            'workspace': workspace_name,
                            'name': object_name
                        }],
                        'includeMetadata':
                        1
                    }, )
                metadata = obj_meta_data.get('infos')[0][10]
                metadata_keys = metadata.keys()
                object_pd = pd.Series(metadata, index=metadata_keys)
                d[object_name] = object_pd

        df = pd.DataFrame(d)

        htmlDir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        self._mkdir_p(htmlDir)
        report_file_path = os.path.join(htmlDir, "index.html")
        #df.to_html(report_file_path)
        self.write_pd_html(df.T, report_file_path)

        try:
            html_upload_ret = self.dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except Exception:
            raise ValueError('Error uploading HTML file: ' + str(htmlDir) +
                             ' to shock')

        reportname = 'generate_metadata_report_' + str(uuid.uuid4())

        reportobj = {
            'message': '',
            'direct_html': None,
            'direct_html_link_index': 0,
            'file_links': [],
            'html_links': [],
            'html_window_height': 500,
            'workspace_name': params['workspace_name'],
            'report_object_name': reportname
        }

        # attach to report obj
        reportobj['direct_html'] = ''
        reportobj['direct_html_link_index'] = 0
        reportobj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            'name': 'index.html',
            'label': 'index.html'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        report_info = report.create_extended_report(reportobj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        print(output)

        #END run_generate_metadata_report

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method run_generate_metadata_report return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 31
0
class FeatureSetBuilder:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in ['diff_expression_ref', 'workspace_name',
                  'p_cutoff', 'q_cutoff', 'fold_change_cutoff']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used')

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning("Unexpected parameter {} supplied".format(param))

    def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(up_feature_set_ref_list,
                                                       down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{'ref': up_feature_set_ref,
                                 'description': 'Upper FeatureSet Object'}]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{'ref': down_feature_set_ref,
                                 'description': 'Lower FeatureSet Object'}]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{'ref': filtered_expression_matrix_ref,
                                 'description': 'Filtered ExpressionMatrix Object'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 333,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     up_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                               len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     down_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                              len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>',
                                                          uppper_feature_content)

                report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>',
                                                          lower_feature_content)

                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report'})
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref, result_directory,
                                 condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2({'objects':
                                                  [{'ref':
                                                   diff_expression_set_ref}]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2({'objects':
                                                        [{'ref':
                                                         diff_expression_ref}]})['data'][0]['data']

            label_string = set_item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']
                selected_diff_expression_ref = diff_expression_ref

                with open(diff_expr_matrix_file, 'a') as csvfile:
                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({'gene_id': row_id,
                                         'log2_fold_change': row_value[0],
                                         'p_value': row_value[1],
                                         'q_value': row_value[2]})

        return diff_expr_matrix_file, genome_id, selected_diff_expression_ref

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {feature_id: [genome_id] for feature_id in feature_ids}
        feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression',
                            'element_ordering': feature_ids,
                            'elements': elements}

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': feature_set_data,
                         'name': feature_set_name}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value,
                             comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = {'NA', 'null', ''}
                col_value = {row_p_value, row_q_value, row_fold_change_cutoff}

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition and q_value_condition and
                                                         (float(row_fold_change_cutoff) >=
                                                         comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition and q_value_condition and
                                             (float(row_fold_change_cutoff) <=
                                             -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self, expression_matrix_ref, feature_ids,
                                  workspace_name, filtered_expression_matrix_suffix="",
                                  diff_expression_matrix_ref=None,
                                  filtered_expression_matrix_name=None):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects({'object_refs':
                                                     [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if not filtered_expression_matrix_name:
            if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name):
                filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix',
                                                         filtered_expression_matrix_suffix,
                                                         expression_matrix_name)
            else:
                filtered_expression_matrix_name = expression_matrix_name + \
                    filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values
        filtered_expression_matrix_data['data'] = filtered_data

        expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data,
                          'name': filtered_expression_matrix_name}
        # we now save the filtering DEM in a EM field added for this purpose
        if diff_expression_matrix_ref:
            expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref
            expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref]

        save_object_params = {
            'id': workspace_id,
            'objects': [expression_obj]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            label_string = condition_pair['label_string'][0].strip()
            label_list = [x.strip() for x in label_string.split(',')]
            first_label = label_list[0]
            second_label = label_list[1]

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(first_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(second_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2({'objects':
                                                       [{'ref': diff_expression_set_ref}]
                                                        })['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_label_pairs.append(label_list)
            available_condition_labels |= set(label_list)

        log('all possible condition pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def _get_feature_ids(self, genome_ref, ids):
        """
        _get_feature_ids: get feature ids from genome
        """

        genome_features = self.gsu.search({'ref': genome_ref,
                                           'limit': len(ids),
                                           'structured_query': {"$or": [{"feature_id": x}
                                                                        for x in ids]},
                                           'sort_by': [['feature_id', True]]})['features']

        features_ids = set((feature.get('feature_id') for feature in genome_features))

        return features_ids

    def _build_fs_obj(self, params):
        new_feature_set = {
            'description': '',
            'element_ordering': [],
            'elements': {}
        }
        genome_ref = params['genome']
        if params.get('base_feature_sets', []) and None not in params['base_feature_sets']:
            base_feature_sets = self.dfu.get_objects(
                {'object_refs': params['base_feature_sets']}
            )['data']
            for ret in base_feature_sets:
                base_set = ret['data']
                base_set_name = ret['info'][1]

                new_feature_set['element_ordering'] += [x for x in base_set['element_ordering']
                                                        if x not in new_feature_set['elements']]
                for element, genome_refs in base_set['elements'].items():
                    if element in new_feature_set['elements']:
                        new_feature_set['elements'][element] += [x for x in genome_refs if x not in
                                                                 new_feature_set['elements'][
                                                                     element]]
                    else:
                        new_feature_set['elements'][element] = genome_refs
                new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format(
                    base_set_name, base_set.get('description'))
        new_feature_ids = []
        if params.get('feature_ids'):
            if isinstance(params['feature_ids'], str):
                new_feature_ids += params['feature_ids'].split(',')
            else:
                new_feature_ids += params['feature_ids']
        if params.get('feature_ids_custom'):
            new_feature_ids += params['feature_ids_custom'].split(',')
        if new_feature_ids:
            genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids)
        for new_feature in new_feature_ids:
            if new_feature not in genome_feature_ids:
                raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format(
                    new_feature, genome_ref))
            if new_feature in new_feature_set['elements']:
                if genome_ref not in new_feature_set['elements'][new_feature]:
                    new_feature_set['elements'][new_feature].append(genome_ref)
            else:
                new_feature_set['elements'][new_feature] = [genome_ref]
                new_feature_set['element_ordering'].append(new_feature)

        if params.get('description'):
            new_feature_set['description'] = params['description']

        return new_feature_set

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3({"objects":
                                                            [{"ref": diff_expression_set_ref}]}
                                                            )['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs,
         available_condition_labels) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs, available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = [x.strip() for x in label_string.split(',')]
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '-'.join(reversed(condition_label_pair))
            diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression(
                                                                diff_expression_set_ref,
                                                                result_directory,
                                                                condition_label_pair)
            up_feature_ids, down_feature_ids = self._process_matrix_file(
                                                                diff_expr_matrix_file,
                                                                params.get('p_cutoff'),
                                                                params.get('q_cutoff'),
                                                                params.get('fold_change_cutoff'))
            filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                                                params.get('expression_matrix_ref'),
                                                up_feature_ids + down_feature_ids,
                                                params.get('workspace_name'), "",
                                                diff_expr_matrix_ref, filtered_em_name)
                filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix', "")
            up_feature_set_name = "{}_{}_up{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            up_feature_set_ref = self._generate_feature_set(up_feature_ids,
                                                            genome_id,
                                                            params.get('workspace_name'),
                                                            up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = "{}_{}_down{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            down_feature_set_ref = self._generate_feature_set(down_feature_ids,
                                                              genome_id,
                                                              params.get('workspace_name'),
                                                              down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {'result_directory': result_directory,
                     'up_feature_set_ref_list': up_feature_set_ref_list,
                     'down_feature_set_ref_list': down_feature_set_ref_list,
                     'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list}

        report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list,
                                              filtered_expression_matrix_ref_list,
                                              params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal

    def filter_matrix_with_fs(self, params):
        self.validate_params(params, ('feature_set_ref', 'workspace_name',
                                      'expression_matrix_ref', 'filtered_expression_matrix_suffix'))
        ret = self.dfu.get_objects(
            {'object_refs': [params['feature_set_ref']]}
        )['data'][0]
        feature_set = ret['data']
        feature_set_name = ret['info'][1]
        feature_ids = set(feature_set['elements'].keys())
        filtered_matrix_ref = self._filter_expression_matrix(
            params['expression_matrix_ref'], feature_ids, params['workspace_name'],
            params['filtered_expression_matrix_suffix'])

        objects_created = [{'ref': filtered_matrix_ref,
                            'description': 'Filtered ExpressionMatrix Object'}]
        message = "Filtered Expression Matrix based of the {} feature ids present in {}"\
            .format(len(feature_ids), feature_set_name)

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'filtered_expression_matrix_ref': filtered_matrix_ref,
                'report_name': output['name'], 'report_ref': output['ref']}

    def build_feature_set(self, params):
        self.validate_params(params, {'output_feature_set', 'workspace_name', },
                             {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets',
                              'description'})
        feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets')
        if not any([params.get(x) for x in feature_sources]):
            raise ValueError("You must supply at least one feature source: {}".format(
                ", ".join(feature_sources)))
        workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])

        new_feature_set = self._build_fs_obj(params)
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': 'KBaseCollections.FeatureSet',
                         'data': new_feature_set,
                         'name': params['output_feature_set']}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        objects_created = [{'ref': feature_set_obj_ref,
                            'description': 'Feature Set'}]
        message = 'A new feature set containing {} features was created.'.format(
            len(new_feature_set['elements']))

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'feature_set_ref': feature_set_obj_ref,
                'report_name': output['name'], 'report_ref': output['ref']}
Exemplo n.º 32
0
class BwaAligner:
    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url,
                 provenance):
        self.scratch_dir = scratch_dir
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.provenance = provenance

        # from the provenance, extract out the version to run by exact hash if possible
        self.my_version = 'release'
        if len(provenance) > 0:
            if 'subactions' in provenance[0]:
                self.my_version = self.get_version_from_subactions(
                    'kb_Bwa', provenance[0]['subactions'])
        print('Running kb_Bwa version = ' + self.my_version)

        self.ws = Workspace(self.workspace_url)
        self.bwa = BwaRunner(self.scratch_dir)
        self.parallel_runner = KBParallel(self.callback_url)
        self.qualimap = kb_QualiMap(self.callback_url)

    def get_version_from_subactions(self, module_name, subactions):
        # go through each sub action looking for
        if not subactions:
            return 'release'  # default to release if we can't find anything
        for sa in subactions:
            if 'name' in sa:
                if sa['name'] == module_name:
                    # local-docker-image implies that we are running in kb-test, so return 'dev'
                    if sa['commit'] == 'local-docker-image':
                        return 'dev'
                    # to check that it is a valid hash, make sure it is the right
                    # length and made up of valid hash characters
                    if re.match('[a-fA-F0-9]{40}$', sa['commit']):
                        return sa['commit']
        # again, default to setting this to release
        return 'release'

    def align(self, params):
        validated_params = self.validate_params(params)
        input_info = self.determine_input_info(validated_params)
        # input info provides information on the input and tells us if we should
        # run as a single_library or as a set:
        #     input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'}

        assembly_or_genome_ref = validated_params['assembly_or_genome_ref']

        if input_info['run_mode'] == 'single_library':
            if 'output_alignment_name' not in validated_params:
                suffix = '_alignment'
                if 'output_alignment_suffix' in validated_params:
                    suffix = validated_params['output_alignment_suffix']
                validated_params[
                    'output_alignment_name'] = input_info['info'][1] + suffix
            single_lib_result = self.single_reads_lib_run(
                input_info,
                assembly_or_genome_ref,
                validated_params,
                create_report=validated_params['create_report'])

            return single_lib_result

        if input_info['run_mode'] == 'sample_set':
            reads = self.fetch_reads_refs_from_sampleset(
                input_info['ref'], input_info['info'], validated_params)
            self.build_bwa_index(assembly_or_genome_ref,
                                 validated_params['output_workspace'])

            print('Running on set of reads=')
            pprint(reads)

            tasks = []
            for r in reads:
                tasks.append(
                    self.build_single_execution_task(
                        r['ref'], params, r['alignment_output_name'],
                        r['condition']))

            batch_run_params = {
                'tasks': tasks,
                'runner': 'parallel',
                'max_retries': 2
            }
            if validated_params['concurrent_local_tasks'] is not None:
                batch_run_params['concurrent_local_tasks'] = validated_params[
                    'concurrent_local_tasks']
            if validated_params['concurrent_njsw_tasks'] is not None:
                batch_run_params['concurrent_njsw_tasks'] = validated_params[
                    'concurrent_njsw_tasks']
            results = self.parallel_runner.run_batch(batch_run_params)
            print('Batch run results=')
            pprint(results)
            batch_result = self.process_batch_result(results, validated_params,
                                                     reads, input_info['info'])
            return batch_result

        raise ('Improper run mode')

    def build_single_execution_task(self, reads_lib_ref, params, output_name,
                                    condition):
        task_params = copy.deepcopy(params)

        task_params['input_ref'] = reads_lib_ref
        task_params['output_alignment_name'] = output_name
        task_params['create_report'] = 0
        task_params['condition_label'] = condition

        return {
            'module_name': 'kb_Bwa',
            'function_name': 'align_reads_to_assembly_app',
            'version': self.my_version,
            'parameters': task_params
        }

    def single_reads_lib_run(self,
                             read_lib_info,
                             assembly_or_genome_ref,
                             validated_params,
                             create_report=False,
                             bwa_index_info=None):
        ''' run on one reads '''

        # download reads and prepare any bwa index files
        input_configuration = self.prepare_single_run(
            read_lib_info, assembly_or_genome_ref, bwa_index_info,
            validated_params['output_workspace'])

        # run the actual program
        run_output_info = self.run_bwa_align_cli(input_configuration,
                                                 validated_params)

        # process the result and save the output
        upload_results = self.save_read_alignment_output(
            run_output_info, input_configuration, validated_params)
        run_output_info['upload_results'] = upload_results

        report_info = None
        if create_report:
            report_info = self.create_report_for_single_run(
                run_output_info, input_configuration, validated_params)

        self.clean(run_output_info)

        return {'output_info': run_output_info, 'report_info': report_info}

    def build_bwa_index(self, assembly_or_genome_ref, ws_for_cache):
        bwaIndexBuilder = BwaIndexBuilder(self.scratch_dir, self.workspace_url,
                                          self.callback_url, self.srv_wiz_url,
                                          self.provenance)

        return bwaIndexBuilder.get_index({
            'ref': assembly_or_genome_ref,
            'ws_for_cache': ws_for_cache
        })

    def prepare_single_run(self, input_info, assembly_or_genome_ref,
                           bwa_index_info, ws_for_cache):
        ''' Given a reads ref and an assembly, setup the bwa index '''
        # first setup the bwa index of the assembly
        input_configuration = {'bwa_index_info': bwa_index_info}
        if not bwa_index_info:
            bwaIndexBuilder = BwaIndexBuilder(self.scratch_dir,
                                              self.workspace_url,
                                              self.callback_url,
                                              self.srv_wiz_url,
                                              self.provenance)

            index_result = bwaIndexBuilder.get_index({
                'ref':
                assembly_or_genome_ref,
                'ws_for_cache':
                ws_for_cache
            })
            input_configuration['bwa_index_info'] = index_result

        # next download the reads
        read_lib_ref = input_info['ref']
        read_lib_info = input_info['info']
        reads_params = {
            'read_libraries': [read_lib_ref],
            'interleaved': 'false',
            'gzipped': None
        }
        ru = ReadsUtils(self.callback_url)
        reads = ru.download_reads(reads_params)['files']

        input_configuration['reads_lib_type'] = self.get_type_from_obj_info(
            read_lib_info).split('.')[1]
        input_configuration['reads_files'] = reads[read_lib_ref]
        input_configuration['reads_lib_ref'] = read_lib_ref

        return input_configuration

    def run_bwa_align_cli(self, input_configuration, validated_params):
        # pprint('======== input_configuration =====')
        # pprint(input_configuration)
        options = []
        run_output_info = {}

        # set the bwa index location
        bt2_index_dir = input_configuration['bwa_index_info']['output_dir']

        bt2_index_basename = input_configuration['bwa_index_info'][
            'index_files_basename']
        #options.extend(['-x', bt2_index_basename])

        reference = os.path.join(bt2_index_dir, bt2_index_basename)

        options_r = []
        options_l = []

        options.append(reference)
        options_r.append(reference)
        options_l.append(reference)

        output_dir = os.path.join(
            self.scratch_dir,
            'bwa_alignment_output_' + str(int(time.time() * 10000)))
        output_sam_file = os.path.join(output_dir, 'reads_alignment.sam')
        os.makedirs(output_dir)

        # set the input reads
        sam_parameter = ''
        if input_configuration['reads_lib_type'] == 'SingleEndLibrary':
            options.extend(
                ['-0', input_configuration['reads_files']['files']['fwd']])
            run_output_info['library_type'] = 'single_end'
            output_sai_file = os.path.join(output_dir,
                                           bt2_index_basename) + ".sai"
            options.extend(["-f", output_sai_file])
            self.bwa.run('aln', options, cwd=bt2_index_dir)
            sam_parameter = 'samse'
            options2 = []
            options2.append(reference)
            options2.append(output_sai_file)
            options2.append(input_configuration['reads_files']['files']['fwd'])
            options2.extend(["-f", output_sam_file])
            self.bwa.run(sam_parameter, options2, cwd=bt2_index_dir)
        elif input_configuration['reads_lib_type'] == 'PairedEndLibrary':
            options_l.extend(
                ['-1', input_configuration['reads_files']['files']['fwd']])
            output_l_sai_file = os.path.join(output_dir,
                                             bt2_index_basename) + "_l.sai"
            options_l.extend(["-f", output_l_sai_file])
            self.bwa.run('aln', options_l, cwd=bt2_index_dir)
            options_r.extend(
                ['-2', input_configuration['reads_files']['files']['rev']])
            output_r_sai_file = os.path.join(output_dir,
                                             bt2_index_basename) + "_r.sai"
            options_r.extend(["-f", output_r_sai_file])
            self.bwa.run('aln', options_r, cwd=bt2_index_dir)
            sam_parameter = 'sampe'
            options2 = []
            options2.append(reference)
            options2.append(output_r_sai_file)
            options2.append(output_l_sai_file)
            options2.append(input_configuration['reads_files']['files']['rev'])
            options2.append(input_configuration['reads_files']['files']['fwd'])
            options2.extend(["-f", output_sam_file])
            self.bwa.run(sam_parameter, options2, cwd=bt2_index_dir)
            run_output_info['library_type'] = 'paired_end'
        '''
        align = bash('bwa aln -I -t 8 reference.fa reads.txt > out.sai')
        sam = bash('bwa samse reference.fa out.sai reads.txt > out.sam')
        '''
        # setup the output file name

        # options.extend(['-S', output_sam_file])
        run_output_info['output_sam_file'] = output_sam_file
        run_output_info['output_dir'] = output_dir

        return run_output_info

    def save_read_alignment_output(self, run_output_info, input_configuration,
                                   validated_params):
        rau = ReadsAlignmentUtils(self.callback_url)
        destination_ref = validated_params[
            'output_workspace'] + '/' + validated_params[
                'output_alignment_name']
        condition = 'unknown'
        if 'condition_label' in validated_params:
            condition = validated_params['condition_label']
        upload_params = {
            'file_path': run_output_info['output_sam_file'],
            'destination_ref': destination_ref,
            'read_library_ref': input_configuration['reads_lib_ref'],
            'assembly_or_genome_ref':
            validated_params['assembly_or_genome_ref'],
            'condition': condition
        }
        upload_results = rau.upload_alignment(upload_params)
        return upload_results

    def clean(self, run_output_info):
        ''' Not really necessary on a single run, but if we are running multiple local subjobs, we
        should clean up files that have already been saved back up to kbase '''
        pass

    def create_report_for_single_run(self, run_output_info,
                                     input_configuration, validated_params):
        # first run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': run_output_info['upload_results']['obj_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create report
        report_text = 'Ran on a single reads library.\n\n'
        alignment_info = self.get_obj_info(
            run_output_info['upload_results']['obj_ref'])
        report_text = 'Created ReadsAlignment: ' + str(
            alignment_info[1]) + '\n'
        report_text = '                        ' + run_output_info[
            'upload_results']['obj_ref'] + '\n'
        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created': [{
                'ref':
                run_output_info['upload_results']['obj_ref'],
                'description':
                'ReadsAlignment'
            }],
            'report_object_name':
            'kb_Bwa_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

    def process_batch_result(self, batch_result, validated_params, reads,
                             input_set_info):

        n_jobs = len(batch_result['results'])
        n_success = 0
        n_error = 0
        ran_locally = 0
        ran_njsw = 0

        # reads alignment set items
        items = []
        objects_created = []

        for k in range(0, len(batch_result['results'])):
            job = batch_result['results'][k]
            result_package = job['result_package']
            if job['is_error']:
                n_error += 1
            else:
                n_success += 1
                print(result_package['result'])
                print(result_package['result'][0])
                print(result_package['result'][0]['output_info'])
                output_info = result_package['result'][0]['output_info']
                ra_ref = output_info['upload_results']['obj_ref']
                # Note: could add a label to the alignment here?
                items.append({'ref': ra_ref, 'label': reads[k]['condition']})
                objects_created.append({'ref': ra_ref})

            if result_package['run_context']['location'] == 'local':
                ran_locally += 1
            if result_package['run_context']['location'] == 'njsw':
                ran_njsw += 1

        # Save the alignment set
        alignment_set_data = {'description': '', 'items': items}
        alignment_set_save_params = {
            'data':
            alignment_set_data,
            'workspace':
            validated_params['output_workspace'],
            'output_object_name':
            str(input_set_info[1]) + validated_params['output_obj_name_suffix']
        }

        set_api = SetAPI(self.srv_wiz_url)
        save_result = set_api.save_reads_alignment_set_v1(
            alignment_set_save_params)
        print('Saved ReadsAlignment=')
        pprint(save_result)
        objects_created.append({
            'ref':
            save_result['set_ref'],
            'description':
            'Set of all reads alignments generated'
        })
        set_name = save_result['set_info'][1]

        # run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': save_result['set_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create the report
        report_text = 'Ran on SampleSet or ReadsSet.\n\n'
        report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n'
        report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n'
        report_text += '        Successful runs = ' + str(n_success) + '\n'
        report_text += '            Failed runs = ' + str(n_error) + '\n'
        report_text += '       Ran on main node = ' + str(ran_locally) + '\n'
        report_text += '   Ran on remote worker = ' + str(ran_njsw) + '\n\n'

        print('Report text=')
        print(report_text)

        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created':
            objects_created,
            'report_object_name':
            'kb_Bwa_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })

        result = {
            'report_info': {
                'report_name': report_info['name'],
                'report_ref': report_info['ref']
            }
        }
        result['batch_output_info'] = batch_result

        return result

    def validate_params(self, params):
        validated_params = {}

        required_string_fields = [
            'input_ref', 'assembly_or_genome_ref', 'output_obj_name_suffix',
            'output_workspace'
        ]
        for field in required_string_fields:
            if field in params and params[field]:
                validated_params[field] = params[field]
            else:
                raise ValueError('"' + field +
                                 '" field required to run bwa aligner app')

        optional_fields = [
            'quality_score', 'alignment_type', 'preset_options', 'trim5',
            'trim3', 'condition_label', 'np', 'minins', 'maxins',
            'output_alignment_suffix', 'output_alignment_name'
        ]
        for field in optional_fields:
            if field in params:
                if params[field] is not None:
                    validated_params[field] = params[field]

        validated_params['create_report'] = True
        if 'create_report' in params and params['create_report'] is not None:
            if int(params['create_report']) == 1:
                validated_params['create_report'] = True
            elif int(params['create_report']) == 0:
                validated_params['create_report'] = False
            else:
                raise ValueError(
                    '"create_report" field, if present, should be set to a boolean value: 0 or 1'
                )

        validated_params['concurrent_local_tasks'] = None
        validated_params['concurrent_njsw_tasks'] = None

        if 'concurrent_local_tasks' in params and params[
                'concurrent_local_tasks'] is not None:
            validated_params['concurrent_local_tasks'] = int(
                params['concurrent_local_tasks'])
        if 'concurrent_njsw_tasks' in params and params[
                'concurrent_njsw_tasks'] is not None:
            validated_params['concurrent_njsw_tasks'] = int(
                params['concurrent_njsw_tasks'])

        return validated_params

    def fetch_reads_refs_from_sampleset(self, ref, info, validated_params):
        """
        Note: adapted from kbaseapps/kb_hisat2 - file_util.py
        From the given object ref, return a list of all reads objects that are a part of that
        object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
        refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
        {
            "ref": reads object reference,
            "condition": condition string associated with that reads object
        }
        The only one required is "ref", all other keys may or may not be present, based on the reads
        object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
        for each reads object, but a single PairedEndLibrary may not have that info.
        If ref is already a Reads library, just returns a list with ref as a single element.
        """
        obj_type = self.get_type_from_obj_info(info)
        refs = list()
        refs_for_ws_info = list()
        if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type:
            print("Looking up reads references in ReadsSet object")
            set_api = SetAPI(self.srv_wiz_url)
            reads_set = set_api.get_reads_set_v1({
                'ref':
                ref,
                'include_item_info':
                0,
                'include_set_item_ref_paths':
                1
            })

            for reads in reads_set["data"]["items"]:
                refs.append({
                    'ref': reads['ref_path'],
                    'condition': reads['label']
                })
                refs_for_ws_info.append({'ref': reads['ref_path']})
        else:
            raise ValueError("Unable to fetch reads reference from object {} "
                             "which is a {}".format(ref, obj_type))

        # get object info so we can name things properly
        infos = self.ws.get_object_info3({'objects':
                                          refs_for_ws_info})['infos']

        name_ext = '_alignment'
        if 'output_alignment_suffix' in validated_params \
                and validated_params['output_alignment_suffix'] is not None:
            ext = validated_params['output_alignment_suffix'].replace(' ', '')
            if ext:
                name_ext = ext

        unique_name_lookup = {}
        for k in range(0, len(refs)):
            refs[k]['info'] = infos[k]
            name = infos[k][1]
            if name not in unique_name_lookup:
                unique_name_lookup[name] = 1
            else:
                unique_name_lookup[name] += 1
                name = name + '_' + str(unique_name_lookup[name])
            name = name + name_ext
            refs[k]['alignment_output_name'] = name

        return refs

    def determine_input_info(self, validated_params):
        ''' get info on the input_ref object and determine if we run once or run on a set '''
        info = self.get_obj_info(validated_params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in [
                'KBaseAssembly.PairedEndLibrary',
                'KBaseAssembly.SingleEndLibrary', 'KBaseFile.PairedEndLibrary',
                'KBaseFile.SingleEndLibrary'
        ]:
            return {
                'run_mode': 'single_library',
                'info': info,
                'ref': validated_params['input_ref']
            }
        if obj_type == 'KBaseRNASeq.RNASeqSampleSet':
            return {
                'run_mode': 'sample_set',
                'info': info,
                'ref': validated_params['input_ref']
            }
        if obj_type == 'KBaseSets.ReadsSet':
            return {
                'run_mode': 'sample_set',
                'info': info,
                'ref': validated_params['input_ref']
            }

        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
Exemplo n.º 33
0
class GFFUtils2:
    def __init__(self, config):
        self.callback_url = config['callback_url']
        self.shared_folder = config['scratch']
        #self.shared_folder = "/kb/module/work"
        self.ws_url = config['workspace-url']

        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)

    def _prep_gff(self, gff_file):
        outfile = os.path.join(self.genome_dir, 'out.gff')
        sortcmd = f'(grep ^"#"  {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)'

        with open(outfile, 'w') as o:
            p = subprocess.Popen(sortcmd, shell=True, stdout=o)
            out, err = p.communicate()
            o.close()

        bgzip = subprocess.Popen(['bgzip', 'out.gff'], cwd=self.genome_dir)
        out2, err2 = bgzip.communicate()

        outfile += '.gz'

        return outfile

    def _construct_gff_from_json(self, json, gff_file_path, contig_base_lengths):
        with open(gff_file_path, 'w') as f:
            for feature in json:
                if feature['feature_type'].strip().upper() == 'GENE':
                    end = int(feature['location'][0]['start'])+int(feature['location'][0]['length'])

                    metainfo = "ID="+feature['feature_id']

                    if feature['function']:
                        metainfo += ';FUNCTION='+feature['function']

                    contig_id = str(feature['location'][0]['contig_id'])
                    start = int(feature['location'][0]['start'])

                    # TODO: Fix Plink reassignment of Chr prefixes
                    try:
                        global_pos = int(contig_base_lengths[contig_id]) + start
                    except KeyError:
                        try:
                            global_pos = int(contig_base_lengths[contig_id.capitalize()]) + start
                        except KeyError:
                            try:
                                global_pos = int(contig_base_lengths['Chr'+str(contig_id)]) + start
                            except KeyError:
                                try:
                                    global_pos = int(contig_base_lengths['Chr0'+str(contig_id)]) + start
                                except KeyError:
                                    pp(contig_base_lengths)
                                    pp(contig_id)
                                    raise KeyError(e)

                    """
                    Remove ontology for now
                    if feature['ontology_terms']:
                        metainfo += ';ONTOLOGY('

                        for k, v in feature['ontology_terms'].items():
                            metainfo += str(k) + ',' + str(v) + ':'

                        metainfo = metainfo[:-1]  # remove trailing ;
                        metainfo += ')'
                    """

                    constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \
                                           'KBase\tgene\t' + \
                                           str(feature['location'][0]['start']) + '\t' + \
                                           str(end) + '\t.\t' + \
                                           str(feature['location'][0]['strand']) + '\t' + \
                                           str(global_pos) + '\t' + \
                                           str(metainfo) + '\n'
                    f.write(constructed_gff_line)
            f.close()
        if os.path.exists(gff_file_path):
            return gff_file_path
        else:
            raise FileNotFoundError('Unable to create GFF file form genome JSON.')

    def _process_tabix_results(self, queryresult):
        queryinfo = queryresult[8].split(';')
        if len(queryinfo) >= 2:
            extension = [clean_tsv_data(queryinfo[0][3:]), "NA", clean_tsv_data(queryinfo[1][9:])]
        elif len(queryinfo) is 1:
            extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"]
        else:
            extension = ['NA', 'NA', 'NA']
        return extension

    def find_gene_info(self, row):
        tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]), int(row["POS"]))
        tbresult = next(tb, None)
        if tbresult is None:
            tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"], int(row["POS"]), int(row["POS"]))
            tbresult2 = next(tb2, None)
            if tbresult2 is None:
                tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"], int(row["POS"]), int(row["POS"]))
                tbresult3 = next(tb3, None)
                if tbresult3 is None:
                    if int(row["POS"]) < 500:
                        nstart = 0
                    else:
                        nstart = int(row["POS"]) - 500

                    neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart, int(row["POS"]) + 500)
                    neigh_result = next(neigh_tb, None)

                    if neigh_result is None:
                        return pd.Series(['NA', 'NA', 'NA'], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                    else:
                        nq = self._process_tabix_results(neigh_result)
                        return pd.Series([nq[1], nq[0], nq[2]], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                else:
                    q3 = self._process_tabix_results(tbresult3)
                    return pd.Series(q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
            else:
                q2 = self._process_tabix_results(tbresult2)
                return pd.Series(q2, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
        else:
            q = self._process_tabix_results(tbresult)
            return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])

    def get_gwas_result_file(self, association_ref, association_name, p_value):
        #association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]['data']['data']
        association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]
        association_results = association_obj['data']["association_details"][0]["association_results"]
        result = "CHR\tSNP\tPOS\tP\tBP\n"
        for variation in association_results:
            if (float(variation[3]) > float(p_value)):
                continue
            result += str(variation[0]) + "\t" 
            result +=  str(variation[1]) + "\t" 
            result +=  str(variation[2]) + "\t" 
            result +=   str(variation[3]) + "\t"
            result +=   str(variation[2]) + "\n"
        filepath = os.path.join(self.genome_dir, association_name)
        with open(filepath, "w") as file1: 
            file1.write(result) 
        return (filepath)

    def build_featureset(self, filepath, genome_ref, description, workspace_name, association_name, prefix):
      gene_ids = dict()
      element_ordering = list()
      elements = dict()
      skip_words = ["GENEID", "NEIGHBORGENE", "NA"]
      with open(filepath, 'r') as reader:
          for line in reader:
              fields = line.split("\t")
              condition1 = fields[5] not in skip_words
              condition2 = fields[5] not in elements
              condition3 = fields[6] not in skip_words
              condition4 = fields[6] not in elements
              if condition1 and condition2:
                  element_ordering.append(fields[5])
                  elements[fields[5]] = [genome_ref]
              if condition3 and condition4:
                  element_ordering.append(fields[6])
                  elements[fields[6]] = [genome_ref]
      featureset = dict()
      featureset['description'] = description
      featureset['element_ordering'] = element_ordering
      featureset['elements'] = elements
      ws_id = self.dfu.ws_name_to_id(workspace_name)
      featureset_obj_name = prefix + str(association_name)

      save_info = self.dfu.save_objects( { 'id': ws_id, 
                                            'objects': [ {'type': 'KBaseCollections.FeatureSet', 
                                                          'data': featureset, 
                                                          'name': featureset_obj_name}]})[0]
      obj_ref  = "{0}/{1}/{2}".format( save_info[6], save_info[0], save_info[4] )   
      return obj_ref         


   
    def annotate_GWAS_results(self, genome_ref, association_ref, workspace_name, prefix, p_value):
         
        #TODO: Send outfile to prep gff function inseted of hardcord
        #TODO: Removed hard coded stuff and create new directory for each test function
        self.genome_dir_name = "_".join(genome_ref.split("/"))
        self.genome_dir = os.path.join(self.shared_folder, self.genome_dir_name)
        if not os.path.isdir(self.genome_dir):
            os.mkdir(self.genome_dir)
        sorted_gff_path = os.path.join(self.genome_dir, 'out.gff.gz')
        self.sorted_gff = sorted_gff_path

        if  not os.path.exists(sorted_gff_path):
            feature_num = self.gsu.search({'ref': genome_ref})['num_found']
            # get genome features for gff construction
            genome_features = self.gsu.search({
                'ref': genome_ref,
                'limit': feature_num,
                #'sort_by': [['feature_id', True]]
            })['features']

            assembly_ref = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])[0]['data']['assembly_ref']

            # get assembly contigs for base length calculations
            assembly_contigs = self.wsc.get_object_subset([{
                'included': ['/contigs'],
                'ref': assembly_ref
            }])[0]['data']['contigs']

            contig_ids = list(assembly_contigs.keys())
            contig_ids.sort()

            contig_base_lengths = {}
            prev_length = 0

            for contig in contig_ids:
                contig_base_lengths[contig] = prev_length
                prev_length += assembly_contigs[contig]['length']

            gff_file = os.path.join(self.genome_dir, 'constructed.gff')
            constructed_gff = self._construct_gff_from_json(genome_features, gff_file, contig_base_lengths)
            self.sorted_gff = self._prep_gff(constructed_gff)
            tabix_index(self.sorted_gff)

        obj_info = self.wsc.get_object_info3({"objects": [{"ref": association_ref}]})
        association_name =obj_info["infos"][0][1]


        gwas_results_file = self.get_gwas_result_file(association_ref, association_name, p_value)

        gwas_results = pd.read_csv(gwas_results_file, sep='\t')

        gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \
           gwas_results.apply(self.find_gene_info, axis=1)

        new_results_path = os.path.abspath(os.path.join(gwas_results_file, '..'))
        fname = 'final_' +  association_name
        new_results_path = os.path.join(new_results_path, fname )
        gwas_results.to_csv(path_or_buf=new_results_path, sep='\t', index=False)
        description = "Genelist for GWAS results of trait " + association_name
         
        featureset_obj = self.build_featureset( new_results_path, genome_ref, description, workspace_name, association_name, prefix)
        
        return featureset_obj
Exemplo n.º 34
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1]
            res = self.gsu.search({'ref': genome,
                                   'structured_query': {'feature_id': fids},
                                   'sort_by': [['contig_id', 1]],
                                   'start': 0,
                                   'limit': len(fids)
                                   })

            for feat in res['features']:
                features.append({'Feature Id': feat['feature_id'],
                                 'Aliases': ", ".join(sorted(feat['aliases'].keys())),
                                 'Genome': "{} ({})".format(genome_name, genome),
                                 'Type': feat['feature_type'],
                                 'Function': feat['function']
                                 })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}