def fetch_narrative_data(endpt: str, token: str, ws_id: int,
                         outdir: str) -> int:
    ws = Workspace(url=endpt + "ws", token=token)
    ws_info = ws.get_workspace_info({"id": ws_id})
    ws_meta = ws_info[8]

    # Narrative object
    narr_id = ws_meta["narrative"]
    narr_obj = ws.get_objects2({"objects": [{
        "ref": f"{ws_id}/{narr_id}"
    }]})["data"][0]
    narr_ver = narr_obj["info"][4]
    narr_outpath = os.path.join(
        outdir, f"narrative-{ws_id}.{narr_id}.{narr_ver}.json")
    with open(narr_outpath, "w") as fout:
        json.dump(narr_obj, fout, indent=4)

    # Report objects
    for cell in narr_obj["data"]["cells"]:
        if "kbase" in cell["metadata"]:
            meta = cell["metadata"]["kbase"]
            if "appCell" in meta:
                job_state = meta["appCell"].get("exec", {}).get("jobState")
                result = list()
                if "result" in job_state:
                    result = job_state["result"]
                elif "job_output" in job_state and "result" in job_state[
                        "job_output"]:
                    result = job_state["job_output"]["result"]
                if len(result) > 0 and "report_ref" in result[0]:
                    report_data = ws.get_objects2(
                        {"objects": [{
                            "ref": result[0]["report_ref"]
                        }]})["data"][0]
                    report_info = report_data["info"]
                    ref_dots = f"{report_info[6]}.{report_info[0]}.{report_info[4]}"
                    report_path = os.path.join(outdir,
                                               f"report-{ref_dots}.json")
                    with open(report_path, "w") as fout:
                        json.dump(report_data, fout, indent=4)

    # List objects results
    service = NarrativeService(url=endpt + "service_wizard", token=token)
    # service = ServiceClient(url=endpt + "service_wizard", use_url_lookup=True, token=token)
    ws_data = service.list_objects_with_sets({
        "ws_id": ws_id,
        "includeMetadata": 1
    })
    # ws_data = service.sync_call(
    #     "NarrativeService.list_objects_with_sets",
    #     [{"ws_id": ws_id, "includeMetadata": 1}]
    # )[0]
    data_outpath = os.path.join(outdir, f"objects-{ws_id}.json")
    with open(data_outpath, "w") as fout:
        json.dump(ws_data, fout, indent=4)

    return 0
예제 #2
0
def read_narrative(ref: NarrativeRef, ws_client: Workspace) -> Dict:
    """
    Fetches a Narrative and its object info from the Workspace
    If content is False, this only returns the Narrative's info
    and metadata, otherwise, it returns the whole workspace object.

    This is mainly a wrapper around Workspace.get_objects2(), except that
    it always returns a dict. If content is False, it returns a dict
    containing a single key: 'info', with the object info and, optionally,
    metadata.

    Can the following errors:
        ValueError (if ref isn't a Narrative object),
        WorkspaceError if there's a Workspace issue (ref isn't valid, or token isn't valid)

    :param ref: a NarrativeRef
    :param content: if True, returns the narrative document, otherwise just the metadata
    :param include_metadata: if True, includes the object metadata when returning
    """
    try:
        narr_data = ws_client.get_objects2({'objects': [{'ref': str(ref)}]})
        nar = narr_data['data'][0]
        _validate_narr_type(nar['info'][2], ref)
        # nar['data'] = update_narrative(nar['data'])
        return nar['data']
    except ServerError as err:
        raise WorkspaceError(err, ref.wsid)
예제 #3
0
def load_fastas(config, scratch, upa):
    '''

    '''
    dfu = DataFileUtil(config['callback_url'])
    au = AssemblyUtil(config['callback_url'])
    ws = Workspace(config['workspace-url'])

    obj_data = dfu.get_objects({"object_refs":[upa]})['data'][0]
    obj_type  = obj_data['info'][2]

    if 'KBaseSets.GenomeSet' in obj_type:
        upas = [gsi['ref'] for gsi in obj_data['data']['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
    elif "KBaseGenomes.Genome" in obj_type:
        upas = [upa]
    elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
        # in this case we use the assembly file util to get the fasta file
        file_output = os.path.join(scratch, "input_fasta.fa")
        faf = au.get_assembly_as_fasta({"ref": upa})
        return [(faf['path'], upa)]

    fasta_paths = []
    for genome_upa in upas:
        if upa != genome_upa:
            genome_upa = upa + ';' + genome_upa
        genome_data = ws.get_objects2( {'objects':[{"ref":genome_upa}]})['data'][0]['data']
        target_upa = genome_data.get('contigset_ref') or genome_data.get('assembly_ref')
        assembly_upa = genome_upa + ';' + target_upa
        faf = au.get_assembly_as_fasta({"ref":assembly_upa})
        fasta_paths.append((faf['path'], assembly_upa))

    return fasta_paths
예제 #4
0
    def build_bin_summary_file_from_binnedcontigs_obj(self, input_ref, bin_dir,
                                                      bin_basename,
                                                      fasta_extension):

        # read bin info from obj
        ws = Workspace(self.ws_url)
        try:
            binned_contig_obj = ws.get_objects2(
                {'objects': [{
                    'ref': input_ref
                }]})['data'][0]['data']
        except Exception as e:
            raise ValueError('Unable to fetch ' + str(input_ref) +
                             ' object from workspace: ' + str(e))
            #to get the full stack trace: traceback.format_exc()
        bin_summary_info = dict()

        # bid in object is full name of contig fasta file.  want just the number
        for bin_item in binned_contig_obj['bins']:
            #print ("BIN_ITEM[bid]: "+bin_item['bid'])  # DEBUG
            bin_ID = re.sub('^[^\.]+\.', '',
                            bin_item['bid'].replace('.' + fasta_extension, ''))

            #print ("BIN_ID: "+bin_ID)  # DEBUG
            bin_summary_info[bin_ID] = {
                'n_contigs': bin_item['n_contigs'],
                'gc': round(100.0 * float(bin_item['gc']), 1),
                'sum_contig_len': bin_item['sum_contig_len'],
                'cov': round(100.0 * float(bin_item['cov']), 1)
            }
        # write summary file for just those bins present in bin_dir
        header_line = ['Bin name', 'Completeness', 'Genome size', 'GC content']
        bin_fasta_files_by_bin_ID = self.get_bin_fasta_files(
            bin_dir, fasta_extension)
        bin_IDs = []
        for bin_ID in sorted(bin_fasta_files_by_bin_ID.keys()):
            bin_ID = re.sub('^[^\.]+\.', '',
                            bin_ID.replace('.' + fasta_extension, ''))
            bin_IDs.append(bin_ID)
        summary_file_path = os.path.join(bin_dir,
                                         bin_basename + '.' + 'summary')

        print("writing filtered binned contigs summary file " +
              summary_file_path)
        with open(summary_file_path, 'w') as summary_file_handle:
            print("\t".join(header_line))
            summary_file_handle.write("\t".join(header_line) + "\n")
            for bin_ID in bin_IDs:
                #print ("EXAMINING BIN SUMMARY INFO FOR BIN_ID: "+bin_ID)  # DEBUG
                bin_summary_info_line = [
                    bin_basename + '.' + str(bin_ID) + '.' + fasta_extension,
                    str(bin_summary_info[bin_ID]['cov']) + '%',
                    str(bin_summary_info[bin_ID]['sum_contig_len']),
                    str(bin_summary_info[bin_ID]['gc'])
                ]
                print("\t".join(bin_summary_info_line))
                summary_file_handle.write("\t".join(bin_summary_info_line) +
                                          "\n")

        return summary_file_path
예제 #5
0
파일: FileUtil.py 프로젝트: n1mus/CoverM
def load_fastas(config, scratch: str, upa: str):
    '''
    Returns list of (fasta_path, upa)
    '''
    dfu = DataFileUtil(config['callback_url'])
    au = AssemblyUtil(config['callback_url'])
    mgu = MetagenomeUtils(config['callback_url'])
    ws = Workspace(config['workspace-url'])

    obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0]
    obj_type = obj_data['info'][2]

    if 'KBaseSets.GenomeSet' in obj_type:
        upas = [gsi['ref'] for gsi in obj_data['data']['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
    elif "KBaseGenomes.Genome" in obj_type:
        upas = [upa]
    elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
        # in this case we use the assembly file util to get the fasta file
        # file_output = os.path.join(scratch, "input_fasta.fa")
        faf = au.get_assembly_as_fasta({"ref": upa})
        return [(faf['path'], upa)]
    elif "KBaseSets.AssemblySet" in obj_type:
        fasta_paths = []
        for item_upa in obj_data['data']['items']:
            faf = au.get_assembly_as_fasta({"ref": item_upa['ref']})
            fasta_paths.append((faf['path'], item_upa['ref']))
        return fasta_paths
    elif 'KBaseMetagenomes.BinnedContigs' in obj_type:
        fasta_paths = []
        bin_file_dir = mgu.binned_contigs_to_file({
            'input_ref': upa,
            'save_to_shock': 0
        })['bin_file_directory']
        for (dirpath, dirnames, filenames) in os.walk(bin_file_dir):
            for fasta_file in filenames:
                fasta_path = os.path.join(scratch, fasta_file)
                fasta_path = os.path.splitext(fasta_path)[0] + ".fa"
                copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path)
                # Should I verify that the bins have contigs?
                # is it possible to have empty bins?
                fasta_paths.append((fasta_path, upa))
            break
        return fasta_paths
    else:
        raise Error('Input genome/metagenome reference has unhandled type')

    fasta_paths = []
    for genome_upa in upas:
        genome_data = ws.get_objects2({'objects': [{
            "ref": genome_upa
        }]})['data'][0]['data']
        assembly_upa = genome_upa + ';' + str(
            genome_data.get('contigset_ref')
            or genome_data.get('assembly_ref'))
        faf = au.get_assembly_as_fasta({'ref': assembly_upa})
        fasta_paths.append((faf['path'], assembly_upa))

    return fasta_paths
예제 #6
0
    def check_assembly_cache(self, ref, token):
        ws = Workspace(self.ws_url, token=token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(self.assembly_index_dir, inner_chsum + self.ASSEMBLY_SUFFIX + ".tsv.gz")
        if not os.path.isfile(index_file):
            if self.debug:
                print("    Loading WS object...")
                t1 = time.time()

            if 'KBaseGenomeAnnotations.Assembly' in info[2]:
                included = ["/contigs"]
                assembly_data = ws.get_objects2(
                    {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data']
                contigs = list(assembly_data['contigs'].values())
                self.save_assembly_tsv(contigs, inner_chsum)

            elif 'KBaseGenomes.ContigSet' in info[2]:
                included = ["/contigs/[*]/id",
                            "/contigs/[*]/length",
                            "/contigs/[*]/md5",
                            "/contigs/[*]/description"]
                cs_data = ws.get_objects2(
                    {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data']
                contigs = []
                for c in cs_data['contigs']:
                    this_contig_data = {'contig_id': ''}
                    if 'id' in c:
                        this_contig_data['contig_id'] = c['id']
                    if 'md5' in c:
                        this_contig_data['md5'] = c['md5']
                    if 'length' in c:
                        this_contig_data['length'] = c['length']
                    if 'description' in c:
                        this_contig_data['description'] = c['description']
                    contigs.append(this_contig_data)

                self.save_assembly_tsv(contigs, inner_chsum)
            else:
                raise ValueError('The "ref" is not an Assembly or ContigSet data object. '
                                 'It was a ' + info[2])

            if self.debug:
                print(f"    (time={time.time() - t1})")
        return inner_chsum
예제 #7
0
    def read_assembly_ref_from_binnedcontigs(self, input_ref):
        ws = Workspace(self.ws_url)
        try:
            binned_contig_obj = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data']
        except Exception as e:
            raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e))
            #to get the full stack trace: traceback.format_exc()

        return binned_contig_obj['assembly_ref']
예제 #8
0
    def search_orthologs_from_pangenome(self, token, ref, query, sort_by,
                                        start, limit, num_found):

        search_object = 'orthologs'
        info_included = [
            'id', 'type', 'function', 'md5', 'protein_translation', 'orthologs'
        ]
        table_indexer = TableIndexer(token, self.ws_url)

        ret = table_indexer.run_search(ref, self.pangenome_index_dir,
                                       self.ORTHOLOGS_SUFFIX, search_object,
                                       info_included, query, sort_by, start,
                                       limit, num_found, self.debug)

        for orthologs in ret['orthologs']:
            orthologs_string = orthologs['orthologs']
            if orthologs_string:
                orthologs['orthologs'] = list(eval(orthologs_string))
                if not isinstance(orthologs['orthologs'][0], list):
                    orthologs['orthologs'] = [orthologs['orthologs']]

        ws = Workspace(self.ws_url, token=token)
        genome_feature_function_map = {}
        for orthologs in ret['orthologs']:
            for orthologs_obj in orthologs['orthologs']:
                gene_id = orthologs_obj[0]

                if gene_id in genome_feature_function_map:
                    orthologs_obj.append(
                        genome_feature_function_map.get(gene_id))
                else:
                    included = ["/features/[*]/function", "/features/[*]/id"]
                    object_info = ws.get_objects2({
                        'objects': [{
                            'ref': orthologs_obj[2],
                            'included': included
                        }]
                    })['data'][0]['data']

                    for feature in object_info['features']:
                        genome_feature_function_map.update(
                            {feature.get('id'): feature.get('function')})

                    orthologs_obj.append(
                        genome_feature_function_map.get(gene_id))

        return ret
예제 #9
0
    def check_object_cache(self, ref, search_object, info_included,
                           index_dir, object_suffix, debug):
        ws = Workspace(self.ws_url, token=self.token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(index_dir,
                                  inner_chsum + object_suffix + ".tsv.gz")
        if not os.path.isfile(index_file):
            if debug:
                print("    Loading WS object...")
                t1 = time.time()

            included = self.build_info_included(search_object, info_included)
            object = ws.get_objects2({'objects': [{'ref': ref,
                                                   'included': included}]})['data'][0]['data']
            self.save_object_tsv(object[search_object], inner_chsum, info_included,
                                 index_dir, object_suffix)
            if debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
예제 #10
0
def fetch_pangenome_summary(
        pangenome_ref: str,
        workspace_url: str,
        token: str) -> dict:
    """
    Construct a summary data object for a single pangenome, used in the
    "simple_summary" method.
    Args:
        pangenome_ref: Workspace reference to the pangenome object
        workspace_url: URL of the Workspace being used in the current env
        token: authorization token for fetching the data
    Returns:
        A python object adhering to the SimpleSummaryResult type in
        PanGenomeAPI.spec
    """
    ws_client = Workspace(workspace_url, token=token)
    # Download the full pangenome workspace dataset
    resp = ws_client.get_objects2({
        'objects': [{'ref': pangenome_ref}]
    })
    data = resp['data'][0]['data']
    # Fetch the object infos for each genome
    genome_refs = [{"ref": ref} for ref in data["genome_refs"]]
    genome_infos = ws_client.get_object_info3({
        "objects": genome_refs,
        "includeMetadata": 1
    })["infos"]
    name_mapping = _genome_name_mapping(genome_infos)
    ret = {
        "pangenome_id": data["id"],
        "genomes_count": len(data["genome_refs"]),
        "genes": _count_genes(data),
        "families": _count_families(data),
        "genomes": _genome_counts(data, genome_infos, name_mapping),
        "shared_family_map": _shared_family_map(data, name_mapping),
        "genome_ref_name_map": name_mapping,
    }
    return ret
예제 #11
0
def fetch_fasta_from_genome(genome_ref, ws_url, callback_url):
    """
    Returns an assembly or contigset as FASTA.
    """
    if not check_ref_type(genome_ref, ['KBaseGenomes.Genome'], ws_url):
        raise ValueError(
            "The given genome_ref {} is not a KBaseGenomes.Genome type!")
    # test if genome references an assembly type
    # do get_objects2 without data. get list of refs
    ws = Workspace(ws_url)
    genome_obj_info = ws.get_objects2({
        'objects': [{
            'ref': genome_ref
        }],
        'no_data': 1
    })
    # get the list of genome refs from the returned info.
    # if there are no refs (or something funky with the return), this will be an empty list.
    # this WILL fail if data is an empty list. But it shouldn't be, and we know because
    # we have a real genome reference, or get_objects2 would fail.
    genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

    # see which of those are of an appropriate type (ContigSet or Assembly), if any.
    assembly_ref = list()
    ref_params = [{'ref': genome_ref + ";" + x} for x in genome_obj_refs]
    ref_info = ws.get_object_info3({'objects': ref_params})
    for idx, info in enumerate(ref_info.get('infos')):
        if "KBaseGenomeAnnotations.Assembly" in info[
                2] or "KBaseGenomes.ContigSet" in info[2]:
            assembly_ref.append(";".join(ref_info.get('paths')[idx]))

    if len(assembly_ref) == 1:
        return fetch_fasta_from_assembly(assembly_ref[0], ws_url, callback_url)
    else:
        raise ValueError(
            "Multiple assemblies found associated with the given genome ref {}! "
            "Unable to continue.")
def _attributemapping_index(ws_url, upa, parent_upa):
    """"""
    ws = Workspace(ws_url)
    obj = ws.get_objects2({'objects': [{
        'ref': parent_upa + ";" + upa
    }]})['data'][0]
    data = obj['data']
    doc = {
        "attributes": [],
        "attribute_ontology_ids": [],
        "attribute_units": [],
        "attribute_unit_ontology_ids": [],
        "attribute_values": [],
        "attribute_value_ontology_ids": [],
        "instances": data['instances'],
        "num_attributes": len(data['attributes']),
        "num_instances": len(data['instances']),
    }
    for attr in data['attributes']:
        doc['attributes'].append(attr['attribute'])
        if 'attribute_ont_id' in attr:
            doc['attribute_ontology_ids'].append(attr['attribute_ont_id'])
        if 'unit' in attr:
            doc['attribute_units'].append(attr['unit'])
        if 'attribute_ont_id' in attr:
            doc['attribute_unit_ontology_ids'].append(attr['attribute_ont_id'])
        if 'categories' in attr:
            doc['attribute_values'].extend(attr['categories'].keys())
            doc['attribute_value_ontology_ids'].extend(
                x['attribute_ont_id'] for x in attr['categories']
                if 'attribute_ont_id' in x)

    return {
        'doc': doc,
        'sub_id': str(upa_delimeter.join(list(upa.split('/')))),
        'sub_type': "atrrmapping"
    }
예제 #13
0
class WorkspaceAdminUtils:
    def __init__(self, config):
        wsurl = config.get('workspace-url')
        self.atoken = config.get('workspace-admin-token')
        self.noadmin = False
        if self.atoken is None or self.atoken == '':
            self.noadmin = True
            self.atoken = os.environ.get('KB_AUTH_TOKEN', None)
        self.ws = Workspace(wsurl, token=self.atoken)

    def list_objects(self, params):
        """
        Provide something that acts like a standard listObjects
        """
        if self.noadmin:
            return self.ws.list_objects(params)
        return self.ws.administer({'command': 'listObjects', 'params': params})

    def get_objects2(self, params):
        """
        Provide something that acts like a standard getObjects
        """
        if self.noadmin:
            return self.ws.get_objects2(params)
        return self.ws.administer({'command': 'getObjects', 'params': params})

    def get_workspace_info(self, params):
        """
        Provide something that acts like a standard getObjects
        """
        if self.noadmin:
            return self.ws.get_workspace_info(params)
        return self.ws.administer({
            'command': 'getWorkspaceInfo',
            'params': params
        })
    def run_generate_metadata_report(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_generate_metadata_report
        object_type = params['object_type']
        workspace_name = params['workspace_name']

        ws = Workspace(self.ws_url)
        print(params)

        objects_in_workspace = ws.list_objects({
            'workspaces': [workspace_name],
            'type': object_type
        })
        object_names = sorted([j[1] for j in objects_in_workspace])

        d = dict()

        if (object_type == 'KBaseRNASeq.RNASeqAlignment'):
            for object_name in object_names:
                alignment_stats = ws.get_objects2({
                    'objects': [{
                        'workspace': workspace_name,
                        'name': object_name
                    }]
                })['data'][0]['data']['alignment_stats']
                metadata_keys = alignment_stats.keys()
                object_pd = pd.Series(alignment_stats, index=metadata_keys)
                d[object_name] = object_pd

        else:
            for object_name in object_names:
                obj_meta_data = ws.get_object_info3(
                    {
                        'objects': [{
                            'workspace': workspace_name,
                            'name': object_name
                        }],
                        'includeMetadata':
                        1
                    }, )
                metadata = obj_meta_data.get('infos')[0][10]
                metadata_keys = metadata.keys()
                object_pd = pd.Series(metadata, index=metadata_keys)
                d[object_name] = object_pd

        df = pd.DataFrame(d)

        htmlDir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        self._mkdir_p(htmlDir)
        report_file_path = os.path.join(htmlDir, "index.html")
        #df.to_html(report_file_path)
        self.write_pd_html(df.T, report_file_path)

        try:
            html_upload_ret = self.dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except Exception:
            raise ValueError('Error uploading HTML file: ' + str(htmlDir) +
                             ' to shock')

        reportname = 'generate_metadata_report_' + str(uuid.uuid4())

        reportobj = {
            'message': '',
            'direct_html': None,
            'direct_html_link_index': 0,
            'file_links': [],
            'html_links': [],
            'html_window_height': 500,
            'workspace_name': params['workspace_name'],
            'report_object_name': reportname
        }

        # attach to report obj
        reportobj['direct_html'] = ''
        reportobj['direct_html_link_index'] = 0
        reportobj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            'name': 'index.html',
            'label': 'index.html'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        report_info = report.create_extended_report(reportobj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        print(output)

        #END run_generate_metadata_report

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method run_generate_metadata_report return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]
예제 #15
0
class dN_dS_ratio:
    '''
    Module Name:
    dN_dS_ratio

    Module Description:
    A KBase module: dN_dS_ratio
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = ""
    GIT_COMMIT_HASH = ""

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        self.du = DownloadUtils(self.callback_url)
        self.pu = DnDs_Utils()
        self.dpu = Data_Process_Utils()
        self.hu = htmlreportutils()
        self.config = config
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def run_dN_dS_ratio(self, ctx, params):
        '''
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        '''
        # ctx is the context object
        # return variables are: output
        #BEGIN run_dN_dS_ratio
        print(params)
        self.dpu.validate_params(params)

        workspace = params['workspace_name']
        output_dir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        os.mkdir(output_dir)
        self.ws_url = self.config['workspace-url']
        self.ws = Workspace(url=self.ws_url, token=ctx['token'])

        variation_ref = params['variation_ref']
        variation = self.du.get_variation(variation_ref)
        #self.du.tabix_index(variation)
        variation_obj = self.ws.get_objects2(
            {'objects': [{
                'ref': variation_ref
            }]})['data'][0]

        data = self.ws.get_objects2({
            'objects': [{
                "ref": variation_ref,
                'included': ['/sample_set_ref']
            }]
        })['data'][0]['data']
        sample_set_ref = data['sample_set_ref']

        assembly_ref = variation_obj['data']['assembly_ref']
        assembly_path = self.du.get_assembly(assembly_ref, output_dir)
        gff_ref = params['genome_ref']
        gff_path = self.du.get_gff(gff_ref)
        gene_id = params['gene_id']
        gff_subsample_path = os.path.join(output_dir, "sub_sample.gff")
        self.dpu.filter_gff(gene_id, gff_path, gff_subsample_path)

        with open(gff_subsample_path, 'r') as f:
            line = f.readline()
            rec = line.split("\t")
            chrom = rec[0]
            start = rec[3]
            end = rec[4]

        sub_sample_vcf = os.path.join(output_dir, "sub_sample.vcf")
        self.dpu.index_vcf_file(variation)
        self.dpu.tabix_query(variation, chrom, start, end, sub_sample_vcf)

        assembly_path = output_dir + '/ref_genome.fa'
        variation = output_dir + '/sub_sample.vcf'
        gff_path = output_dir + '/sub_sample.gff'

        sequence = self.pu.read_refseq(assembly_path)
        print(sequence)
        var_list = self.pu.read_vcf(variation, sequence)
        print(var_list)
        var_file = os.path.join(output_dir, "variant_info.tsv")

        with open(var_file, 'w') as variant_tmp_file:
            var_temp = csv.writer(variant_tmp_file, delimiter='\t')
            var_temp.writerow([
                "#chr", "ref", "alt", "pos", "codon number", "pos in codon",
                "codon start", "codon", "mutation type", "coverage"
            ])
            for var_gene_list in var_list:
                var_temp.writerow(var_gene_list)

        gff_data = self.pu.read_gff_file(gff_path)
        codon_list = self.pu.get_triplets(sequence, gff_data)
        codon_result_file = os.path.join(output_dir, "codon_results_temp.tsv")
        corrected_codon_result_file = os.path.join(
            output_dir, "corrected_variant_info.tsv")

        with open(codon_result_file, 'w') as cdr_tmp_file:
            cdr_temp = csv.writer(cdr_tmp_file, delimiter='\t')
            cdr_temp.writerow([
                "#chr", "gene", "codon", "codon start", "codon end",
                "codon positions", "codon number", "N", "S"
            ])
            for gene_codon_list in codon_list:
                for codon in gene_codon_list:
                    cdr_temp.writerow(codon)

        merged_list = self.pu.merge_files(corrected_codon_result_file,
                                          codon_result_file, var_file)
        all_possible_codon = self.pu.get_all_possible_codon(
            merged_list)  # generating all possible codon
        self.pu.generate_statistics(corrected_codon_result_file,
                                    codon_result_file, all_possible_codon,
                                    output_dir)

        ############# html reporting ############################################################3
        workspace = params['workspace_name']
        output = self.hu.create_html_report(self.callback_url, output_dir,
                                            workspace)

        #END run_dN_dS_ratio

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_dN_dS_ratio return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
예제 #16
0
class BwaIndexBuilder:
    def __init__(self, scratch_dir, ws_url, callback_url, service_wizard_url, provenance):
        self.scratch_dir = scratch_dir
        self.ws_url = ws_url
        self.ws = Workspace(self.ws_url)
        self.callback_url = callback_url
        self.service_wizard_url = service_wizard_url
        self.bwa = BwaRunner(self.scratch_dir)
        self.provenance = provenance

    def get_index(self, params):
        ''' The key function of this module- get a bwa index for the specified input '''

        # validate the parameters and fetch assembly_info
        validated_params = self._validate_params(params)
        assembly_info = self._get_assembly_info(validated_params['ref'])

        # check the cache (keyed off of assembly_info)
        index_info = self._get_cached_index(assembly_info, validated_params)
        if index_info:
            index_info['from_cache'] = 1
            index_info['pushed_to_cache'] = 0
        else:
            # on a cache miss, build the index
            index_info = self._build_index(assembly_info, validated_params)
            index_info['from_cache'] = 0
            # pushed_to_cache will be set in return from _build_index

        index_info['assembly_ref'] = assembly_info['ref']
        index_info['genome_ref'] = assembly_info['genome_ref']

        return index_info

    def _validate_params(self, params):
        ''' validate parameters; can do some processing here to produce validated params '''
        # params['ref'] = params['assembly_or_genome_ref']
        validated_params = {'ref': None}
        if 'ref' in params and params['ref']:
            validated_params['ref'] = params['ref']
        else:
            raise ValueError('"ref" field indicating either an assembly or genome is required.')

        if 'output_dir' in params:
            validated_params['output_dir'] = params['output_dir']
        else:
            validated_params['output_dir'] = os.path.join(self.scratch_dir,
                                                          'bwa_index_' + str(int(time.time() * 100)))

        if os.path.exists(validated_params['output_dir']):
            raise ('Output directory name specified (' + validated_params['output_dir'] +
                   ') already exists. Will not overwrite, so aborting.')

        if 'ws_for_cache' in params and params['ws_for_cache']:
            validated_params['ws_for_cache'] = params['ws_for_cache']
        else:
            print('WARNING: bwa index if created will not be cached because "ws_for_cache" field not set')
            validated_params['ws_for_cache'] = None

        return validated_params

    def _get_assembly_info(self, ref):
        ''' given a ref to an assembly or genome, figure out the assembly and return its info '''
        info = self.ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][0]
        obj_type = info[2]
        if obj_type.startswith('KBaseGenomeAnnotations.Assembly') or obj_type.startswith('KBaseGenomes.ContigSet'):
            return {'info': info, 'ref': ref, 'genome_ref': None}

        if obj_type.startswith('KBaseGenomes.Genome'):
            # we need to get the assembly for this genome
            ga = GenomeAnnotationAPI(self.service_wizard_url)
            assembly_ref = ga.get_assembly({'ref': ref})
            # using the path ensures we can access the assembly even if we don't have direct access
            ref_path = ref + ';' + assembly_ref
            info = self.ws.get_object_info3({'objects': [{'ref': ref_path}]})['infos'][0]
            return {'info': info, 'ref': ref_path, 'genome_ref': ref}

        raise ValueError('Input object was not of type: Assembly, ContigSet or Genome.  Cannot get bwa Index.')

    def _get_cached_index(self, assembly_info, validated_params):

        try:
            # note: list_reference_objects does not yet support reference paths, so we need to call
            # with the direct reference.  So we won't get a cache hit if you don't have direct access
            # to the assembly object right now (although you can still always build the assembly object)
            # Once this call supports paths, this should be changed to set ref = assembly_info['ref']
            info = assembly_info['info']
            ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
            objs = self.ws.list_referencing_objects([{'ref': ref}])[0]

            # iterate through each of the objects that reference the assembly
            bwa_indexes = []
            for o in objs:
                if o[2].startswith('KBaseRNASeq.Bowtie2IndexV2'):
                    bwa_indexes.append(o)

            # Nothing refs this assembly, so cache miss
            if len(bwa_indexes) == 0:
                return False

            # if there is more than one hit, get the most recent one
            # (obj_info[3] is the save_date timestamp (eg 2017-05-30T22:56:49+0000), so we can sort on that)
            bwa_indexes.sort(key=lambda x: x[3])
            bwa_index_info = bwa_indexes[-1]
            index_ref = str(bwa_index_info[6]) + '/' + str(bwa_index_info[0]) + '/' + str(bwa_index_info[4])

            # get the object data
            index_obj_data = self.ws.get_objects2({'objects': [{'ref': index_ref}]})['data'][0]['data']

            # download the handle object
            os.makedirs(validated_params['output_dir'])

            dfu = DataFileUtil(self.callback_url)
            dfu.shock_to_file({'file_path': os.path.join(validated_params['output_dir'], 'bt2_index.tar.gz'),
                               'handle_id': index_obj_data['handle']['hid'],
                               'unpack': 'unpack'})
            print('Cache hit: ')
            pprint(index_obj_data)
            return {'output_dir': validated_params['output_dir'],
                    'index_files_basename': index_obj_data['index_files_basename']}

        except Exception:
            # if we fail in saving the cached object, don't worry
            print('WARNING: exception encountered when trying to lookup in cache:')
            print(traceback.format_exc())
            print('END WARNING: exception encountered when trying to lookup in cache.')

        return None

    def _put_cached_index(self, assembly_info, index_files_basename, output_dir, ws_for_cache):

        if not ws_for_cache:
            print('WARNING: bwa index cannot be cached because "ws_for_cache" field not set')
            return False

        try:
            dfu = DataFileUtil(self.callback_url)
            result = dfu.file_to_shock({'file_path': output_dir,
                                        'make_handle': 1,
                                        'pack': 'targz'})

            bwa_index = {'handle': result['handle'], 'size': result['size'],
                         'assembly_ref': assembly_info['ref'],
                         'index_files_basename': index_files_basename}

            ws = Workspace(self.ws_url)
            save_params = {'objects': [{'hidden': 1,
                                        'provenance': self.provenance,
                                        'name': os.path.basename(output_dir),
                                        'data': bwa_index,
                                        'type': 'KBaseRNASeq.Bowtie2IndexV2'
                                        }]
                           }
            if ws_for_cache.strip().isdigit():
                save_params['id'] = int(ws_for_cache)
            else:
                save_params['workspace'] = ws_for_cache.strip()
            save_result = ws.save_objects(save_params)
            print('Bowtie2IndexV2 cached to: ')
            pprint(save_result[0])
            return True

        except Exception:
            # if we fail in saving the cached object, don't worry
            print('WARNING: exception encountered when trying to cache the index files:')
            print(traceback.format_exc())
            print('END WARNING: exception encountered when trying to cache the index files')

        return False

    def _build_index(self, assembly_info, validated_params):
        # get the assembly as a fasta file using AssemblyUtil
        au = AssemblyUtil(self.callback_url)
        fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']})

        # make the target destination folder (check again it wasn't created yet)
        if os.path.exists(validated_params['output_dir']):
            raise ('Output directory name specified (' + validated_params['output_dir'] +
                   ') already exists. Will not overwrite, so aborting.')
        os.makedirs(validated_params['output_dir'])

        # configure the command line args and run it
        cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params)
        self.bwa.run('index', cli_params)
        # self.bwa.run('index', cli_params)
        for file in glob.glob(r'/kb/module/work/tmp/' + fasta_info['assembly_name'] + '.*'):
            print(file)
            shutil.copy(file, validated_params['output_dir'])

        index_info = {'output_dir': validated_params['output_dir'],
                      'index_files_basename': fasta_info['assembly_name']}

        # cache the result, mark if it worked or not
        cache_success = self._put_cached_index(assembly_info,
                                               fasta_info['assembly_name'],
                                               validated_params['output_dir'],
                                               validated_params['ws_for_cache'])
        if cache_success:
            index_info['pushed_to_cache'] = 1
        else:
            index_info['pushed_to_cache'] = 0

        return index_info

    def _build_cli_params(self, fasta_file_path, index_files_basename, validated_params):
        cli_params = []

        # always run in quiet mode
        # positional args: first the fasta path, then the base name used for the index files
        cli_params.append(fasta_file_path)
        cli_params.append("-p")
        cli_params.append(index_files_basename)

        return cli_params
예제 #17
0
class VirSorterUtils:
    def __init__(self, config):
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.mgu = MetagenomeUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.ws = Workspace(config['workspace-url'], token=config['token'])

    def VirSorter_help(self):
        command = 'wrapper_phage_contigs_sorter_iPlant.pl --help'
        self._run_command(command)

    def get_fasta(self, ref):
        # check type of object, i.e KBaseGenomeAnnotations.Assembly-3.0
        obj_type = self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0][2]
        if 'assembly' in obj_type.lower():
            genome_ref = ref
        elif 'kbasegenomes' in obj_type.lower():
            data = self.ws.get_objects2({
                'objects': [{
                    'ref': ref,
                    'included': ['assembly_ref'],
                    'strict_maps': 1
                }]
            })['data'][0]['data']
            genome_ref = data['assembly_ref']
        else:
            raise ValueError(
                f"Input reference {ref} is of type {obj_type}. Type KBaseGenomes.Genome or "
                f"KBaseGenomeAnnotations.Assembly required.")
        return self.au.get_assembly_as_fasta({'ref': genome_ref})['path']

    def run_VirSorter(self, params):

        params['SDK_CALLBACK_URL'] = self.callback_url
        params['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']

        # Get contigs from 'assembly'
        genome_fp = self.get_fasta(params['genomes'])

        command = 'wrapper_phage_contigs_sorter_iPlant.pl --data-dir /data/virsorter-data'

        # Add in first args
        command += f' -f {genome_fp} --db {params["database"]}'

        # Check if additional genomes were submitted
        if params.get('add_genomes'):
            add_genomes_fp = self.get_fasta(params['add_genomes'])
            print(f'Added genomes DETECTED: {add_genomes_fp}')
            command += f' --cp {add_genomes_fp}'

        bool_args = ['virome', 'diamond', 'keep_db',
                     'no_c']  # keep_db = keep-db

        for bool_arg in bool_args:
            if params[
                    bool_arg] == 1:  # 0 is true and therefore run... though for some reason it's reversed on json
                if bool_arg == 'keep_db':
                    bool_arg = 'keep-db'

                command += f' --{bool_arg}'

        self._run_command(command)

        report = self._generate_report(
            params)  # Basically, do everything that's after the tool runs

        return report

    def _run_command(self, command):
        """

        :param command:
        :return:
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, err = pipe.communicate()
        exitCode = pipe.returncode

        if exitCode == 0:
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nError: {}'.format(
                exitCode, output, err)
            raise RuntimeError(error_msg)

    def _parse_summary(self, virsorter_global_fp, affi_contigs_shock_id):
        columns = [
            'Contig_id',
            'Nb genes contigs',
            'Fragment',
            'Nb genes',
            'Category',
            'Nb phage hallmark genes',
            'Phage gene enrichment sig',
            'Non-Caudovirales phage gene enrichment sig',
            'Pfam depletion sig',
            'Uncharacterized enrichment sig',
            'Strand switch depletion sig',
            'Short genes enrichment sig',
        ]

        try:
            with open(virsorter_global_fp, 'r') as vir_fh:
                data = {}
                category = ''
                for line in vir_fh:
                    if line.startswith('## Contig_id'):
                        continue
                    elif line.startswith(
                            '## '
                    ):  # If 'header' lines are consumed by 1st if, then remaining should be good
                        category = line.split('## ')[-1].split(' -')[0]
                    else:
                        values = line.strip().split(',')
                        data[values[0]] = dict(zip(columns[1:], values[1:]))
        except:
            vir_path = os.path.join(os.getcwd(), 'virsorter-out')
            files = os.listdir(vir_path)
            raise RuntimeError(
                f"{virsorter_global_fp} is not a file. existing files {files}."
            )

        df = pd.DataFrame().from_dict(data, orient='index')
        df.index.name = columns[0]
        df.reset_index(inplace=True)

        html = df.to_html(index=False,
                          classes='my_class table-striped" id = "my_id')

        # Need to file write below
        direct_html = html_template.substitute(
            html_table=html, affi_contigs_shock_id=affi_contigs_shock_id)

        # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer
        start_header = Literal("<thead>")
        end_header = Literal("</thead>")

        text = start_header + SkipTo(end_header)

        new_text = ''
        for data, start_pos, end_pos in text.scanString(direct_html):
            new_text = ''.join(data).replace(
                ' style="text-align: right;"', '').replace(
                    'thead>', 'tfoot>\n  ') + '\n</tfoot>'

        # Get start and end positions to insert new text
        end_tbody = Literal("</tbody>")
        end_table = Literal("</table>")

        insertion_pos = end_tbody + SkipTo(end_table)

        final_html = ''
        for data, start_pos, end_pos in insertion_pos.scanString(direct_html):
            final_html = direct_html[:start_pos +
                                     8] + '\n' + new_text + direct_html[
                                         start_pos + 8:]

        return final_html

    def get_assembly_contig_ids(self, assembly_ref):
        """get contig ids from assembly_ref"""
        contigs = self.ws.get_objects2(
            {'objects': [{
                'ref': assembly_ref,
                'included': ['contigs']
            }]})['data'][0]['data']['contigs']
        return contigs.keys()

    def _generate_report(self, params):
        """

        :param params:
        :return:
        """

        # Get URL
        self.dfu = dfu(params['SDK_CALLBACK_URL'])

        # Output directory should be $PWD/virsorter-out - ASSUMES that's the output location
        virsorter_outdir = os.path.join(os.getcwd(), 'virsorter-out')

        print(
            f'VIRSorter output directory contents: {os.listdir(virsorter_outdir)}'
        )

        # Replacing individual download files with BinnedContigs

        # kb_deseq adds output files, then builds report files and sends all of them to the workspace
        output_files = []  # Appended list of dicts containing attributes

        # Collect all the files needed to report to end-user
        # Get all predicted viral sequences
        pred_fnas = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.fasta'))
        pred_gbs = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.gb'))
        # Summary 'table'
        glob_signal = os.path.join(virsorter_outdir,
                                   'VIRSorter_global-phage-signal.csv')

        print('Identified the following predicted viral sequences:\n{}'.format(
            '\n\t'.join(pred_fnas)))

        if len(pred_fnas) == 0:
            print(
                f"Unable to find predicted viral sequences, here are the directory's content:\n"
                f"{os.listdir(os.path.join(virsorter_outdir, 'Predicted_viral_sequences'))}"
            )

        if os.path.exists(glob_signal):

            print(f'Identified the global phage signal: {glob_signal}')

            lines = -1  # Don't count header
            with open(glob_signal) as fh:
                for ln in fh:
                    lines += 1

            if lines == 0:
                print('But it is EMPTY!')

        else:
            print(
                'Unable to find the global phage signal file. Was there an error during the run?'
            )

        # Append error and out files from VIRSorter
        err_fp = os.path.join(virsorter_outdir, 'logs/err')
        # if os.path.exists(err_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/err'),
        #         'name': 'VIRSorter_err',
        #         'label': 'VIRSorter_err',
        #         'description': 'VIRSorter error log file, generated from the tool itself.'
        #     })
        out_fp = os.path.join(virsorter_outdir, 'logs/out')
        # if os.path.exists(out_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/out'),
        #         'name': 'VIRSorter_out',
        #         'label': 'VIRSorter_out',
        #         'description': 'VIRSorter output log file, generated from the tool itself.'
        #     })

        if not (os.path.exists(err_fp) or os.path.exists(out_fp)):
            print(
                'Unable to find err and/or out files in LOG directory, contents:'
            )
            print(os.listdir(os.path.join(virsorter_outdir, 'logs')))

        # Make output directory
        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_dir)

        # Deal with nucleotide and protein fasta
        pred_fna_tgz_fp = os.path.join(output_dir,
                                       'VIRSorter_predicted_viral_fna.tar.gz')
        with tarfile.open(
                pred_fna_tgz_fp,
                'w:gz') as pred_fna_tgz_fh:  # Compress to minimize disk usage
            for pred_fna in pred_fnas:
                pred_fna_tgz_fh.add(pred_fna,
                                    arcname=os.path.basename(pred_fna))
        output_files.append({
            'path':
            pred_fna_tgz_fp,
            'name':
            os.path.basename(pred_fna_tgz_fp),
            'label':
            os.path.basename(pred_fna_tgz_fp),
            'description':
            'FASTA-formatted nucleotide sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_fna_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in FASTA format: '
                f'{pred_fna_tgz_fp}')

        pred_gb_tgz_fp = os.path.join(output_dir,
                                      'VIRSorter_predicted_viral_gb.tar.gz')
        with tarfile.open(pred_gb_tgz_fp, 'w:gz') as pred_gb_tgz_fh:
            for pred_gb in pred_gbs:
                pred_gb_tgz_fh.add(pred_gb, arcname=os.path.basename(pred_gb))
        output_files.append({
            'path':
            pred_gb_tgz_fp,
            'name':
            os.path.basename(pred_gb_tgz_fp),
            'label':
            os.path.basename(pred_gb_tgz_fp),
            'description':
            'Genbank-formatted sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_gb_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in Genbank format: '
                f'{pred_gb_tgz_fp}')

        # To create BinnedContig, need to create another directory with each of the "bins" as separate files?
        binned_contig_output_dir = os.path.join(self.scratch,
                                                str(uuid.uuid4()))
        self._mkdir_p(binned_contig_output_dir)

        # Before creating final HTML output, need to create BinnedContig object so other tools/users can take advantage
        # of its features, but also to feed more easily into other tools (e.g. vConTACT)
        created_objects = []  # Will store the objects that go to the workspace

        # load contig ids from the assembly input
        # assembly_contig_ids = self.get_assembly_contig_ids(self.assembly_ref)
        assembly_contig_ids = self.get_assembly_contig_ids(
            params['genomes'])  # Will fail for Genome

        summary_fp = os.path.join(
            binned_contig_output_dir,
            'VIRSorter.summary')  # Anything that ends in .summary
        with open(summary_fp, 'w') as summary_fh:

            summary_writer = csv.writer(summary_fh,
                                        delimiter='\t',
                                        quoting=csv.QUOTE_MINIMAL)
            summary_writer.writerow(
                ['Bin name', 'Completeness', 'Genome size', 'GC content'])

            for category_fp in pred_fnas:
                # _get_bin_ids from MetaGenomeUtils requires files to follow the header.0xx.fasta convention
                category = os.path.basename(category_fp).split(
                    'cat-')[-1].split('.')[0]
                dest_fn = 'VirSorter.{}.fasta'.format(category.zfill(3))
                dest_fp = os.path.join(output_dir, dest_fn)
                binned_contig_fp = os.path.join(binned_contig_output_dir,
                                                dest_fn)

                genome_size = 0
                gc_content = []

                # Need stats for summary file
                # Also need to adjust sequence name so binnedContig object can retrieve sequences
                adjusted_sequences = []
                with open(category_fp, 'rU') as category_fh:
                    for record in SeqIO.parse(category_fh, 'fasta'):
                        seq = record.seq
                        gc_content.append(SeqUtils.GC(seq))
                        genome_size += len(seq)

                        # This is very dirty, but need to change name to match original contigs
                        record.id = record.id.replace('VIRSorter_',
                                                      '').replace(
                                                          '-circular',
                                                          '').split('-cat_')[0]
                        if 'gene' in record.id:  # Prophage
                            record.id = record.id.split('_gene')[0]
                        record.id = record.id.rsplit('_', 1)[0]

                        # here we make sure that the id's line up with contig ids in the input assembly object
                        if record.id not in assembly_contig_ids:
                            for assembly_contig_id in assembly_contig_ids:
                                # first check if record.id is substring of current contig id,
                                # then check if current contig id is substring of record.id
                                # NOTE: this is not a perfect way of checking and will likely
                                #       fail in some circumstances.
                                #       A more complete check would be to make sure there is a 1:1
                                #       mapping of contig id's in the assembly object as compared to
                                #       the binned contig object (the fasta files defined here).
                                if (record.id in assembly_contig_id) or (
                                        assembly_contig_id in record.id):
                                    record.id = assembly_contig_id
                                    break

                        record.description = ''
                        record.name = ''
                        adjusted_sequences.append(record)

                if genome_size != 0:  # Empty file

                    summary_writer.writerow([
                        dest_fn, '100%', genome_size,
                        (sum(gc_content) / len(gc_content))
                    ])

                    print('Copying {} to results directory'.format(
                        os.path.basename(category_fp)))
                    # Yes, need both. One is to get file_links in report. Second is for binnedContigs object
                    shutil.copyfile(category_fp, dest_fp)

                    # Write renamed sequences
                    with open(binned_contig_fp, 'w') as binned_contig_fh:
                        SeqIO.write(adjusted_sequences, binned_contig_fh,
                                    'fasta')

                    result = self.au.save_assembly_from_fasta({
                        'file': {
                            'path': dest_fp
                        },
                        'workspace_name':
                        params['workspace_name'],
                        'assembly_name':
                        'VirSorter-Category-{}'.format(category)
                    })

                    created_objects.append({
                        "ref":
                        result,
                        "description":
                        "KBase Assembly object from VIRSorter"
                    })

        # Create BinnedContigs object, but 1st, a little metadata
        generate_binned_contig_param = {
            'file_directory': binned_contig_output_dir,
            'assembly_ref':
            params['genomes'],  # params.get('genomes'), self.assembly_ref
            'binned_contig_name': params['binned_contig_name'],
            'workspace_name': params['workspace_name']
        }
        binned_contig_object_ref = self.mgu.file_to_binned_contigs(
            generate_binned_contig_param).get('binned_contig_obj_ref')

        # Add binned contigs reference here, as it was already created above
        created_objects.append({
            "ref": binned_contig_object_ref,
            "description": "BinnedContigs from VIRSorter"
        })

        # Save VIRSorter_affi-contigs.tab for DRAM-v
        affi_contigs_fp = os.path.join(virsorter_outdir, 'Metric_files',
                                       'VIRSorter_affi-contigs.tab')
        affi_contigs_shock_id = self.dfu.file_to_shock(
            {'file_path': affi_contigs_fp})['shock_id']

        # Use global signal (i.e. summary) file and create HTML-formatted version
        raw_html = self._parse_summary(glob_signal, affi_contigs_shock_id)

        html_fp = os.path.join(output_dir, 'index.html')

        with open(html_fp, 'w') as html_fh:
            html_fh.write(raw_html)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_report = [{
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(html_fp),
            'label':
            os.path.basename(html_fp),
            'description':
            'HTML summary report for VIRSorter-predicted viral genomes.'
        }]

        report_params = {
            'message':
            'Here are the results from your VIRSorter run. Above, you\'ll find a report with '
            'all the identified (putative) viral genomes, and below, links to the report as '
            'well as files generated.',
            'workspace_name':
            params['workspace_name'],
            'html_links':
            html_report,
            'direct_html_link_index':
            0,
            'report_object_name':
            'VIRSorter_report_{}'.format(str(uuid.uuid4())),
            'file_links':
            output_files,
            'objects_created':
            created_objects,
        }

        kbase_report_client = KBaseReport(params['SDK_CALLBACK_URL'],
                                          token=params['KB_AUTH_TOKEN'])
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref'],
            'result_directory': binned_contig_output_dir,
            'binned_contig_obj_ref': binned_contig_object_ref
        }

        return report_output

    def _mkdir_p(self, path):
        """
        :param path:
        :return:
        """

        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise
예제 #18
0
class VariationMerge:
    '''
    Module Name:
    VariationMerge

    Module Description:
    A KBase module: VariationMerge
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbasecollaborations/VariationMerge.git"
    GIT_COMMIT_HASH = "918495236305bcae5e2ded0be6ed18d71defd678"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        self.ws_url = config['workspace-url']

        self.vu = VariationUtil(self.callback_url)
        self.mu = MergeVcfUtils()
        #END_CONSTRUCTOR
        pass

    def run_VariationMerge(self, ctx, params):
        """
        :param params: instance of type "inparams" (This example function
           accepts any number of parameters and returns results in a
           KBaseReport) -> structure: parameter "obj_name" of String,
           parameter "workspace_name" of String, parameter "vcflist" of list
           of String
        :returns: instance of type "OutResults" -> structure: parameter
           "output_obj_ref" of String, parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_VariationMerge

        self.ws = Workspace(url=self.ws_url, token=ctx['token'])

        print(params)

        vcf_flist = []
        assembly_ref_set = set()
        sampleset_ref_set = set()
        genome_set_ref_set = set()
        for i in range(len(params['vcflist'])):
            variation_ref = params['vcflist'][i]

            variation_obj = self.ws.get_objects2(
                {'objects': [{
                    'ref': variation_ref
                }]})['data'][0]
            print(variation_obj['data']['assembly_ref'])

            if 'assembly_ref' in variation_obj['data']:
                assembly_ref = variation_obj['data']['assembly_ref']
                assembly_ref_set.add(assembly_ref)
            elif 'genome_ref' in variation_obj['data']:
                genome_ref = variation_obj['data']['genome_ref']
                genome_set_ref_set.add(genome_ref)

            print(params['vcflist'][i])
            vcf_filename = "/kb/module/work/tmp/variation" + str(i) + ".vcf.gz"
            vcf_flist.append(vcf_filename)

            inparams = {}
            inparams['variation_ref'] = variation_ref
            inparams['filename'] = vcf_filename

            self.vu.get_variation_as_vcf(inparams)
            os.rename("/kb/module/work/tmp/variation.vcf.gz", vcf_filename)
            self.mu.index_vcf(vcf_filename)
            var_object_ref = params['vcflist'][i]
            data = self.ws.get_objects2({
                'objects': [{
                    "ref": var_object_ref,
                    'included': ['/sample_set_ref']
                }]
            })['data'][0]['data']
            sampleset_ref_set.add(data['sample_set_ref'])

        #Raising exception

        if (len(genome_set_ref_set) == 0 and len(assembly_ref_set) != 1):
            raise Exception(
                "variation objects are from different assembly refs")
        elif (len(sampleset_ref_set) != 1):
            raise Exception(
                "variation objects are from different sample set refs")
        elif (len(assembly_ref_set) == 0 and len(genome_set_ref_set) != 1):
            raise Exception(
                "variation objects are from different genome set refs")

        merged_file = os.path.join(self.shared_folder,
                                   "merged_gatk_variation_jmc2_test.vcf")
        self.mu.merge_vcf(vcf_flist, merged_file)

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': assembly_ref_set.pop(),
            'sample_set_ref': sampleset_ref_set.pop(),
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': merged_file,
            'variation_object_name': params['variation_object_name']
        }
        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_VariationMerge

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_VariationMerge return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
예제 #19
0
    def run_FamaGenomeProfiling(self, ctx, params):
        """
        Run genome functional profiling module of Fama.
        :param params: instance of type "FamaGenomeProfilingParams"
           (Parameters for genome functional profiling. workspace_name - the
           name of the workspace for input/output genome_refs - references to
           a genome object ref_dataset - the name of Fama reference dataset
           output_result_name - the name of the output DomainAnnotation) ->
           structure: parameter "workspace_name" of String, parameter
           "genome_ref" of list of String, parameter "ref_dataset" of String,
           parameter "output_feature_set_name" of String, parameter
           "output_annotation_name" of String
        :returns: instance of type "ReportResults" (Output report parameters
           report_name - the name of the report object report_ref - the
           reference to the report object) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_FamaGenomeProfiling
        # Import protein sequences from input genome_ref
        ws_client = Workspace(self.ws_url)
        input_genome_refs = params['genome_ref']
        fama_reference = params['ref_dataset']
        input_proteins = {}
        name2ref = {}
        for input_genome_ref in input_genome_refs:
            ret = ws_client.get_objects2(
                {'objects': [{
                    'ref': input_genome_ref
                }]})['data'][0]
            obj_data = ret['data']
            obj_name = ret['info'][1]
            obj_type = ret['info'][2].split('.')[1].split('-')[0]
            if obj_type == 'GenomeSet':
                print('GenomeSet data', obj_data)
                genome_refs = []
                if 'elements' in obj_data:
                    genome_refs = [
                        item['ref'] for item in obj_data['elements'].values()
                    ]
                elif 'items' in obj_data:
                    genome_refs = [item['ref'] for item in obj_data['items']]
                for sub_obj_ref in genome_refs:
                    ret = ws_client.get_objects2(
                        {'objects': [{
                            'ref': sub_obj_ref
                        }]})['data'][0]
                    genome_data = ret['data']
                    genome_name = ret['info'][1]
                    if genome_name in name2ref:
                        raise ServerError(
                            'All input genome names must be unique. Check ' +
                            genome_name)
                    name2ref[genome_name] = sub_obj_ref
                    proteins = genome_proteins_to_fasta(
                        genome_data, self.shared_folder)
                    input_proteins[genome_name] = {}
                    input_proteins[genome_name]['fwd'] = proteins
            elif obj_type == 'Genome':
                if obj_name in name2ref:
                    raise ServerError('All input genome names must be unique')
                name2ref[obj_name] = input_genome_ref
                proteins = genome_proteins_to_fasta(obj_data,
                                                    self.shared_folder)
                input_proteins[obj_name] = {}
                input_proteins[obj_name]['fwd'] = proteins
            else:
                raise ServerError('Incompatible object: ' + input_genome_ref +
                                  ' (' + obj_name + ')')

        self.log('Input sequence files:', str(input_proteins))
        self.log('reference: ', fama_reference)
        # Run Fama
        fama_params = {
            'input_proteins': input_proteins,
            'work_dir': self.shared_folder,
            'reference': fama_reference,
            'ws_name': params['workspace_name'],
            'ws_client': ws_client,
            'featureset_name': params['output_feature_set_name'],
            'annotation_prefix': params['output_annotation_name'],
            'name2ref': name2ref
        }
        fama_output = protein_functional_profiling_pipeline(fama_params)
        objects_created = fama_output['objects_created']

        dfu = DataFileUtil(self.callback_url)
        workspace_id = dfu.ws_name_to_id(params['workspace_name'])

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': fama_output['feature_set_data'],
                'name': params['output_feature_set_name']
            }]
        }

        try:
            dfu_oi = dfu.save_objects(save_object_params)[0]
        except ServerError as dfue:
            # not really any way to test this block
            self.log('Logging exception saving feature set')
            self.log(str(dfue))
            raise
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0],
                                                dfu_oi[4])
        objects_created.append({
            'ref': feature_set_obj_ref,
            'description': 'Filtered genome features'
        })

        self.log('FeatureSet saved to ' + feature_set_obj_ref)

        # Write HTML output to workspace
        message = 'Fama protein functional profiling finished successfully'

        try:
            dfu_output = dfu.file_to_shock(
                {'file_path': fama_output['html_report']})
        except ServerError as dfue:
            # not really any way to test this block
            self.log('Logging exception loading results to shock')
            self.log(str(dfue))
            raise
        self.log('HTML report saved: ' + str(dfu_output))

        html_links = [{
            'shock_id': dfu_output['shock_id'],
            'description': 'HTML report for Fama App',
            'name': 'fama_report.html',
            'label': 'Fama_report'
        }]
        for krona_file in fama_output['krona_charts']:
            try:
                dfu_output = dfu.file_to_shock({'file_path': krona_file})
                html_links.append({
                    'shock_id':
                    dfu_output['shock_id'],
                    'description':
                    'Krona chart for function taxonomy profile',
                    'name':
                    fama_output['krona_charts'][krona_file][0],
                    'label':
                    fama_output['krona_charts'][krona_file][1]
                })
            except ServerError as dfue:
                # not really any way to test this block
                self.log('Logging exception loading results to shock')
                self.log(str(dfue))
                raise

        self.log('Krona chart saved: ' + str(dfu_output))

        # Save report
        report_params = {
            'message': message,
            'objects_created': objects_created,
            'direct_html_link_index': 0,
            'html_links': html_links,
            'file_links': fama_output['report_files'],
            'report_object_name': 'fama_profiling_report_' + str(uuid.uuid4()),
            'workspace_name': params['workspace_name'],
            'html_window_height': 460
        }
        try:
            self.log('Call KBaseReport at ' + str(self.callback_url))
            report = KBaseReport(self.callback_url)
            self.log('Ready to save KBase report: ' + str(report_params))
            report_info = report.create_extended_report(report_params)
        except ServerError as kre:
            # not really any way to test this block
            self.log('Logging exception saving report')
            self.log(str(kre))
            raise

        report_info['report_params'] = report_params
        self.log('KBase report saved: ' + str(report_info))
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END run_FamaGenomeProfiling

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_FamaGenomeProfiling return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #20
0
class kb_ReadSim:
    '''
    Module Name:
    kb_ReadSim

    Module Description:
    A KBase module: kb_ReadSim
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbasecollaborations/kb_ReadSim.git"
    GIT_COMMIT_HASH = "c9c0185e34d25be57cc6e1c901d8801fbc0f4784"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        self.du = DownloadUtils(self.callback_url)
        self.su = SimUtils()
        self.ru = ReadsUtils(self.callback_url)
        self.vu = VariationUtil(self.callback_url)
        self.eu = VcfEvalUtils()
        self.hu = htmlreportutils()
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def run_kb_ReadSim(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "Inparams" -> structure: parameter
           "workspace_name" of String, parameter "input_sample_set" of
           String, parameter "strain_info" of String, parameter
           "assembly_or_genome_ref" of String, parameter "base_error_rate" of
           String, parameter "outer_distance" of String, parameter
           "standard_deviation" of String, parameter "num_read_pairs" of
           String, parameter "len_first_read" of String, parameter
           "len_second_read" of String, parameter "mutation_rate" of String,
           parameter "frac_indels" of String, parameter
           "variation_object_name" of String, parameter "output_read_object"
           of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_ReadSim
        output_dir = self.shared_folder
        print(params)
        self.su.validate_simreads_params(params)

        genome_or_assembly_ref = params['assembly_or_genome_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        self.du.download_genome(assembly_ref, output_dir)

        ref_genome = os.path.join(self.shared_folder, "ref_genome.fa")
        output_fwd_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed1.fq")
        output_rev_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed2.fq")

        self.eu.check_path_exists(ref_genome)

        self.su.simreads(ref_genome, output_fwd_paired_file_path,
                         output_rev_paired_file_path, params)
        self.eu.check_path_exists(output_fwd_paired_file_path)
        self.eu.check_path_exists(output_rev_paired_file_path)

        retVal = self.ru.upload_reads({
            'wsname': params['workspace_name'],
            'name': params['output_read_object'],
            'sequencing_tech': 'illumina',
            'fwd_file': output_fwd_paired_file_path,
            'rev_file': output_rev_paired_file_path
        })

        logfile = os.path.join(self.shared_folder, "variant.txt")
        self.eu.check_path_exists(logfile)

        vcf_file = self.su.format_vcf(logfile)
        self.eu.check_path_exists(vcf_file)

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['assembly_or_genome_ref'],
            'sample_set_ref': params['input_sample_set'],
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': vcf_file,
            'variation_object_name': params['variation_object_name']
        }
        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'Success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_kb_ReadSim

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_ReadSim return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def run_eval_variantcalling(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "Evalparams" -> structure: parameter
           "workspace_name" of String, parameter "sim_varobject_name" of
           String, parameter "calling_varobject_name" of String, parameter
           "output_var_object" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_eval_variantcalling

        print(params)
        self.eu.validate_eval_params(params)

        report_dir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        os.mkdir(report_dir)

        self.ws = Workspace(url=self.ws_url, token=ctx['token'])

        var_object_ref1 = params['varobject_ref1']
        sampleset_ref1 = self.ws.get_objects2({
            'objects': [{
                "ref": var_object_ref1,
                'included': ['/sample_set_ref']
            }]
        })['data'][0]['data']['sample_set_ref']

        var_object_ref2 = params['varobject_ref2']
        sampleset_ref2 = self.ws.get_objects2({
            'objects': [{
                "ref": var_object_ref2,
                'included': ['/sample_set_ref']
            }]
        })['data'][0]['data']['sample_set_ref']

        if (sampleset_ref1 != sampleset_ref2):
            raise Exception(
                "Variation objects are from different sample set\n")

        assembly_ref_set = set()
        genomeset_ref_set = set()

        variation_obj1 = self.ws.get_objects2(
            {'objects': [{
                'ref': var_object_ref1
            }]})['data'][0]

        if 'assembly_ref' in variation_obj1['data']:
            assembly_ref1 = variation_obj1['data']['assembly_ref']
            assembly_ref_set.add(assembly_ref1)
        elif 'genome_ref' in variation_obj1['data']:
            genome_ref1 = variation_obj1['data']['genome_ref']
            genomeset_ref_set.add(genome_ref1)

        variation_obj2 = self.ws.get_objects2(
            {'objects': [{
                'ref': var_object_ref2
            }]})['data'][0]
        if 'assembly_ref' in variation_obj2['data']:
            assembly_ref2 = variation_obj2['data']['assembly_ref']
            assembly_ref_set.add(assembly_ref2)
        elif 'genome_ref' in variation_obj2['data']:
            genome_ref2 = variation_obj2['data']['genome_ref']
            genomeset_ref_set.add(genome_ref2)

        assembly_or_genome_ref = None

        if (not genomeset_ref_set and len(assembly_ref_set) != 1):
            raise Exception(
                "variation objects are from different assembly refs")
        elif (not assembly_ref_set and len(genomeset_ref_set) != 1):
            raise Exception("variation objects are from different genome refs")

        simvarfile = os.path.join(report_dir, "simvarinat.vcf.gz")
        simvarpath = self.du.download_variations(var_object_ref1, simvarfile)

        os.rename(simvarpath, simvarfile)
        self.eu.index_vcf(simvarfile)

        callingvarfile = os.path.join(report_dir, "callingvarinat.vcf.gz")
        callingvarpath = self.du.download_variations(var_object_ref2,
                                                     callingvarfile)

        os.rename(callingvarpath, callingvarfile)
        self.eu.index_vcf(callingvarfile)

        eval_results = self.eu.variant_evalation(simvarfile, callingvarfile,
                                                 report_dir)

        unique_vcf1 = eval_results['unique1']
        self.eu.check_path_exists(unique_vcf1)

        unique_vcf2 = eval_results['unique2']
        self.eu.check_path_exists(unique_vcf2)

        common_vcf = eval_results['common']
        self.eu.check_path_exists(common_vcf)

        image_path = self.eu.plot_venn_diagram(report_dir, unique_vcf1,
                                               unique_vcf2, common_vcf)
        self.eu.check_path_exists(image_path)
        '''
        if(len(assembly_ref_set) != 0):
            assembly_or_genome_ref = assembly_ref_set.pop()
        elif(len(genomeset_ref_set) != 0):
            assembly_or_genome_ref = genomeset_ref_set.pop()

        logging.info("Saving Unique1 vcf\n")
        save_unique_variation_params1 = {'workspace_name': params['workspace_name'],
                                        'genome_or_assembly_ref': assembly_or_genome_ref,
                                        'sample_set_ref': sampleset_ref1,
                                        'sample_attribute_name': 'sample_unique_attr1',
                                        'vcf_staging_file_path': unique_vcf1,
                                        'variation_object_name': params['output_variant_object'] + "_sample1_unique"
        }
        self.vu.save_variation_from_vcf(save_unique_variation_params1)
        logging.info("Saving done\n")

        logging.info("Saving Unique2 vcf\n")
        save_unique_variation_params2 = {'workspace_name': params['workspace_name'],
                                        'genome_or_assembly_ref': assembly_or_genome_ref,
                                        'sample_set_ref': sampleset_ref1,
                                        'sample_attribute_name': 'sample_unique_attr2',
                                        'vcf_staging_file_path': unique_vcf2,
                                        'variation_object_name': params['output_variant_object'] + "_sample2_unique"
        }
        self.vu.save_variation_from_vcf(save_unique_variation_params2)
        logging.info("Saving done\n")

        logging.info("Saving Common vcf\n")
        save_common_variation_params = {'workspace_name': params['workspace_name'],
                                 'genome_or_assembly_ref': assembly_or_genome_ref,
                                 'sample_set_ref': sampleset_ref1,
                                 'sample_attribute_name': 'sample_common_attr',
                                 'vcf_staging_file_path': common_vcf,
                                 'variation_object_name': params['output_variant_object'] + "_sample1_sample2_common"
        }
        self.vu.save_variation_from_vcf(save_common_variation_params)
        logging.info("Saving done\n")
        '''

        workspace = params['workspace_name']
        output = self.hu.create_html_report(self.callback_url, report_dir,
                                            workspace)
        #END run_eval_variantcalling

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_eval_variantcalling return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
예제 #21
0
def build_report_view_data(host: str, ws_client: Workspace,
                           result: list) -> dict:
    """
    Returns a structure like this:
    {
        html: {
            height: max height string for iframes (default = 500px, unless present in report),
            set_height: boolean - if True, then apply height to the height style value as well.
            direct: string (optional) - direct html to plop in the page,
            iframe_style: string (optional) - styling for direct html iframe,
            links: [{
                url: string,
                name: string,
                description: string,
                handle: ?
                label: ?
            }],
            paths: [ path1, path2, path3, ... ] for all urls in links (just a convenience),
            link_idx: index of paths to use
                (this is a little funky, might get cleared up in a later iteration.)
                (I suspect this'll be here 3 years later. Today's 2/13/2020. Let's see!)
            file_links: [{
                'URL': 'https://ci.kbase.us/services/shock-api/node/a2625b71-48d5-4ba6-8603-355485508da8',
                'description': 'JGI Metagenome Assembly Report',
                'handle': 'KBH_253154',
                'label': 'assembly_report',
                'name': 'assembly_report.zip'
            }]
        }
        objects: [{
            'upa': '...',
            'name': 'foo',
            'type': '...',
            'description': '...'
        }]
        summary: '',
        summary_height: height string for summary panel (default = 500px unless specified in report),
        report: ''
    }
    """
    if not result:
        return {}
    if not isinstance(result, list):
        result = [result]
    if (not result[0] or not isinstance(result[0], dict)
            or not result[0].get('report_name')
            or not result[0].get('report_ref')):
        return {}
    report_ref = result[0]['report_ref']
    report = ws_client.get_objects2({'objects': [{
        'ref': report_ref
    }]})['data'][0]['data']
    """{'direct_html': None,
     'direct_html_link_index': None,
     'file_links': [],
     'html_links': [],
     'html_window_height': None,
     'objects_created': [{'description': 'Annotated genome', 'ref': '43666/6/1'}],
     'summary_window_height': None,
     'text_message': 'Genome saved to: wjriehl:narrative_1564507007662/some_genome\nNumber of genes predicted: 3895\nNumber of protein coding genes: 3895\nNumber of genes with non-hypothetical function: 2411\nNumber of genes with EC-number: 1413\nNumber of genes with Seed Subsystem Ontology: 1081\nAverage protein length: 864 aa.\n',
     'warnings': []}
    """
    created_objs = []
    if report.get('objects_created'):
        report_objs_created = report['objects_created']
        # make list to look up obj types with get_object_info3
        info_lookup = [{"ref": o["ref"]} for o in report_objs_created]
        infos = ws_client.get_object_info3({'objects': info_lookup})['infos']
        for idx, info in enumerate(infos):
            created_objs.append({
                'upa':
                report_objs_created[idx]['ref'],
                'description':
                report_objs_created[idx].get('description', ''),
                'name':
                info[1],
                'type':
                info[2].split('-')[0].split('.')[-1],
                'link':
                host + '/#dataview/' + report_objs_created[idx]['ref']
            })
    html_height = report.get("html_window_height")
    if html_height is None:
        html_height = 500
    html = {"height": f"{html_height}px", "set_height": True}
    if report.get("direct_html"):
        if not report.get("direct_html").startswith("<html"):
            html["set_height"] = False
        html["direct"] = "data:text/html;charset=utf-8," + quote(
            report.get("direct_html"))

    if report.get("html_links"):
        idx = report.get("direct_html_link_index", 0)
        if idx is None or idx < 0 or idx >= len(report["html_links"]):
            idx = 0
        html["links"] = report["html_links"]
        html["paths"] = list()
        for i, link in enumerate(html["links"]):
            html["paths"].append(f'/api/v1/{report_ref}/$/{i}/{link["name"]}')
        html["link_idx"] = idx

    if report.get("file_links"):
        html["file_links"] = report["file_links"]

    summary_height = report.get("summary_window_height")
    if summary_height is None:
        summary_height = 500

    html["iframe_style"] = f"max-height: {html['height']}"
    if html["set_height"]:
        html["iframe_style"] += f"; height: {html['height']}"
    else:
        html["iframe_style"] += "; height: auto"
    return {
        "objects": created_objs,
        "summary": report.get("text_message", ""),
        "summary_height": f"{summary_height}px",
        "html": html
    }
예제 #22
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'
        [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11)  # object_info tuple
        ws = Workspace(self.ws_url)

        # 1) generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)


        # 2) based on type, download the files
        obj_name = self.get_data_obj_name (input_ref)
        type_name = self.get_data_obj_type (input_ref)

        # auClient
        try:
            auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e))

        # mguClient
        try:
            mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))


        # Standard Single Assembly
        #
        if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']:
            # create file data
            filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension)
            auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename})
            if not os.path.isfile(filename):
                raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # AssemblySet
        #
        elif type_name == 'KBaseSets.AssemblySet':

            # read assemblySet
            try:
                assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1})
            except Exception as e:
                raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e))
            assembly_refs = []
            assembly_names = []
            for assembly_item in assemblySet_obj['data']['items']:
                this_assembly_ref = assembly_item['ref']
                # assembly obj info
                try:
                    this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0]
                    this_assembly_name = this_assembly_info[NAME_I]
                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e))
                assembly_refs.append(this_assembly_ref)
                assembly_names.append(this_assembly_name)

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(assembly_refs):
                this_name = assembly_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Binned Contigs
        #
        elif type_name == 'KBaseMetagenomes.BinnedContigs':

            # download the bins as fasta and set the input folder name
            bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            # make sure fasta file isn't empty
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
            for (dirpath, dirnames, filenames) in os.walk(input_dir):
                for fasta_file in filenames:
                    fasta_path = os.path.join (input_dir,fasta_file)
                    min_fasta_len = 1
                    if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len):
                        raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path))
                break

        # Genome and GenomeSet
        #
        elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet':
            genome_obj_names = []
            genome_sci_names = []
            genome_assembly_refs = []

            if type_name == 'KBaseGenomes.Genome':
                genomeSet_refs = [input_ref]
            else:  # get genomeSet_refs from GenomeSet object
                genomeSet_refs = []
                try:
                    genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data']
                except Exception as e:
                    raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e))
                    #to get the full stack trace: traceback.format_exc()

                # iterate through genomeSet members
                for genome_id in genomeSet_object['elements'].keys():
                    if 'ref' not in genomeSet_object['elements'][genome_id] or \
                       genomeSet_object['elements'][genome_id]['ref'] == None or \
                       genomeSet_object['elements'][genome_id]['ref'] == '':
                        raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref))
                    else:
                        genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref'])

            # genome obj data
            for i,this_input_ref in enumerate(genomeSet_refs):
                try:
                    objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data']
                    genome_obj = objects[0]['data']
                    genome_obj_info = objects[0]['info']
                    genome_obj_names.append(genome_obj_info[NAME_I])
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError ("unable to fetch genome: "+this_input_ref)

                # Get genome_assembly_ref
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    raise ValueError (msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['contigset_ref'])

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(genome_assembly_refs):
                this_name = genome_obj_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Unknown type slipped through
        #
        else:
            raise ValueError('Cannot stage fasta file input directory from type: ' + type_name)


        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
class AveExpressionMatrixBuilder:

    def _validate_calculate_average_expression_matrix_params(self, params):
        """
        _validate_calculate_average_expression_matrix_params:
                validates params passed to calculate_average_expression_matrix method
        """

        log('start validating calculate_average_expression_matrix params')

        # check for required parameters
        for p in ['expression_matrix_ref', 'output_suffix', 'workspace_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def _generate_report(self, expression_matrix_ref, workspace_name):
        """
        _generate_report: generate report
        """

        objects_created = [{'ref': expression_matrix_ref,
                            'description': 'Average ExpressionMatrix'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         # 'html_links': output_html_files,
                         # 'direct_html_link_index': 0,
                         'html_window_height': 366,
                         'report_object_name': 'kb_ave_expr_matrix_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _save_expression_matrix(self, em_data, em_obj_name, workspace_name):
        """
        _save_expression_matrix: saving ExpressionMatrix
        """

        try:
            log('saving ExpressionMatrix [{}]'.format(em_obj_name))
        
            data_type = 'KBaseFeatureValues.ExpressionMatrix'
            obj_info = self.dfu.save_objects({'id': self.dfu.ws_name_to_id(workspace_name),
                                              'objects': [{'type': data_type,
                                                           'data': em_data,
                                                           'name': em_obj_name}]})[0]
        except Exception as e:
            log(e)
            raise Exception('Failed Saving ExpressionMatrix to Workspace')

        expression_matrix_ref = str(obj_info[6]) + '/' + str(obj_info[0]) + '/' + str(obj_info[4])

        return expression_matrix_ref

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.scratch = config['scratch']

    def calculate_average_expression_matrix(self, params):
        """
        calculate_average_expression_matrix: create an average ExpressionMatrix object 
                                             from a ExpressionMatrix object

        required params:
        expression_matrix_ref: ExpressionMatrix object reference
        output_suffix: output average ExpressionMatrix name suffix
        workspace_name: the name of the workspace it gets saved to
        
        return:
        average_expression_matrix_ref: generated average ExpressionMatrix object reference
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        log('--->\nrunning AveExpressionMatrixBuilder.calculate_average_expression_matrix\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_calculate_average_expression_matrix_params(params)

        expression_matrix_ref = params.get('expression_matrix_ref')
        expression_matrix = self.ws.get_objects2({'objects':
                                                  [{'ref': 
                                                    expression_matrix_ref}]})['data'][0]

        expression_matrix_data = expression_matrix['data']
        expression_matrix_info = expression_matrix['info']

        condition_map = expression_matrix_data['condition_mapping']

        ori_data = expression_matrix_data['data']
        ori_col_ids = ori_data['col_ids']
        ori_row_ids = ori_data['row_ids']
        ori_values = ori_data['values']

        labels = list(condition_map.keys())

        if set(labels) != set(ori_col_ids):
            error_msg = 'available labels: {}\n'.format(ori_col_ids)
            error_msg += 'labels in condition_mapping: {}'.format(labels)
            raise ValueError(error_msg)

        condition_pos = {}

        for label, condition in condition_map.items():
            if condition not in condition_pos:
                condition_pos.update({condition: [ori_col_ids.index(label)]})
            else:
                condition_list = condition_pos[condition]
                condition_list.append(ori_col_ids.index(label))
                condition_pos.update({condition: condition_list})

        conditions = list(condition_pos.keys())

        ave_values = []
        for ori_value in ori_values:
            ave_value = [None] * len(conditions)
            for condition, poss in condition_pos.items():
                ave_pos = conditions.index(condition)
                sum_value = 0.0
                for pos in poss:
                    sum_value += round(float(ori_value[pos]), 3) 
                average = sum_value / len(poss)
                ave_value[ave_pos] = round(average, 2)

            ave_values.append(ave_value)

        average_data = {}
        average_data.update({'row_ids': ori_row_ids})
        average_data.update({'col_ids': conditions})
        average_data.update({'values': ave_values})

        em_data = {}
        genome_ref = expression_matrix_data.get('genome_ref')
        if genome_ref:
            em_data.update({'genome_ref': genome_ref})
        em_data.update({'scale': expression_matrix_data.get('scale')})
        em_data.update({'type': expression_matrix_data.get('type')})
        em_data.update({'feature_mapping': expression_matrix_data.get('feature_mapping')})
        em_data.update({'condition_mapping': expression_matrix_data.get('condition_mapping')})
        em_data.update({'data': average_data})

        expression_matrix_name = expression_matrix_info[1]
        ave_expression_matrix_name = expression_matrix_name + params.get('output_suffix')

        workspace_name = params.get('workspace_name')

        ave_expression_matrix_ref = self._save_expression_matrix(em_data, 
                                                                 ave_expression_matrix_name, 
                                                                 workspace_name)

        returnVal = {'average_expression_matrix_ref': ave_expression_matrix_ref}

        report_output = self._generate_report(ave_expression_matrix_ref,
                                              workspace_name)
        returnVal.update(report_output)

        return returnVal
예제 #24
0
class DESeqUtil:

    PREPDE_TOOLKIT_PATH = '/kb/module/lib/kb_deseq/Utils'

    def _validate_run_deseq2_app_params(self, params):
        """
        _validate_run_deseq2_app_params:
                validates params passed to run_deseq2_app method
        """

        logging.info('start validating run_deseq2_app params')

        # check for required parameters
        for p in ['expressionset_ref', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _validate_run_deseq2_app_with_condition_set_params(self, params):
        """
        _validate_run_deseq2_app_params:
                validates params passed to run_deseq2_app method
        """

        logging.info(
            'start validating run_deseq2_app_with_condition_set params')

        # check for required parameters
        for p in [
                'expressionset_ref', 'workspace_name',
                'diff_expression_obj_name', 'conditionset_ref', 'group_factor'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """
        logging.info('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exit_code = pipe.returncode

        if exit_code == 0:
            logging.info(f'Executed command:\n{command}\n'
                         f'Exit Code: {exit_code}\nOutput:\n{output}')
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exit_code, output)
            raise ValueError(error_msg)

    def _generate_html_report(self, result_directory, diff_expression_obj_ref,
                              params):
        """
        _generate_html_report: generate html summary report
        """

        logging.info('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.makedirs(output_directory, exist_ok=True)
        result_file_path = os.path.join(output_directory, 'report.html')

        visualization_content = ''
        dispersion_plots_name = 'deseq2_dispersion_plot.png'
        dispersion_plots_display_name = 'Dispersion plot'

        shutil.copy2(os.path.join(result_directory, dispersion_plots_name),
                     os.path.join(output_directory, dispersion_plots_name))
        visualization_content += '<div class="gallery">'
        visualization_content += '<a target="_blank" href="{}">'.format(
            dispersion_plots_name)
        visualization_content += '<img src="{}" '.format(dispersion_plots_name)
        visualization_content += 'alt="{}" width="600" height="400">'.format(
            dispersion_plots_display_name)
        visualization_content += '</a><div class="desc">{}</div></div>'.format(
            dispersion_plots_display_name)

        pca_plots_name = 'deseq2_PCA_plot.png'
        pca_plots_display_name = 'PCA plot'

        shutil.copy2(os.path.join(result_directory, pca_plots_name),
                     os.path.join(output_directory, pca_plots_name))
        visualization_content += '<div class="gallery">'
        visualization_content += '<a target="_blank" href="{}">'.format(
            pca_plots_name)
        visualization_content += '<img src="{}" '.format(pca_plots_name)
        visualization_content += 'alt="{}" width="600" height="400">'.format(
            pca_plots_display_name)
        visualization_content += '</a><div class="desc">{}</div></div>'.format(
            pca_plots_display_name)

        diff_expr_set_data = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_obj_ref
            }]})['data'][0]['data']

        overview_content = ''
        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrixSet'
        overview_content += ' Object</th></tr>'
        overview_content += '<tr><td>{} ({})'.format(
            params.get('diff_expression_obj_name'), diff_expression_obj_ref)
        overview_content += '</td></tr></table>'

        overview_content += '<p><br/></p>'

        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrix'
        overview_content += ' Object</th><th></th><th></th><th></th></tr>'
        overview_content += '<tr><th>Differential Expression Matrix Name</th>'
        overview_content += '<th>Feature Count</th>'
        overview_content += '</tr>'

        for item in diff_expr_set_data['items']:
            diff_expr_ref = item['ref']
            diff_expr_object = self.ws.get_objects2(
                {'objects': [{
                    'ref': diff_expr_ref
                }]})['data'][0]

            diff_expr_data = diff_expr_object['data']
            diff_expr_info = diff_expr_object['info']
            diff_expr_name = diff_expr_info[1]
            number_features = len(diff_expr_data['data']['row_ids'])

            overview_content += '<tr><td>{} ({})</td>'.format(
                diff_expr_name, diff_expr_ref)
            overview_content += '<td>{}</td></tr>'.format(number_features)
        overview_content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                report_template = report_template.replace(
                    '<p>Visualization_Content</p>', visualization_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id': report_shock_id,
            'name': os.path.basename(result_file_path),
            'label': os.path.basename(result_file_path),
            'description': 'HTML summary report for DESeq2 App'
        })
        return html_report

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        logging.info('start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.makedirs(output_directory, exist_ok=True)
        result_file = os.path.join(output_directory, 'DESeq2_result.zip')
        plot_file = os.path.join(output_directory, 'DESeq2_plot.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.zip') or file.endswith('.png')
                            or file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by DESeq2 App'
        })

        with zipfile.ZipFile(plot_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if file.endswith('.png'):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': plot_file,
            'name': os.path.basename(plot_file),
            'label': os.path.basename(plot_file),
            'description': 'Visualization plots by DESeq2 App'
        })

        return output_files

    def _generate_report(self, diff_expression_obj_ref, params,
                         result_directory):
        """
        _generate_report: generate summary report
        """

        logging.info('creating report')

        output_files = self._generate_output_file_list(result_directory)

        output_html_files = self._generate_html_report(
            result_directory, diff_expression_obj_ref, params)

        diff_expr_set_data = self.ws.get_objects2(
            {'objects': [{
                'ref': diff_expression_obj_ref
            }]})['data'][0]['data']

        items = diff_expr_set_data['items']

        description_set = 'DifferentialExpressionMatrixSet generated by DESeq2'
        description_object = 'DifferentialExpressionMatrix generated by DESeq2'
        objects_created = []
        objects_created.append({
            'ref': diff_expression_obj_ref,
            'description': description_set
        })

        for item in items:
            diff_expr_ref = item['ref']
            objects_created.append({
                'ref': diff_expr_ref,
                'description': description_object
            })

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': 'kb_deseq2_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _save_count_matrix_file(self, result_directory):
        """
        _save_count_matrix_file: download gtf file for each expression
                                 run prepDE.py on them and save resulting count matrix file
        """

        logging.info('generating count matrix file')

        conditions = []
        genome_ref = None
        items = self.expression_set_data['items']

        gtf_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.makedirs(gtf_directory, exist_ok=True)

        mapping_file = os.path.join(gtf_directory, "mapping.txt")
        with open(mapping_file, 'w') as input_mapping:
            for item in items:
                expression_ref = item['ref']
                expression_object = self.ws.get_objects2(
                    {'objects': [{
                        'ref': expression_ref
                    }]})['data'][0]
                expression_data = expression_object['data']
                expression_info = expression_object['info']
                handle_id = expression_data.get('file').get('hid')
                expression_name = expression_info[1]
                conditions.append(expression_data['condition'])
                genome_ref = expression_data['genome_id']

                tmp_gtf_directory = os.path.join(gtf_directory,
                                                 expression_name)
                os.makedirs(tmp_gtf_directory, exist_ok=True)

                self.dfu.shock_to_file({
                    'handle_id': handle_id,
                    'file_path': tmp_gtf_directory,
                    'unpack': 'unpack'
                })

                input_mapping.write("{}\t{}/transcripts.gtf\n".format(
                    expression_name, tmp_gtf_directory))

        self._run_prepDE(result_directory, mapping_file)
        return ",".join(conditions), genome_ref

    def _run_prepDE(self, result_directory, input):
        """
        _run_prepDE: run prepDE.py script

        ref: http://ccb.jhu.edu/software/stringtie/index.shtml?t=manual#deseq
        """

        logging.info('generating matrix of read counts')

        command = self.PREPDE_TOOLKIT_PATH + '/prepDE.py '
        command += '-i {} '.format(input)
        command += '-g {} '.format(
            os.path.join(result_directory, 'raw_gene_count_matrix.csv'))
        command += '-t {} '.format(
            os.path.join(result_directory, 'transcript_count_matrix.csv'))

        self._run_command(command)

        # remove novel genes from results (ideally should compare against expression set)
        with open(os.path.join(result_directory,
                               'raw_gene_count_matrix.csv')) as infile, open(
                                   os.path.join(result_directory,
                                                'gene_count_matrix.csv'),
                                   'w') as outfile:
            outfile.writelines([l for l in infile if "STRG." not in l])

    def _generate_diff_expression_csv(self, result_directory, condition_string,
                                      params):
        """
        _generate_diff_expression_csv: get different expression matrix with DESeq2
        """

        result_files = os.listdir(result_directory)
        if 'gene_count_matrix.csv' not in result_files:
            error_msg = 'Missing gene_count_matrix.csv, available files: {}'.format(
                result_files)
            raise ValueError(error_msg)
        pair_string = ",".join(
            ["_vs_".join(x) for x in params['condition_labels']])
        rcmd_list = [
            'Rscript',
            os.path.join(os.path.dirname(__file__), 'run_DESeq.R')
        ]
        rcmd_list.extend(
            ['--result_directory', '"{}"'.format(result_directory)])
        rcmd_list.extend(
            ['--condition_string', '"{}"'.format(condition_string)])
        rcmd_list.extend(['--contrast_pairs', '"{}"'.format(pair_string)])
        if params.get('input_type') == 'transcripts':
            rcmd_list.extend(['--transcripts'])

        rcmd_str = " ".join(str(x) for x in rcmd_list)

        self._run_command(rcmd_str)

    def _save_diff_expression(self, result_directory, params):
        """
        _save_diff_expression: save DifferentialExpression object to workspace
        """

        logging.info(
            'start saving KBaseFeatureValues.DifferentialExpressionMatrix object'
        )

        workspace_name = params.get('workspace_name')
        diff_expression_obj_name = params.get('diff_expression_obj_name')

        destination_ref = workspace_name + '/' + diff_expression_obj_name

        diff_expr_files = list()

        for res_file in os.listdir(result_directory):
            if 'deseq_results.csv' not in res_file:
                continue
            condition_labels = res_file.replace('_deseq_results.csv',
                                                '').split('_vs_', 2)[:2]

            genes_results_filepath = os.path.join(result_directory, res_file)

            with open(genes_results_filepath, "r") as f:
                reader = csv.reader(f)
                columns = next(reader)[1:]

            columns[columns.index('log2FoldChange')] = 'log2_fold_change'
            columns[columns.index('pvalue')] = 'p_value'
            columns[columns.index('padj')] = 'q_value'
            for line in fileinput.input(genes_results_filepath, inplace=True):
                if fileinput.isfirstline():
                    print('gene_id,' + ','.join(columns))
                else:
                    print(line)

            reader = csv.DictReader(open(genes_results_filepath))

            diffexpr_filepath = genes_results_filepath.replace(
                'deseq_results.csv', 'differential_expression_result.csv')

            with open(diffexpr_filepath, 'w') as csvfile:
                fieldnames = [
                    'gene_id', 'log2_fold_change', 'p_value', 'q_value'
                ]
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                writer.writeheader()

                for row in reader:
                    writer.writerow({
                        'gene_id':
                        row.get('gene_id'),
                        'log2_fold_change':
                        row.get('log2_fold_change'),
                        'p_value':
                        row.get('p_value'),
                        'q_value':
                        row.get('q_value')
                    })

            diff_expr_files.append({
                'condition_mapping': {
                    condition_labels[0]: condition_labels[1]
                },
                'diffexpr_filepath': diffexpr_filepath
            })

        upload_diff_expr_params = {
            'destination_ref': destination_ref,
            'diffexpr_data': diff_expr_files,
            'tool_used': 'deseq',
            'tool_version': '1.16.1',
            'genome_ref': params['genome_ref']
        }

        deu_upload_return = self.deu.save_differential_expression_matrix_set(
            upload_diff_expr_params)

        diff_expression_obj_ref = deu_upload_return['diffExprMatrixSet_ref']

        return diff_expression_obj_ref

    def _get_condition_labels(self):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        logging.info('getting all possible condition pairs')

        items = self.expression_set_data.get('items')
        condition_replicate_name_mapping = collections.OrderedDict()
        for item in items:
            expression_ref = item['ref']
            expr_object = self.ws.get_objects2(
                {'objects': [{
                    'ref': expression_ref
                }]})['data'][0]
            expr_data = expr_object['data']
            expr_info = expr_object['info']
            expr_name = expr_info[1]
            expr_condition = expr_data['condition']
            expr_name_list = condition_replicate_name_mapping.get(
                expr_condition)
            if expr_name_list:
                expr_name_list.append(expr_name)
                condition_replicate_name_mapping.update(
                    {expr_condition: expr_name_list})
            else:
                condition_replicate_name_mapping.update(
                    {expr_condition: [expr_name]})

        condition_labels = list(condition_replicate_name_mapping.keys())

        condition_label_pairs = [
            list(pair) for pair in itertools.combinations(condition_labels, 2)
        ]

        logging.info(
            'all possible condition pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, condition_labels

    @staticmethod
    def _check_input_labels(condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            first_label = condition_pair['condition_label_1'][0].strip()
            second_label = condition_pair['condition_label_2'][0].strip()
            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not available. '.format(
                    first_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not available. '.format(
                    second_label)
                error_msg += 'Available conditions: {}'.format(
                    available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _generate_condition_string(self, expression_set_data, conditionset_ref,
                                   group_factor):
        """
        _generate_condition_string: generate condition string based on conditionset factors
        """
        condition_strings = []

        condition_set_obj = self.dfu.get_objects(
            {'object_refs': [conditionset_ref]})['data'][0]
        condition_set_data = condition_set_obj['data']
        conditions = condition_set_data.get('conditions')

        factors = [
            factor.get('factor')
            for factor in condition_set_data.get('factors')
        ]
        try:
            position = factors.index(group_factor)
        except:
            error_msg = 'Group Factor {} is not available\n'.format(
                group_factor)
            error_msg += 'Available factors {}'.format(factors)
            raise ValueError(error_msg)

        for expr in expression_set_data.get('items'):
            condition_id = expr.get('label')
            try:
                condition = conditions[condition_id]
            except KeyError:
                error_msg = 'Condition ID [{}] '.format(condition_id)
                error_msg += 'is not available in ConditionSet object'
                raise ValueError(error_msg)

            condition_strings.append(condition[position])

        return ",".join(condition_strings)

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.deu = DifferentialExpressionUtils(self.callback_url,
                                               service_ver='dev')
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']

    def run_deseq2_app_with_condition_set(self, params):
        """
        run_deseq2_app_with_condition_set: run DESeq2 app with ConditionSet
        (https://www.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)

        required params:
            expressionset_ref: ExpressionSet object reference
            diff_expression_obj_name: DifferentialExpressoinMatrixSet object name
            workspace_name: the name of the workspace it gets saved to
            conditionset_ref: ConditionSet object reference
            group_factor: factor in conditionset used for grouping expressions

        optional params:
            run_all_combinations: run all paired condition combinations
            condition_labels: conditions for expression set object
            alpha_cutoff: q value cutoff
            fold_change_cutoff: fold change cutoff
            fold_scale_type: one of ["linear", "log2+1", "log10+1"]

        return:
            result_directory: folder path that holds all files generated by run_deseq2_app
            diff_expression_obj_ref: generated RNASeqDifferetialExpression object reference
            report_name: report name generated by KBaseReport
            report_ref: report reference generated by KBaseReport
        """
        logging.info(
            '--->\nrunning DESeqUtil.run_deseq2_app_with_condition_set\n'
            f'params:\n{json.dumps(params, indent=1)}')

        self._validate_run_deseq2_app_with_condition_set_params(params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.makedirs(result_directory, exist_ok=True)

        expressionset_ref = params.get('expressionset_ref')
        expression_set_obj = self.dfu.get_objects(
            {'object_refs': [expressionset_ref]})['data'][0]
        self.expression_set_data = expression_set_obj['data']

        # run prepDE.py and save count matrix file
        condition_ids, params['genome_ref'] = self._save_count_matrix_file(
            result_directory)

        conditionset_ref = params.get('conditionset_ref')
        group_factor = params.get('group_factor')

        # overwrite condition_string with conditionset factors
        condition_string = self._generate_condition_string(
            self.expression_set_data, conditionset_ref, group_factor)

        condition_labels = list(set(condition_string.split(',')))
        condition_label_pairs = [
            list(pair) for pair in itertools.combinations(condition_labels, 2)
        ]
        if condition_label_pairs:
            params['condition_labels'] = condition_label_pairs
        else:
            error_msg = 'Only 1 condition was fetched from ConditionSet for fatcor {}'.format(
                group_factor)
            raise ValueError(error_msg)

        self._generate_diff_expression_csv(result_directory, condition_string,
                                           params)

        diff_expression_obj_ref = self._save_diff_expression(
            result_directory, params)

        returnVal = {
            'result_directory': result_directory,
            'diff_expression_obj_ref': diff_expression_obj_ref
        }

        report_output = self._generate_report(diff_expression_obj_ref, params,
                                              result_directory)
        returnVal.update(report_output)

        return returnVal

    def run_deseq2_app(self, params):
        """
        run_deseq2_app: run DESeq2 app
        (https://www.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)

        required params:
            expressionset_ref: ExpressionSet object reference
            diff_expression_obj_name: DifferentialExpressoinMatrixSet object name
            workspace_name: the name of the workspace it gets saved to

        optional params:
            run_all_combinations: run all paired condition combinations
            condition_labels: conditions for expression set object
            alpha_cutoff: q value cutoff
            fold_change_cutoff: fold change cutoff
            fold_scale_type: one of ["linear", "log2+1", "log10+1"]

        return:
            result_directory: folder path that holds all files generated by run_deseq2_app
            diff_expression_obj_ref: generated RNASeqDifferetialExpression object reference
            report_name: report name generated by KBaseReport
            report_ref: report reference generated by KBaseReport
        """

        if params.get('conditionset_ref'):
            return self.run_deseq2_app_with_condition_set(params)

        logging.info('--->\nrunning DESeqUtil.run_deseq2_app\n' +
                     f'params:\n{json.dumps(params, indent=1)}')

        self._validate_run_deseq2_app_params(params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.makedirs(result_directory, exist_ok=True)

        expressionset_ref = params.get('expressionset_ref')
        expression_set_obj = self.ws.get_objects2(
            {'objects': [{
                'ref': expressionset_ref
            }]})['data'][0]
        self.expression_set_data = expression_set_obj['data']

        available_condition_label_pairs, available_condition_labels = self._get_condition_labels(
        )

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            self._check_input_labels(condition_pairs,
                                     available_condition_labels)
            condition_label_pairs = []
            for condition_pair in condition_pairs:
                condition_labels = [
                    condition_pair.get('condition_label_1')[0].strip(),
                    condition_pair.get('condition_label_2')[0].strip()
                ]
                condition_label_pairs.append(condition_labels)

        params['condition_labels'] = condition_label_pairs

        # run prepDE.py and save count matrix file
        condition_string, params['genome_ref'] = self._save_count_matrix_file(
            result_directory)

        self._generate_diff_expression_csv(result_directory, condition_string,
                                           params)

        diff_expression_obj_ref = self._save_diff_expression(
            result_directory, params)

        returnVal = {
            'result_directory': result_directory,
            'diff_expression_obj_ref': diff_expression_obj_ref
        }

        report_output = self._generate_report(diff_expression_obj_ref, params,
                                              result_directory)
        returnVal.update(report_output)

        return returnVal
예제 #25
0
class AMAUtils():
    def __init__(self, ws_url, cb_url, token, scratch):
        self.ws = Workspace(ws_url, token=token)
        self.cb_url = cb_url
        self.token = token
        self.scratch = scratch

    def _confirm_ws_type(self, ref):
        """confirm whether 'ref' is of type 'KBaseMetagenomes.AnnotatedMetagenomeAssembly
        if not, throw error. """
        if ref is None:
            raise ValueError(" 'ref' argument must be specified.")
        obj_info = self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
        # check object type is 'KBaseMetagenome.AnnotatedMetagenomeAssembly'
        obj_type = obj_info[2]
        if 'KBaseMetagenomes.AnnotatedMetagenomeAssembly' not in obj_type:
            raise ValueError(
                f"input ref '{ref}' is of type {obj_type}. function "
                "'get_annotated_metagenome_assembly' requires objects"
                " of type KBaseMetagenome.AnnotatedMetagenomeAssembly")

    def get_annotated_metagenome_assembly(self, params):
        """
        params:
            ref - workspace reference
            included_fields - list of fields to include, defaults to list below if not specified.
        output
            genomes - contains the returned data fields from the workspace request.

        """
        ref = params.get('ref', None)
        included_fields = params.get('included_fields', None)
        self._confirm_ws_type(ref)

        get_obj_params = {'ref': ref}
        if included_fields is not None:
            get_obj_params['included'] = included_fields

        data = self.ws.get_objects2({'objects': [get_obj_params]})['data']

        return {'genomes': data}

    def get_annotated_metagenome_assembly_features(self, params):
        """
        params: 
            ref - workspace reference for KBaseMetagenomes.AnnotatedMetagenomeAssembly object
        output:
            features - list of features, each representing a dict.
        """
        ref = params['ref']
        self._confirm_ws_type(ref)
        ret = self.ws.get_objects2(
            {"objects": [{
                "ref": ref,
                "included": ["features_handle_ref"]
            }]})['data']
        features_handle_ref = ret[0]['data']['features_handle_ref']
        dfu = DataFileUtil(self.cb_url, token=self.token)
        file_name = 'features.json.gz'
        file_path = os.path.join(self.scratch, file_name)
        shock_ret = dfu.shock_to_file({
            'handle_id': features_handle_ref,
            'file_path': file_path,
            'unpack': "uncompress"
        })
        file_path = shock_ret['file_path']

        with open(file_path) as fd:
            json_features = json.load(fd)

        if params.get('feature_type'):
            accepted_feature_types = [
                "cds", "gene", "mrna", "trna", "rrna", "repeat_region"
            ]
            feat_type = params['feature_type']
            if feat_type.lower() not in accepted_feature_types:
                raise ValueError(
                    f"{feat_type} not an accepted feature type; accepted feature"
                    " types (in lower case) are {accepted_feature_types}")
            json_features = [
                feature for feature in json_features
                if feature['type'].lower() == feat_type.lower()
            ]

        if params.get('only_ids'):
            json_features = [{
                'id': feature['id']
            } for feature in json_features]

        return {'features': json_features}
예제 #26
0
    def export_genome_as_gff(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_as_gff
        if 'input_ref' not in params:
            raise ValueError('Cannot run export_genome_as_gff- no "input_ref" '
                             'field defined.')

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_objects2({
            'objects': [{
                'ref':
                params['input_ref'],
                'included':
                ['/assembly_ref', '/contigset_ref', '/id', '/gff_handle_ref']
            }]
        })['data'][0]['data']

        # export to file (building from KBase Genome Object)
        result = self.genome_to_gff(ctx,
                                    {'genome_ref': params['input_ref']})[0]

        # get assembly
        if 'assembly_ref' in info:
            assembly_ref = info['assembly_ref']
        else:
            assembly_ref = info['contigset_ref']
        print(('Assembly reference = ' + assembly_ref))
        print('Downloading assembly')
        au = AssemblyUtil(self.cfg.callbackURL)
        assembly_file_path = au.get_assembly_as_fasta(
            {'ref': params['input_ref'] + ";" + assembly_ref})['path']

        # create the output directory and move the files there
        export_package_dir = os.path.join(self.cfg.sharedFolder, info['id'])
        os.makedirs(export_package_dir)
        shutil.move(
            result['file_path'],
            os.path.join(
                export_package_dir,
                'KBase_derived_' + os.path.basename(result['file_path'])))
        shutil.move(
            assembly_file_path,
            os.path.join(export_package_dir,
                         os.path.basename(assembly_file_path)))

        # add cached genome if appropriate
        exporter = GenomeToGFF(self.cfg)
        cached = exporter.get_gff_handle(info, export_package_dir)

        # package it up
        dfUtil = DataFileUtil(self.cfg.callbackURL)
        package_details = dfUtil.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {'shock_id': package_details['shock_id']}
        #END export_genome_as_gff

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_as_gff return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #27
0
class AMPLICON:
    '''
    Module Name:
    AMPLICON

    Module Description:
    A KBase module: AMPLICON
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = ""
    GIT_COMMIT_HASH = ""

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']

        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)

        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def run_AMPLICON(self, ctx, params):

        # ctx is the context object
        # return variables are: output
        #BEGIN run_AMPLICON
        print('Starting AMPLICON function and validating parameters.')
        if not params.get('workspace_name'):
            print('Parameters provided were', str(params))
            raise TypeError('Must pass a non-empty `workspace_name` arg.')
        if not params.get('ref'):
            print('Parameters provided were', str(params))
            raise TypeError('Must pass a non-empty `ref` arg.')

        ws_name = params['workspace_name']
        # get the amplicon data
        obj = self.ws_client.get_objects2(
            {'objects': [{
                'ref': params['ref']
            }]})['data'][0]['data']

        # define file names
        parse_out_file = os.path.join('work/tmp', 'parse_out.tsv')

        input_file = parse_out_file
        output_file = os.path.join('work/tmp', 'output.tsv')

        # 1. convert data into tsv format
        parse_input_data(obj, parse_out_file)

        # 2. run subprocess FAPROTAX
        run_program(input_file, output_file)

        # 3. create html tables using output_file
        output = create_report(self.callback_url, self.shared_folder, ws_name,
                               output_file)

        #END run_AMPLICON

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_AMPLICON return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
예제 #28
0
def load_fastas(config, scratch, upa):
    '''
    '''
    dfu = DataFileUtil(config['callback_url'])
    au = AssemblyUtil(config['callback_url'])
    mgu = MetagenomeUtils(config['callback_url'])
    ws = Workspace(config['workspace-url'])

    obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0]
    upa = str(obj_data['info'][6]) + '/' + str(
        obj_data['info'][0]) + '/' + str(obj_data['info'][4])
    obj_type = obj_data['info'][2]

    id_to_assy_info = {}
    if 'KBaseSets.GenomeSet' in obj_type:
        upas = [gsi['ref'] for gsi in obj_data['data']['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
    elif "KBaseGenomes.Genome" in obj_type:
        upas = [upa]
    elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
        # in this case we use the assembly file util to get the fasta file
        # file_output = os.path.join(scratch, "input_fasta.fa")
        faf = au.get_assembly_as_fasta({
            "ref": upa,
            'filename': upa_to_path(scratch, upa)
        })
        return {file_safe_upa(upa): faf}
    elif "KBaseSets.AssemblySet" in obj_type:
        for item_upa in obj_data['data']['items']:
            faf = au.get_assembly_as_fasta({
                "ref":
                upa + ';' + item_upa['ref'],
                'filename':
                upa_to_path(scratch, item_upa['ref'])
            })
            id_to_assy_info[file_safe_upa(item_upa['ref'])] = faf
        return id_to_assy_info
    elif 'KBaseMetagenomes.BinnedContigs' in obj_type:
        return handle_binned_contigs(upa, mgu, scratch)

    for genome_upa in upas:
        # this could be sped up by batching the get_objects call
        # does assy file util not take bulk calls?
        # maybe doesn't matter since Shock doesn't handle bulk calls
        if upa != genome_upa:  # for single genomes, upa and genome_upa will be the same
            genome_upa = upa + ';' + genome_upa
        genome_data = ws.get_objects2({'objects': [{
            "ref": genome_upa
        }]})['data'][0]['data']
        target_upa = genome_data.get('contigset_ref') or genome_data.get(
            'assembly_ref')
        assembly_upa = genome_upa + ';' + target_upa
        faf = au.get_assembly_as_fasta({
            'ref':
            assembly_upa,
            'filename':
            upa_to_path(scratch, target_upa)
        })
        id_to_assy_info[file_safe_upa(target_upa)] = faf

    return id_to_assy_info
class VariationAnnotation:
    '''
    Module Name:
    VariationAnnotation

    Module Description:
    A KBase module: VariationAnnotation
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/man4ish/VariationAnnotation.git"
    GIT_COMMIT_HASH = "233ab11cd942b99c960f7b83aaee2b3800685bb4"

    #BEGIN_CLASS_HEADER
    def build_genome_index(self, genome_ref):
        #Downloads gff, fasta and puts it in the right directory
        # and returns the genome_index name that can be used by snpeff.jar
        #TODO: READ GENOME TAXONOMY from genome_ref and
        # TODO: Get genome taxonomy/classification from user so that There
        # is no confusion.
        pass
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        self.VU = VariationUtil(self.callback_url)
        self.SU = SnpEffUtils()
        self.DU = DownloadUtils()
        self.HU = htmlreportutils()
        self.config = config
        #self.snpeff=<path_to_snpeff>
        #END_CONSTRUCTOR
        pass

    def annotate_variants(self, ctx, params):
        """
        This method extracts VCF from variation object,
        runs SNPEFF workflow (http://snpeff.sourceforge.net/SnpEff_manual.html)
        and annotate and predict the effects of genetic variants
        (such as amino acid changes)
        :param params: instance of type "input_params" (variation_ref:
           Reference to Variation object out_variation_name: Name by which
           the output object will be saved) -> structure: parameter
           "variation_ref" of String, parameter "out_variation_name" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN annotate_variants
        # Validate the parameters
        # Extract vcf from variation using VariationUtil
        #    output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        #    os.mkdir(output_dir)
        #    #filename = os.path.join(output_dir, "variation.vcf.gz")

        #    print(filename)
        #    vcf_path = self.VU.get_variation_as_vcf({
        #        'variation_ref': params['variation_ref'],
        #        'filename':filename
        #    })
        # TODO current vcf path is hard coded for testing which need to be removed.

        self.SU.validate_params(params)
        vcf_path = "/kb/module/work/variation.vcf.gz"
        print(vcf_path)

        # TODO: Need to think through how to get this from the USERS
        # because variation_ref may or may not have a genome_ref field filled in
        # our spec.json may require some work
        # There is a chance that user may provide wrong genome as input if we don't deal with this properly
        # params['genome_ref']
        # Download gff and assembly based on geome_ref
        #gff_path = .....
        #assembly_path ...

        workspace = params['workspace_name']
        self.ws_url = self.config['workspace-url']
        self.ws = Workspace(self.ws_url, token=ctx['token'])

        # TODO current file name is hard coded but that need to be changed later.
        filename = "/kb/module/work/variation.vcf"
        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_dir)
        
        shutil.copytree("/kb/module/deps/snp_eff", output_dir + "/snp_eff")

        variation_ref = params['variation_ref']
        variation_obj = self.ws.get_objects2({'objects': [{'ref': variation_ref}]})['data'][0]


        data = self.ws.get_objects2( {'objects':[{"ref":variation_ref, 'included': ['/sample_set_ref']}]})['data'][0]['data']
        sample_set_ref = data['sample_set_ref']

        assembly_ref = variation_obj['data']['assembly_ref']
        assembly_path = self.DU.get_assembly(assembly_ref, output_dir)

        gff_ref = params['genome_ref']
        gff_path = self.DU.get_gff(gff_ref, output_dir)
       
        # Todo: It is temporary fix but need to find logical removal of exons based on coordinates.
        fix_cmd = "grep -v \"exon\" "+ gff_path + " > /kb/module/work/tmp/output.gff"
        print(fix_cmd)
        os.system(fix_cmd)
        #os.system("cp /kb/module/work/tmp/output.gff " + os.path.join(output_dir, "/snp_eff/data/kbase_v1/genes.gff")) 
        #shutil.copyfile("/kb/module/work/tmp/output.gff", output_dir + "/snp_eff/data/kbase_v1/genes.gff")
        
        vcf_path = self.VU.get_variation_as_vcf({
                'variation_ref': params['variation_ref'],
                'filename': filename
            })

        new_gff_path = "/kb/module/work/tmp/output.gff"

        genome_index_name = self.SU.build_genome(new_gff_path, assembly_path, output_dir)
        annotated_vcf_path = self.SU.annotate_variants(genome_index_name, vcf_path['path'], params, output_dir)
        '''
        params['vcf_staging_file_path'] = annotated_vcf_path
        params['variation_object_name'] = params['output_object_name']
        params['genome_or_assembly_ref'] = params['genome_ref']
        '''
        save_variation_params = {'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['genome_ref'],      
            'sample_set_ref': sample_set_ref,
            'sample_attribute_name':'sample_attr',
            'vcf_staging_file_path': annotated_vcf_path,
            'variation_object_name': params['output_object_name']
            }  
       
        variantion_ref = self.VU.save_variation_from_vcf(save_variation_params)['variation_ref']

        created_objects = []
        created_objects.append({
            "ref": variation_ref,
            "description": "Variation Object"
            })

        #self.VU.   #upload file to shock
        # TODO: Add parameters for snpeff in parameters
        # Parse the snpeff parameters from params and build snpeff command
        # TODO: We are hardcoding this for now
        
        print("\n\n\n")
        print("$$$$$$$$" + output_dir + "$$$$$$$$$")
        arr = os.listdir(output_dir + "/snp_eff")
        for files in arr:
            print("########" + files + "###########")
        print("\n\n\n")
        
        #os.rename(os.path.join(output_dir, "snp_eff/snpEff_summary.html"), os.path.join(output_dir, "snp_eff/index.html"))
        snp_eff_resultdir = os.path.join(output_dir, "snp_eff_results")
        os.mkdir(snp_eff_resultdir)
        #shutil.copyfile(os.path.join(output_dir, "snp_eff/index.html"), os.path.join(snp_eff_resultdir, "index.html"))
        shutil.copyfile(os.path.join(output_dir, "snp_eff/snpEff_genes.txt"), os.path.join(snp_eff_resultdir, "snpEff_genes.txt"))

        #report_dirpath = os.path.join(output_dir, "snp_eff")

        logging.info("creating html report ...")
        output = self.HU.create_html_report(self.callback_url, snp_eff_resultdir, workspace)
        # output = self.HU.create_html_report(self.callback_url, snp_eff_resultdir, workspace, created_objects)

        '''
        report = KBaseReport(self.callback_url)
        output = {
            "x":vcf_path
        }
        '''
        #END annotate_variants

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method annotate_variants return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {'state': "OK",
                     'message': "",
                     'version': self.VERSION,
                     'git_url': self.GIT_URL,
                     'git_commit_hash': self.GIT_COMMIT_HASH}
        #END_STATUS
        return [returnVal]
예제 #30
0
def process_kbase_objects(host_ref, virus_ref, shared_folder, callback,
                          workspace, token):
    """
    Convert KBase object(s) into usable files for VirMatcher
    :param host_ref: Putative host / microbial genomes with KBase '#/#/#' used to describe each object
    :param virus_ref: Viral genomes with KBase '#/#/#' used to describe each object
    :param shared_folder: KBase job node's "working" directory, where actual files exist
    :param callback:
    :param workspace: Workspace name
    :param token: Job token
    :return:
    """

    dfu = DataFileUtil(callback, token=token)

    ws = Workspace(workspace, token=token)

    mgu = MetagenomeUtils(callback, token=token)

    au = AssemblyUtil(callback, token=token)

    # Need to determine KBase type in order to know how to properly proceed
    host_type = ws.get_object_info3({'objects': [{
        'ref': host_ref
    }]})['infos'][0][2].split('-')[0]
    virus_type = ws.get_object_info3({'objects': [{
        'ref': virus_ref
    }]})['infos'][0][2].split('-')[0]

    logging.info(f'Potential hosts identified as: {host_type}')
    logging.info(f'Viruses identified as: {virus_type}')

    # Create new directory to house virus and host files
    host_dir = Path(shared_folder) / 'host_files'
    if not host_dir.exists():
        os.mkdir(host_dir)

    host_count = 0

    if host_type == 'KBaseGenomeAnnotations.Assembly':  # No info about individual genomes, so treat each as organism
        host_fps = au.get_assembly_as_fasta(
            {'ref':
             host_ref})['path']  # Consists of dict: path + assembly_name

        logging.info(
            f'Identified {host_type}. Each sequence will be treated as a separate organism.'
        )

        records = SeqIO.parse(host_fps, 'fasta')

        for record in records:
            host_count += 1
            tmp_fp = host_dir / f'{record.id}.fasta'  # TODO Illegal filenames?
            SeqIO.write([record], tmp_fp, 'fasta')

    elif host_type == 'KBaseGenomes.Genomes':  # TODO Genomes?!
        genome_data = ws.get_objects2({'objects': [{
            'ref': host_ref
        }]})['data'][0]['data']
        genome_data.get('contigset_ref') or genome_data.get('assembly_ref')

    # elif host_type == 'KBaseSets.GenomeSet'

    elif host_type == 'KBaseSets.AssemblySet':
        obj_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0]

        for subobj in obj_data['data']['items']:
            host_fp = au.get_assembly_as_fasta({'ref': subobj['ref']})['path']

            if os.path.splitext(host_fp)[-1] != 'fasta':
                # Ensure extension always = fasta
                target_fn = os.path.splitext(
                    os.path.basename(host_fp))[0].strip('_') + '.fasta'
            else:
                target_fn = os.path.basename(host_fp).strip('_')

            shutil.copyfile(host_fp, host_dir / target_fn)
            host_count += 1

    elif host_type == 'KBaseMetagenomes.BinnedContigs':  # This is what we want!
        host_kbase_dir = mgu.binned_contigs_to_file({
            'input_ref': host_ref,
            'save_to_shock': 0
        })['bin_file_directory']  # Dict of bin_file_dir and shock_id

        for (dirpath, dirnames, fns) in os.walk(
                host_kbase_dir):  # Dirnames = all folders under dirpath
            for fn in fns:
                if os.path.splitext(fn)[-1] != 'fasta':
                    fn = os.path.splitext(fn)[0] + '.fasta'
                fp = Path(dirpath) / fn
                shutil.copy(fp, host_dir)
                host_count += 1

    else:
        raise ValueError(f'{host_type} is not supported.')

    logging.info(f'{host_count} potential host genomes were identified.')

    virus_count = 0

    if virus_type == 'KBaseGenomeAnnotations.Assembly':
        virus_fps = au.get_assembly_as_fasta({'ref': virus_ref})['path']

        records = SeqIO.parse(virus_fps, 'fasta')
        virus_count = len(list(records))

        # for record in records:
        #     virus_count += 1
        # tmp_fp = virus_dir / f'{record.id}.fasta'
        # SeqIO.write([record], tmp_fp, 'fasta')

    else:
        raise ValueError(f'{virus_type} is not supported.')

    logging.info(f'{virus_count} potential viral genomes were identified.')

    # TODO Do we even need any of this data? We don't care about what the sequences are called

    # host_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0]
    # virus_data = dfu.get_objects({'object_refs': [virus_ref]})['data'][0]

    return host_dir, virus_fps