def create_variation_report(self, params):
        '''
        Create a table report with
        contig_id, length, number_variation, density/mb
        :param variation_ref:
        '''
        ws = Workspace(self.ws_url)

        subset = ws.get_object_subset([{
            'included': ['/numgenotypes', 'numvariants'],
            'ref':
            params['variation_ref']
        }])

        numgenotypes = subset[0]['data']['numgenotypes']
        numvariants = subset[0]['data']['numvariants']

        variation_table = """
        <table>
           <thead>
               <tr>
                   <td>Number of strains/genotypes</td>
                   <td> ##numgenotypes##</td>
               </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Number of variants</td>
                    <td>##numvariants##</td>
                </tr>
            </tbody>
        </table>
        """
        variation_table = variation_table.replace("##numgenotypes##",
                                                  str(numgenotypes))
        variation_table = variation_table.replace("##numvariants##",
                                                  str(numvariants))

        session = str(uuid.uuid4())
        htmlreport_dir = (os.path.join(self.scratch, session))
        os.mkdir(htmlreport_dir)
        index_html_path = os.path.join(htmlreport_dir, "index.html")
        with open(index_html_path, "w") as f:
            f.write(variation_table)
        return (htmlreport_dir)
示例#2
0
class GFFUtils2:
    def __init__(self, config):
        self.callback_url = config['callback_url']
        self.shared_folder = config['scratch']
        #self.shared_folder = "/kb/module/work"
        self.ws_url = config['workspace-url']

        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)

    def _prep_gff(self, gff_file):
        outfile = os.path.join(self.genome_dir, 'out.gff')
        sortcmd = f'(grep ^"#"  {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)'

        with open(outfile, 'w') as o:
            p = subprocess.Popen(sortcmd, shell=True, stdout=o)
            out, err = p.communicate()
            o.close()

        bgzip = subprocess.Popen(['bgzip', 'out.gff'], cwd=self.genome_dir)
        out2, err2 = bgzip.communicate()

        outfile += '.gz'

        return outfile

    def _construct_gff_from_json(self, json, gff_file_path, contig_base_lengths):
        with open(gff_file_path, 'w') as f:
            for feature in json:
                if feature['feature_type'].strip().upper() == 'GENE':
                    end = int(feature['location'][0]['start'])+int(feature['location'][0]['length'])

                    metainfo = "ID="+feature['feature_id']

                    if feature['function']:
                        metainfo += ';FUNCTION='+feature['function']

                    contig_id = str(feature['location'][0]['contig_id'])
                    start = int(feature['location'][0]['start'])

                    # TODO: Fix Plink reassignment of Chr prefixes
                    try:
                        global_pos = int(contig_base_lengths[contig_id]) + start
                    except KeyError:
                        try:
                            global_pos = int(contig_base_lengths[contig_id.capitalize()]) + start
                        except KeyError:
                            try:
                                global_pos = int(contig_base_lengths['Chr'+str(contig_id)]) + start
                            except KeyError:
                                try:
                                    global_pos = int(contig_base_lengths['Chr0'+str(contig_id)]) + start
                                except KeyError:
                                    pp(contig_base_lengths)
                                    pp(contig_id)
                                    raise KeyError(e)

                    """
                    Remove ontology for now
                    if feature['ontology_terms']:
                        metainfo += ';ONTOLOGY('

                        for k, v in feature['ontology_terms'].items():
                            metainfo += str(k) + ',' + str(v) + ':'

                        metainfo = metainfo[:-1]  # remove trailing ;
                        metainfo += ')'
                    """

                    constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \
                                           'KBase\tgene\t' + \
                                           str(feature['location'][0]['start']) + '\t' + \
                                           str(end) + '\t.\t' + \
                                           str(feature['location'][0]['strand']) + '\t' + \
                                           str(global_pos) + '\t' + \
                                           str(metainfo) + '\n'
                    f.write(constructed_gff_line)
            f.close()
        if os.path.exists(gff_file_path):
            return gff_file_path
        else:
            raise FileNotFoundError('Unable to create GFF file form genome JSON.')

    def _process_tabix_results(self, queryresult):
        queryinfo = queryresult[8].split(';')
        if len(queryinfo) >= 2:
            extension = [clean_tsv_data(queryinfo[0][3:]), "NA", clean_tsv_data(queryinfo[1][9:])]
        elif len(queryinfo) is 1:
            extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"]
        else:
            extension = ['NA', 'NA', 'NA']
        return extension

    def find_gene_info(self, row):
        tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]), int(row["POS"]))
        tbresult = next(tb, None)
        if tbresult is None:
            tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"], int(row["POS"]), int(row["POS"]))
            tbresult2 = next(tb2, None)
            if tbresult2 is None:
                tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"], int(row["POS"]), int(row["POS"]))
                tbresult3 = next(tb3, None)
                if tbresult3 is None:
                    if int(row["POS"]) < 500:
                        nstart = 0
                    else:
                        nstart = int(row["POS"]) - 500

                    neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart, int(row["POS"]) + 500)
                    neigh_result = next(neigh_tb, None)

                    if neigh_result is None:
                        return pd.Series(['NA', 'NA', 'NA'], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                    else:
                        nq = self._process_tabix_results(neigh_result)
                        return pd.Series([nq[1], nq[0], nq[2]], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                else:
                    q3 = self._process_tabix_results(tbresult3)
                    return pd.Series(q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
            else:
                q2 = self._process_tabix_results(tbresult2)
                return pd.Series(q2, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
        else:
            q = self._process_tabix_results(tbresult)
            return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])

    def get_gwas_result_file(self, association_ref, association_name, p_value):
        #association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]['data']['data']
        association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]
        association_results = association_obj['data']["association_details"][0]["association_results"]
        result = "CHR\tSNP\tPOS\tP\tBP\n"
        for variation in association_results:
            if (float(variation[3]) > float(p_value)):
                continue
            result += str(variation[0]) + "\t" 
            result +=  str(variation[1]) + "\t" 
            result +=  str(variation[2]) + "\t" 
            result +=   str(variation[3]) + "\t"
            result +=   str(variation[2]) + "\n"
        filepath = os.path.join(self.genome_dir, association_name)
        with open(filepath, "w") as file1: 
            file1.write(result) 
        return (filepath)

    def build_featureset(self, filepath, genome_ref, description, workspace_name, association_name, prefix):
      gene_ids = dict()
      element_ordering = list()
      elements = dict()
      skip_words = ["GENEID", "NEIGHBORGENE", "NA"]
      with open(filepath, 'r') as reader:
          for line in reader:
              fields = line.split("\t")
              condition1 = fields[5] not in skip_words
              condition2 = fields[5] not in elements
              condition3 = fields[6] not in skip_words
              condition4 = fields[6] not in elements
              if condition1 and condition2:
                  element_ordering.append(fields[5])
                  elements[fields[5]] = [genome_ref]
              if condition3 and condition4:
                  element_ordering.append(fields[6])
                  elements[fields[6]] = [genome_ref]
      featureset = dict()
      featureset['description'] = description
      featureset['element_ordering'] = element_ordering
      featureset['elements'] = elements
      ws_id = self.dfu.ws_name_to_id(workspace_name)
      featureset_obj_name = prefix + str(association_name)

      save_info = self.dfu.save_objects( { 'id': ws_id, 
                                            'objects': [ {'type': 'KBaseCollections.FeatureSet', 
                                                          'data': featureset, 
                                                          'name': featureset_obj_name}]})[0]
      obj_ref  = "{0}/{1}/{2}".format( save_info[6], save_info[0], save_info[4] )   
      return obj_ref         


   
    def annotate_GWAS_results(self, genome_ref, association_ref, workspace_name, prefix, p_value):
         
        #TODO: Send outfile to prep gff function inseted of hardcord
        #TODO: Removed hard coded stuff and create new directory for each test function
        self.genome_dir_name = "_".join(genome_ref.split("/"))
        self.genome_dir = os.path.join(self.shared_folder, self.genome_dir_name)
        if not os.path.isdir(self.genome_dir):
            os.mkdir(self.genome_dir)
        sorted_gff_path = os.path.join(self.genome_dir, 'out.gff.gz')
        self.sorted_gff = sorted_gff_path

        if  not os.path.exists(sorted_gff_path):
            feature_num = self.gsu.search({'ref': genome_ref})['num_found']
            # get genome features for gff construction
            genome_features = self.gsu.search({
                'ref': genome_ref,
                'limit': feature_num,
                #'sort_by': [['feature_id', True]]
            })['features']

            assembly_ref = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])[0]['data']['assembly_ref']

            # get assembly contigs for base length calculations
            assembly_contigs = self.wsc.get_object_subset([{
                'included': ['/contigs'],
                'ref': assembly_ref
            }])[0]['data']['contigs']

            contig_ids = list(assembly_contigs.keys())
            contig_ids.sort()

            contig_base_lengths = {}
            prev_length = 0

            for contig in contig_ids:
                contig_base_lengths[contig] = prev_length
                prev_length += assembly_contigs[contig]['length']

            gff_file = os.path.join(self.genome_dir, 'constructed.gff')
            constructed_gff = self._construct_gff_from_json(genome_features, gff_file, contig_base_lengths)
            self.sorted_gff = self._prep_gff(constructed_gff)
            tabix_index(self.sorted_gff)

        obj_info = self.wsc.get_object_info3({"objects": [{"ref": association_ref}]})
        association_name =obj_info["infos"][0][1]


        gwas_results_file = self.get_gwas_result_file(association_ref, association_name, p_value)

        gwas_results = pd.read_csv(gwas_results_file, sep='\t')

        gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \
           gwas_results.apply(self.find_gene_info, axis=1)

        new_results_path = os.path.abspath(os.path.join(gwas_results_file, '..'))
        fname = 'final_' +  association_name
        new_results_path = os.path.join(new_results_path, fname )
        gwas_results.to_csv(path_or_buf=new_results_path, sep='\t', index=False)
        description = "Genelist for GWAS results of trait " + association_name
         
        featureset_obj = self.build_featureset( new_results_path, genome_ref, description, workspace_name, association_name, prefix)
        
        return featureset_obj
示例#3
0
class JbrowseUtil:
    def __init__(self, Config):
        callback_url = os.environ['SDK_CALLBACK_URL']
        ws_url = Config['ws_url']
        self.wsc = Workspace(ws_url)
        self.dfu = DataFileUtil(callback_url)
        self.gfu = GenomeFileUtil(callback_url)
        #service-wizard url
        self.sw_url = Config['sw_url']
        self.shock_url = Config['shock_url']
        scratch = Config['scratch']
        session = str(uuid.uuid4())
        self.session_dir = (os.path.join(scratch, session))
        os.mkdir(self.session_dir)
        pass

    def get_variation_service_url(self, sw_url):
        '''
        get the most recent VariationFileServ url from the service wizard.
        sw_url: service wizard url
        '''
        # TODO Fix the following dev thing to beta or release or future
        json_obj = {
            "method": "ServiceWizard.get_service_status",
            "id": "",
            "params": [{"module_name": "VariationFileServ", "version": "dev"}]
        }
        sw_resp = requests.post(url=sw_url, data=json.dumps(json_obj))
        vfs_resp = sw_resp.json()
        self.shock_url = self.shock_url.replace("https://", "")
        vfs_url = vfs_resp['result'][0]['url'] + "/jbrowse_query/" + self.shock_url + "/node"
        return vfs_url

    def _run_cmd(self, cmd):
        try:
            process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
            stdout, stderr = process.communicate()
            if stdout:
                logging.info("ret> ", process.returncode)
                logging.info("OK> output ", stdout)
            if stderr:
                logging.info("ret> ", process.returncode)
                logging.info("Error> error ", stderr.strip())

        except OSError as e:
            logging.info("OSError > ", e.errno)
            logging.info("OSError > ", e.strerror)
            logging.info("OSError > ", e.filename)

    def create_refseqs_data_from_assembly(self, assembly_ref):
        '''

        :param assembly_json:
        :return:
        '''
        refseqs_data = []
        # 1) Download assembly contig info and parse contig length information
        data = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': assembly_ref
        }])[0]['data']
        for key in data['contigs']:
            refseqs_data.append(
                {"end": data['contigs'][key]["length"],
                 "length": data['contigs'][key]["length"],
                 "name": data['contigs'][key]["contig_id"],
                 "seqChunkSize": 20000,
                 "start": 0
                 }
            )
        return refseqs_data


    def prepare_genome_features_track(self, genome_ref, vfs_url):
        """
        Builds track for genome features

        :param genome_ref:
        :return:
        """
        shock_handles = list()
        gff_track = ""

        # 1) Download gff using genomefileutil
        gff_file_info = self.gfu.genome_to_gff({'genome_ref': genome_ref})
        gff_file = gff_file_info["file_path"]

        # 2) sort gff
        outfile = gff_file + "_sorted"
        sorted_gff_cmd = " ".join(["sort -k1,1 -k4,4n",
                                  gff_file, ">", outfile])
        self._run_cmd(sorted_gff_cmd)

        # 3) compress gff
        zip_cmd = "bgzip " + outfile
        self._run_cmd(zip_cmd)

        # 4) index gff
        index_gff_cmd = "tabix -p gff " + gff_file + "_sorted.gz"
        self._run_cmd(index_gff_cmd)

        gff_gz_file_path = gff_file + "_sorted.gz"
        gff_index_file_path = gff_file + "_sorted.gz.tbi"

        # 5) Upload gff and gff index to shock
        if os.path.exists(gff_gz_file_path):
            gff_shock_ref = self.dfu.file_to_shock(
                {'file_path': gff_gz_file_path, 'make_handle': 1}
            )
        if os.path.exists(gff_index_file_path):
            gff_index_shock_ref = self.dfu.file_to_shock(
                {'file_path': gff_index_file_path, 'make_handle': 1}
            )

        # 6 Create gff track text that will be used for genome features track
        gff_track = '''
        {
            "label": "Genome Features",
            "key": "GenomeFeatures",
            "storeClass": "JBrowse/Store/SeqFeature/GFF3Tabix",
            "urlTemplate":"<vfs_url>/<gff_shock_ref>",
            "tbiUrlTemplate": "<vfs_url>/<gff_index_shock_ref>",
            "type": "JBrowse/View/Track/CanvasFeatures"
        }
        '''
        gff_track = gff_track.replace("<gff_shock_ref>",
                                      gff_shock_ref['handle']['id'])
        gff_track = gff_track.replace("<gff_index_shock_ref>",
                                      gff_index_shock_ref['handle']['id'])
        gff_track = gff_track.replace("<vfs_url>", vfs_url)
        gff_track_dict = json.loads(gff_track)

        # 7) Capture shock handles
        shock_handles.append(gff_shock_ref['handle'])
        shock_handles.append(gff_index_shock_ref['handle'])

        # 8) return shock handles and gff track info
        return {"shock_handle_list": shock_handles, "track_item": gff_track_dict}



    def prepare_snp_frequency_track(self, vcf_filepath, assembly_ref, binsize, vfs_url):
        """

        :param vcf_filepath:
        :param assembly_ref:
        :param binsize:
        :return:
        """
        BEDGRAPHTOBIGWIG="/kb/deployment/bin/bedGraphToBigWig"
        shock_handles = list()

        chr_length_dict = {}
        chr_length_data = ""
        chr_length_path = None
        counts = Counter()

        # 1) Download assembly contig info and parse contig length information
        data = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': assembly_ref
        }])[0]['data']

        contigs = data["contigs"]
        for contig in contigs:
            contig_data = data["contigs"][contig]
            chr_length_data += str(contig_data['contig_id']) + '\t' + str(contig_data['length']) + '\n'
            c_id = str(contig_data['contig_id'])
            c_length = str(contig_data['length'])
            chr_length_dict[c_id] = c_length

        # 2) Write contig lengths to a file (needed later)
        if chr_length_data is not None:
            chr_length_path = os.path.join(self.session_dir,
                                            "chr_length.txt")
            with open(chr_length_path, "w") as f:
                f.write(chr_length_data)

        # 3) Read and parse vcf file (must be bgzip compressed)
        #    Caclculate number of SNPs in each bin and write in bedgraph format
        reader = gzip.open(vcf_filepath, "rt")
        logging.info("Generating bedgraph file\n")
        for record in reader:
            if record[0] == "#":
                continue
            rs = record.split("\t")
            CHR, POS = rs[0], rs[1]
            bin_pos = int(int(POS) / binsize)
            bin_id = str(CHR) + "\t" + str(bin_pos)
            counts[bin_id] += 1
        bedgraph_file = os.path.join(self.session_dir, "vcf_bedgraph.txt")
        try:
            with open(bedgraph_file, "w") as fout:
                for j, k in counts.items():
                    chromosome, bin_num = j.split("\t")
                    bin_start = int(bin_num) * binsize
                    bin_end = bin_start + binsize
                    chr_length = chr_length_dict[chromosome]
                    if bin_end <= int(chr_length):
                        fout.write(chromosome + "\t" + str(bin_start) + "\t" + str(bin_end) + "\t" + str(k) + "\n")
                    else:
                        fout.write(chromosome + "\t" + str(bin_start) + "\t" + str(chr_length) + "\t" + str(k) + "\n")
        except IOError:
            logging.info("Unable to write " + bedgraph_file, + " file on disk.")

        # 4) Sort bedgraph file by chromosome id and co-ordinates
        sorted_bedgraph_file = bedgraph_file + "_sorted"
        sort_cmd = "sort -k1,1 -k2,2n " + bedgraph_file + "> " + sorted_bedgraph_file
        self._run_cmd(sort_cmd)

        # 5) Convert sorted bedgraph to bigwig format using utility bedgraphTOBigWig tool
        output_bigwig_file = bedgraph_file + "_bigwig.bw"
        cmd = BEDGRAPHTOBIGWIG + " " + sorted_bedgraph_file + " " + chr_length_path + " " + output_bigwig_file
        logging.info("Generating bigwig ..\n" + cmd + "\n")
        self._run_cmd(cmd)

        # 6) upload bigwig file to shock
        logging.info("Uploading Bigwig file to shock")
        if os.path.exists(output_bigwig_file):
            bigwig_shock_ref = self.dfu.file_to_shock(
                {'file_path': output_bigwig_file, 'make_handle': 1}
            )
        # 7) Append shock handle to genomic_indexes
        shock_handles.append(bigwig_shock_ref['handle'])

        # 8) Build snp frequency track
        output_bigwig_shock = bigwig_shock_ref['handle']['id']
        snp_frequency_track = '''
        {
            "label": "Variation Densityy", 
            "key": "Variation_density", 
            "storeClass": "JBrowse/Store/SeqFeature/BigWig", 
            "urlTemplate": "<vfs_url>/<bigwig_shock_id>", 
            "type": "JBrowse/View/Track/Wiggle/XYPlot"
        } 
        '''
        snp_frequency_track = snp_frequency_track.replace("<bigwig_shock_id>", output_bigwig_shock)
        snp_frequency_track = snp_frequency_track.replace("<vfs_url>", vfs_url)
        snp_frequency_track_dict = json.loads(snp_frequency_track)
        # 9) Return shock handles and track info
        return {"shock_handle_list": shock_handles, "track_item": snp_frequency_track_dict}


    def prepare_snp_track(self, vcf_shock_id, vcf_index_shock_id, vfs_url):
        """

        :param vcf_shock_id:
        :param vcf_index_shock_id:
        :return:
        """
        shock_handles = list()

        snp_track ='''
            {
                "label": "Variation", 
                "key": "Variation", 
                "storeClass": "JBrowse/Store/SeqFeature/VCFTabix", 
                "urlTemplate": "<vfs_url>/<vcf_shock_id>", 
                "tbiUrlTemplate": "<vfs_url>/<vcf_index_shock_id>", 
                "type": "JBrowse/View/Track/HTMLVariants"
            }
        '''
        snp_track = snp_track.replace("<vcf_shock_id>", vcf_shock_id)
        snp_track = snp_track.replace("<vcf_index_shock_id>", vcf_index_shock_id)
        snp_track = snp_track.replace("<vfs_url>", vfs_url)
        snp_track_dict = json.loads(snp_track)
        # shock handles should be empty list in return when built from shock ids
        return {"shock_handle_list": shock_handles, "track_item": snp_track_dict}

    def build_jbrowse_data_folder(self, jbrowse_path):
        shock_handles = list()
        data_folder_shock_ref = self.dfu.file_to_shock({'file_path': jbrowse_path,
                                            'pack': 'zip', 'make_handle': 1})
        shock_handles.append(data_folder_shock_ref['handle'])
        return {"shock_handle_list": shock_handles}

    def build_jbrowse(self, jbrowse_src, jbrowse_path, refseqs_data, genomic_indexes, tracklist_items):
        """

        :param jbrowse_src:
        :param jbrowse_path:
        :param genomic_indexes:
        :param tracklist_items:
        :return:
        """
        jbrowse_report = {}

        # 1) Copy the jbrowse source code to build report
        destination = shutil.copytree(jbrowse_src, jbrowse_path)

        # 2) Put tracklist.json in jbrowse data path
        tracklist_path = os.path.join(jbrowse_path, "data", "trackList.json")
        trackdata = {
            'formatVersion': 1,
            'tracks': tracklist_items
        }
        with open(tracklist_path, "w") as f:
            f.write(json.dumps(trackdata))

        # 3) Put refseq.json in jbrowse seq path
        refseqs_json_path = os.path.join(jbrowse_path, "data", "seq", "refSeqs.json")
        with open(refseqs_json_path, "w") as f:
            f.write(json.dumps(refseqs_data))

        #Build jbrowse data folder to support jbrowse widget in narrative
        res = self.build_jbrowse_data_folder(jbrowse_path)
        data_folder_index = res['shock_handle_list']
        genomic_indexes = genomic_indexes + data_folder_index

        # Build jbrowse report dict
        jbrowse_report["jbrowse_data_path"] = jbrowse_path
        jbrowse_report["genomic_indexes"] = genomic_indexes

        return jbrowse_report

    def prepare_jbrowse_report(self, jbrowse_params):
        """
        Build genomic indexes, prepare jbrowse report
        :param input_params:
        :return:
        """
        # Service wizard
        sw_url = self.sw_url
        # Variation file service url for serving jbrowse track files
        vfs_url = self.get_variation_service_url(sw_url)

        print(vfs_url)

        genomic_indexes = list()
        tracklist_items = list()
        refseqs_data = None

        # 1) Build refseqs_data
        #    This is used to build refseqs.json file for jbrowse
        #    Jbrowse report can not be built if assembly ref doesn't exist
        if 'assembly_ref' in jbrowse_params:
            assembly_ref = jbrowse_params['assembly_ref']
            refseqs_data = self.create_refseqs_data_from_assembly(assembly_ref)
        else:
            raise ValueError ("assembly ref not found")
            return

        # 2) Build genome features track
        if 'genome_ref' in jbrowse_params:
            genome_ref = jbrowse_params['genome_ref']
            output = self.prepare_genome_features_track(genome_ref, vfs_url)
            shock_handles, track_item = output["shock_handle_list"], output["track_item"]
            genomic_indexes = genomic_indexes + shock_handles
            tracklist_items.append(track_item)
        else:
            print ("Skipping genome features track")


        # 3) Build SNP frequency track
        cond1 = 'vcf_path' in jbrowse_params
        cond2 = 'assembly_ref' in jbrowse_params
        cond3 = 'binsize' in jbrowse_params
        if cond1 and cond2 and cond3:
            vcf_path = jbrowse_params['vcf_path']
            assembly_ref = jbrowse_params['assembly_ref']
            binsize = jbrowse_params["binsize"]
            output = self.prepare_snp_frequency_track(vcf_path, assembly_ref, binsize, vfs_url)
            shock_handles, track_item = output["shock_handle_list"], output["track_item"]
            if shock_handles:
                genomic_indexes = genomic_indexes + shock_handles
            tracklist_items.append(track_item)
        else:
            print ("Skipping SNP frequency track")

        # 4) Build SNP track
        cond1 = 'vcf_shock_id' in jbrowse_params
        cond2 = 'vcf_index_shock_id' in jbrowse_params
        if cond1 and cond2:
            vcf_shock_id = jbrowse_params['vcf_shock_id']
            vcf_index_shock_id = jbrowse_params['vcf_index_shock_id']
            output = self.prepare_snp_track(vcf_shock_id, vcf_index_shock_id, vfs_url)
            shock_handles, track_item = output["shock_handle_list"], output["track_item"]
            genomic_indexes = genomic_indexes + shock_handles
            tracklist_items.append(track_item)
        else:
            print ("Skipping SNP track")
        # 5) Build jbrowse directory with index.html
        # jbrowse directory later on gets uploaded as html report
        jbrowse_src = "/kb/module/deps/jbrowse"
        jbrowse_path = os.path.join(self.session_dir, "jbrowse")

        if tracklist_items:
             jbrowse_report = self.build_jbrowse(jbrowse_src,
                                                 jbrowse_path,
                                                 refseqs_data,
                                                 genomic_indexes,
                                                 tracklist_items)
        else:
            raise ValueError ("No tracks found")
            return

        return jbrowse_report
示例#4
0
class GFFUtils:
    def __init__(self, config):
        self.callback_url = config['callback_url']
        self.shared_folder = config['scratch']
        self.ws_url = config['workspace-url']

        self.GFF_dir = os.path.join(self.shared_folder, 'GFF')

        if not os.path.isdir(self.GFF_dir):
            os.mkdir(self.GFF_dir)

        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)

    def _prep_gff(self, gff_file):
        outfile = os.path.join(self.shared_folder, 'GFF', 'out.gff')
        sortcmd = f'(grep ^"#"  {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)'

        with open(outfile, 'w') as o:
            p = subprocess.Popen(sortcmd, shell=True, stdout=o)
            out, err = p.communicate()
            o.close()

        bgzip = subprocess.Popen(['bgzip', 'out.gff'],
                                 cwd=os.path.join(self.shared_folder, 'GFF'))
        out2, err2 = bgzip.communicate()

        outfile += '.gz'

        return outfile

    def _construct_gff_from_json(self, json, gff_file_path,
                                 contig_base_lengths):
        with open(gff_file_path, 'w') as f:
            for feature in json:
                if feature['feature_type'].strip().upper() == 'GENE':
                    end = int(feature['location'][0]['start']) + int(
                        feature['location'][0]['length'])

                    metainfo = "ID=" + feature['feature_id']

                    if feature['function']:
                        metainfo += ';FUNCTION=' + feature['function']

                    contig_id = str(feature['location'][0]['contig_id'])
                    start = int(feature['location'][0]['start'])

                    # TODO: Fix Plink reassignment of Chr prefixes
                    try:
                        global_pos = int(
                            contig_base_lengths[contig_id]) + start
                    except KeyError:
                        try:
                            global_pos = int(contig_base_lengths[
                                contig_id.capitalize()]) + start
                        except KeyError:
                            try:
                                global_pos = int(contig_base_lengths[
                                    'Chr' + str(contig_id)]) + start
                            except KeyError:
                                try:
                                    global_pos = int(contig_base_lengths[
                                        'Chr0' + str(contig_id)]) + start
                                except KeyError:
                                    pp(contig_base_lengths)
                                    pp(contig_id)
                                    raise KeyError(e)
                    """
                    Remove ontology for now
                    if feature['ontology_terms']:
                        metainfo += ';ONTOLOGY('

                        for k, v in feature['ontology_terms'].items():
                            metainfo += str(k) + ',' + str(v) + ':'

                        metainfo = metainfo[:-1]  # remove trailing ;
                        metainfo += ')'
                    """

                    constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \
                                           'KBase\tgene\t' + \
                                           str(feature['location'][0]['start']) + '\t' + \
                                           str(end) + '\t.\t' + \
                                           str(feature['location'][0]['strand']) + '\t' + \
                                           str(global_pos) + '\t' + \
                                           str(metainfo) + '\n'
                    f.write(constructed_gff_line)
            f.close()
        if os.path.exists(gff_file_path):
            return gff_file_path
        else:
            raise FileNotFoundError(
                'Unable to create GFF file form genome JSON.')

    def _process_tabix_results(self, queryresult):
        queryinfo = queryresult[8].split(';')
        if len(queryinfo) >= 2:
            extension = [
                clean_tsv_data(queryinfo[0][3:]), "NA",
                clean_tsv_data(queryinfo[1][9:])
            ]
        elif len(queryinfo) is 1:
            extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"]
        else:
            extension = ['NA', 'NA', 'NA']
        return extension

    def find_gene_info(self, row):
        tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]),
                         int(row["POS"]))
        tbresult = next(tb, None)
        if tbresult is None:
            tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"],
                              int(row["POS"]), int(row["POS"]))
            tbresult2 = next(tb2, None)
            if tbresult2 is None:
                tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"],
                                  int(row["POS"]), int(row["POS"]))
                tbresult3 = next(tb3, None)
                if tbresult3 is None:
                    if int(row["POS"]) < 500:
                        nstart = 0
                    else:
                        nstart = int(row["POS"]) - 500

                    neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart,
                                           int(row["POS"]) + 500)
                    neigh_result = next(neigh_tb, None)

                    if neigh_result is None:
                        return pd.Series(
                            ['NA', 'NA', 'NA'],
                            index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                    else:
                        nq = self._process_tabix_results(neigh_result)
                        return pd.Series(
                            [nq[1], nq[0], nq[2]],
                            index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                else:
                    q3 = self._process_tabix_results(tbresult3)
                    return pd.Series(
                        q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
            else:
                q2 = self._process_tabix_results(tbresult2)
                return pd.Series(q2,
                                 index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
        else:
            q = self._process_tabix_results(tbresult)
            return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])

    def annotate_GWAS_results(self, genome_ref, gwas_results_file):
        feature_num = self.gsu.search({'ref': genome_ref})['num_found']

        # get genome features for gff construction
        genome_features = self.gsu.search({
            'ref': genome_ref,
            'limit': feature_num,
            #'sort_by': [['feature_id', True]]
        })['features']

        assembly_ref = self.wsc.get_object_subset([{
            'included': ['/assembly_ref'],
            'ref':
            genome_ref
        }])[0]['data']['assembly_ref']

        # get assembly contigs for base length calculations
        assembly_contigs = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': assembly_ref
        }])[0]['data']['contigs']

        contig_ids = list(assembly_contigs.keys())
        contig_ids.sort()

        contig_base_lengths = {}
        prev_length = 0

        for contig in contig_ids:
            contig_base_lengths[contig] = prev_length
            prev_length += assembly_contigs[contig]['length']

        gff_file = os.path.join(self.GFF_dir, 'constructed.gff')
        constructed_gff = self._construct_gff_from_json(
            genome_features, gff_file, contig_base_lengths)
        self.sorted_gff = self._prep_gff(constructed_gff)
        tabix_index(self.sorted_gff)

        gwas_results = pd.read_csv(gwas_results_file, sep='\t')

        gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \
            gwas_results.apply(self.find_gene_info, axis=1)

        new_results_path = os.path.abspath(
            os.path.join(gwas_results_file, '..'))
        new_results_path = os.path.join(new_results_path, 'final_results.txt')

        gwas_results.to_csv(path_or_buf=new_results_path,
                            sep='\t',
                            index=False)

        return new_results_path
class VariationUtil:
    '''
    Module Name:
    VariationUtil

    Module Description:
    A KBase module: VariationUtil
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.4"
    GIT_URL = ""
    GIT_COMMIT_HASH = "2a4c2dbc058b702811c967997e7100c834e755d4"

    #BEGIN_CLASS_HEADER

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        # TODO: Make sure we need to define config just once
        # TODO: Change the code tp match this style
        self.config = config
        self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL']
        self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.config['ws_url'] = config['workspace-url']

        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shared_folder = config['scratch']
        self.hr = htmlreportutils()
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.shock_url = config['shock-url']
        self.sw_url = config['srv-wiz-url']
        pass
        #END_CONSTRUCTOR
        pass

    def save_variation_from_vcf(self, ctx, params):
        """
        Save a variation (and trait?) object to Kbase given a reference genome, object output name,
        Variant Call Format (VCF) file, and sample attribute file.
        :param params: instance of type "save_variation_input" (## funcdef
           save_variation_from_vcf ## required input params:
           genome_or_assembly_ref: KBaseGenomes.Genome or
           KBaseGenomeAnnotations.Assembly object reference *** variation
           input data *** vcf_staging_file_path: path to location data
           associated with samples variation_object_name: output name for
           KBase variation object *** sample input data ***
           sample_attribute_ref: x/y/z reference to kbase sample attribute
           optional params: NA output report: report_name report_ref HTML
           visualization: Manhattan plot *** Visualization *** plot_maf:
           generate histogram of minor allele frequencies plot_hwe: generate
           histogram of Hardy-Weinberg Equilibrium p-values) -> structure:
           parameter "workspace_name" of String, parameter
           "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "vcf_staging_file_path" of type "filepath"
           (KBase file path to staging files), parameter
           "variation_object_name" of String, parameter
           "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "save_variation_output" -> structure:
           parameter "variation_ref" of String, parameter "report_name" of
           String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: report
        #BEGIN save_variation_from_vcf

        # Get workspace id
        ws_id = self.dfu.ws_name_to_id(params['workspace_name'])

        genome_ref = None
        assembly_ref = None

        # 1) Find whether the input is a genome or assembly
        #    and get genome_ref and assembly_ref

        genome_or_assembly_ref = params['genome_or_assembly_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        # 2)  Validate VCF, compress, and build VCF index
        logging.info("Validating VCF, Compressing VCF and Indexing VCF")
        VCFUtilsConfig = {"scratch": self.scratch}
        VCFUtilsParams = {
            'vcf_staging_file_path': params['vcf_staging_file_path']
        }
        VCU = VCFUtils(VCFUtilsConfig)
        vcf_compressed, vcf_index, vcf_strain_ids = VCU.validate_compress_and_index_vcf(
            VCFUtilsParams)

        if vcf_index is not None:
            logging.info("vcf compressed :" + str(vcf_compressed))
            logging.info("vcf index :" + str(vcf_index))
            logging.info("vcf strain ids :" + str(vcf_strain_ids))
        else:
            raise ValueError(
                "No result obtained after compression and indexing step")

        # Get strain info
        # TODO: Remove hard coded stuff
        StrainInfoConfig = self.config
        StrainInfoParams = {
            "ws_id": ws_id,
            "vcf_strain_ids": vcf_strain_ids,
            "sample_set_ref": params["sample_set_ref"],
            "sample_attribute_name": params["sample_attribute_name"]
        }
        si = StrainInfo(StrainInfoConfig)
        sample_attribute_ref, strains = si.sample_strain_info(StrainInfoParams)
        print(sample_attribute_ref)
        print(strains)

        # 3) Create json for variation object. In a following step genomic_indexes will be
        # added to this json before it is saved as Variation object

        VCFToVariationConfig = {"ws_url": self.ws_url, "scratch": self.scratch}
        VCFToVariationParams = {
            "vcf_compressed": vcf_compressed,
            "vcf_index": vcf_index,
            "assembly_ref": assembly_ref
        }
        if genome_ref is not None:
            VCFToVariationParams['genome_ref'] = genome_ref

        vtv = VCFToVariation(VCFToVariationConfig)
        variation_object_data = vtv.generate_variation_object_data(
            VCFToVariationParams)
        # Append sample information
        if sample_attribute_ref:
            variation_object_data[
                'sample_attribute_ref'] = sample_attribute_ref
        else:
            raise ValueError(f'sample attribute ref not found')
        if strains:
            variation_object_data['strains'] = strains
        else:
            raise ValueError(f'strains not found')
        if 'sample_set_ref' in params:
            variation_object_data['sample_set_ref'] = params['sample_set_ref']
        else:
            raise ValueError(f'sample_set_ref not found in params')

        # 4)
        JbrowseConfig = {
            "ws_url": self.ws_url,
            "scratch": self.scratch,
            "sw_url": self.sw_url,
            "shock_url": self.shock_url
        }
        JbrowseParams = {
            "vcf_path": vcf_compressed,
            "assembly_ref": assembly_ref,
            "binsize": 10000,
            "vcf_shock_id": variation_object_data['vcf_handle']['id'],
            "vcf_index_shock_id":
            variation_object_data['vcf_index_handle']['id']
        }
        if genome_ref is not None:
            JbrowseParams["genome_ref"] = genome_ref

        jb = JbrowseUtil(JbrowseConfig)
        jbrowse_report = jb.prepare_jbrowse_report(JbrowseParams)

        # 5) Now we have the genomic indices and we have all the information needed to save
        # the variation object
        # TODO: Take out the genomic_indexes field from the object spec
        #  TODO: Take out the vcf_handle stuff not needed

        variation_object_data['genomic_indexes'] = jbrowse_report[
            'genomic_indexes']

        var_obj = self.dfu.save_objects({
            'id':
            self.dfu.ws_name_to_id(params['workspace_name']),
            'objects': [{
                'type': 'KBaseGwasData.Variations',
                'data': variation_object_data,
                'name': params['variation_object_name']
            }]
        })[0]

        var_obj_ref = str(var_obj[6]) + "/" + str(var_obj[0]) + "/" + str(
            var_obj[4])
        print(var_obj_ref)

        # 5) Build Variation report
        # This is a simple report
        #
        workspace = params['workspace_name']
        created_objects = []
        created_objects.append({
            "ref": var_obj_ref,
            "description": "Variation Object"
        })
        ReportConfig = {
            "ws_url": self.ws_url,
            "scratch": self.scratch,
        }
        ReportParams = {"variation_ref": var_obj_ref}
        vr = VariationReport(ReportConfig)
        htmlreport_dir = vr.create_variation_report(ReportParams)

        report = self.hr.create_html_report(htmlreport_dir, workspace,
                                            created_objects)
        report['variation_ref'] = var_obj_ref
        print(report)
        #END save_variation_from_vcf

        # At some point might do deeper type checking...
        if not isinstance(report, dict):
            raise ValueError('Method save_variation_from_vcf return value ' +
                             'report is not type dict as required.')
        # return the results
        return [report]

    def export_variation_as_vcf(self, ctx, params):
        """
        Export KBase variation object as Variant Call Format (VCF) file
        :param params: instance of type "export_variation_input" (## funcdef
           export_variation_as_vcf ## required input params: Variation object
           reference optional params: NA output report: Shock id pointing to
           exported vcf file) -> structure: parameter "input_var_ref" of type
           "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "export_variation_output" -> structure:
           parameter "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_variation_as_vcf

        vtv = VariationToVCF(self.callback_url, self.shared_folder)
        output = vtv.export_as_vcf(params)

        #END export_variation_as_vcf

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_variation_as_vcf return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def get_variation_as_vcf(self, ctx, params):
        """
        Given a reference to a variation object, and output name: return a Variant Call Format (VCF)
        file path and name.
        :param params: instance of type "get_variation_input" (## funcdef
           get_variation_as_vcf ## required input params: Variation object
           reference output file name optional params: NA output report: path
           to returned vcf name of variation object) -> structure: parameter
           "variation_ref" of type "obj_ref" (An X/Y/Z style reference),
           parameter "filename" of String
        :returns: instance of type "get_variation_output" -> structure:
           parameter "path" of type "filepath" (KBase file path to staging
           files), parameter "variation_name" of String
        """
        # ctx is the context object
        # return variables are: file
        #BEGIN get_variation_as_vcf
        vtv = VariationToVCF(self.callback_url, self.shared_folder)
        file = vtv.variation_to_vcf(params)

        #END get_variation_as_vcf

        # At some point might do deeper type checking...
        if not isinstance(file, dict):
            raise ValueError('Method get_variation_as_vcf return value ' +
                             'file is not type dict as required.')
        # return the results
        return [file]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
class snp2gene:
    '''
    Module Name:
    snp2gene

    Module Description:
    A KBase module: snp2gene
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "[email protected]:kbasecollaborations/snp2gene.git"
    GIT_COMMIT_HASH = "8dd593e96c4b37fcf91a719181389e1b04c0bb4a"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.config['callback_url'] = os.environ['SDK_CALLBACK_URL']
        callback_url = self.config['callback_url']
        self.shared_folder = config['scratch']
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        self.kbr = KBaseReport(callback_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def annotate_gwas_results(self, ctx, params):
        """
        annotate_gwas_results:
        inputs:
            file path to gwas results
            genome object - with reference to GFF file
        outputs:
            TSV file represented by shock/handle ids and
        :param params: instance of type "annotate_gwas_input" -> structure:
           parameter "gwas_result_file" of type "file_path" (A valid file
           path), parameter "genome_obj" of type "genome_ref" (KBase style
           object reference X/Y/Z @id ws KBaseGenomes.Genome)
        :returns: instance of type "annotate_gwas_output" -> structure:
           parameter "snp_to_gene_list" of type "file_path" (A valid file
           path)
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN annotate_gwas_results

        gene_list = GFFUtils(self.config).annotate_GWAS_results(
            params['genome_obj'], params['gwas_result_file'])

        output = {'snp_to_gene_list': gene_list}

        #END annotate_gwas_results

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method annotate_gwas_results return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def annotate_gwas_results_app(self, ctx, params):
        """
        :param params: instance of type "annotate_gwas_app_input" ->
           structure: parameter "associations" of list of type
           "association_ref" (KBase style object reference X/Y/Z @id ws
           KBaseGwasData.Associations), parameter "p_value" of String,
           parameter "prefix" of String
        :returns: instance of type "annotate_gwas_app_output" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "featureset_obj" of type "featureset_ref" (KBase
           style object reference X/Y/Z @id ws KBaseCollections.FeatureSet)
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN annotate_gwas_results_app
        # return the results
        print(params)
        #TODO: Hanlde cases where there are no significant SNPs
        #genome_ref = "47506/4/1"
        objects_created = []
        for association_ref in params['associations']:

            variation_ref = self.wsc.get_object_subset([{
                'included': ['/variation_id'],
                'ref':
                association_ref
            }])[0]['data']['variation_id']

            genome_ref = self.wsc.get_object_subset([{
                'included': ['/genome_ref'],
                'ref':
                variation_ref
            }])[0]['data']['genome_ref']

            featureset_obj = GFFUtils2(self.config).annotate_GWAS_results(
                genome_ref, association_ref, params['workspace_name'],
                params['prefix'], params['p_value'])
            objects_created.append({
                'ref': featureset_obj,
                'description': 'FeatureSet'
            })
        # Build the new gff before doing anything

        # Download the workspace object for association one at a time
        # Filter SNPs for p-value, if no snps shows up, append this to warnings
        # Build the table structure needed for snp2gene
        # Run snp2gene algorithm and get final list.txt
        # Save as featureset. Find how to save featureset from genelist

        report_info = self.kbr.create_extended_report({
            'message':
            ' ',
            'objects_created':
            objects_created,
            'report_object_name':
            'annotate_gwas_results_app_' + str(uuid.uuid4()),
            'workspace_name':
            params['workspace_name']
        })
        output = dict()
        output['report_name'] = report_info['name']
        output['report_ref'] = report_info['ref']
        print(output)

        #END annotate_gwas_results_app

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method annotate_gwas_results_app return value ' +
                             'output is not type dict as required.')
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
示例#7
0
class VCFToVariation:
    def __init__(self, config, scratch, callback_url ):
        self.scratch = config['scratch']
        self.ws_url = config['workspace-url']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)
        self.scratch = scratch
        self.callback_url = callback_url
        self.au = AssemblyUtil(self.callback_url)
        self.gapi = GenericsAPI(self.callback_url)


    def _parse_vcf_data(self, params):
        vcf_filepath = self._stage_input(params)

        # file is validated by this point, can assume vcf_filepath is valid
        reader = vcf.Reader(open(vcf_filepath, 'r'))

        version = float(reader.metadata['fileformat'][4:6])
        genotypes = reader.samples
        chromosomes = []
        contigs = {}
        totalvars = 0

        for record in reader:
            totalvars += 1
            if record.CHROM not in chromosomes:
                chromosomes.append(record.CHROM)

            if record.CHROM not in contigs.keys():
                passvar = 1 if not record.FILTER else 0

                contigs[record.CHROM] = {
                    'contig_id': record.CHROM,
                    'totalvariants': 1,
                    'passvariants': passvar,
                    'length': int(record.affected_end-record.affected_start),
                }
            else:
                contigs[record.CHROM]['totalvariants'] += 1
                if not record.FILTER:
                    contigs[record.CHROM]['passvariants'] += 1

        vcf_info = {
            'version': version,
            'contigs': contigs,
            'total_variants': totalvars,
            'genotype_ids': genotypes,
            'chromosome_ids': chromosomes,
            'file_ref': vcf_filepath
        }

        return vcf_info


    def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids):
        genos_not_found = []

        vgenotypes = [x.upper().strip() for x in vcf_genotypes]
        sids = [x.upper().strip() for x in sample_ids]

        for geno in vgenotypes:
            if geno not in sids:
                genos_not_found.append(geno)

        if not genos_not_found:
            return True
        else:
            return genos_not_found

    def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes):
        chromos_not_in_assembly = []

        pp(assembly_chromosomes)

        for chromo in vcf_chromosomes:
            if chromo not in assembly_chromosomes:
                chromos_not_in_assembly.append(chromo)

        if not chromos_not_in_assembly:
            return True
        else:
            return chromos_not_in_assembly

    def _get_vcf_version(self, vcf_filepath):
        with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted.")
                raise ValueError("Invalid VCF.  ##fileformat line in meta is improperly formatted. "
                                 "Check VCF file specifications: https://samtools.github.io/hts-specs/")

            vcf_version = float(tokens[1][-4:].rstrip())

            return vcf_version

    def validate_vcf(self, params):
        if 'genome_or_assembly_ref' not in params:
            raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params)
        if 'vcf_staging_file_path' not in params:
            raise ValueError('VCF staging file path not in input parameters: \n\n' + params)


        vcf_filepath = self._stage_input(params)

        vcf_version = self._get_vcf_version(vcf_filepath)

        # setup directorys for validation output
        validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        # vcftools (vcf-validator) supports VCF v4.0-4.2
        # https://github.com/vcftools/vcftools

        # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3
        # https://github.com/EBIvariation/vcf-validator

        # vcftools is only to validate VCF v4.0

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-l")
            validator_cmd.append('error')
            print("VCF version "+str(vcf_version)+".")
        elif vcf_version >= 4.0:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version 4.0.")
        else:
            raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the '
                             'first line of vcf file and in appropriate syntax. Check VCF file specifications: '
                             'https://samtools.github.io/hts-specs/')

        print("Validator command: {}".format(validator_cmd))

        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            if line.decode("utf-8").strip().startswith('[info]'):
                validator_output.append(line.decode("utf-8"))

        out, err = p.communicate()

        validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt')
        file_output_chk = []

        try:
            if validator_output[0][:6] == '[info]':
                # validation by vcf_validator_linux
                validation_output_filename = validator_output[1].split(' ')[6].strip('\n')
                vo = validator_output[2].split(' ')
                file_output_chk = ''.join(vo[9:]).strip('\n')

                if not os.path.exists(validation_output_filename):
                    raise ValueError(validation_output_filename+' does not exist!')

                if not file_output_chk == 'isvalid':
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))

                #TODO: more detailed validation parsing for vcf_validator_linux
            else:
                if validator_output:
                    with open(validation_output_filename, 'w') as f:
                        for line in validator_output:
                            f.write(str(line))
                        f.close()
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))
                else:
                    with open(validation_output_filename, 'w') as f:
                        f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0")
                        f.close()

                # TODO: more detailed validation parsing for vcftools
        except IndexError:
            # if vcf file < v4.1, and valid it will produce index error on line 132
            if validator_output:
                with open(validation_output_filename, 'w') as f:
                    for line in validator_output:
                        f.write(str(line))
                    f.close()
                print('\n'.join(validator_output))
                raise ValueError('\n'.join(validator_output))
            else:
                with open(validation_output_filename, 'w') as f:
                    f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0")
                    f.close()

        if not os.path.exists(validation_output_filename):
            print('Validator did not generate log file!')
            raise SystemError("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filename))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filename

    def _stage_input(self, params):
        # extract file location from input ui parameters
        if params['vcf_staging_file_path'].startswith('/kb/module/test/'):
            # variation utils unit test
            vcf_local_file_path = params['vcf_staging_file_path']

            if vcf_local_file_path.endswith('.gz'):
                with gzip.open(vcf_local_file_path, 'rb') as f_in:
                    with open(vcf_local_file_path[:-3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

                vcf_local_file_path = vcf_local_file_path[:-3]
        else:
            staging_dir = '/staging'
            vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path'])

        if not os.path.exists(vcf_local_file_path):
            raise OSError('VCF input path does not exist, or is not readable')

        orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path))
        print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}')
        self.original_file = shutil.copy(vcf_local_file_path, orig_file_path)

        # TODO: use data file utils here, upload vcf to shock, use dfu.
        if is_gz_file(vcf_local_file_path):
            # /staging is read only, therefore have to copy before uncompressing
            if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']):
                copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path']))
                unpack = self.dfu.unpack_file({'file_path': copy})
            else:
                unpack = {}
                unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path'])
            params['vcf_local_file_path'] = unpack['file_path']
            return unpack['file_path']
        else:
            params['vcf_local_file_path'] = vcf_local_file_path 
            return vcf_local_file_path

    def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file):
        """
        function for creating sample attribute mapping file.
        """
        try:
            with open (vcf_file, 'r') as vcf_handle:
                Lines = vcf_handle.readlines()

                for line in Lines:
                    if(line.startswith("#CHROM")):
                       header = line.lstrip().split("\t")

                       try:
                          with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle:
                              attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID")

                              for i in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[i])
                              #attribute_mapping_handle.write("\n")


                              attribute_mapping_handle.write("label\t\t\t")
                              for j in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[j])
                              #attribute_mapping_handle.write("\n")
                       except IOError:
                           print("Could not write to file:", sample_attribute_mapping_file)

        except IOError:
               print("Could not read file:", vcf_file)

    def _validate_assembly_ids(self, params):
        # All chromosome ids from the vcf should be in assembly
        # but not all assembly chromosome ids should be in vcf


        if ('genome_ref' in params):
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': params['genome_or_assembly_ref']
            }])

            self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref']

        if ('assembly_ref' in params):
            self.vcf_info['assembly_ref'] = params['assembly_ref']

        assembly_chromosome_ids_call = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])

        assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys()
        vcf_chromosomes = self.vcf_info['chromosome_ids']

        chk_assembly_ids =  self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes)

        if isinstance(chk_assembly_ids, list):
            failed_ids = ' '.join(chk_assembly_ids)
            print(f'VCF contig ids: {failed_ids} are not present in assembly.')
            raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.')


        return assembly_chromosomes

    def _validate_sample_ids(self, params):
        # All samples within the VCF file need to be in sample attribute list


        vcf_genotypes = self.vcf_info['genotype_ids']

        sample_ids_subset = self.wsc.get_object_subset([{
            'included': ['/instances'],
            'ref': params['sample_attribute_ref']
        }])

        sample_ids = sample_ids_subset[0]['data']['instances'].keys()

        validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids)

        if isinstance(validate_genotypes, list):
            failed_genos = ' '.join(validate_genotypes)
            print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')
            raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')

        return sample_ids

    def _construct_contig_info(self, params):
        """
            KBaseGwasData.Variations type spec

            /*
               Contig variation data
                 contig_id - contig identifier
                 totalvariants - total number of variants in each contig
                 passvariants - total number of variants that pass quality variation filter in contig
                 length - length of contig from assembly data
             */

             typdef structure {
               string contig_id;
               int totalvariants;
               int passvariants;
               int length; // from assembly
             } contig_info;
        """

        assembly_chromosome_dict = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])[0]['data']['contigs']


        contigs = []

        contig_infos = self.vcf_info['contigs']


        for contig_id in contig_infos:
            length_contig = assembly_chromosome_dict[contig_id].get("length")
            contig_infos[contig_id]["length"] = length_contig
            contigs.append(contig_infos[contig_id])

        return contigs
   

    def _bgzip_vcf(self, vcf_filepath):

        if not os.path.exists(vcf_filepath):
           print (vcf_filepath + " does not exist")

        zip_cmd = ["bgzip", vcf_filepath]
        
        p = subprocess.Popen(zip_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()        
        
        bgzip_file_path = vcf_filepath + ".gz"
        print (bgzip_file_path)
          
        return bgzip_file_path
  
 
    def _index_vcf(self, bgzip_file):
 
        output_dir = self.scratch

        bgzip_filepath = os.path.join(self.scratch, bgzip_file)
        if not os.path.exists(bgzip_filepath):
           print (bgzip_filepath + " does not exist")

        index_cmd = ["tabix", "-p", "vcf", bgzip_filepath]       
        p = subprocess.Popen(index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()
         
        index_file_path = bgzip_filepath + ".tbi"
     
        return index_file_path

    def _index_assembly(self, assembly_file):
        if not os.path.exists(assembly_file):
           print (assembly_file + " does not exist")

        logging.info("indexing assembly file")

        assembly_index_cmd = ["samtools", "faidx", assembly_file]
        print(assembly_index_cmd)
        p = subprocess.Popen(assembly_index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()

        logging.info("indexing of assembly file done!")

        return assembly_file + ".fai"

    def _download_assembly(self, assembly_ref):
        file = self.au.get_assembly_as_fasta({
          'ref': assembly_ref
        })
        return file
 
    def _construct_variation(self, params, contigs_info):
        
        """
            KBaseGwasData.Variations type spec
             /*
               Variation object data structure
                 num_genotypes - number of total genotypes within variant file
                 num_variants - number of total variants within variant file
                 contigs - list of contig ids and variant information
                 attribute_ref - KBase reference to attribute mapping workspace object
                 genome_ref - KBase reference to genome workspace object
                 assembly_ref - KBase reference to assemebly workspace object
                 vcf_handle_ref - VCF handle reference to VCF file

                 @optional genome_ref
             */
             typedef structure {
               int numgenotypes;
               int numvariants;
               list<contig_info> contigs;
               attribute_ref population; // KBaseExperiments.AttributeMapping
               genome_ref genome_ref; // KBaseGenomes.Genome
               assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly
               vcf_handle_ref vcf_handle_ref;
             } Variations;

            :param params: KBase ui input parameters
            :param population: previoiusly constructed sample population data
            :return: constructed variation object (dictionary)
        """

        if not self.vcf_info['file_ref'].startswith(self.scratch):
            new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref']))
            self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file)
      

        vcf_staged_file = self.original_file

        bgzip_file_path = self._bgzip_vcf(vcf_staged_file)
        vcf_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': bgzip_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref)


        index_file_path = self._index_vcf(bgzip_file_path)
        vcf_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref)


        assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path']

        assembly_index_file_path = self._index_assembly(assembly_file_path)
        assembly_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': assembly_index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref)
        
        variation_obj = {
            'numgenotypes': int(len(self.vcf_info['genotype_ids'])),
            'numvariants': int(self.vcf_info['total_variants']),
            'contigs': contigs_info,
            'population': params['sample_attribute_ref'],

            # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref
            'assemby_ref': self.vcf_info['assembly_ref'],
            'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'],
            'vcf_handle' : vcf_shock_file_ref['handle'],
            'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'],
            'vcf_index_handle': vcf_index_shock_file_ref['handle'],
            'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'],
            'assembly_index_handle': assembly_index_shock_file_ref['handle']
        }
        if 'genome_ref' in params:
            variation_obj['genome_ref'] =  params['genome_ref']

        return variation_obj

    def _save_var_obj(self, params, var):
        """
        :param params:
        :param var:
        :return:
            DataFileUtils object_info:
                objid - the numerical id of the object.
                name - the name of the object.
                type - the type of the object.
                save_date - the save date of the object.
                ver - the version of the object.
                saved_by - the user that saved or copied the object.
                wsid - the id of the workspace containing the object.
                workspace - the name of the workspace containing the object.
                chsum - the md5 checksum of the object.
                size - the size of the object in bytes.
                meta - arbitrary user-supplied metadata about the object.
        """

        print('Saving Variation to workspace...\n')

        if var:
            if not 'variation_object_name' in params:
                var_obj_name = 'variation_'+str(uuid.uuid4())
            else:
                var_obj_name = params['variation_object_name']

            var_obj_info = self.dfu.save_objects({
                'id': self.dfu.ws_name_to_id(params['workspace_name']),
                'objects': [{
                    'type': 'KBaseGwasData.Variations',
                    'data': var,
                    'name': var_obj_name
                }]
            })[0]

            return var_obj_info
        else:
            raise ValueError('Variation object blank, cannot not save to workspace!')

    def _validate_sample_attribute_ref(self, params):

        #params["sample_attribute_ref"] = ''  #just for testing
        if not params['sample_attribute_ref']:
           sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv")   #hardcoded for testing
           self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file)
          
           logging.info("Uploading sample attribute file to ref")
           vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock(
               {'file_path': sample_attribute_mapping_file, 'make_handle': 1}
           )
           shock_id = vcf_sample_attribute_shock_file_ref['shock_id']
           ws_id = self.dfu.ws_name_to_id(params['workspace_name'])
           import_params = {
                  'input_shock_id' : shock_id,
                  'output_ws_id': ws_id,
                  'output_obj_name': 'Sample_attribute'}

           ret = self.gapi.file_to_attribute_mapping(import_params)
           params['sample_attribute_ref'] = ret['attribute_mapping_ref']

    def import_vcf(self, params):
        # VCF validation
        # VCF file validation
        file_valid_result = self.validate_vcf(params)
        self._validate_sample_attribute_ref(params)
        # VCF file parsing
        self.vcf_info = self._parse_vcf_data(params)
        # Validate vcf chromosome ids against assembly chromosome ids
        self._validate_assembly_ids(params)
        # Validate vcf genotypes against sample meta data ids
        self._validate_sample_ids(params)

        # Variation object construction
        # construct contigs_info
        contigs_info = self._construct_contig_info(params)
        # construct variation
        var = self._construct_variation(params, contigs_info)

        # Save variation object to workspace
        var_wksp_obj = self._save_var_obj(params, var)

        return [var_wksp_obj, var]
示例#8
0
class kb_ReadSim:
    '''
    Module Name:
    kb_ReadSim

    Module Description:
    A KBase module: kb_ReadSim
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbasecollaborations/kb_ReadSim.git"
    GIT_COMMIT_HASH = "c9c0185e34d25be57cc6e1c901d8801fbc0f4784"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        self.du = DownloadUtils(self.callback_url)
        self.su = SimUtils()
        self.ru = ReadsUtils(self.callback_url)
        self.vu = VariationUtil(self.callback_url)
        self.eu = VcfEvalUtils()
        self.hu = htmlreportutils()
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def run_kb_ReadSim(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "Inparams" -> structure: parameter
           "workspace_name" of String, parameter "input_sample_set" of
           String, parameter "strain_info" of String, parameter
           "assembly_or_genome_ref" of String, parameter "base_error_rate" of
           String, parameter "outer_distance" of String, parameter
           "standard_deviation" of String, parameter "num_read_pairs" of
           String, parameter "len_first_read" of String, parameter
           "len_second_read" of String, parameter "mutation_rate" of String,
           parameter "frac_indels" of String, parameter
           "variation_object_name" of String, parameter "output_read_object"
           of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_ReadSim
        output_dir = self.shared_folder
        print(params)
        self.su.validate_simreads_params(params)

        genome_or_assembly_ref = params['assembly_or_genome_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        self.du.download_genome(assembly_ref, output_dir)

        ref_genome = os.path.join(self.shared_folder, "ref_genome.fa")
        output_fwd_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed1.fq")
        output_rev_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed2.fq")

        self.eu.check_path_exists(ref_genome)

        self.su.simreads(ref_genome, output_fwd_paired_file_path,
                         output_rev_paired_file_path, params)
        self.eu.check_path_exists(output_fwd_paired_file_path)
        self.eu.check_path_exists(output_rev_paired_file_path)

        retVal = self.ru.upload_reads({
            'wsname': params['workspace_name'],
            'name': params['output_read_object'],
            'sequencing_tech': 'illumina',
            'fwd_file': output_fwd_paired_file_path,
            'rev_file': output_rev_paired_file_path
        })

        logfile = os.path.join(self.shared_folder, "variant.txt")
        self.eu.check_path_exists(logfile)

        vcf_file = self.su.format_vcf(logfile)
        self.eu.check_path_exists(vcf_file)

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['assembly_or_genome_ref'],
            'sample_set_ref': params['input_sample_set'],
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': vcf_file,
            'variation_object_name': params['variation_object_name']
        }
        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'Success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_kb_ReadSim

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_ReadSim return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def run_eval_variantcalling(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "Evalparams" -> structure: parameter
           "workspace_name" of String, parameter "sim_varobject_name" of
           String, parameter "calling_varobject_name" of String, parameter
           "output_var_object" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_eval_variantcalling

        print(params)
        self.eu.validate_eval_params(params)

        report_dir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        os.mkdir(report_dir)

        self.ws = Workspace(url=self.ws_url, token=ctx['token'])

        var_object_ref1 = params['varobject_ref1']
        sampleset_ref1 = self.ws.get_objects2({
            'objects': [{
                "ref": var_object_ref1,
                'included': ['/sample_set_ref']
            }]
        })['data'][0]['data']['sample_set_ref']

        var_object_ref2 = params['varobject_ref2']
        sampleset_ref2 = self.ws.get_objects2({
            'objects': [{
                "ref": var_object_ref2,
                'included': ['/sample_set_ref']
            }]
        })['data'][0]['data']['sample_set_ref']

        if (sampleset_ref1 != sampleset_ref2):
            raise Exception(
                "Variation objects are from different sample set\n")

        assembly_ref_set = set()
        genomeset_ref_set = set()

        variation_obj1 = self.ws.get_objects2(
            {'objects': [{
                'ref': var_object_ref1
            }]})['data'][0]

        if 'assembly_ref' in variation_obj1['data']:
            assembly_ref1 = variation_obj1['data']['assembly_ref']
            assembly_ref_set.add(assembly_ref1)
        elif 'genome_ref' in variation_obj1['data']:
            genome_ref1 = variation_obj1['data']['genome_ref']
            genomeset_ref_set.add(genome_ref1)

        variation_obj2 = self.ws.get_objects2(
            {'objects': [{
                'ref': var_object_ref2
            }]})['data'][0]
        if 'assembly_ref' in variation_obj2['data']:
            assembly_ref2 = variation_obj2['data']['assembly_ref']
            assembly_ref_set.add(assembly_ref2)
        elif 'genome_ref' in variation_obj2['data']:
            genome_ref2 = variation_obj2['data']['genome_ref']
            genomeset_ref_set.add(genome_ref2)

        assembly_or_genome_ref = None

        if (not genomeset_ref_set and len(assembly_ref_set) != 1):
            raise Exception(
                "variation objects are from different assembly refs")
        elif (not assembly_ref_set and len(genomeset_ref_set) != 1):
            raise Exception("variation objects are from different genome refs")

        simvarfile = os.path.join(report_dir, "simvarinat.vcf.gz")
        simvarpath = self.du.download_variations(var_object_ref1, simvarfile)

        os.rename(simvarpath, simvarfile)
        self.eu.index_vcf(simvarfile)

        callingvarfile = os.path.join(report_dir, "callingvarinat.vcf.gz")
        callingvarpath = self.du.download_variations(var_object_ref2,
                                                     callingvarfile)

        os.rename(callingvarpath, callingvarfile)
        self.eu.index_vcf(callingvarfile)

        eval_results = self.eu.variant_evalation(simvarfile, callingvarfile,
                                                 report_dir)

        unique_vcf1 = eval_results['unique1']
        self.eu.check_path_exists(unique_vcf1)

        unique_vcf2 = eval_results['unique2']
        self.eu.check_path_exists(unique_vcf2)

        common_vcf = eval_results['common']
        self.eu.check_path_exists(common_vcf)

        image_path = self.eu.plot_venn_diagram(report_dir, unique_vcf1,
                                               unique_vcf2, common_vcf)
        self.eu.check_path_exists(image_path)
        '''
        if(len(assembly_ref_set) != 0):
            assembly_or_genome_ref = assembly_ref_set.pop()
        elif(len(genomeset_ref_set) != 0):
            assembly_or_genome_ref = genomeset_ref_set.pop()

        logging.info("Saving Unique1 vcf\n")
        save_unique_variation_params1 = {'workspace_name': params['workspace_name'],
                                        'genome_or_assembly_ref': assembly_or_genome_ref,
                                        'sample_set_ref': sampleset_ref1,
                                        'sample_attribute_name': 'sample_unique_attr1',
                                        'vcf_staging_file_path': unique_vcf1,
                                        'variation_object_name': params['output_variant_object'] + "_sample1_unique"
        }
        self.vu.save_variation_from_vcf(save_unique_variation_params1)
        logging.info("Saving done\n")

        logging.info("Saving Unique2 vcf\n")
        save_unique_variation_params2 = {'workspace_name': params['workspace_name'],
                                        'genome_or_assembly_ref': assembly_or_genome_ref,
                                        'sample_set_ref': sampleset_ref1,
                                        'sample_attribute_name': 'sample_unique_attr2',
                                        'vcf_staging_file_path': unique_vcf2,
                                        'variation_object_name': params['output_variant_object'] + "_sample2_unique"
        }
        self.vu.save_variation_from_vcf(save_unique_variation_params2)
        logging.info("Saving done\n")

        logging.info("Saving Common vcf\n")
        save_common_variation_params = {'workspace_name': params['workspace_name'],
                                 'genome_or_assembly_ref': assembly_or_genome_ref,
                                 'sample_set_ref': sampleset_ref1,
                                 'sample_attribute_name': 'sample_common_attr',
                                 'vcf_staging_file_path': common_vcf,
                                 'variation_object_name': params['output_variant_object'] + "_sample1_sample2_common"
        }
        self.vu.save_variation_from_vcf(save_common_variation_params)
        logging.info("Saving done\n")
        '''

        workspace = params['workspace_name']
        output = self.hu.create_html_report(self.callback_url, report_dir,
                                            workspace)
        #END run_eval_variantcalling

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_eval_variantcalling return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
示例#9
0
class VCFToVariation:
    def __init__(self, Config):
        self.scratch = Config['scratch']
        ws_url = Config['ws_url']
        callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(callback_url)
        self.wsc = Workspace(ws_url)
        self.au = AssemblyUtil(callback_url)
        self.vcf_info = dict()

    def _parse_header(self, record, category):
        """
        parses vcf header which looks like the following
        and get details for the IDs like DP, q10
        This information is useful in filtering
        ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
        ##FILTER=<ID=q10,Description="Quality below 10">
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
        [
            {
            },
            {
            },
            {
            }
        ]
        """
        returninfo = {"Category": category}

        # remove all comma within quotes
        record = re.sub(r'(?!(([^"]*"){2})*[^"]*$),', '', record)
        record = record.rstrip()
        # Remove last > character
        record = record[:-1]
        info = re.sub(".*=<", "", record)
        infolist = info.replace('"', '').rstrip().split(",")
        for fields in infolist:
            data = fields.split("=")
            key = data.pop(0)
            val = "=".join(data)
            val = val.replace("\"", "")
            returninfo[key] = val
        return returninfo

    def parse_vcf_data(self, vcf_filepath):
        """
        parses vcf file including headers and prepares
        information that will be uploaded to KBase workspace
        :param vcf_filepath:
        :return:
        """
        reader = gzip.open(vcf_filepath, "rt")
        version = ""
        genotypes = ""
        counter = 0
        chromosomes = list()
        contigs = {}
        header = list()
        totalvars = 0

        for record in reader:

            # Handle header lines and parse information
            if record[0] == "#":
                if record.startswith("##fileformat"):
                    version = record.replace("##fileformat=", "").rstrip()
                if record.startswith("##INFO=<"):
                    info = self._parse_header(record, "INFO")
                    header.append(info)
                if record.startswith("##FORMAT=<"):
                    info = self._parse_header(record, "FORMAT")
                    header.append(info)
                if record.startswith("##FILTER=<"):
                    info = self._parse_header(record, "FILTER")
                    header.append(info)
                if (record.startswith("#CHROM")):
                    # This is the chrome line
                    record = record.rstrip()
                    values = record.split("\t")
                    genotypes = values[9:]
                continue

            # Handle the actual VCF content and parse information
            counter = counter + 1

            CHROM, *_ = record.split("\t")

            totalvars += 1
            if CHROM not in chromosomes:
                chromosomes.append(CHROM)
                contigs[CHROM] = {'contig_id': CHROM, 'totalvariants': 1}
            else:
                contigs[CHROM]['totalvariants'] += 1

        variation_details = VCFReaderStream(vcf_filepath)
        vcf_info = {
            'version': version,
            'contigs': contigs,
            'total_variants': totalvars,
            'genotype_ids': genotypes,
            'chromosome_ids': chromosomes,
            'variation_details': variation_details,
            'file_ref': vcf_filepath,
            'header': header
        }
        return vcf_info

    def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids):
        genos_not_found = []
        vgenotypes = [x.upper().strip() for x in vcf_genotypes]
        sids = [x.upper().strip() for x in sample_ids]

        for geno in vgenotypes:
            if geno not in sids:
                genos_not_found.append(geno)

        if not genos_not_found:
            return True
        else:
            return genos_not_found

    def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes,
                                    assembly_chromosomes):
        """
        Check if all chromosome ids in vcf are also present in assembly
        :param vcf_chromosomes:
        :param assembly_chromosomes:
        :return: returns list of chromosome ids,
                present in vcf, but absent from assembly
        """
        chromos_not_in_assembly = []

        for chromo in vcf_chromosomes:
            if chromo not in assembly_chromosomes:
                chromos_not_in_assembly.append(chromo)

        if not chromos_not_in_assembly:
            return True
        else:
            return chromos_not_in_assembly

    def _validate_assembly_ids(self, vcf_info):
        """
        All chromosome ids from the vcf should be in assembly
        but not all assembly chromosome ids need to be in vcf
        :param params:
        :return: list of all assembly chromosome ids
        """
        assembly_chromosome_ids_call = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref':
            vcf_info['assembly_ref']
        }])
        assembly_chromosomes = assembly_chromosome_ids_call[0]['data'][
            'contigs'].keys()
        vcf_chromosomes = vcf_info['chromosome_ids']
        chk_assembly_ids = self._chk_if_vcf_ids_in_assembly(
            vcf_chromosomes, assembly_chromosomes)

        if isinstance(chk_assembly_ids, list):
            failed_ids = ' '.join(chk_assembly_ids)
            print(f'VCF contig ids: {failed_ids} are not present in assembly.')
            raise ValueError(
                f'VCF contig ids: {failed_ids} are not present in assembly.')

        return assembly_chromosomes

    def _construct_contig_info(self, vcf_info):
        """
           From KBaseGwasData.Variations type spec
            /*
               Contig variation data
                 contig_id - contig identifier
                 totalvariants - total number of variants in each contig
                 length - length of contig from assembly data
             */
             typdef structure {
               string contig_id;
               int totalvariants;
               int length; // from assembly
             } contig_info;

        """
        assembly_chromosome_dict = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref':
            vcf_info['assembly_ref']
        }])[0]['data']['contigs']

        contigs = []

        contig_infos = vcf_info['contigs']

        for contig_id in contig_infos:
            length_contig = assembly_chromosome_dict[contig_id].get("length")
            contig_infos[contig_id]["length"] = length_contig
            contigs.append(contig_infos[contig_id])

        return contigs

    def _construct_variation_object_json(self, vcf_info):
        """
            KBaseGwasData.Variations type spec
             /*
               Variation object data structure
                 num_genotypes - number of total genotypes within variant file
                 num_variants - number of total variants within variant file
                 contigs - list of contig ids and variant information
                 attribute_ref - KBase reference to attribute mapping workspace object
                 genome_ref - KBase reference to genome workspace object
                 assembly_ref - KBase reference to assemebly workspace object
                 vcf_handle_ref - VCF handle reference to VCF file

                 @optional genome_ref
             */
             typedef structure {
               int numgenotypes;
               int numvariants;
               list<contig_info> contigs;
               attribute_ref samples; // KBaseExperiments.AttributeMapping
               genome_ref genome_ref; // KBaseGenomes.Genome
               assembly_ref assembly_ref; // KBaseGenomeAnnotations.Assembly
               vcf_handle_ref vcf_handle_ref;
             } Variations;

            :param params: KBase ui input parameters
            :return: constructed variation object (dictionary)
        """
        logging.info("Uploading VCF file to shock")
        vcf_shock_file_ref = None
        vcf_index_shock_file_ref = None
        if os.path.exists(vcf_info['vcf_compressed']):
            vcf_shock_file_ref = self.dfu.file_to_shock({
                'file_path':
                vcf_info['vcf_compressed'],
                'make_handle':
                1
            })
        # compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref)

        logging.info("Uploading VCF index file to shock")
        if os.path.exists(vcf_info['vcf_index']):
            vcf_index_shock_file_ref = self.dfu.file_to_shock({
                'file_path':
                vcf_info['vcf_index'],
                'make_handle':
                1
            })
        # compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref)

        # TODO: remove any reference to samples in this file
        variation_obj_data = {
            'numgenotypes': int(len(vcf_info['genotype_ids'])),
            'numvariants': int(vcf_info['total_variants']),
            'contigs': vcf_info['contigs_info'],
            "header": vcf_info['header'],
            "variation_details": vcf_info['variation_details'],
            'assembly_ref': vcf_info['assembly_ref'],
            'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'],
            'vcf_handle': vcf_shock_file_ref['handle'],
            'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'],
            'vcf_index_handle': vcf_index_shock_file_ref['handle'],
        }
        if 'genome_ref' in vcf_info:
            variation_obj_data['genome_ref'] = vcf_info['genome_ref']
        if 'sample_attribute_ref' in vcf_info:
            variation_obj_data['sample_attribute_ref'] = vcf_info[
                'sample_attribute_ref']
        return variation_obj_data

    def generate_variation_object_data(self, params):
        # VCF validation
        # VCF file parsing

        # Copy vcf_compressed, vcf_index,

        vcf_info = self.parse_vcf_data(params['vcf_compressed'])
        vcf_info['vcf_compressed'] = params['vcf_compressed']
        vcf_info['vcf_index'] = params['vcf_index']

        assembly_ref = params['assembly_ref']
        if 'genome_ref' in params:
            genome_ref = params['genome_ref']

        logging.info("Parsing vcf started")
        vcf_info['assembly_ref'] = assembly_ref
        if 'genome_ref' in params:
            vcf_info['genome_ref'] = genome_ref

        logging.info("Comparing assembly ids")
        # Validate vcf chromosome ids against assembly chromosome ids
        result = self._validate_assembly_ids(vcf_info)
        # Variation object construction
        # construct contigs_info
        if result:
            logging.info("Creating contig info")
            vcf_info['contigs_info'] = self._construct_contig_info(vcf_info)

        # construct variation
        variation_object_json = self._construct_variation_object_json(vcf_info)

        return variation_object_json
示例#10
0
class kb_GATK:
    '''
    Module Name:
    kb_GATK

    Module Description:
    A KBase module: kb_GATK
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbasecollaborations/kb_GATK.git"
    GIT_COMMIT_HASH = "5e6e4bdca9a7749bba0abab081736c56007212ed"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.shared_folder = config['scratch']
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        self.gu = GATKUtils()
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        self.vu = VariationUtil(self.callback_url)
        self.du = DownloadAlignmentUtils(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def run_kb_GATK(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_GATK
        source_ref = params['alignment_ref']
        alignment_out = self.du.downloadreadalignment(source_ref, params,
                                                      self.callback_url)
        sam_file = os.path.join(alignment_out['destination_dir'],
                                "reads_alignment.sam")
        '''
        #Todo Reading sample set and sample strains information
        '''
        '''
        command.extend(["-filter-name", "\"QD_filter\"", "-filter", "\"QD", "<", params['snp_filter']['snp_qd_filter'] + "\""])
        command.extend(["-filter-name", "\"FS_filter\"", "-filter", "\"FS", "<", params['snp_filter']['snp_fs_filter'] + "\""])
        command.extend(["-filter-name", "\"MQ_filter\"", "-filter", "\"MQ", "<", params['snp_filter']['snp_mq_filter'] + "\""])
        command.extend(["-filter-name", "\"SOR_filter\"", "-filter", "\"SOR", "<", params['snp_filter']['snp_sor_filter'] + "\""])
        command.extend(["-filter-name", "\"MQRankSum_filter\"", "-filter", "\"MQRankSum", "<", params['snp_filter']['snp_mqrankSum_filter'] + "\""])
        command.extend(["-filter-name", "\"ReadPosRankSum_filter\"", "-filter", "\"ReadPosRankSum", "<", params['snp_filter']['snp_readposranksum_filter'] + "\""])
        '''
        print(params)
        strain_info = params['strain_info']
        output_dir = os.path.join(self.shared_folder, str(uuid.uuid4()))
        os.mkdir(output_dir)

        genome_or_assembly_ref = params['assembly_or_genome_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        assembly_file = self.du.download_genome(assembly_ref,
                                                output_dir)['path']

        #output_dir = output_dir + "/"

        #Todo: check time for building index file or donwload from cache.
        #Todo: To discuss about cache_id to be used.
        #Todo: In case of copying genome, find the way of finding original genome (ref id) for getting original cache id.

        self.gu.build_genome(assembly_file)
        self.gu.index_assembly(assembly_file)
        self.gu.generate_sequence_dictionary(assembly_file)
        self.gu.duplicate_marking(output_dir, sam_file)
        #self.gu.sort_bam_index(output_dir)
        self.gu.collect_alignment_and_insert_size_metrics(
            assembly_file, output_dir)
        #self.gu.analyze_covariates(output_dir)

        #Todo: avoid writing intermediate fies to save space and time I/O.
        self.gu.variant_calling(assembly_file, output_dir)
        self.gu.extract_variants(assembly_file, output_dir)
        self.gu.filter_SNPs(assembly_file, "filtered_snps.vcf", output_dir,
                            params)
        self.gu.filter_Indels(assembly_file, "filtered_indels.vcf", output_dir,
                              params)
        self.gu.exclude_filtered_variants(output_dir)
        self.gu.base_quality_score_recalibration(assembly_file,
                                                 "recal_data.table",
                                                 output_dir)
        self.gu.apply_BQSR(assembly_file, "recal_data.table", output_dir)
        self.gu.base_quality_score_recalibration(assembly_file,
                                                 "post_recal_data.table",
                                                 output_dir)
        self.gu.apply_BQSR(assembly_file, "post_recal_data.table", output_dir)
        self.gu.filter_SNPs(assembly_file, "filtered_snps_final.vcf",
                            output_dir, params)

        #Todo: To save indels also using VariationUtils or merge with snps and sort them with chr & pos and save using variaiotiontuils.
        #Todo: To get an example for saving structural variants(specially CNV) and compare with standard vcf output.

        self.gu.filter_Indels(assembly_file, "filtered_indels_final.vcf",
                              output_dir, params)
        '''
        os.system("grep   '##fileformat' " + output_dir + "/filtered_snps_final.vcf > " + output_dir + "/sample.vcf")
        cmd = "grep -v  '##' " + output_dir + "/filtered_snps_final.vcf >> " + output_dir + "/sample.vcf"
        os.system(cmd)            # TODO : need to remove system command after fixing variationUtils.
        '''

        vcf_filepath = self.gu.index_vcf_file(output_dir +
                                              "/filtered_snps_final.vcf")
        reheader_vcf_file = self.gu.reheader(vcf_filepath, strain_info)
        #Todo : check existence of final filtered finals snps.
        #Todo : chnage assembly_or_genome_ref to genome_or_assembly_ref

        #Todo: to derive name of sample_attribute_name from sample set ref by prefixing/suffixing. Attribute mapping should have one sample.

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['assembly_or_genome_ref'],
            'sample_set_ref': params['input_sample_set'],
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': reheader_vcf_file,
            'variation_object_name': params['variation_object_name']
        }

        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'Success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_kb_GATK

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_GATK return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]