def create_variation_report(self, params): ''' Create a table report with contig_id, length, number_variation, density/mb :param variation_ref: ''' ws = Workspace(self.ws_url) subset = ws.get_object_subset([{ 'included': ['/numgenotypes', 'numvariants'], 'ref': params['variation_ref'] }]) numgenotypes = subset[0]['data']['numgenotypes'] numvariants = subset[0]['data']['numvariants'] variation_table = """ <table> <thead> <tr> <td>Number of strains/genotypes</td> <td> ##numgenotypes##</td> </tr> </thead> <tbody> <tr> <td>Number of variants</td> <td>##numvariants##</td> </tr> </tbody> </table> """ variation_table = variation_table.replace("##numgenotypes##", str(numgenotypes)) variation_table = variation_table.replace("##numvariants##", str(numvariants)) session = str(uuid.uuid4()) htmlreport_dir = (os.path.join(self.scratch, session)) os.mkdir(htmlreport_dir) index_html_path = os.path.join(htmlreport_dir, "index.html") with open(index_html_path, "w") as f: f.write(variation_table) return (htmlreport_dir)
class GFFUtils2: def __init__(self, config): self.callback_url = config['callback_url'] self.shared_folder = config['scratch'] #self.shared_folder = "/kb/module/work" self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.wsc = Workspace(self.ws_url) def _prep_gff(self, gff_file): outfile = os.path.join(self.genome_dir, 'out.gff') sortcmd = f'(grep ^"#" {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)' with open(outfile, 'w') as o: p = subprocess.Popen(sortcmd, shell=True, stdout=o) out, err = p.communicate() o.close() bgzip = subprocess.Popen(['bgzip', 'out.gff'], cwd=self.genome_dir) out2, err2 = bgzip.communicate() outfile += '.gz' return outfile def _construct_gff_from_json(self, json, gff_file_path, contig_base_lengths): with open(gff_file_path, 'w') as f: for feature in json: if feature['feature_type'].strip().upper() == 'GENE': end = int(feature['location'][0]['start'])+int(feature['location'][0]['length']) metainfo = "ID="+feature['feature_id'] if feature['function']: metainfo += ';FUNCTION='+feature['function'] contig_id = str(feature['location'][0]['contig_id']) start = int(feature['location'][0]['start']) # TODO: Fix Plink reassignment of Chr prefixes try: global_pos = int(contig_base_lengths[contig_id]) + start except KeyError: try: global_pos = int(contig_base_lengths[contig_id.capitalize()]) + start except KeyError: try: global_pos = int(contig_base_lengths['Chr'+str(contig_id)]) + start except KeyError: try: global_pos = int(contig_base_lengths['Chr0'+str(contig_id)]) + start except KeyError: pp(contig_base_lengths) pp(contig_id) raise KeyError(e) """ Remove ontology for now if feature['ontology_terms']: metainfo += ';ONTOLOGY(' for k, v in feature['ontology_terms'].items(): metainfo += str(k) + ',' + str(v) + ':' metainfo = metainfo[:-1] # remove trailing ; metainfo += ')' """ constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \ 'KBase\tgene\t' + \ str(feature['location'][0]['start']) + '\t' + \ str(end) + '\t.\t' + \ str(feature['location'][0]['strand']) + '\t' + \ str(global_pos) + '\t' + \ str(metainfo) + '\n' f.write(constructed_gff_line) f.close() if os.path.exists(gff_file_path): return gff_file_path else: raise FileNotFoundError('Unable to create GFF file form genome JSON.') def _process_tabix_results(self, queryresult): queryinfo = queryresult[8].split(';') if len(queryinfo) >= 2: extension = [clean_tsv_data(queryinfo[0][3:]), "NA", clean_tsv_data(queryinfo[1][9:])] elif len(queryinfo) is 1: extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"] else: extension = ['NA', 'NA', 'NA'] return extension def find_gene_info(self, row): tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]), int(row["POS"])) tbresult = next(tb, None) if tbresult is None: tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult2 = next(tb2, None) if tbresult2 is None: tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult3 = next(tb3, None) if tbresult3 is None: if int(row["POS"]) < 500: nstart = 0 else: nstart = int(row["POS"]) - 500 neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart, int(row["POS"]) + 500) neigh_result = next(neigh_tb, None) if neigh_result is None: return pd.Series(['NA', 'NA', 'NA'], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: nq = self._process_tabix_results(neigh_result) return pd.Series([nq[1], nq[0], nq[2]], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q3 = self._process_tabix_results(tbresult3) return pd.Series(q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q2 = self._process_tabix_results(tbresult2) return pd.Series(q2, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q = self._process_tabix_results(tbresult) return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) def get_gwas_result_file(self, association_ref, association_name, p_value): #association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]['data']['data'] association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0] association_results = association_obj['data']["association_details"][0]["association_results"] result = "CHR\tSNP\tPOS\tP\tBP\n" for variation in association_results: if (float(variation[3]) > float(p_value)): continue result += str(variation[0]) + "\t" result += str(variation[1]) + "\t" result += str(variation[2]) + "\t" result += str(variation[3]) + "\t" result += str(variation[2]) + "\n" filepath = os.path.join(self.genome_dir, association_name) with open(filepath, "w") as file1: file1.write(result) return (filepath) def build_featureset(self, filepath, genome_ref, description, workspace_name, association_name, prefix): gene_ids = dict() element_ordering = list() elements = dict() skip_words = ["GENEID", "NEIGHBORGENE", "NA"] with open(filepath, 'r') as reader: for line in reader: fields = line.split("\t") condition1 = fields[5] not in skip_words condition2 = fields[5] not in elements condition3 = fields[6] not in skip_words condition4 = fields[6] not in elements if condition1 and condition2: element_ordering.append(fields[5]) elements[fields[5]] = [genome_ref] if condition3 and condition4: element_ordering.append(fields[6]) elements[fields[6]] = [genome_ref] featureset = dict() featureset['description'] = description featureset['element_ordering'] = element_ordering featureset['elements'] = elements ws_id = self.dfu.ws_name_to_id(workspace_name) featureset_obj_name = prefix + str(association_name) save_info = self.dfu.save_objects( { 'id': ws_id, 'objects': [ {'type': 'KBaseCollections.FeatureSet', 'data': featureset, 'name': featureset_obj_name}]})[0] obj_ref = "{0}/{1}/{2}".format( save_info[6], save_info[0], save_info[4] ) return obj_ref def annotate_GWAS_results(self, genome_ref, association_ref, workspace_name, prefix, p_value): #TODO: Send outfile to prep gff function inseted of hardcord #TODO: Removed hard coded stuff and create new directory for each test function self.genome_dir_name = "_".join(genome_ref.split("/")) self.genome_dir = os.path.join(self.shared_folder, self.genome_dir_name) if not os.path.isdir(self.genome_dir): os.mkdir(self.genome_dir) sorted_gff_path = os.path.join(self.genome_dir, 'out.gff.gz') self.sorted_gff = sorted_gff_path if not os.path.exists(sorted_gff_path): feature_num = self.gsu.search({'ref': genome_ref})['num_found'] # get genome features for gff construction genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': feature_num, #'sort_by': [['feature_id', True]] })['features'] assembly_ref = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }])[0]['data']['assembly_ref'] # get assembly contigs for base length calculations assembly_contigs = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': assembly_ref }])[0]['data']['contigs'] contig_ids = list(assembly_contigs.keys()) contig_ids.sort() contig_base_lengths = {} prev_length = 0 for contig in contig_ids: contig_base_lengths[contig] = prev_length prev_length += assembly_contigs[contig]['length'] gff_file = os.path.join(self.genome_dir, 'constructed.gff') constructed_gff = self._construct_gff_from_json(genome_features, gff_file, contig_base_lengths) self.sorted_gff = self._prep_gff(constructed_gff) tabix_index(self.sorted_gff) obj_info = self.wsc.get_object_info3({"objects": [{"ref": association_ref}]}) association_name =obj_info["infos"][0][1] gwas_results_file = self.get_gwas_result_file(association_ref, association_name, p_value) gwas_results = pd.read_csv(gwas_results_file, sep='\t') gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \ gwas_results.apply(self.find_gene_info, axis=1) new_results_path = os.path.abspath(os.path.join(gwas_results_file, '..')) fname = 'final_' + association_name new_results_path = os.path.join(new_results_path, fname ) gwas_results.to_csv(path_or_buf=new_results_path, sep='\t', index=False) description = "Genelist for GWAS results of trait " + association_name featureset_obj = self.build_featureset( new_results_path, genome_ref, description, workspace_name, association_name, prefix) return featureset_obj
class JbrowseUtil: def __init__(self, Config): callback_url = os.environ['SDK_CALLBACK_URL'] ws_url = Config['ws_url'] self.wsc = Workspace(ws_url) self.dfu = DataFileUtil(callback_url) self.gfu = GenomeFileUtil(callback_url) #service-wizard url self.sw_url = Config['sw_url'] self.shock_url = Config['shock_url'] scratch = Config['scratch'] session = str(uuid.uuid4()) self.session_dir = (os.path.join(scratch, session)) os.mkdir(self.session_dir) pass def get_variation_service_url(self, sw_url): ''' get the most recent VariationFileServ url from the service wizard. sw_url: service wizard url ''' # TODO Fix the following dev thing to beta or release or future json_obj = { "method": "ServiceWizard.get_service_status", "id": "", "params": [{"module_name": "VariationFileServ", "version": "dev"}] } sw_resp = requests.post(url=sw_url, data=json.dumps(json_obj)) vfs_resp = sw_resp.json() self.shock_url = self.shock_url.replace("https://", "") vfs_url = vfs_resp['result'][0]['url'] + "/jbrowse_query/" + self.shock_url + "/node" return vfs_url def _run_cmd(self, cmd): try: process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) stdout, stderr = process.communicate() if stdout: logging.info("ret> ", process.returncode) logging.info("OK> output ", stdout) if stderr: logging.info("ret> ", process.returncode) logging.info("Error> error ", stderr.strip()) except OSError as e: logging.info("OSError > ", e.errno) logging.info("OSError > ", e.strerror) logging.info("OSError > ", e.filename) def create_refseqs_data_from_assembly(self, assembly_ref): ''' :param assembly_json: :return: ''' refseqs_data = [] # 1) Download assembly contig info and parse contig length information data = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': assembly_ref }])[0]['data'] for key in data['contigs']: refseqs_data.append( {"end": data['contigs'][key]["length"], "length": data['contigs'][key]["length"], "name": data['contigs'][key]["contig_id"], "seqChunkSize": 20000, "start": 0 } ) return refseqs_data def prepare_genome_features_track(self, genome_ref, vfs_url): """ Builds track for genome features :param genome_ref: :return: """ shock_handles = list() gff_track = "" # 1) Download gff using genomefileutil gff_file_info = self.gfu.genome_to_gff({'genome_ref': genome_ref}) gff_file = gff_file_info["file_path"] # 2) sort gff outfile = gff_file + "_sorted" sorted_gff_cmd = " ".join(["sort -k1,1 -k4,4n", gff_file, ">", outfile]) self._run_cmd(sorted_gff_cmd) # 3) compress gff zip_cmd = "bgzip " + outfile self._run_cmd(zip_cmd) # 4) index gff index_gff_cmd = "tabix -p gff " + gff_file + "_sorted.gz" self._run_cmd(index_gff_cmd) gff_gz_file_path = gff_file + "_sorted.gz" gff_index_file_path = gff_file + "_sorted.gz.tbi" # 5) Upload gff and gff index to shock if os.path.exists(gff_gz_file_path): gff_shock_ref = self.dfu.file_to_shock( {'file_path': gff_gz_file_path, 'make_handle': 1} ) if os.path.exists(gff_index_file_path): gff_index_shock_ref = self.dfu.file_to_shock( {'file_path': gff_index_file_path, 'make_handle': 1} ) # 6 Create gff track text that will be used for genome features track gff_track = ''' { "label": "Genome Features", "key": "GenomeFeatures", "storeClass": "JBrowse/Store/SeqFeature/GFF3Tabix", "urlTemplate":"<vfs_url>/<gff_shock_ref>", "tbiUrlTemplate": "<vfs_url>/<gff_index_shock_ref>", "type": "JBrowse/View/Track/CanvasFeatures" } ''' gff_track = gff_track.replace("<gff_shock_ref>", gff_shock_ref['handle']['id']) gff_track = gff_track.replace("<gff_index_shock_ref>", gff_index_shock_ref['handle']['id']) gff_track = gff_track.replace("<vfs_url>", vfs_url) gff_track_dict = json.loads(gff_track) # 7) Capture shock handles shock_handles.append(gff_shock_ref['handle']) shock_handles.append(gff_index_shock_ref['handle']) # 8) return shock handles and gff track info return {"shock_handle_list": shock_handles, "track_item": gff_track_dict} def prepare_snp_frequency_track(self, vcf_filepath, assembly_ref, binsize, vfs_url): """ :param vcf_filepath: :param assembly_ref: :param binsize: :return: """ BEDGRAPHTOBIGWIG="/kb/deployment/bin/bedGraphToBigWig" shock_handles = list() chr_length_dict = {} chr_length_data = "" chr_length_path = None counts = Counter() # 1) Download assembly contig info and parse contig length information data = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': assembly_ref }])[0]['data'] contigs = data["contigs"] for contig in contigs: contig_data = data["contigs"][contig] chr_length_data += str(contig_data['contig_id']) + '\t' + str(contig_data['length']) + '\n' c_id = str(contig_data['contig_id']) c_length = str(contig_data['length']) chr_length_dict[c_id] = c_length # 2) Write contig lengths to a file (needed later) if chr_length_data is not None: chr_length_path = os.path.join(self.session_dir, "chr_length.txt") with open(chr_length_path, "w") as f: f.write(chr_length_data) # 3) Read and parse vcf file (must be bgzip compressed) # Caclculate number of SNPs in each bin and write in bedgraph format reader = gzip.open(vcf_filepath, "rt") logging.info("Generating bedgraph file\n") for record in reader: if record[0] == "#": continue rs = record.split("\t") CHR, POS = rs[0], rs[1] bin_pos = int(int(POS) / binsize) bin_id = str(CHR) + "\t" + str(bin_pos) counts[bin_id] += 1 bedgraph_file = os.path.join(self.session_dir, "vcf_bedgraph.txt") try: with open(bedgraph_file, "w") as fout: for j, k in counts.items(): chromosome, bin_num = j.split("\t") bin_start = int(bin_num) * binsize bin_end = bin_start + binsize chr_length = chr_length_dict[chromosome] if bin_end <= int(chr_length): fout.write(chromosome + "\t" + str(bin_start) + "\t" + str(bin_end) + "\t" + str(k) + "\n") else: fout.write(chromosome + "\t" + str(bin_start) + "\t" + str(chr_length) + "\t" + str(k) + "\n") except IOError: logging.info("Unable to write " + bedgraph_file, + " file on disk.") # 4) Sort bedgraph file by chromosome id and co-ordinates sorted_bedgraph_file = bedgraph_file + "_sorted" sort_cmd = "sort -k1,1 -k2,2n " + bedgraph_file + "> " + sorted_bedgraph_file self._run_cmd(sort_cmd) # 5) Convert sorted bedgraph to bigwig format using utility bedgraphTOBigWig tool output_bigwig_file = bedgraph_file + "_bigwig.bw" cmd = BEDGRAPHTOBIGWIG + " " + sorted_bedgraph_file + " " + chr_length_path + " " + output_bigwig_file logging.info("Generating bigwig ..\n" + cmd + "\n") self._run_cmd(cmd) # 6) upload bigwig file to shock logging.info("Uploading Bigwig file to shock") if os.path.exists(output_bigwig_file): bigwig_shock_ref = self.dfu.file_to_shock( {'file_path': output_bigwig_file, 'make_handle': 1} ) # 7) Append shock handle to genomic_indexes shock_handles.append(bigwig_shock_ref['handle']) # 8) Build snp frequency track output_bigwig_shock = bigwig_shock_ref['handle']['id'] snp_frequency_track = ''' { "label": "Variation Densityy", "key": "Variation_density", "storeClass": "JBrowse/Store/SeqFeature/BigWig", "urlTemplate": "<vfs_url>/<bigwig_shock_id>", "type": "JBrowse/View/Track/Wiggle/XYPlot" } ''' snp_frequency_track = snp_frequency_track.replace("<bigwig_shock_id>", output_bigwig_shock) snp_frequency_track = snp_frequency_track.replace("<vfs_url>", vfs_url) snp_frequency_track_dict = json.loads(snp_frequency_track) # 9) Return shock handles and track info return {"shock_handle_list": shock_handles, "track_item": snp_frequency_track_dict} def prepare_snp_track(self, vcf_shock_id, vcf_index_shock_id, vfs_url): """ :param vcf_shock_id: :param vcf_index_shock_id: :return: """ shock_handles = list() snp_track =''' { "label": "Variation", "key": "Variation", "storeClass": "JBrowse/Store/SeqFeature/VCFTabix", "urlTemplate": "<vfs_url>/<vcf_shock_id>", "tbiUrlTemplate": "<vfs_url>/<vcf_index_shock_id>", "type": "JBrowse/View/Track/HTMLVariants" } ''' snp_track = snp_track.replace("<vcf_shock_id>", vcf_shock_id) snp_track = snp_track.replace("<vcf_index_shock_id>", vcf_index_shock_id) snp_track = snp_track.replace("<vfs_url>", vfs_url) snp_track_dict = json.loads(snp_track) # shock handles should be empty list in return when built from shock ids return {"shock_handle_list": shock_handles, "track_item": snp_track_dict} def build_jbrowse_data_folder(self, jbrowse_path): shock_handles = list() data_folder_shock_ref = self.dfu.file_to_shock({'file_path': jbrowse_path, 'pack': 'zip', 'make_handle': 1}) shock_handles.append(data_folder_shock_ref['handle']) return {"shock_handle_list": shock_handles} def build_jbrowse(self, jbrowse_src, jbrowse_path, refseqs_data, genomic_indexes, tracklist_items): """ :param jbrowse_src: :param jbrowse_path: :param genomic_indexes: :param tracklist_items: :return: """ jbrowse_report = {} # 1) Copy the jbrowse source code to build report destination = shutil.copytree(jbrowse_src, jbrowse_path) # 2) Put tracklist.json in jbrowse data path tracklist_path = os.path.join(jbrowse_path, "data", "trackList.json") trackdata = { 'formatVersion': 1, 'tracks': tracklist_items } with open(tracklist_path, "w") as f: f.write(json.dumps(trackdata)) # 3) Put refseq.json in jbrowse seq path refseqs_json_path = os.path.join(jbrowse_path, "data", "seq", "refSeqs.json") with open(refseqs_json_path, "w") as f: f.write(json.dumps(refseqs_data)) #Build jbrowse data folder to support jbrowse widget in narrative res = self.build_jbrowse_data_folder(jbrowse_path) data_folder_index = res['shock_handle_list'] genomic_indexes = genomic_indexes + data_folder_index # Build jbrowse report dict jbrowse_report["jbrowse_data_path"] = jbrowse_path jbrowse_report["genomic_indexes"] = genomic_indexes return jbrowse_report def prepare_jbrowse_report(self, jbrowse_params): """ Build genomic indexes, prepare jbrowse report :param input_params: :return: """ # Service wizard sw_url = self.sw_url # Variation file service url for serving jbrowse track files vfs_url = self.get_variation_service_url(sw_url) print(vfs_url) genomic_indexes = list() tracklist_items = list() refseqs_data = None # 1) Build refseqs_data # This is used to build refseqs.json file for jbrowse # Jbrowse report can not be built if assembly ref doesn't exist if 'assembly_ref' in jbrowse_params: assembly_ref = jbrowse_params['assembly_ref'] refseqs_data = self.create_refseqs_data_from_assembly(assembly_ref) else: raise ValueError ("assembly ref not found") return # 2) Build genome features track if 'genome_ref' in jbrowse_params: genome_ref = jbrowse_params['genome_ref'] output = self.prepare_genome_features_track(genome_ref, vfs_url) shock_handles, track_item = output["shock_handle_list"], output["track_item"] genomic_indexes = genomic_indexes + shock_handles tracklist_items.append(track_item) else: print ("Skipping genome features track") # 3) Build SNP frequency track cond1 = 'vcf_path' in jbrowse_params cond2 = 'assembly_ref' in jbrowse_params cond3 = 'binsize' in jbrowse_params if cond1 and cond2 and cond3: vcf_path = jbrowse_params['vcf_path'] assembly_ref = jbrowse_params['assembly_ref'] binsize = jbrowse_params["binsize"] output = self.prepare_snp_frequency_track(vcf_path, assembly_ref, binsize, vfs_url) shock_handles, track_item = output["shock_handle_list"], output["track_item"] if shock_handles: genomic_indexes = genomic_indexes + shock_handles tracklist_items.append(track_item) else: print ("Skipping SNP frequency track") # 4) Build SNP track cond1 = 'vcf_shock_id' in jbrowse_params cond2 = 'vcf_index_shock_id' in jbrowse_params if cond1 and cond2: vcf_shock_id = jbrowse_params['vcf_shock_id'] vcf_index_shock_id = jbrowse_params['vcf_index_shock_id'] output = self.prepare_snp_track(vcf_shock_id, vcf_index_shock_id, vfs_url) shock_handles, track_item = output["shock_handle_list"], output["track_item"] genomic_indexes = genomic_indexes + shock_handles tracklist_items.append(track_item) else: print ("Skipping SNP track") # 5) Build jbrowse directory with index.html # jbrowse directory later on gets uploaded as html report jbrowse_src = "/kb/module/deps/jbrowse" jbrowse_path = os.path.join(self.session_dir, "jbrowse") if tracklist_items: jbrowse_report = self.build_jbrowse(jbrowse_src, jbrowse_path, refseqs_data, genomic_indexes, tracklist_items) else: raise ValueError ("No tracks found") return return jbrowse_report
class GFFUtils: def __init__(self, config): self.callback_url = config['callback_url'] self.shared_folder = config['scratch'] self.ws_url = config['workspace-url'] self.GFF_dir = os.path.join(self.shared_folder, 'GFF') if not os.path.isdir(self.GFF_dir): os.mkdir(self.GFF_dir) self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.wsc = Workspace(self.ws_url) def _prep_gff(self, gff_file): outfile = os.path.join(self.shared_folder, 'GFF', 'out.gff') sortcmd = f'(grep ^"#" {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)' with open(outfile, 'w') as o: p = subprocess.Popen(sortcmd, shell=True, stdout=o) out, err = p.communicate() o.close() bgzip = subprocess.Popen(['bgzip', 'out.gff'], cwd=os.path.join(self.shared_folder, 'GFF')) out2, err2 = bgzip.communicate() outfile += '.gz' return outfile def _construct_gff_from_json(self, json, gff_file_path, contig_base_lengths): with open(gff_file_path, 'w') as f: for feature in json: if feature['feature_type'].strip().upper() == 'GENE': end = int(feature['location'][0]['start']) + int( feature['location'][0]['length']) metainfo = "ID=" + feature['feature_id'] if feature['function']: metainfo += ';FUNCTION=' + feature['function'] contig_id = str(feature['location'][0]['contig_id']) start = int(feature['location'][0]['start']) # TODO: Fix Plink reassignment of Chr prefixes try: global_pos = int( contig_base_lengths[contig_id]) + start except KeyError: try: global_pos = int(contig_base_lengths[ contig_id.capitalize()]) + start except KeyError: try: global_pos = int(contig_base_lengths[ 'Chr' + str(contig_id)]) + start except KeyError: try: global_pos = int(contig_base_lengths[ 'Chr0' + str(contig_id)]) + start except KeyError: pp(contig_base_lengths) pp(contig_id) raise KeyError(e) """ Remove ontology for now if feature['ontology_terms']: metainfo += ';ONTOLOGY(' for k, v in feature['ontology_terms'].items(): metainfo += str(k) + ',' + str(v) + ':' metainfo = metainfo[:-1] # remove trailing ; metainfo += ')' """ constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \ 'KBase\tgene\t' + \ str(feature['location'][0]['start']) + '\t' + \ str(end) + '\t.\t' + \ str(feature['location'][0]['strand']) + '\t' + \ str(global_pos) + '\t' + \ str(metainfo) + '\n' f.write(constructed_gff_line) f.close() if os.path.exists(gff_file_path): return gff_file_path else: raise FileNotFoundError( 'Unable to create GFF file form genome JSON.') def _process_tabix_results(self, queryresult): queryinfo = queryresult[8].split(';') if len(queryinfo) >= 2: extension = [ clean_tsv_data(queryinfo[0][3:]), "NA", clean_tsv_data(queryinfo[1][9:]) ] elif len(queryinfo) is 1: extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"] else: extension = ['NA', 'NA', 'NA'] return extension def find_gene_info(self, row): tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]), int(row["POS"])) tbresult = next(tb, None) if tbresult is None: tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult2 = next(tb2, None) if tbresult2 is None: tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult3 = next(tb3, None) if tbresult3 is None: if int(row["POS"]) < 500: nstart = 0 else: nstart = int(row["POS"]) - 500 neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart, int(row["POS"]) + 500) neigh_result = next(neigh_tb, None) if neigh_result is None: return pd.Series( ['NA', 'NA', 'NA'], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: nq = self._process_tabix_results(neigh_result) return pd.Series( [nq[1], nq[0], nq[2]], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q3 = self._process_tabix_results(tbresult3) return pd.Series( q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q2 = self._process_tabix_results(tbresult2) return pd.Series(q2, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q = self._process_tabix_results(tbresult) return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) def annotate_GWAS_results(self, genome_ref, gwas_results_file): feature_num = self.gsu.search({'ref': genome_ref})['num_found'] # get genome features for gff construction genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': feature_num, #'sort_by': [['feature_id', True]] })['features'] assembly_ref = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }])[0]['data']['assembly_ref'] # get assembly contigs for base length calculations assembly_contigs = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': assembly_ref }])[0]['data']['contigs'] contig_ids = list(assembly_contigs.keys()) contig_ids.sort() contig_base_lengths = {} prev_length = 0 for contig in contig_ids: contig_base_lengths[contig] = prev_length prev_length += assembly_contigs[contig]['length'] gff_file = os.path.join(self.GFF_dir, 'constructed.gff') constructed_gff = self._construct_gff_from_json( genome_features, gff_file, contig_base_lengths) self.sorted_gff = self._prep_gff(constructed_gff) tabix_index(self.sorted_gff) gwas_results = pd.read_csv(gwas_results_file, sep='\t') gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \ gwas_results.apply(self.find_gene_info, axis=1) new_results_path = os.path.abspath( os.path.join(gwas_results_file, '..')) new_results_path = os.path.join(new_results_path, 'final_results.txt') gwas_results.to_csv(path_or_buf=new_results_path, sep='\t', index=False) return new_results_path
class VariationUtil: ''' Module Name: VariationUtil Module Description: A KBase module: VariationUtil ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.4" GIT_URL = "" GIT_COMMIT_HASH = "2a4c2dbc058b702811c967997e7100c834e755d4" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR # TODO: Make sure we need to define config just once # TODO: Change the code tp match this style self.config = config self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL'] self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.config['ws_url'] = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shared_folder = config['scratch'] self.hr = htmlreportutils() self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) self.shock_url = config['shock-url'] self.sw_url = config['srv-wiz-url'] pass #END_CONSTRUCTOR pass def save_variation_from_vcf(self, ctx, params): """ Save a variation (and trait?) object to Kbase given a reference genome, object output name, Variant Call Format (VCF) file, and sample attribute file. :param params: instance of type "save_variation_input" (## funcdef save_variation_from_vcf ## required input params: genome_or_assembly_ref: KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly object reference *** variation input data *** vcf_staging_file_path: path to location data associated with samples variation_object_name: output name for KBase variation object *** sample input data *** sample_attribute_ref: x/y/z reference to kbase sample attribute optional params: NA output report: report_name report_ref HTML visualization: Manhattan plot *** Visualization *** plot_maf: generate histogram of minor allele frequencies plot_hwe: generate histogram of Hardy-Weinberg Equilibrium p-values) -> structure: parameter "workspace_name" of String, parameter "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "vcf_staging_file_path" of type "filepath" (KBase file path to staging files), parameter "variation_object_name" of String, parameter "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "save_variation_output" -> structure: parameter "variation_ref" of String, parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: report #BEGIN save_variation_from_vcf # Get workspace id ws_id = self.dfu.ws_name_to_id(params['workspace_name']) genome_ref = None assembly_ref = None # 1) Find whether the input is a genome or assembly # and get genome_ref and assembly_ref genome_or_assembly_ref = params['genome_or_assembly_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') # 2) Validate VCF, compress, and build VCF index logging.info("Validating VCF, Compressing VCF and Indexing VCF") VCFUtilsConfig = {"scratch": self.scratch} VCFUtilsParams = { 'vcf_staging_file_path': params['vcf_staging_file_path'] } VCU = VCFUtils(VCFUtilsConfig) vcf_compressed, vcf_index, vcf_strain_ids = VCU.validate_compress_and_index_vcf( VCFUtilsParams) if vcf_index is not None: logging.info("vcf compressed :" + str(vcf_compressed)) logging.info("vcf index :" + str(vcf_index)) logging.info("vcf strain ids :" + str(vcf_strain_ids)) else: raise ValueError( "No result obtained after compression and indexing step") # Get strain info # TODO: Remove hard coded stuff StrainInfoConfig = self.config StrainInfoParams = { "ws_id": ws_id, "vcf_strain_ids": vcf_strain_ids, "sample_set_ref": params["sample_set_ref"], "sample_attribute_name": params["sample_attribute_name"] } si = StrainInfo(StrainInfoConfig) sample_attribute_ref, strains = si.sample_strain_info(StrainInfoParams) print(sample_attribute_ref) print(strains) # 3) Create json for variation object. In a following step genomic_indexes will be # added to this json before it is saved as Variation object VCFToVariationConfig = {"ws_url": self.ws_url, "scratch": self.scratch} VCFToVariationParams = { "vcf_compressed": vcf_compressed, "vcf_index": vcf_index, "assembly_ref": assembly_ref } if genome_ref is not None: VCFToVariationParams['genome_ref'] = genome_ref vtv = VCFToVariation(VCFToVariationConfig) variation_object_data = vtv.generate_variation_object_data( VCFToVariationParams) # Append sample information if sample_attribute_ref: variation_object_data[ 'sample_attribute_ref'] = sample_attribute_ref else: raise ValueError(f'sample attribute ref not found') if strains: variation_object_data['strains'] = strains else: raise ValueError(f'strains not found') if 'sample_set_ref' in params: variation_object_data['sample_set_ref'] = params['sample_set_ref'] else: raise ValueError(f'sample_set_ref not found in params') # 4) JbrowseConfig = { "ws_url": self.ws_url, "scratch": self.scratch, "sw_url": self.sw_url, "shock_url": self.shock_url } JbrowseParams = { "vcf_path": vcf_compressed, "assembly_ref": assembly_ref, "binsize": 10000, "vcf_shock_id": variation_object_data['vcf_handle']['id'], "vcf_index_shock_id": variation_object_data['vcf_index_handle']['id'] } if genome_ref is not None: JbrowseParams["genome_ref"] = genome_ref jb = JbrowseUtil(JbrowseConfig) jbrowse_report = jb.prepare_jbrowse_report(JbrowseParams) # 5) Now we have the genomic indices and we have all the information needed to save # the variation object # TODO: Take out the genomic_indexes field from the object spec # TODO: Take out the vcf_handle stuff not needed variation_object_data['genomic_indexes'] = jbrowse_report[ 'genomic_indexes'] var_obj = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['workspace_name']), 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': variation_object_data, 'name': params['variation_object_name'] }] })[0] var_obj_ref = str(var_obj[6]) + "/" + str(var_obj[0]) + "/" + str( var_obj[4]) print(var_obj_ref) # 5) Build Variation report # This is a simple report # workspace = params['workspace_name'] created_objects = [] created_objects.append({ "ref": var_obj_ref, "description": "Variation Object" }) ReportConfig = { "ws_url": self.ws_url, "scratch": self.scratch, } ReportParams = {"variation_ref": var_obj_ref} vr = VariationReport(ReportConfig) htmlreport_dir = vr.create_variation_report(ReportParams) report = self.hr.create_html_report(htmlreport_dir, workspace, created_objects) report['variation_ref'] = var_obj_ref print(report) #END save_variation_from_vcf # At some point might do deeper type checking... if not isinstance(report, dict): raise ValueError('Method save_variation_from_vcf return value ' + 'report is not type dict as required.') # return the results return [report] def export_variation_as_vcf(self, ctx, params): """ Export KBase variation object as Variant Call Format (VCF) file :param params: instance of type "export_variation_input" (## funcdef export_variation_as_vcf ## required input params: Variation object reference optional params: NA output report: Shock id pointing to exported vcf file) -> structure: parameter "input_var_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "export_variation_output" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_variation_as_vcf vtv = VariationToVCF(self.callback_url, self.shared_folder) output = vtv.export_as_vcf(params) #END export_variation_as_vcf # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_variation_as_vcf return value ' + 'output is not type dict as required.') # return the results return [output] def get_variation_as_vcf(self, ctx, params): """ Given a reference to a variation object, and output name: return a Variant Call Format (VCF) file path and name. :param params: instance of type "get_variation_input" (## funcdef get_variation_as_vcf ## required input params: Variation object reference output file name optional params: NA output report: path to returned vcf name of variation object) -> structure: parameter "variation_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "filename" of String :returns: instance of type "get_variation_output" -> structure: parameter "path" of type "filepath" (KBase file path to staging files), parameter "variation_name" of String """ # ctx is the context object # return variables are: file #BEGIN get_variation_as_vcf vtv = VariationToVCF(self.callback_url, self.shared_folder) file = vtv.variation_to_vcf(params) #END get_variation_as_vcf # At some point might do deeper type checking... if not isinstance(file, dict): raise ValueError('Method get_variation_as_vcf return value ' + 'file is not type dict as required.') # return the results return [file] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class snp2gene: ''' Module Name: snp2gene Module Description: A KBase module: snp2gene ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "[email protected]:kbasecollaborations/snp2gene.git" GIT_COMMIT_HASH = "8dd593e96c4b37fcf91a719181389e1b04c0bb4a" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.config['callback_url'] = os.environ['SDK_CALLBACK_URL'] callback_url = self.config['callback_url'] self.shared_folder = config['scratch'] self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) self.kbr = KBaseReport(callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def annotate_gwas_results(self, ctx, params): """ annotate_gwas_results: inputs: file path to gwas results genome object - with reference to GFF file outputs: TSV file represented by shock/handle ids and :param params: instance of type "annotate_gwas_input" -> structure: parameter "gwas_result_file" of type "file_path" (A valid file path), parameter "genome_obj" of type "genome_ref" (KBase style object reference X/Y/Z @id ws KBaseGenomes.Genome) :returns: instance of type "annotate_gwas_output" -> structure: parameter "snp_to_gene_list" of type "file_path" (A valid file path) """ # ctx is the context object # return variables are: output #BEGIN annotate_gwas_results gene_list = GFFUtils(self.config).annotate_GWAS_results( params['genome_obj'], params['gwas_result_file']) output = {'snp_to_gene_list': gene_list} #END annotate_gwas_results # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method annotate_gwas_results return value ' + 'output is not type dict as required.') # return the results return [output] def annotate_gwas_results_app(self, ctx, params): """ :param params: instance of type "annotate_gwas_app_input" -> structure: parameter "associations" of list of type "association_ref" (KBase style object reference X/Y/Z @id ws KBaseGwasData.Associations), parameter "p_value" of String, parameter "prefix" of String :returns: instance of type "annotate_gwas_app_output" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "featureset_obj" of type "featureset_ref" (KBase style object reference X/Y/Z @id ws KBaseCollections.FeatureSet) """ # ctx is the context object # return variables are: output #BEGIN annotate_gwas_results_app # return the results print(params) #TODO: Hanlde cases where there are no significant SNPs #genome_ref = "47506/4/1" objects_created = [] for association_ref in params['associations']: variation_ref = self.wsc.get_object_subset([{ 'included': ['/variation_id'], 'ref': association_ref }])[0]['data']['variation_id'] genome_ref = self.wsc.get_object_subset([{ 'included': ['/genome_ref'], 'ref': variation_ref }])[0]['data']['genome_ref'] featureset_obj = GFFUtils2(self.config).annotate_GWAS_results( genome_ref, association_ref, params['workspace_name'], params['prefix'], params['p_value']) objects_created.append({ 'ref': featureset_obj, 'description': 'FeatureSet' }) # Build the new gff before doing anything # Download the workspace object for association one at a time # Filter SNPs for p-value, if no snps shows up, append this to warnings # Build the table structure needed for snp2gene # Run snp2gene algorithm and get final list.txt # Save as featureset. Find how to save featureset from genelist report_info = self.kbr.create_extended_report({ 'message': ' ', 'objects_created': objects_created, 'report_object_name': 'annotate_gwas_results_app_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) output = dict() output['report_name'] = report_info['name'] output['report_ref'] = report_info['ref'] print(output) #END annotate_gwas_results_app # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method annotate_gwas_results_app return value ' + 'output is not type dict as required.') return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class VCFToVariation: def __init__(self, config, scratch, callback_url ): self.scratch = config['scratch'] self.ws_url = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.wsc = Workspace(self.ws_url) self.scratch = scratch self.callback_url = callback_url self.au = AssemblyUtil(self.callback_url) self.gapi = GenericsAPI(self.callback_url) def _parse_vcf_data(self, params): vcf_filepath = self._stage_input(params) # file is validated by this point, can assume vcf_filepath is valid reader = vcf.Reader(open(vcf_filepath, 'r')) version = float(reader.metadata['fileformat'][4:6]) genotypes = reader.samples chromosomes = [] contigs = {} totalvars = 0 for record in reader: totalvars += 1 if record.CHROM not in chromosomes: chromosomes.append(record.CHROM) if record.CHROM not in contigs.keys(): passvar = 1 if not record.FILTER else 0 contigs[record.CHROM] = { 'contig_id': record.CHROM, 'totalvariants': 1, 'passvariants': passvar, 'length': int(record.affected_end-record.affected_start), } else: contigs[record.CHROM]['totalvariants'] += 1 if not record.FILTER: contigs[record.CHROM]['passvariants'] += 1 vcf_info = { 'version': version, 'contigs': contigs, 'total_variants': totalvars, 'genotype_ids': genotypes, 'chromosome_ids': chromosomes, 'file_ref': vcf_filepath } return vcf_info def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids): genos_not_found = [] vgenotypes = [x.upper().strip() for x in vcf_genotypes] sids = [x.upper().strip() for x in sample_ids] for geno in vgenotypes: if geno not in sids: genos_not_found.append(geno) if not genos_not_found: return True else: return genos_not_found def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes): chromos_not_in_assembly = [] pp(assembly_chromosomes) for chromo in vcf_chromosomes: if chromo not in assembly_chromosomes: chromos_not_in_assembly.append(chromo) if not chromos_not_in_assembly: return True else: return chromos_not_in_assembly def _get_vcf_version(self, vcf_filepath): with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf: line = vcf.readline() tokens = line.split('=') if not (tokens[0].startswith('##fileformat')): log("Invalid VCF. ##fileformat line in meta is improperly formatted.") raise ValueError("Invalid VCF. ##fileformat line in meta is improperly formatted. " "Check VCF file specifications: https://samtools.github.io/hts-specs/") vcf_version = float(tokens[1][-4:].rstrip()) return vcf_version def validate_vcf(self, params): if 'genome_or_assembly_ref' not in params: raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params) if 'vcf_staging_file_path' not in params: raise ValueError('VCF staging file path not in input parameters: \n\n' + params) vcf_filepath = self._stage_input(params) vcf_version = self._get_vcf_version(vcf_filepath) # setup directorys for validation output validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4())) os.mkdir(validation_output_dir) # vcftools (vcf-validator) supports VCF v4.0-4.2 # https://github.com/vcftools/vcftools # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3 # https://github.com/EBIvariation/vcf-validator # vcftools is only to validate VCF v4.0 if vcf_version >= 4.1: print("Using vcf_validator_linux...") validator_cmd = ["vcf_validator_linux"] validator_cmd.append("-i") validator_cmd.append(vcf_filepath) validator_cmd.append("-l") validator_cmd.append('error') print("VCF version "+str(vcf_version)+".") elif vcf_version >= 4.0: print("Using vcftools to validate...") validator_cmd = ["vcf-validator"] validator_cmd.append(vcf_filepath) print("VCF version 4.0.") else: raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the ' 'first line of vcf file and in appropriate syntax. Check VCF file specifications: ' 'https://samtools.github.io/hts-specs/') print("Validator command: {}".format(validator_cmd)) p = subprocess.Popen(validator_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) validator_output = [] while True: line = p.stdout.readline() if not line: break if line.decode("utf-8").strip().startswith('[info]'): validator_output.append(line.decode("utf-8")) out, err = p.communicate() validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt') file_output_chk = [] try: if validator_output[0][:6] == '[info]': # validation by vcf_validator_linux validation_output_filename = validator_output[1].split(' ')[6].strip('\n') vo = validator_output[2].split(' ') file_output_chk = ''.join(vo[9:]).strip('\n') if not os.path.exists(validation_output_filename): raise ValueError(validation_output_filename+' does not exist!') if not file_output_chk == 'isvalid': print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) #TODO: more detailed validation parsing for vcf_validator_linux else: if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0") f.close() # TODO: more detailed validation parsing for vcftools except IndexError: # if vcf file < v4.1, and valid it will produce index error on line 132 if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0") f.close() if not os.path.exists(validation_output_filename): print('Validator did not generate log file!') raise SystemError("Validator did not generate a log file.") log("Validator output filepath: {}".format(validation_output_filename)) log("Return code from validator {}".format(p.returncode)) return validation_output_filename def _stage_input(self, params): # extract file location from input ui parameters if params['vcf_staging_file_path'].startswith('/kb/module/test/'): # variation utils unit test vcf_local_file_path = params['vcf_staging_file_path'] if vcf_local_file_path.endswith('.gz'): with gzip.open(vcf_local_file_path, 'rb') as f_in: with open(vcf_local_file_path[:-3], 'wb') as f_out: shutil.copyfileobj(f_in, f_out) vcf_local_file_path = vcf_local_file_path[:-3] else: staging_dir = '/staging' vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path']) if not os.path.exists(vcf_local_file_path): raise OSError('VCF input path does not exist, or is not readable') orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path)) print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}') self.original_file = shutil.copy(vcf_local_file_path, orig_file_path) # TODO: use data file utils here, upload vcf to shock, use dfu. if is_gz_file(vcf_local_file_path): # /staging is read only, therefore have to copy before uncompressing if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']): copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path'])) unpack = self.dfu.unpack_file({'file_path': copy}) else: unpack = {} unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path']) params['vcf_local_file_path'] = unpack['file_path'] return unpack['file_path'] else: params['vcf_local_file_path'] = vcf_local_file_path return vcf_local_file_path def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file): """ function for creating sample attribute mapping file. """ try: with open (vcf_file, 'r') as vcf_handle: Lines = vcf_handle.readlines() for line in Lines: if(line.startswith("#CHROM")): header = line.lstrip().split("\t") try: with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle: attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID") for i in range(9,len(header)): attribute_mapping_handle.write("\t"+header[i]) #attribute_mapping_handle.write("\n") attribute_mapping_handle.write("label\t\t\t") for j in range(9,len(header)): attribute_mapping_handle.write("\t"+header[j]) #attribute_mapping_handle.write("\n") except IOError: print("Could not write to file:", sample_attribute_mapping_file) except IOError: print("Could not read file:", vcf_file) def _validate_assembly_ids(self, params): # All chromosome ids from the vcf should be in assembly # but not all assembly chromosome ids should be in vcf if ('genome_ref' in params): subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': params['genome_or_assembly_ref'] }]) self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref'] if ('assembly_ref' in params): self.vcf_info['assembly_ref'] = params['assembly_ref'] assembly_chromosome_ids_call = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }]) assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys() vcf_chromosomes = self.vcf_info['chromosome_ids'] chk_assembly_ids = self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes) if isinstance(chk_assembly_ids, list): failed_ids = ' '.join(chk_assembly_ids) print(f'VCF contig ids: {failed_ids} are not present in assembly.') raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.') return assembly_chromosomes def _validate_sample_ids(self, params): # All samples within the VCF file need to be in sample attribute list vcf_genotypes = self.vcf_info['genotype_ids'] sample_ids_subset = self.wsc.get_object_subset([{ 'included': ['/instances'], 'ref': params['sample_attribute_ref'] }]) sample_ids = sample_ids_subset[0]['data']['instances'].keys() validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids) if isinstance(validate_genotypes, list): failed_genos = ' '.join(validate_genotypes) print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') return sample_ids def _construct_contig_info(self, params): """ KBaseGwasData.Variations type spec /* Contig variation data contig_id - contig identifier totalvariants - total number of variants in each contig passvariants - total number of variants that pass quality variation filter in contig length - length of contig from assembly data */ typdef structure { string contig_id; int totalvariants; int passvariants; int length; // from assembly } contig_info; """ assembly_chromosome_dict = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }])[0]['data']['contigs'] contigs = [] contig_infos = self.vcf_info['contigs'] for contig_id in contig_infos: length_contig = assembly_chromosome_dict[contig_id].get("length") contig_infos[contig_id]["length"] = length_contig contigs.append(contig_infos[contig_id]) return contigs def _bgzip_vcf(self, vcf_filepath): if not os.path.exists(vcf_filepath): print (vcf_filepath + " does not exist") zip_cmd = ["bgzip", vcf_filepath] p = subprocess.Popen(zip_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() bgzip_file_path = vcf_filepath + ".gz" print (bgzip_file_path) return bgzip_file_path def _index_vcf(self, bgzip_file): output_dir = self.scratch bgzip_filepath = os.path.join(self.scratch, bgzip_file) if not os.path.exists(bgzip_filepath): print (bgzip_filepath + " does not exist") index_cmd = ["tabix", "-p", "vcf", bgzip_filepath] p = subprocess.Popen(index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() index_file_path = bgzip_filepath + ".tbi" return index_file_path def _index_assembly(self, assembly_file): if not os.path.exists(assembly_file): print (assembly_file + " does not exist") logging.info("indexing assembly file") assembly_index_cmd = ["samtools", "faidx", assembly_file] print(assembly_index_cmd) p = subprocess.Popen(assembly_index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() logging.info("indexing of assembly file done!") return assembly_file + ".fai" def _download_assembly(self, assembly_ref): file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref }) return file def _construct_variation(self, params, contigs_info): """ KBaseGwasData.Variations type spec /* Variation object data structure num_genotypes - number of total genotypes within variant file num_variants - number of total variants within variant file contigs - list of contig ids and variant information attribute_ref - KBase reference to attribute mapping workspace object genome_ref - KBase reference to genome workspace object assembly_ref - KBase reference to assemebly workspace object vcf_handle_ref - VCF handle reference to VCF file @optional genome_ref */ typedef structure { int numgenotypes; int numvariants; list<contig_info> contigs; attribute_ref population; // KBaseExperiments.AttributeMapping genome_ref genome_ref; // KBaseGenomes.Genome assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly vcf_handle_ref vcf_handle_ref; } Variations; :param params: KBase ui input parameters :param population: previoiusly constructed sample population data :return: constructed variation object (dictionary) """ if not self.vcf_info['file_ref'].startswith(self.scratch): new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref'])) self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file) vcf_staged_file = self.original_file bgzip_file_path = self._bgzip_vcf(vcf_staged_file) vcf_shock_file_ref = self.dfu.file_to_shock( {'file_path': bgzip_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref) index_file_path = self._index_vcf(bgzip_file_path) vcf_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref) assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path'] assembly_index_file_path = self._index_assembly(assembly_file_path) assembly_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': assembly_index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref) variation_obj = { 'numgenotypes': int(len(self.vcf_info['genotype_ids'])), 'numvariants': int(self.vcf_info['total_variants']), 'contigs': contigs_info, 'population': params['sample_attribute_ref'], # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref 'assemby_ref': self.vcf_info['assembly_ref'], 'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'], 'vcf_handle' : vcf_shock_file_ref['handle'], 'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'], 'vcf_index_handle': vcf_index_shock_file_ref['handle'], 'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'], 'assembly_index_handle': assembly_index_shock_file_ref['handle'] } if 'genome_ref' in params: variation_obj['genome_ref'] = params['genome_ref'] return variation_obj def _save_var_obj(self, params, var): """ :param params: :param var: :return: DataFileUtils object_info: objid - the numerical id of the object. name - the name of the object. type - the type of the object. save_date - the save date of the object. ver - the version of the object. saved_by - the user that saved or copied the object. wsid - the id of the workspace containing the object. workspace - the name of the workspace containing the object. chsum - the md5 checksum of the object. size - the size of the object in bytes. meta - arbitrary user-supplied metadata about the object. """ print('Saving Variation to workspace...\n') if var: if not 'variation_object_name' in params: var_obj_name = 'variation_'+str(uuid.uuid4()) else: var_obj_name = params['variation_object_name'] var_obj_info = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['workspace_name']), 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': var, 'name': var_obj_name }] })[0] return var_obj_info else: raise ValueError('Variation object blank, cannot not save to workspace!') def _validate_sample_attribute_ref(self, params): #params["sample_attribute_ref"] = '' #just for testing if not params['sample_attribute_ref']: sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv") #hardcoded for testing self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file) logging.info("Uploading sample attribute file to ref") vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock( {'file_path': sample_attribute_mapping_file, 'make_handle': 1} ) shock_id = vcf_sample_attribute_shock_file_ref['shock_id'] ws_id = self.dfu.ws_name_to_id(params['workspace_name']) import_params = { 'input_shock_id' : shock_id, 'output_ws_id': ws_id, 'output_obj_name': 'Sample_attribute'} ret = self.gapi.file_to_attribute_mapping(import_params) params['sample_attribute_ref'] = ret['attribute_mapping_ref'] def import_vcf(self, params): # VCF validation # VCF file validation file_valid_result = self.validate_vcf(params) self._validate_sample_attribute_ref(params) # VCF file parsing self.vcf_info = self._parse_vcf_data(params) # Validate vcf chromosome ids against assembly chromosome ids self._validate_assembly_ids(params) # Validate vcf genotypes against sample meta data ids self._validate_sample_ids(params) # Variation object construction # construct contigs_info contigs_info = self._construct_contig_info(params) # construct variation var = self._construct_variation(params, contigs_info) # Save variation object to workspace var_wksp_obj = self._save_var_obj(params, var) return [var_wksp_obj, var]
class kb_ReadSim: ''' Module Name: kb_ReadSim Module Description: A KBase module: kb_ReadSim ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbasecollaborations/kb_ReadSim.git" GIT_COMMIT_HASH = "c9c0185e34d25be57cc6e1c901d8801fbc0f4784" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] self.du = DownloadUtils(self.callback_url) self.su = SimUtils() self.ru = ReadsUtils(self.callback_url) self.vu = VariationUtil(self.callback_url) self.eu = VcfEvalUtils() self.hu = htmlreportutils() self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def run_kb_ReadSim(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Inparams" -> structure: parameter "workspace_name" of String, parameter "input_sample_set" of String, parameter "strain_info" of String, parameter "assembly_or_genome_ref" of String, parameter "base_error_rate" of String, parameter "outer_distance" of String, parameter "standard_deviation" of String, parameter "num_read_pairs" of String, parameter "len_first_read" of String, parameter "len_second_read" of String, parameter "mutation_rate" of String, parameter "frac_indels" of String, parameter "variation_object_name" of String, parameter "output_read_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_ReadSim output_dir = self.shared_folder print(params) self.su.validate_simreads_params(params) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') self.du.download_genome(assembly_ref, output_dir) ref_genome = os.path.join(self.shared_folder, "ref_genome.fa") output_fwd_paired_file_path = os.path.join(self.shared_folder, "raed1.fq") output_rev_paired_file_path = os.path.join(self.shared_folder, "raed2.fq") self.eu.check_path_exists(ref_genome) self.su.simreads(ref_genome, output_fwd_paired_file_path, output_rev_paired_file_path, params) self.eu.check_path_exists(output_fwd_paired_file_path) self.eu.check_path_exists(output_rev_paired_file_path) retVal = self.ru.upload_reads({ 'wsname': params['workspace_name'], 'name': params['output_read_object'], 'sequencing_tech': 'illumina', 'fwd_file': output_fwd_paired_file_path, 'rev_file': output_rev_paired_file_path }) logfile = os.path.join(self.shared_folder, "variant.txt") self.eu.check_path_exists(logfile) vcf_file = self.su.format_vcf(logfile) self.eu.check_path_exists(vcf_file) save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_ReadSim # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_ReadSim return value ' + 'output is not type dict as required.') # return the results return [output] def run_eval_variantcalling(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Evalparams" -> structure: parameter "workspace_name" of String, parameter "sim_varobject_name" of String, parameter "calling_varobject_name" of String, parameter "output_var_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_eval_variantcalling print(params) self.eu.validate_eval_params(params) report_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(report_dir) self.ws = Workspace(url=self.ws_url, token=ctx['token']) var_object_ref1 = params['varobject_ref1'] sampleset_ref1 = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref1, 'included': ['/sample_set_ref'] }] })['data'][0]['data']['sample_set_ref'] var_object_ref2 = params['varobject_ref2'] sampleset_ref2 = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref2, 'included': ['/sample_set_ref'] }] })['data'][0]['data']['sample_set_ref'] if (sampleset_ref1 != sampleset_ref2): raise Exception( "Variation objects are from different sample set\n") assembly_ref_set = set() genomeset_ref_set = set() variation_obj1 = self.ws.get_objects2( {'objects': [{ 'ref': var_object_ref1 }]})['data'][0] if 'assembly_ref' in variation_obj1['data']: assembly_ref1 = variation_obj1['data']['assembly_ref'] assembly_ref_set.add(assembly_ref1) elif 'genome_ref' in variation_obj1['data']: genome_ref1 = variation_obj1['data']['genome_ref'] genomeset_ref_set.add(genome_ref1) variation_obj2 = self.ws.get_objects2( {'objects': [{ 'ref': var_object_ref2 }]})['data'][0] if 'assembly_ref' in variation_obj2['data']: assembly_ref2 = variation_obj2['data']['assembly_ref'] assembly_ref_set.add(assembly_ref2) elif 'genome_ref' in variation_obj2['data']: genome_ref2 = variation_obj2['data']['genome_ref'] genomeset_ref_set.add(genome_ref2) assembly_or_genome_ref = None if (not genomeset_ref_set and len(assembly_ref_set) != 1): raise Exception( "variation objects are from different assembly refs") elif (not assembly_ref_set and len(genomeset_ref_set) != 1): raise Exception("variation objects are from different genome refs") simvarfile = os.path.join(report_dir, "simvarinat.vcf.gz") simvarpath = self.du.download_variations(var_object_ref1, simvarfile) os.rename(simvarpath, simvarfile) self.eu.index_vcf(simvarfile) callingvarfile = os.path.join(report_dir, "callingvarinat.vcf.gz") callingvarpath = self.du.download_variations(var_object_ref2, callingvarfile) os.rename(callingvarpath, callingvarfile) self.eu.index_vcf(callingvarfile) eval_results = self.eu.variant_evalation(simvarfile, callingvarfile, report_dir) unique_vcf1 = eval_results['unique1'] self.eu.check_path_exists(unique_vcf1) unique_vcf2 = eval_results['unique2'] self.eu.check_path_exists(unique_vcf2) common_vcf = eval_results['common'] self.eu.check_path_exists(common_vcf) image_path = self.eu.plot_venn_diagram(report_dir, unique_vcf1, unique_vcf2, common_vcf) self.eu.check_path_exists(image_path) ''' if(len(assembly_ref_set) != 0): assembly_or_genome_ref = assembly_ref_set.pop() elif(len(genomeset_ref_set) != 0): assembly_or_genome_ref = genomeset_ref_set.pop() logging.info("Saving Unique1 vcf\n") save_unique_variation_params1 = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_unique_attr1', 'vcf_staging_file_path': unique_vcf1, 'variation_object_name': params['output_variant_object'] + "_sample1_unique" } self.vu.save_variation_from_vcf(save_unique_variation_params1) logging.info("Saving done\n") logging.info("Saving Unique2 vcf\n") save_unique_variation_params2 = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_unique_attr2', 'vcf_staging_file_path': unique_vcf2, 'variation_object_name': params['output_variant_object'] + "_sample2_unique" } self.vu.save_variation_from_vcf(save_unique_variation_params2) logging.info("Saving done\n") logging.info("Saving Common vcf\n") save_common_variation_params = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_common_attr', 'vcf_staging_file_path': common_vcf, 'variation_object_name': params['output_variant_object'] + "_sample1_sample2_common" } self.vu.save_variation_from_vcf(save_common_variation_params) logging.info("Saving done\n") ''' workspace = params['workspace_name'] output = self.hu.create_html_report(self.callback_url, report_dir, workspace) #END run_eval_variantcalling # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_eval_variantcalling return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class VCFToVariation: def __init__(self, Config): self.scratch = Config['scratch'] ws_url = Config['ws_url'] callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(callback_url) self.wsc = Workspace(ws_url) self.au = AssemblyUtil(callback_url) self.vcf_info = dict() def _parse_header(self, record, category): """ parses vcf header which looks like the following and get details for the IDs like DP, q10 This information is useful in filtering ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> ##FILTER=<ID=q10,Description="Quality below 10"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> [ { }, { }, { } ] """ returninfo = {"Category": category} # remove all comma within quotes record = re.sub(r'(?!(([^"]*"){2})*[^"]*$),', '', record) record = record.rstrip() # Remove last > character record = record[:-1] info = re.sub(".*=<", "", record) infolist = info.replace('"', '').rstrip().split(",") for fields in infolist: data = fields.split("=") key = data.pop(0) val = "=".join(data) val = val.replace("\"", "") returninfo[key] = val return returninfo def parse_vcf_data(self, vcf_filepath): """ parses vcf file including headers and prepares information that will be uploaded to KBase workspace :param vcf_filepath: :return: """ reader = gzip.open(vcf_filepath, "rt") version = "" genotypes = "" counter = 0 chromosomes = list() contigs = {} header = list() totalvars = 0 for record in reader: # Handle header lines and parse information if record[0] == "#": if record.startswith("##fileformat"): version = record.replace("##fileformat=", "").rstrip() if record.startswith("##INFO=<"): info = self._parse_header(record, "INFO") header.append(info) if record.startswith("##FORMAT=<"): info = self._parse_header(record, "FORMAT") header.append(info) if record.startswith("##FILTER=<"): info = self._parse_header(record, "FILTER") header.append(info) if (record.startswith("#CHROM")): # This is the chrome line record = record.rstrip() values = record.split("\t") genotypes = values[9:] continue # Handle the actual VCF content and parse information counter = counter + 1 CHROM, *_ = record.split("\t") totalvars += 1 if CHROM not in chromosomes: chromosomes.append(CHROM) contigs[CHROM] = {'contig_id': CHROM, 'totalvariants': 1} else: contigs[CHROM]['totalvariants'] += 1 variation_details = VCFReaderStream(vcf_filepath) vcf_info = { 'version': version, 'contigs': contigs, 'total_variants': totalvars, 'genotype_ids': genotypes, 'chromosome_ids': chromosomes, 'variation_details': variation_details, 'file_ref': vcf_filepath, 'header': header } return vcf_info def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids): genos_not_found = [] vgenotypes = [x.upper().strip() for x in vcf_genotypes] sids = [x.upper().strip() for x in sample_ids] for geno in vgenotypes: if geno not in sids: genos_not_found.append(geno) if not genos_not_found: return True else: return genos_not_found def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes): """ Check if all chromosome ids in vcf are also present in assembly :param vcf_chromosomes: :param assembly_chromosomes: :return: returns list of chromosome ids, present in vcf, but absent from assembly """ chromos_not_in_assembly = [] for chromo in vcf_chromosomes: if chromo not in assembly_chromosomes: chromos_not_in_assembly.append(chromo) if not chromos_not_in_assembly: return True else: return chromos_not_in_assembly def _validate_assembly_ids(self, vcf_info): """ All chromosome ids from the vcf should be in assembly but not all assembly chromosome ids need to be in vcf :param params: :return: list of all assembly chromosome ids """ assembly_chromosome_ids_call = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': vcf_info['assembly_ref'] }]) assembly_chromosomes = assembly_chromosome_ids_call[0]['data'][ 'contigs'].keys() vcf_chromosomes = vcf_info['chromosome_ids'] chk_assembly_ids = self._chk_if_vcf_ids_in_assembly( vcf_chromosomes, assembly_chromosomes) if isinstance(chk_assembly_ids, list): failed_ids = ' '.join(chk_assembly_ids) print(f'VCF contig ids: {failed_ids} are not present in assembly.') raise ValueError( f'VCF contig ids: {failed_ids} are not present in assembly.') return assembly_chromosomes def _construct_contig_info(self, vcf_info): """ From KBaseGwasData.Variations type spec /* Contig variation data contig_id - contig identifier totalvariants - total number of variants in each contig length - length of contig from assembly data */ typdef structure { string contig_id; int totalvariants; int length; // from assembly } contig_info; """ assembly_chromosome_dict = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': vcf_info['assembly_ref'] }])[0]['data']['contigs'] contigs = [] contig_infos = vcf_info['contigs'] for contig_id in contig_infos: length_contig = assembly_chromosome_dict[contig_id].get("length") contig_infos[contig_id]["length"] = length_contig contigs.append(contig_infos[contig_id]) return contigs def _construct_variation_object_json(self, vcf_info): """ KBaseGwasData.Variations type spec /* Variation object data structure num_genotypes - number of total genotypes within variant file num_variants - number of total variants within variant file contigs - list of contig ids and variant information attribute_ref - KBase reference to attribute mapping workspace object genome_ref - KBase reference to genome workspace object assembly_ref - KBase reference to assemebly workspace object vcf_handle_ref - VCF handle reference to VCF file @optional genome_ref */ typedef structure { int numgenotypes; int numvariants; list<contig_info> contigs; attribute_ref samples; // KBaseExperiments.AttributeMapping genome_ref genome_ref; // KBaseGenomes.Genome assembly_ref assembly_ref; // KBaseGenomeAnnotations.Assembly vcf_handle_ref vcf_handle_ref; } Variations; :param params: KBase ui input parameters :return: constructed variation object (dictionary) """ logging.info("Uploading VCF file to shock") vcf_shock_file_ref = None vcf_index_shock_file_ref = None if os.path.exists(vcf_info['vcf_compressed']): vcf_shock_file_ref = self.dfu.file_to_shock({ 'file_path': vcf_info['vcf_compressed'], 'make_handle': 1 }) # compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref) logging.info("Uploading VCF index file to shock") if os.path.exists(vcf_info['vcf_index']): vcf_index_shock_file_ref = self.dfu.file_to_shock({ 'file_path': vcf_info['vcf_index'], 'make_handle': 1 }) # compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref) # TODO: remove any reference to samples in this file variation_obj_data = { 'numgenotypes': int(len(vcf_info['genotype_ids'])), 'numvariants': int(vcf_info['total_variants']), 'contigs': vcf_info['contigs_info'], "header": vcf_info['header'], "variation_details": vcf_info['variation_details'], 'assembly_ref': vcf_info['assembly_ref'], 'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'], 'vcf_handle': vcf_shock_file_ref['handle'], 'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'], 'vcf_index_handle': vcf_index_shock_file_ref['handle'], } if 'genome_ref' in vcf_info: variation_obj_data['genome_ref'] = vcf_info['genome_ref'] if 'sample_attribute_ref' in vcf_info: variation_obj_data['sample_attribute_ref'] = vcf_info[ 'sample_attribute_ref'] return variation_obj_data def generate_variation_object_data(self, params): # VCF validation # VCF file parsing # Copy vcf_compressed, vcf_index, vcf_info = self.parse_vcf_data(params['vcf_compressed']) vcf_info['vcf_compressed'] = params['vcf_compressed'] vcf_info['vcf_index'] = params['vcf_index'] assembly_ref = params['assembly_ref'] if 'genome_ref' in params: genome_ref = params['genome_ref'] logging.info("Parsing vcf started") vcf_info['assembly_ref'] = assembly_ref if 'genome_ref' in params: vcf_info['genome_ref'] = genome_ref logging.info("Comparing assembly ids") # Validate vcf chromosome ids against assembly chromosome ids result = self._validate_assembly_ids(vcf_info) # Variation object construction # construct contigs_info if result: logging.info("Creating contig info") vcf_info['contigs_info'] = self._construct_contig_info(vcf_info) # construct variation variation_object_json = self._construct_variation_object_json(vcf_info) return variation_object_json
class kb_GATK: ''' Module Name: kb_GATK Module Description: A KBase module: kb_GATK ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbasecollaborations/kb_GATK.git" GIT_COMMIT_HASH = "5e6e4bdca9a7749bba0abab081736c56007212ed" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) self.gu = GATKUtils() logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) self.vu = VariationUtil(self.callback_url) self.du = DownloadAlignmentUtils(self.callback_url) #END_CONSTRUCTOR pass def run_kb_GATK(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_GATK source_ref = params['alignment_ref'] alignment_out = self.du.downloadreadalignment(source_ref, params, self.callback_url) sam_file = os.path.join(alignment_out['destination_dir'], "reads_alignment.sam") ''' #Todo Reading sample set and sample strains information ''' ''' command.extend(["-filter-name", "\"QD_filter\"", "-filter", "\"QD", "<", params['snp_filter']['snp_qd_filter'] + "\""]) command.extend(["-filter-name", "\"FS_filter\"", "-filter", "\"FS", "<", params['snp_filter']['snp_fs_filter'] + "\""]) command.extend(["-filter-name", "\"MQ_filter\"", "-filter", "\"MQ", "<", params['snp_filter']['snp_mq_filter'] + "\""]) command.extend(["-filter-name", "\"SOR_filter\"", "-filter", "\"SOR", "<", params['snp_filter']['snp_sor_filter'] + "\""]) command.extend(["-filter-name", "\"MQRankSum_filter\"", "-filter", "\"MQRankSum", "<", params['snp_filter']['snp_mqrankSum_filter'] + "\""]) command.extend(["-filter-name", "\"ReadPosRankSum_filter\"", "-filter", "\"ReadPosRankSum", "<", params['snp_filter']['snp_readposranksum_filter'] + "\""]) ''' print(params) strain_info = params['strain_info'] output_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(output_dir) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') assembly_file = self.du.download_genome(assembly_ref, output_dir)['path'] #output_dir = output_dir + "/" #Todo: check time for building index file or donwload from cache. #Todo: To discuss about cache_id to be used. #Todo: In case of copying genome, find the way of finding original genome (ref id) for getting original cache id. self.gu.build_genome(assembly_file) self.gu.index_assembly(assembly_file) self.gu.generate_sequence_dictionary(assembly_file) self.gu.duplicate_marking(output_dir, sam_file) #self.gu.sort_bam_index(output_dir) self.gu.collect_alignment_and_insert_size_metrics( assembly_file, output_dir) #self.gu.analyze_covariates(output_dir) #Todo: avoid writing intermediate fies to save space and time I/O. self.gu.variant_calling(assembly_file, output_dir) self.gu.extract_variants(assembly_file, output_dir) self.gu.filter_SNPs(assembly_file, "filtered_snps.vcf", output_dir, params) self.gu.filter_Indels(assembly_file, "filtered_indels.vcf", output_dir, params) self.gu.exclude_filtered_variants(output_dir) self.gu.base_quality_score_recalibration(assembly_file, "recal_data.table", output_dir) self.gu.apply_BQSR(assembly_file, "recal_data.table", output_dir) self.gu.base_quality_score_recalibration(assembly_file, "post_recal_data.table", output_dir) self.gu.apply_BQSR(assembly_file, "post_recal_data.table", output_dir) self.gu.filter_SNPs(assembly_file, "filtered_snps_final.vcf", output_dir, params) #Todo: To save indels also using VariationUtils or merge with snps and sort them with chr & pos and save using variaiotiontuils. #Todo: To get an example for saving structural variants(specially CNV) and compare with standard vcf output. self.gu.filter_Indels(assembly_file, "filtered_indels_final.vcf", output_dir, params) ''' os.system("grep '##fileformat' " + output_dir + "/filtered_snps_final.vcf > " + output_dir + "/sample.vcf") cmd = "grep -v '##' " + output_dir + "/filtered_snps_final.vcf >> " + output_dir + "/sample.vcf" os.system(cmd) # TODO : need to remove system command after fixing variationUtils. ''' vcf_filepath = self.gu.index_vcf_file(output_dir + "/filtered_snps_final.vcf") reheader_vcf_file = self.gu.reheader(vcf_filepath, strain_info) #Todo : check existence of final filtered finals snps. #Todo : chnage assembly_or_genome_ref to genome_or_assembly_ref #Todo: to derive name of sample_attribute_name from sample set ref by prefixing/suffixing. Attribute mapping should have one sample. save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': reheader_vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_GATK # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_GATK return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]