class DownloadUtils: def __init__(self, callbackURL): self.callbackURL = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callbackURL) self.vu = VariationUtil(self.callbackURL) self.gfu = GenomeFileUtil(self.callbackURL) pass def download_genome(self, genomeref, output_dir): ''' this funciton downloads genome. :param genomeref: :param output_dir: :return: ''' file = self.au.get_assembly_as_fasta({ 'ref': genomeref, 'filename': os.path.join(output_dir, "ref_genome.fa") }) return file def get_variation(self, variation_ref): ''' This function downloads variations. :param variation_ref: :param filename: :return: ''' filepath = self.vu.get_variation_as_vcf( {'variation_ref': variation_ref})['path'] return filepath def get_gff(self, genome_ref): ''' :param genome_ref: :return: gff file path ''' file = self.gfu.genome_to_gff({'genome_ref': genome_ref}) return file['file_path'] def get_assembly(self, assembly_ref, output_dir): ''' :param assembly_ref: :param output_dir: :return: assembly file path ''' file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref, 'filename': os.path.join(output_dir, "ref_genome.fa") }) return file['path']
class downloaddatautils: def __init__(self): self.callbackURL = os.environ['SDK_CALLBACK_URL'] self.gfu = GenomeFileUtil(self.callbackURL) self.vfu = VariationUtil(self.callbackURL) pass def download_genome(self, params): file = self.gfu.genome_to_gff({'genome_ref': params['gff_ref']}) return file def download_vcf(self, params): params['input_var_ref'] = params['vcf_ref'] self.vu.export_variation_as_vcf(params)
class DownloadUtils: def __init__(self): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) pass def get_gff(self, genome_ref, output_dir): ''' function for downloaing gff file :param genome_ref: :param output_dir: :return: ''' gff_filename = os.path.join(output_dir + "/snp_eff/data/kbase_v1", "gene.gff") file = self.gfu.genome_to_gff({ 'genome_ref': genome_ref, 'filename': gff_filename }) return file['file_path'] def get_assembly(self, assembly_ref, output_dir): ''' function for downloaing assembly file. :param assembly_ref: :param output_dir: :return: ''' assembly_filename = os.path.join(output_dir + "/snp_eff/data/kbase_v1", "sequences.fa") file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref, 'filename': assembly_filename }) return file['path']
class QualiMapRunner: QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap' JAVA_MEM_DEFAULT_SIZE = '16G' LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024 # 20 GB TIMEOUT = 72 * 60 * 60 # 72 hours def _get_file_size(self, file_path): file_size = os.path.getsize(file_path) print('File size: {} -- {}'.format(file_size, file_path)) return file_size def _large_file(self, file_path): filename, file_extension = os.path.splitext(file_path) multiplier = 0 if file_extension == '.txt': total_file_size = 0 with open(file_path, 'r') as f: for line in f: bam_file_path = line.split('\t')[1] total_file_size += self._get_file_size(bam_file_path) print('Total file size: {}'.format(total_file_size)) multiplier = int(total_file_size) // int(self.LARGE_BAM_FILE_SIZE) else: multiplier = int(self._get_file_size(file_path)) // int( self.LARGE_BAM_FILE_SIZE) print('setting number of windows multiplier to: {}'.format(multiplier)) return multiplier def _timeout_handler(self, signum, frame): print('Signal handler called with signal', signum) raise ValueError('QualiMap takes too long') def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url): self.scratch_dir = scratch_dir self.rau = ReadsAlignmentUtils(callback_url) self.kbr = KBaseReport(callback_url) self.dfu = DataFileUtil(callback_url) self.gfu = GenomeFileUtil(callback_url) self.set_api = SetAPI(srv_wiz_url) self.ws = Workspace(workspace_url) self.valid_commands = ['bamqc', 'multi-bamqc'] def run_app(self, params): self.validate_params(params) print('Validated Params = ') pprint(params) run_info = self.get_run_info(params) if run_info.get('mode') not in ['single', 'multi']: raise ValueError( 'Error in fetching the type to determine run settings.') run_error = False try: signal.signal(signal.SIGALRM, self._timeout_handler) signal.alarm(self.TIMEOUT) if run_info['mode'] == 'single': result = self.run_bamqc(params['input_ref'], run_info['input_info']) elif run_info['mode'] == 'multi': result = self.run_multi_sample_qc(params['input_ref'], run_info['input_info']) signal.alarm(0) except Exception: run_error = True workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) os.makedirs(workdir) with open(os.path.join(workdir, 'qualimapReport.html'), 'w') as report: report.write('<html><body><p></p></body></html>') package_info = self.package_output_folder( workdir, 'QualiMap_report', 'EMPTY HTML report directory for QualiMap BAM QC', 'qualimapReport.html') result = { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info, 'shock_id': None } error_msg = 'Running QualiMap returned an error:\n{}\n'.format( traceback.format_exc()) error_msg += 'Generating simple report instead\n' print(error_msg) if params['create_report']: result = self.create_report(result, params['output_workspace'], run_error, params['input_ref']) return result def create_report(self, result, output_workspace, run_error=None, input_ref=None): if run_error: objects_created = [] info = self.get_obj_info(input_ref) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: objects_created.append({ 'ref': input_ref, 'description': 'Alignment' }) if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: objects_created.append({ 'ref': input_ref, 'description': 'AlignmentSet' }) reads_alignment_info = self.get_alignments_from_set(input_ref) for alignment in reads_alignment_info: alignment_ref = alignment.get('ref') objects_created.append({ 'ref': alignment_ref, 'description': 'Alignment' }) report_info = self.kbr.create_extended_report({ 'message': ' ', 'objects_created': objects_created, 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result qc_result_zip_info = result['qc_result_zip_info'] report_info = self.kbr.create_extended_report({ 'message': '', 'objects_created': [], 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result def get_gtf_file(self, input_ref, set_op=False): print('Start fetching GFF file from genome') if set_op: set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) input_ref = set_data['data']['items'][0]['ref'] obj_data = self.dfu.get_objects({"object_refs": [input_ref]})['data'][0]['data'] genome_ref = obj_data.get('genome_id') if not genome_ref: raise ValueError( 'Alignment is not associated with a Genome object') result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4())) os.makedirs(result_directory) genome_gtf_file = self.gfu.genome_to_gff({ 'genome_ref': genome_ref, 'is_gtf': True, 'target_dir': result_directory })['file_path'] return genome_gtf_file def run_bamqc(self, input_ref, input_info): # download the input and setup a working dir alignment_info = self.rau.download_alignment({'source_ref': input_ref}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) try: gtf_file = self.get_gtf_file(input_ref) except: gtf_file = '' workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) options = [ '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] options.append('--java-mem-size={}'.format( self.JAVA_MEM_DEFAULT_SIZE)) # always use large mem multiplier = self._large_file(bam_file_path) if multiplier: window_size = multiplier * 400 print(f'using larger window size: {window_size} and Java memory: ' f'{self.JAVA_MEM_DEFAULT_SIZE}') options.append( '-nw {}'.format(window_size)) # increase size of windows self.run_cli_command('bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap BAM QC', 'qualimapReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def run_multi_sample_qc(self, input_ref, input_info): # download the input and setup a working dir reads_alignment_info = self.get_alignments_from_set(input_ref) try: gtf_file = self.get_gtf_file(input_ref, set_op=True) except: gtf_file = '' suffix = 'qualimap_' + str(int(time.time() * 10000)) workdir = os.path.join(self.scratch_dir, suffix) os.makedirs(workdir) input_file_path = self.create_multi_qualimap_cfg( reads_alignment_info, workdir) options = [ '-d', input_file_path, '-r', '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] multiplier = self._large_file(input_file_path) if multiplier: window_size = multiplier * 400 print(f'using larger window size: {window_size} and Java memory: ' f'{self.JAVA_MEM_DEFAULT_SIZE}') options.append(f'-nw {window_size}') # increase size of windows options.append(f'--java-mem-size={self.JAVA_MEM_DEFAULT_SIZE}') self.run_cli_command('multi-bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap Multi-sample BAM QC', 'multisampleBamQcReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def get_alignments_from_set(self, alignment_set_ref): set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 1 }) items = set_data['data']['items'] reads_alignment_data = [] for alignment in items: alignment_info = self.rau.download_alignment( {'source_ref': alignment['ref']}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) label = None if 'label' in alignment: label = alignment['label'] reads_alignment_data.append({ 'bam_file_path': bam_file_path, 'ref': alignment['ref'], 'label': label, 'info': alignment['info'] }) return reads_alignment_data def create_multi_qualimap_cfg(self, reads_alignment_info, workdir): # Group by labels if there is at least one defined use_labels = False for alignment in reads_alignment_info: if alignment['label']: use_labels = True break # write the file input_file_path = os.path.join(workdir, 'multi_input.txt') input_file = open(input_file_path, 'w') name_lookup = {} for alignment in reads_alignment_info: name = alignment['info'][1] if name in name_lookup: name_lookup[name] += 1 name = name + '_' + str(name_lookup[name]) else: name_lookup[name] = 1 input_file.write(name + '\t' + alignment['bam_file_path']) if use_labels: if alignment['label']: input_file.write('\t' + alignment['label']) else: input_file.write('\tunlabeled') input_file.write('\n') input_file.close() return input_file_path def get_run_info(self, params): info = self.get_obj_info(params['input_ref']) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: return {'mode': 'single', 'input_info': info} if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: return {'mode': 'multi', 'input_info': info} raise ValueError('Object type of input_ref is not valid, was: ' + str(obj_type)) def validate_params(self, params): if 'input_ref' not in params: raise ValueError( 'required parameter field "input_ref" was not set') create_report = False if 'create_report' in params: if int(params['create_report']) == 1: if 'output_workspace' not in params: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) if not params['output_workspace']: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) create_report = True params['create_report'] = create_report def run_cli_command(self, command, options, cwd=None): if command not in self.valid_commands: raise ValueError('Invalid QualiMap command: ' + str(command)) command = [self.QUALIMAP_PATH, command] + options print('Running: ' + ' '.join(command)) if not cwd: cwd = self.scratch_dir p = subprocess.Popen(command, cwd=cwd, shell=False) exitCode = p.wait() if exitCode == 0: print('Success, exit code was: ' + str(exitCode)) else: raise ValueError('Error running command: ' + ' '.join(command) + '\n' + 'Exit Code: ' + str(exitCode)) def find_my_bam_file(self, dirpath): bam_path = None for f in os.listdir(dirpath): fullpath = os.path.join(dirpath, f) if os.path.isfile(fullpath) and f.lower().endswith('.bam'): if bam_path is not None: raise ValueError( 'Error! Too many BAM files were downloaded for this alignment!' ) bam_path = fullpath if bam_path is None: raise ValueError( 'Error! No BAM files were downloaded for this alignment!') return bam_path def package_output_folder(self, folder_path, zip_file_name, zip_file_description, index_html_file): """ Simple utility for packaging a folder and saving to shock """ output = self.dfu.file_to_shock({ 'file_path': folder_path, 'make_handle': 0, 'pack': 'zip' }) return { 'shock_id': output['shock_id'], 'name': zip_file_name, 'description': zip_file_description, 'index_html_file_name': index_html_file } def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_obj_info(self, ref): return self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0]
class JbrowseUtil: def __init__(self, Config): callback_url = os.environ['SDK_CALLBACK_URL'] ws_url = Config['ws_url'] self.wsc = Workspace(ws_url) self.dfu = DataFileUtil(callback_url) self.gfu = GenomeFileUtil(callback_url) #service-wizard url self.sw_url = Config['sw_url'] self.shock_url = Config['shock_url'] scratch = Config['scratch'] session = str(uuid.uuid4()) self.session_dir = (os.path.join(scratch, session)) os.mkdir(self.session_dir) pass def get_variation_service_url(self, sw_url): ''' get the most recent VariationFileServ url from the service wizard. sw_url: service wizard url ''' # TODO Fix the following dev thing to beta or release or future json_obj = { "method": "ServiceWizard.get_service_status", "id": "", "params": [{"module_name": "VariationFileServ", "version": "dev"}] } sw_resp = requests.post(url=sw_url, data=json.dumps(json_obj)) vfs_resp = sw_resp.json() self.shock_url = self.shock_url.replace("https://", "") vfs_url = vfs_resp['result'][0]['url'] + "/jbrowse_query/" + self.shock_url + "/node" return vfs_url def _run_cmd(self, cmd): try: process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) stdout, stderr = process.communicate() if stdout: logging.info("ret> ", process.returncode) logging.info("OK> output ", stdout) if stderr: logging.info("ret> ", process.returncode) logging.info("Error> error ", stderr.strip()) except OSError as e: logging.info("OSError > ", e.errno) logging.info("OSError > ", e.strerror) logging.info("OSError > ", e.filename) def create_refseqs_data_from_assembly(self, assembly_ref): ''' :param assembly_json: :return: ''' refseqs_data = [] # 1) Download assembly contig info and parse contig length information data = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': assembly_ref }])[0]['data'] for key in data['contigs']: refseqs_data.append( {"end": data['contigs'][key]["length"], "length": data['contigs'][key]["length"], "name": data['contigs'][key]["contig_id"], "seqChunkSize": 20000, "start": 0 } ) return refseqs_data def prepare_genome_features_track(self, genome_ref, vfs_url): """ Builds track for genome features :param genome_ref: :return: """ shock_handles = list() gff_track = "" # 1) Download gff using genomefileutil gff_file_info = self.gfu.genome_to_gff({'genome_ref': genome_ref}) gff_file = gff_file_info["file_path"] # 2) sort gff outfile = gff_file + "_sorted" sorted_gff_cmd = " ".join(["sort -k1,1 -k4,4n", gff_file, ">", outfile]) self._run_cmd(sorted_gff_cmd) # 3) compress gff zip_cmd = "bgzip " + outfile self._run_cmd(zip_cmd) # 4) index gff index_gff_cmd = "tabix -p gff " + gff_file + "_sorted.gz" self._run_cmd(index_gff_cmd) gff_gz_file_path = gff_file + "_sorted.gz" gff_index_file_path = gff_file + "_sorted.gz.tbi" # 5) Upload gff and gff index to shock if os.path.exists(gff_gz_file_path): gff_shock_ref = self.dfu.file_to_shock( {'file_path': gff_gz_file_path, 'make_handle': 1} ) if os.path.exists(gff_index_file_path): gff_index_shock_ref = self.dfu.file_to_shock( {'file_path': gff_index_file_path, 'make_handle': 1} ) # 6 Create gff track text that will be used for genome features track gff_track = ''' { "label": "Genome Features", "key": "GenomeFeatures", "storeClass": "JBrowse/Store/SeqFeature/GFF3Tabix", "urlTemplate":"<vfs_url>/<gff_shock_ref>", "tbiUrlTemplate": "<vfs_url>/<gff_index_shock_ref>", "type": "JBrowse/View/Track/CanvasFeatures" } ''' gff_track = gff_track.replace("<gff_shock_ref>", gff_shock_ref['handle']['id']) gff_track = gff_track.replace("<gff_index_shock_ref>", gff_index_shock_ref['handle']['id']) gff_track = gff_track.replace("<vfs_url>", vfs_url) gff_track_dict = json.loads(gff_track) # 7) Capture shock handles shock_handles.append(gff_shock_ref['handle']) shock_handles.append(gff_index_shock_ref['handle']) # 8) return shock handles and gff track info return {"shock_handle_list": shock_handles, "track_item": gff_track_dict} def prepare_snp_frequency_track(self, vcf_filepath, assembly_ref, binsize, vfs_url): """ :param vcf_filepath: :param assembly_ref: :param binsize: :return: """ BEDGRAPHTOBIGWIG="/kb/deployment/bin/bedGraphToBigWig" shock_handles = list() chr_length_dict = {} chr_length_data = "" chr_length_path = None counts = Counter() # 1) Download assembly contig info and parse contig length information data = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': assembly_ref }])[0]['data'] contigs = data["contigs"] for contig in contigs: contig_data = data["contigs"][contig] chr_length_data += str(contig_data['contig_id']) + '\t' + str(contig_data['length']) + '\n' c_id = str(contig_data['contig_id']) c_length = str(contig_data['length']) chr_length_dict[c_id] = c_length # 2) Write contig lengths to a file (needed later) if chr_length_data is not None: chr_length_path = os.path.join(self.session_dir, "chr_length.txt") with open(chr_length_path, "w") as f: f.write(chr_length_data) # 3) Read and parse vcf file (must be bgzip compressed) # Caclculate number of SNPs in each bin and write in bedgraph format reader = gzip.open(vcf_filepath, "rt") logging.info("Generating bedgraph file\n") for record in reader: if record[0] == "#": continue rs = record.split("\t") CHR, POS = rs[0], rs[1] bin_pos = int(int(POS) / binsize) bin_id = str(CHR) + "\t" + str(bin_pos) counts[bin_id] += 1 bedgraph_file = os.path.join(self.session_dir, "vcf_bedgraph.txt") try: with open(bedgraph_file, "w") as fout: for j, k in counts.items(): chromosome, bin_num = j.split("\t") bin_start = int(bin_num) * binsize bin_end = bin_start + binsize chr_length = chr_length_dict[chromosome] if bin_end <= int(chr_length): fout.write(chromosome + "\t" + str(bin_start) + "\t" + str(bin_end) + "\t" + str(k) + "\n") else: fout.write(chromosome + "\t" + str(bin_start) + "\t" + str(chr_length) + "\t" + str(k) + "\n") except IOError: logging.info("Unable to write " + bedgraph_file, + " file on disk.") # 4) Sort bedgraph file by chromosome id and co-ordinates sorted_bedgraph_file = bedgraph_file + "_sorted" sort_cmd = "sort -k1,1 -k2,2n " + bedgraph_file + "> " + sorted_bedgraph_file self._run_cmd(sort_cmd) # 5) Convert sorted bedgraph to bigwig format using utility bedgraphTOBigWig tool output_bigwig_file = bedgraph_file + "_bigwig.bw" cmd = BEDGRAPHTOBIGWIG + " " + sorted_bedgraph_file + " " + chr_length_path + " " + output_bigwig_file logging.info("Generating bigwig ..\n" + cmd + "\n") self._run_cmd(cmd) # 6) upload bigwig file to shock logging.info("Uploading Bigwig file to shock") if os.path.exists(output_bigwig_file): bigwig_shock_ref = self.dfu.file_to_shock( {'file_path': output_bigwig_file, 'make_handle': 1} ) # 7) Append shock handle to genomic_indexes shock_handles.append(bigwig_shock_ref['handle']) # 8) Build snp frequency track output_bigwig_shock = bigwig_shock_ref['handle']['id'] snp_frequency_track = ''' { "label": "Variation Densityy", "key": "Variation_density", "storeClass": "JBrowse/Store/SeqFeature/BigWig", "urlTemplate": "<vfs_url>/<bigwig_shock_id>", "type": "JBrowse/View/Track/Wiggle/XYPlot" } ''' snp_frequency_track = snp_frequency_track.replace("<bigwig_shock_id>", output_bigwig_shock) snp_frequency_track = snp_frequency_track.replace("<vfs_url>", vfs_url) snp_frequency_track_dict = json.loads(snp_frequency_track) # 9) Return shock handles and track info return {"shock_handle_list": shock_handles, "track_item": snp_frequency_track_dict} def prepare_snp_track(self, vcf_shock_id, vcf_index_shock_id, vfs_url): """ :param vcf_shock_id: :param vcf_index_shock_id: :return: """ shock_handles = list() snp_track =''' { "label": "Variation", "key": "Variation", "storeClass": "JBrowse/Store/SeqFeature/VCFTabix", "urlTemplate": "<vfs_url>/<vcf_shock_id>", "tbiUrlTemplate": "<vfs_url>/<vcf_index_shock_id>", "type": "JBrowse/View/Track/HTMLVariants" } ''' snp_track = snp_track.replace("<vcf_shock_id>", vcf_shock_id) snp_track = snp_track.replace("<vcf_index_shock_id>", vcf_index_shock_id) snp_track = snp_track.replace("<vfs_url>", vfs_url) snp_track_dict = json.loads(snp_track) # shock handles should be empty list in return when built from shock ids return {"shock_handle_list": shock_handles, "track_item": snp_track_dict} def build_jbrowse_data_folder(self, jbrowse_path): shock_handles = list() data_folder_shock_ref = self.dfu.file_to_shock({'file_path': jbrowse_path, 'pack': 'zip', 'make_handle': 1}) shock_handles.append(data_folder_shock_ref['handle']) return {"shock_handle_list": shock_handles} def build_jbrowse(self, jbrowse_src, jbrowse_path, refseqs_data, genomic_indexes, tracklist_items): """ :param jbrowse_src: :param jbrowse_path: :param genomic_indexes: :param tracklist_items: :return: """ jbrowse_report = {} # 1) Copy the jbrowse source code to build report destination = shutil.copytree(jbrowse_src, jbrowse_path) # 2) Put tracklist.json in jbrowse data path tracklist_path = os.path.join(jbrowse_path, "data", "trackList.json") trackdata = { 'formatVersion': 1, 'tracks': tracklist_items } with open(tracklist_path, "w") as f: f.write(json.dumps(trackdata)) # 3) Put refseq.json in jbrowse seq path refseqs_json_path = os.path.join(jbrowse_path, "data", "seq", "refSeqs.json") with open(refseqs_json_path, "w") as f: f.write(json.dumps(refseqs_data)) #Build jbrowse data folder to support jbrowse widget in narrative res = self.build_jbrowse_data_folder(jbrowse_path) data_folder_index = res['shock_handle_list'] genomic_indexes = genomic_indexes + data_folder_index # Build jbrowse report dict jbrowse_report["jbrowse_data_path"] = jbrowse_path jbrowse_report["genomic_indexes"] = genomic_indexes return jbrowse_report def prepare_jbrowse_report(self, jbrowse_params): """ Build genomic indexes, prepare jbrowse report :param input_params: :return: """ # Service wizard sw_url = self.sw_url # Variation file service url for serving jbrowse track files vfs_url = self.get_variation_service_url(sw_url) print(vfs_url) genomic_indexes = list() tracklist_items = list() refseqs_data = None # 1) Build refseqs_data # This is used to build refseqs.json file for jbrowse # Jbrowse report can not be built if assembly ref doesn't exist if 'assembly_ref' in jbrowse_params: assembly_ref = jbrowse_params['assembly_ref'] refseqs_data = self.create_refseqs_data_from_assembly(assembly_ref) else: raise ValueError ("assembly ref not found") return # 2) Build genome features track if 'genome_ref' in jbrowse_params: genome_ref = jbrowse_params['genome_ref'] output = self.prepare_genome_features_track(genome_ref, vfs_url) shock_handles, track_item = output["shock_handle_list"], output["track_item"] genomic_indexes = genomic_indexes + shock_handles tracklist_items.append(track_item) else: print ("Skipping genome features track") # 3) Build SNP frequency track cond1 = 'vcf_path' in jbrowse_params cond2 = 'assembly_ref' in jbrowse_params cond3 = 'binsize' in jbrowse_params if cond1 and cond2 and cond3: vcf_path = jbrowse_params['vcf_path'] assembly_ref = jbrowse_params['assembly_ref'] binsize = jbrowse_params["binsize"] output = self.prepare_snp_frequency_track(vcf_path, assembly_ref, binsize, vfs_url) shock_handles, track_item = output["shock_handle_list"], output["track_item"] if shock_handles: genomic_indexes = genomic_indexes + shock_handles tracklist_items.append(track_item) else: print ("Skipping SNP frequency track") # 4) Build SNP track cond1 = 'vcf_shock_id' in jbrowse_params cond2 = 'vcf_index_shock_id' in jbrowse_params if cond1 and cond2: vcf_shock_id = jbrowse_params['vcf_shock_id'] vcf_index_shock_id = jbrowse_params['vcf_index_shock_id'] output = self.prepare_snp_track(vcf_shock_id, vcf_index_shock_id, vfs_url) shock_handles, track_item = output["shock_handle_list"], output["track_item"] genomic_indexes = genomic_indexes + shock_handles tracklist_items.append(track_item) else: print ("Skipping SNP track") # 5) Build jbrowse directory with index.html # jbrowse directory later on gets uploaded as html report jbrowse_src = "/kb/module/deps/jbrowse" jbrowse_path = os.path.join(self.session_dir, "jbrowse") if tracklist_items: jbrowse_report = self.build_jbrowse(jbrowse_src, jbrowse_path, refseqs_data, genomic_indexes, tracklist_items) else: raise ValueError ("No tracks found") return return jbrowse_report
def download_gffs(cb_url, scratch, genome_set_ref): """ Args: cb_url - callback server URL scratch - scratch work folder genome_set_ref - reference to genome_set object in workspace Returns the path to the folder containing .gff files we want to first handle a GenomeSet Object "KBaseSearch.GenomeSet" or "KBaseSets.GenomeSet" """ # Get our utilities dfu = DataFileUtil(cb_url) au = AssemblyUtil(cb_url) gfu = GenomeFileUtil(cb_url) obj_data = dfu.get_objects({'object_refs': [genome_set_ref]})['data'][0] gs_obj = obj_data['data'] obj_type = obj_data['info'][2] if 'KBaseSets.GenomeSet' in obj_type: refs = [gsi['ref'] for gsi in gs_obj['items']] elif 'KBaseSearch.GenomeSet' in obj_type: refs = [gse['ref'] for gse in gs_obj['elements'].values()] else: raise TypeError( 'provided input must of type KBaseSets.GenomeSet or KBaseSearch.GenomeSet not ' + str * (obj_type)) if len(refs) < 2: raise ValueError("Must provide GenomeSet with at least 2 Genomes.") # name the output directory temp_dir = scratch + '/temp' final_dir = scratch + '/gff' os.mkdir(final_dir) os.mkdir(temp_dir) # write file that will help us cat the gff and fasta files cat_path = scratch + '/fast_cat.txt' with open(cat_path, 'w') as cat_file: cat_file.write("##FASTA\n") path_to_ref_and_ID_pos_dict = {} all_ids = set([]) for ref in refs: gen_obj = dfu.get_objects({'object_refs': [ref]})['data'][0]['data'] # NO Eukaryotes, NO Fungi, # yes bacateria, yes archaea, yes(?) virus if gen_obj['domain'] not in ['Bacteria', 'Archaea']: raise TypeError( 'Provided Genomes are not labeled as Bacteria or Archaea. Roary is only equipped to handle Archaea or Bacteria' ) fasta_path = temp_dir + "/" + gen_obj['id'] + ".fa" gff_file = gfu.genome_to_gff({ 'genome_ref': ref, 'target_dir': temp_dir }) if 'assembly_ref' not in gen_obj.keys(): raise TypeError("All genomes must contain an 'assembly_ref'") else: fasta_file = au.get_assembly_as_fasta({ 'ref': gen_obj['assembly_ref'], 'filename': fasta_path }) # check that fasta_file exists if not os.path.isfile(fasta_file['path']): raise ValueError( 'An input Genome does not have an associated FASTA file.') # need to figure out if FASTA is already in gff file # not sure if we need to do this step. if 'path' in gff_file: gff_file_path = gff_file['path'] elif 'file_path' in gff_file: gff_file_path = gff_file['file_path'] elif 'gff_file' in gff_file: gff_file_path = gff_file['gff_file']['path'] else: raise ValueError("No GFF File Path found.") assert ( os.path.isfile(gff_file_path) ), "Could not find input GFF file for object with workspace reference: %s" % ref # oki doki, here we wanna make sure that the ID's in the genome object match up with # ID's in the gff file. This is importatnt because the pangenome object uses the genome # objects (in the pangenomeviewer). gen_id_to_pos, contains_fasta, all_ids, gen_ids = filter_gff( gff_file_path, gen_obj, all_ids=all_ids) new_file_path = final_dir + "/" + gen_obj['id'] + '.gff' if contains_fasta: args = ['mv', gff_file_path, new_file_path] subprocess.call(args) else: # NOTE: We have to pipe output of cat call to the new_file_path # next we make a new 'gff' file that contains both the gff and fasta information args = ['cat', gff_file_path, cat_path, fasta_file['path']] catted_files = subprocess.check_output(args) with open(new_file_path, 'w') as f: f.write(catted_files.decode('utf-8')) path_to_ref_and_ID_pos_dict[new_file_path] = (ref, gen_id_to_pos, gen_ids) return final_dir, path_to_ref_and_ID_pos_dict