class ImportSRAUtil: SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump' def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _check_fastq_dump_result(self, tmp_dir, sra_name): """ _check_fastq_dump_result: check fastq_dump result is PE or SE """ return os.path.exists(tmp_dir + '/' + sra_name + '/1') def _sra_to_fastq(self, scratch_sra_file_path, params): """ _sra_to_fastq: convert SRA file to FASTQ file(s) """ tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) handler_utils._mkdir_p(tmp_dir) command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O ' command += tmp_dir + ' ' + scratch_sra_file_path self._run_command(command) sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0] paired_end = self._check_fastq_dump_result(tmp_dir, sra_name) if paired_end: self._validate_paired_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq') os.rename(rev_file, rev_file + '.fastq') rev_file = rev_file + '.fastq' else: self._validate_single_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = None fastq_file_path = { 'fwd_file': fwd_file, 'rev_file': rev_file } return fastq_file_path def _validate_single_end_advanced_params(self, params): """ _validate_single_end_advanced_params: validate advanced params for single end reads """ if (params.get('insert_size_mean') or params.get('insert_size_std_dev') or params.get('read_orientation_outward')): error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or ' error_msg += '"Reads Orientation Outward" is Paried End Reads specific' raise ValueError(error_msg) if 'interleaved' in params: del params['interleaved'] def _validate_paired_end_advanced_params(self, params): """ _validate_paired_end_advanced_params: validate advanced params for paired end reads """ sequencing_tech = params.get('sequencing_tech') if sequencing_tech in ['PacBio CCS', 'PacBio CLR']: error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" ' error_msg += 'is Single End Reads specific' raise ValueError(error_msg) def _validate_upload_staging_file_availability(self, staging_file_subdir_path): """ _validate_upload_file_path_availability: validates file availability in user's staging area """ pass # TODO ftp_server needs to be fixed for subdir # list = ftp_service(self.callback_url).list_files() # if staging_file_subdir_path not in list: # error_msg = 'Target file: {} is NOT available.\n'.format( # staging_file_subdir_path.rpartition('/')[-1]) # error_msg += 'Available files:\n {}'.format("\n".join(list)) # raise ValueError(error_msg) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = os.path.join(config['scratch'], 'import_SRA_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_sra_from_staging(self, params): ''' import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name sequencing_tech: sequencing technology name: output reads file name workspace_name: workspace name/ID of the object Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_sra_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') log('Downloaded staging file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params) import_sra_reads_params = params import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = params.get('workspace_name') if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) returnVal = self.ru.upload_reads(import_sra_reads_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), returnVal['obj_ref']) return returnVal def import_sra_from_web(self, params): ''' import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome required params: download_type: download type for web source fastq file ('Direct Download', 'FTP', 'DropBox', 'Google Drive') workspace_name: workspace name/ID of the object sra_urls_to_add: dict of SRA file URLs required params: file_url: SRA file URL sequencing_tech: sequencing technology name: output reads file name Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_web_params(params) download_type = params.get('download_type') workspace_name = params.get('workspace_name') obj_refs = [] uploaded_files = [] for sra_url_to_add in params.get('sra_urls_to_add'): download_web_file_params = { 'download_type': download_type, 'file_url': sra_url_to_add.get('file_url') } scratch_sra_file_path = self.dfu.download_web_file( download_web_file_params).get('copy_file_path') log('Downloaded web file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, sra_url_to_add) import_sra_reads_params = sra_url_to_add import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = workspace_name if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) obj_ref = self.ru.upload_reads(import_sra_reads_params).get('obj_ref') obj_refs.append(obj_ref) uploaded_files.append(sra_url_to_add.get('file_url')) return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files} def validate_import_sra_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') self._validate_upload_staging_file_availability(params.get('staging_file_subdir_path')) def validate_import_sra_from_web_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['download_type', 'workspace_name', 'sra_urls_to_add']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) if not isinstance(params.get('sra_urls_to_add'), list): raise ValueError('sra_urls_to_add is not type list as required') for sra_url_to_add in params.get('sra_urls_to_add'): for p in ['file_url', 'sequencing_tech', 'name']: if p not in sra_url_to_add: raise ValueError('"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_refs_list, params): """ generate_report: generate summary report obj_refs: generated workspace object references. (return of import_sra_from_staging/web) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) objects_created = list() objects_data = list() for obj_ref in obj_refs_list: get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } objects_data.append(self.dfu.get_objects(get_objects_params)) objects_created.append({'ref': obj_ref, 'description': 'Imported Reads'}) output_html_files = self.generate_html_report(objects_data, params, uuid_string) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 460, 'report_object_name': 'kb_sra_upload_report_' + uuid_string} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def generate_html_report(self, reads_objs, params, uuid_string): """ _generate_html_report: generate html summary report """ log('Start generating html report') pprint(params) tmp_dir = os.path.join(self.scratch, uuid_string) handler_utils._mkdir_p(tmp_dir) result_file_path = os.path.join(tmp_dir, 'report.html') html_report = list() objects_content = '' for index, reads_obj in enumerate(reads_objs): idx = str(index) reads_data = reads_obj.get('data')[0].get('data') reads_info = reads_obj.get('data')[0].get('info') reads_ref = str(reads_info[6]) + '/' + str(reads_info[0]) + '/' + str(reads_info[4]) reads_obj_name = str(reads_info[1]) with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/table_panel.html'), 'r') as object_content_file: report_template = object_content_file.read() report_template = report_template.replace('_NUM', str(idx)) report_template = report_template.replace('OBJECT_NAME', reads_obj_name) if index == 0: report_template = report_template.replace('panel-collapse collapse', 'panel-collapse collapse in') objects_content += report_template base_percentages = '' for key, val in reads_data.get('base_percentages').items(): base_percentages += '{}({}%) '.format(key, val) reads_overview_data = collections.OrderedDict() reads_overview_data['Name'] = '{} ({})'.format(reads_obj_name, reads_ref) reads_overview_data['Uploaded File'] = params.get('uploaded_files')[index] reads_overview_data['Date Uploaded'] = time.strftime("%c") reads_overview_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count')) reads_type = reads_info[2].lower() if 'single' in reads_type: reads_overview_data['Type'] = 'Single End' elif 'paired' in reads_type: reads_overview_data['Type'] = 'Paired End' else: reads_overview_data['Type'] = 'Unknown' reads_overview_data['Platform'] = reads_data.get('sequencing_tech', 'Unknown') reads_single_genome = str(reads_data.get('single_genome', 'Unknown')) if '0' in reads_single_genome: reads_overview_data['Single Genome'] = 'No' elif '1' in reads_single_genome: reads_overview_data['Single Genome'] = 'Yes' else: reads_overview_data['Single Genome'] = 'Unknown' insert_size_mean = params.get('insert_size_mean', 'Not Specified') if insert_size_mean is not None: reads_overview_data['Insert Size Mean'] = str(insert_size_mean) else: reads_overview_data['Insert Size Mean'] = 'Not Specified' insert_size_std_dev = params.get('insert_size_std_dev', 'Not Specified') if insert_size_std_dev is not None: reads_overview_data['Insert Size Std Dev'] = str(insert_size_std_dev) else: reads_overview_data['Insert Size Std Dev'] = 'Not Specified' reads_outward_orientation = str(reads_data.get('read_orientation_outward', 'Unknown')) if '0' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'No' elif '1' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'Yes' else: reads_overview_data['Outward Read Orientation'] = 'Unknown' reads_stats_data = collections.OrderedDict() reads_stats_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count')) reads_stats_data['Total Number of Bases'] = '{:,}'.format(reads_data.get('total_bases')) reads_stats_data['Mean Read Length'] = str(reads_data.get('read_length_mean')) reads_stats_data['Read Length Std Dev'] = str(reads_data.get('read_length_stdev')) dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \ reads_data.get('read_count')) reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \ .format(str(reads_data.get('number_of_duplicates')), dup_reads_percent) reads_stats_data['Phred Type'] = str(reads_data.get('phred_type')) reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format(reads_data.get('qual_mean')) reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format(str(reads_data.get('qual_min')), str(reads_data.get('qual_max'))) reads_stats_data['GC Percentage'] = str(round(reads_data.get('gc_content') * 100, 2)) + '%' reads_stats_data['Base Percentages'] = base_percentages overview_content = '' for key, val in reads_overview_data.items(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>' stats_content = '' for key, val in reads_stats_data.items(): stats_content += '<tr><td><b>{}</b></td>'.format(key) stats_content += '<td>{}</td>'.format(val) stats_content += '</tr>' objects_content = objects_content.replace('###OVERVIEW_CONTENT###', overview_content) objects_content = objects_content.replace('###STATS_CONTENT###', stats_content) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/report_head.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('###TABLE_PANELS_CONTENT###', objects_content) result_file.write(report_template) result_file.close() shutil.copytree(os.path.join(os.path.dirname(__file__), 'report_template_sra/bootstrap-3.3.7'), os.path.join(tmp_dir, 'bootstrap-3.3.7')) shutil.copy(os.path.join(os.path.dirname(__file__), 'report_template_sra/jquery-3.2.1.min.js'), os.path.join(tmp_dir, 'jquery-3.2.1.min.js')) matched_files = [] for root, dirnames, filenames in os.walk(tmp_dir): for filename in fnmatch.filter(filenames, '*.gz'): matched_files.append(os.path.join(root, filename)) for gz_file in matched_files: print(('Removing ' + gz_file)) os.remove(gz_file) report_shock_id = self.dfu.file_to_shock({'file_path': tmp_dir, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Imported Assembly'}) return html_report
class UnpackFileUtil: def _staging_service_host(self): deployment_path = os.environ["KB_DEPLOYMENT_CONFIG"] parser = SafeConfigParser() parser.read(deployment_path) endpoint = parser.get('kb_uploadmethods', 'kbase-endpoint') staging_service_host = endpoint + '/staging_service' return staging_service_host def _file_to_staging(self, file_path_list, subdir_folder=None): """ _file_to_staging: upload file(s) to staging area """ subdir_folder_str = '/' if not subdir_folder else '/{}'.format( subdir_folder) staging_service_host = self._staging_service_host() end_point = staging_service_host + '/upload' headers = {'Authorization': self.token} files = {'destPath': subdir_folder_str} for file_path in file_path_list: files.update({ 'uploads': (os.path.basename(file_path), open(file_path, 'rb')) }) resp = _requests.post(end_point, headers=headers, files=files) if resp.status_code != 200: raise ValueError( 'Upload file {} failed.\nError Code: {}\n{}\n'.format( file_path, resp.status_code, resp.text)) else: log("return message from server:\n{}\n".format(resp.text)) def _remove_irrelevant_files(self, file_path): """ _remove_irrelevant_files: remove irrelevant files """ target_name = os.path.basename(file_path) file_dir = os.path.dirname(file_path) for dirpath, dirnames, filenames in os.walk(file_dir): for filename in filenames: if filename != target_name: irrelevant_file_path = os.sep.join([dirpath, filename]) os.remove(irrelevant_file_path) log('removing irrelevant file: {}'.format( irrelevant_file_path)) def _r_unpack(self, file_path, count): """ _r_unpack: recursively unpack file_path """ if count == 0: self._remove_irrelevant_files(file_path) count += 1 if os.path.isfile(file_path): log('processing: {}{}'.format('-' * count, file_path)) t = magic.from_file(file_path, mime=True) if os.path.basename(file_path).endswith('.DS_Store'): os.remove(file_path) log('removing file: {}{}'.format('-' * count, file_path)) elif t in [ 'application/' + x for x in ('x-gzip', 'gzip', 'x-bzip', 'x-bzip2', 'bzip', 'bzip2', 'x-tar', 'tar', 'x-gtar', 'zip', 'x-zip-compressed') ]: file_dir = os.path.dirname(file_path) files_before_unpack = os.listdir(file_dir) self.dfu.unpack_file({'file_path': file_path}).get('file_path') files_after_unpack = os.listdir(file_dir) new_files = [ item for item in files_after_unpack if item not in files_before_unpack ] for new_file in new_files: self._r_unpack(os.sep.join([file_dir, new_file]), count) os.remove(file_path) log('removing file: {}{}'.format('-' * count, file_path)) else: return file_path else: if os.path.basename(file_path).startswith('_'): shutil.rmtree(file_path, ignore_errors=True) log('removing folder: {}{}'.format('-' * count, file_path)) else: for dirpath, dirnames, filenames in os.walk(file_path): for filename in filenames: self._r_unpack(os.sep.join([dirpath, filename]), count) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.user_id = config['USER_ID'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) def unpack_staging_file(self, params): """ Unpack a staging area file params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name result: unpacked_file_path: unpacked file path(s) in staging area """ log('--->\nrunning UnpackFileUtil.unpack_staging_file\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) scratch_file_path = self.dfu.download_staging_file(params).get( 'copy_file_path') self._r_unpack(scratch_file_path, 0) unpacked_file_path_list = [] for dirpath, dirnames, filenames in os.walk( os.path.dirname(scratch_file_path)): for filename in filenames: unpacked_file_path_list.append(os.sep.join([dirpath, filename])) log("Unpacked files:\n {}".format( '\n '.join(unpacked_file_path_list))) self._file_to_staging( unpacked_file_path_list, os.path.dirname(params.get('staging_file_subdir_path'))) unpacked_file_path = ','.join(unpacked_file_path_list) returnVal = {'unpacked_file_path': unpacked_file_path} return returnVal def unpack_web_file(self, params): """ Download and unpack a web file to staging area params: file_url: file URL download_type: one of ['Direct Download', 'FTP', 'DropBox', 'Google Drive'] result: unpacked_file_path: unpacked file path(s) in staging area """ log('--->\nrunning UnpackFileUtil.unpack_web_file\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) scratch_file_path = self.dfu.download_web_file(params).get( 'copy_file_path') self._r_unpack(scratch_file_path, 0) unpacked_file_path_list = [] for dirpath, dirnames, filenames in os.walk( os.path.dirname(scratch_file_path)): for filename in filenames: unpacked_file_path_list.append(os.sep.join([dirpath, filename])) log("Unpacked files:\n {}".format( '\n '.join(unpacked_file_path_list))) self._file_to_staging(unpacked_file_path_list) unpacked_file_path = ','.join(unpacked_file_path_list) returnVal = {'unpacked_file_path': unpacked_file_path} return returnVal def generate_report(self, unpacked_file_path, params): """ generate_report: generate summary report unpacked_file_path: generated unpacked file path(s) in staging area. (return of unpack_staging_file or unpack_web_file) """ log("generating report") uuid_string = str(uuid.uuid4()) unpacked_file_path_list = unpacked_file_path.split(',') subdir = os.path.dirname( params.get('staging_file_subdir_path')) + '/' if params.get( 'staging_file_subdir_path') else '/' upload_message = 'Uploaded Files: {}\n'.format( len(unpacked_file_path_list)) for file_path in unpacked_file_path_list: upload_message += subdir + os.path.basename(file_path) + '\n' report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class GenbankToGenome: def __init__(self, config): self.cfg = config self.gi = GenomeInterface(config) self.dfu = DataFileUtil(config.callbackURL) self.aUtil = AssemblyUtil(config.callbackURL) self.ws = Workspace(config.workspaceURL) self._messages = [] self.time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() self.version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) self.generate_parents = False self.generate_ids = False self.genes = OrderedDict() self.mrnas = OrderedDict() self.cdss = OrderedDict() self.noncoding = [] self.ontologies_present = defaultdict(dict) self.ontology_events = list() self.skiped_features = Counter() self.feature_counts = Counter() self.orphan_types = Counter() self.contig_seq = {} self.circ_contigs = set() self.features_spaning_zero = set() self.genome_warnings = [] self.genome_suspect = False self.defects = Counter() self.spoofed_genes = 0 self.excluded_features = ('source', 'exon', 'fasta_record') self.ont_mappings = load_ontology_mappings('/kb/module/data') self.code_table = 11 self.re_api_url = config.re_api_url # dict with feature 'id's that have been used more than once. self.used_twice_identifiers = {} self.default_params = { 'source': 'Genbank', 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'taxon_lookup_obj_name': self.cfg.raw['taxon-lookup-object-name'], 'ontology_wsname': self.cfg.raw['ontology-workspace-name'], 'ontology_GO_obj_name': self.cfg.raw['ontology-gene-ontology-obj-name'], 'ontology_PO_obj_name': self.cfg.raw['ontology-plant-ontology-obj-name'], 'release': None, 'genetic_code': 11, 'generate_ids_if_needed': 0, 'metadata': {} } @property def messages(self): return "\n".join(self._messages) def refactored_import(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) construct the input directory staging area input_directory = self.stage_input(params) # 3) update default params self.default_params.update(params) params = self.default_params self.generate_parents = params.get('generate_missing_genes') self.generate_ids = params.get('generate_ids_if_needed') if params.get('genetic_code'): self.code_table = params['genetic_code'] # 4) Do the upload files = self._find_input_files(input_directory) consolidated_file = self._join_files_skip_empty_lines(files) genome = self.parse_genbank(consolidated_file, params) if params.get('genetic_code'): genome["genetic_code"] = params['genetic_code'] result = self.gi.save_one_genome({ 'workspace': params['workspace_name'], 'name': params['genome_name'], 'data': genome, "meta": params['metadata'], }) ref = f"{result['info'][6]}/{result['info'][0]}/{result['info'][4]}" logging.info(f"Genome saved to {ref}") # 5) clear the temp directory shutil.rmtree(input_directory) # 6) return the result info = result['info'] details = {'genome_ref': ref, 'genome_info': info} return details @staticmethod def validate_params(params): if 'workspace_name' not in params: raise ValueError('required "workspace_name" field was not defined') if 'genome_name' not in params: raise ValueError('required "genome_name" field was not defined') if 'file' not in params: raise ValueError('required "file" field was not defined') # one and only one of 'path', 'shock_id', or 'ftp_url' is required file = params['file'] if not isinstance(file, dict): raise ValueError('required "file" field must be a map/dict') sources = ('path', 'shock_id', 'ftp_url') n_valid_fields = sum(1 for f in sources if file.get(f)) if n_valid_fields < 1: raise ValueError(f'required "file" field must include one source: ' f'{", ".join(sources)}') if n_valid_fields > 1: raise ValueError( f'required "file" field has too many sources specified: ' f'{", ".join(file.keys())}') if params.get('genetic_code'): if not (isinstance(params['genetic_code'], int) and 0 < params['genetic_code'] < 32): raise ValueError(f"Invalid genetic code specified: {params}") def stage_input(self, params): """ Setup the input_directory by fetching the files and uncompressing if needed. """ # construct the input directory where we stage files input_directory = os.path.join( self.cfg.sharedFolder, f'genome-upload-staging-{uuid.uuid4()}') os.makedirs(input_directory) # at this point, the 'file' input is validated, so we don't have to catch any special cases # we expect one and only one of path, shock_id, or ftp_url # determine how to get the file: if it is from shock, download it. If it # is just sitting there, then use it. Move the file to the staging input directory file = params['file'] genbank_file_path = None if file.get('path') is not None: # copy the local file to the input staging directory # (NOTE: could just move it, but then this method would have the side effect of moving your # file which another SDK module might have an open handle on) local_file_path = file['path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file logging.info( f'Downloading file from SHOCK node: {self.cfg.shockURL} - {file["shock_id"]}' ) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] genbank_file_path = os.path.join(input_directory, file_name) if 'ftp_url' in file and file['ftp_url'] is not None: logging.info('Downloading file from: ' + str(file['ftp_url'])) local_file_path = self.dfu.download_web_file({ 'file_url': file['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) # extract the file if it is compressed if genbank_file_path is not None: logging.info("staged input file =" + genbank_file_path) self.dfu.unpack_file({'file_path': genbank_file_path}) else: raise ValueError( 'No valid files could be extracted based on the input') return input_directory def parse_genbank(self, file_path, params): logging.info("Saving original file to shock") shock_res = self.dfu.file_to_shock({ 'file_path': file_path, 'make_handle': 1, 'pack': 'gzip', }) # Write and save assembly file assembly_ref = self._save_assembly(file_path, params) assembly_data = self.dfu.get_objects({ 'object_refs': [assembly_ref], 'ignore_errors': 0 })['data'][0]['data'] genome = { "id": params['genome_name'], "original_source_file_name": os.path.basename(file_path), "assembly_ref": assembly_ref, "gc_content": assembly_data['gc_content'], "dna_size": assembly_data['dna_size'], "md5": assembly_data['md5'], "genbank_handle_ref": shock_res['handle']['hid'], "publications": set(), "contig_ids": [], "contig_lengths": [], } genome['source'], genome['genome_tiers'] = self.gi.determine_tier( params['source']) if params.get('genome_type'): genome['genome_type'] = params['genome_type'] # Set taxonomy-related fields in the genome # Also validates the given taxon ID if params.get('taxon_id'): set_taxon_data(int(params['taxon_id']), self.re_api_url, genome) else: set_default_taxon_data(genome) dates = [] # Parse data from genbank file contigs = Bio.SeqIO.parse(file_path, "genbank") for record in contigs: r_annot = record.annotations logging.info("parsing contig: " + record.id) try: dates.append(time.strptime(r_annot.get('date'), "%d-%b-%Y")) except (TypeError, ValueError): pass genome['contig_ids'].append(record.id) genome['contig_lengths'].append(len(record)) genome["publications"] |= self._get_pubs(r_annot) # only do the following once(on the first contig) if "source_id" not in genome: genome["source_id"] = record.id.split('.')[0] organism = r_annot.get('organism', 'Unknown Organism') if params.get('scientific_name'): genome['scientific_name'] = params['scientific_name'] else: genome['scientific_name'] = organism self.code_table = genome['genetic_code'] genome["molecule_type"] = r_annot.get('molecule_type', 'DNA') genome['notes'] = r_annot.get('comment', "").replace('\\n', '\n') self._parse_features(record, genome['source']) genome.update(self.get_feature_lists()) genome['num_contigs'] = len(genome['contig_ids']) # add dates dates.sort() if dates: genome['external_source_origination_date'] = time.strftime( "%d-%b-%Y", dates[0]) if dates[0] != dates[-1]: genome['external_source_origination_date'] += " _ " + \ time.strftime("%d-%b-%Y", dates[-1]) if self.ontologies_present: genome['ontologies_present'] = dict(self.ontologies_present) genome["ontology_events"] = self.ontology_events genome['feature_counts'] = dict(self.feature_counts) # can't serialize a set genome['publications'] = list(genome['publications']) if len(genome['cdss']) and (self.defects['cds_seq_not_matching'] / float(len(genome['cdss'])) > 0.02): self.genome_warnings.append( warnings["genome_inc_translation"].format( self.defects['cds_seq_not_matching'], len(genome['cdss']))) self.genome_suspect = 1 if self.defects['bad_parent_loc']: self.genome_warnings.append( f"There were {self.defects['bad_parent_loc']} parent/child " "relationships that were not able to be determined. Some of " "these may have splice variants that may be valid relationships." ) if self.defects['spoofed_genes']: self.genome_warnings.append(warnings['spoofed_genome'].format( self.defects['spoofed_genes'])) genome['suspect'] = 1 if self.defects['not_trans_spliced']: self.genome_warnings.append( warnings['genome_not_trans_spliced'].format( self.defects['not_trans_spliced'])) genome['suspect'] = 1 if self.genome_warnings: genome['warnings'] = self.genome_warnings if self.genome_suspect: genome['suspect'] = 1 logging.info(f"Feature Counts: {genome['feature_counts']}") return genome def _save_assembly(self, genbank_file, params): """Convert genbank file to fasta and sve as assembly""" contigs = Bio.SeqIO.parse(genbank_file, "genbank") assembly_id = f"{params['genome_name']}_assembly" fasta_file = f"{self.cfg.sharedFolder}/{params['genome_name']}_assembly.fasta" out_contigs = [] extra_info = defaultdict(dict) for in_contig in contigs: if in_contig.annotations.get('topology', "") == 'circular': extra_info[in_contig.id]['is_circ'] = 1 self.circ_contigs.add(in_contig.id) elif in_contig.annotations.get('topology', "") == 'linear': extra_info[in_contig.id]['is_circ'] = 0 out_contigs.append(in_contig) self.contig_seq[in_contig.id] = in_contig.seq.upper() assembly_ref = params.get("use_existing_assembly") if assembly_ref: if not re.match("\d+\/\d+\/\d+", assembly_ref): raise ValueError( f"Assembly ref: {assembly_ref} is not a valid format. Must" f" be in numerical <ws>/<object>/<version> format.") ret = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] if "KBaseGenomeAnnotations.Assembly" not in ret['info'][2]: raise ValueError( f"{assembly_ref} is not a reference to an assembly") unmatched_ids = list() unmatched_ids_md5s = list() for current_contig in self.contig_seq.keys(): current_contig_md5 = hashlib.md5( str(self.contig_seq[current_contig]).encode( 'utf8')).hexdigest() if current_contig in ret['data']['contigs']: if current_contig_md5 != ret['data']['contigs'][ current_contig]['md5']: unmatched_ids_md5s.append(current_contig) else: unmatched_ids.append(current_contig) if len(unmatched_ids) > 0: raise ValueError(warnings['assembly_ref_extra_contigs'].format( ", ".join(unmatched_ids))) if len(unmatched_ids_md5s) > 0: raise ValueError(warnings["assembly_ref_diff_seq"].format( ", ".join(unmatched_ids_md5s))) logging.info(f"Using supplied assembly: {assembly_ref}") return assembly_ref logging.info("Saving sequence as Assembly object") Bio.SeqIO.write(out_contigs, fasta_file, "fasta") assembly_ref = self.aUtil.save_assembly_from_fasta({ 'file': { 'path': fasta_file }, 'workspace_name': params['workspace_name'], 'assembly_name': assembly_id, 'type': params.get('genome_type', 'isolate'), 'contig_info': extra_info }) logging.info(f"Assembly saved to {assembly_ref}") return assembly_ref def _find_input_files(self, input_directory): logging.info("Scanning for Genbank Format files.") valid_extensions = [".gbff", ".gbk", ".gb", ".genbank", ".dat", ".gbf"] files = os.listdir(os.path.abspath(input_directory)) logging.info("Genbank Files : " + ", ".join(files)) genbank_files = [ x for x in files if os.path.splitext(x)[-1].lower() in valid_extensions ] if len(genbank_files) == 0: raise Exception( f"The input directory does not have any files with one of the " f"following extensions {','.join(valid_extensions)}.") logging.info(f"Found {len(genbank_files)} genbank files") input_files = [] for genbank_file in genbank_files: input_files.append(os.path.join(input_directory, genbank_file)) return input_files def _join_files_skip_empty_lines(self, input_files): """ Applies strip to each line of each input file. Args: input_files: Paths to input files in Genbank format. Returns: Path to resulting file (currenly it's the same file as input). """ if len(input_files) == 0: raise ValueError("NO GENBANK FILE") temp_dir = os.path.join(os.path.dirname(input_files[0]), "combined") if not os.path.exists(temp_dir): os.makedirs(temp_dir) ret_file = os.path.join(temp_dir, os.path.basename(input_files[0])) # take in Genbank file and remove all empty lines from it. with open(ret_file, 'w', buffering=2**20) as f_out: for input_file in input_files: with open(input_file, 'r') as f_in: for line in f_in: line = line.rstrip('\r\n') if line.strip(): f_out.write(line + '\n') return ret_file def _get_pubs(self, r_annotations): """Get a contig's publications""" pub_list = [] for in_pub in r_annotations.get('references', []): # don't add blank pubs if not in_pub.authors: continue out_pub = [ 0, # pmid "", # source in_pub.title, "", # web address "", # date in_pub.authors, in_pub.journal, ] date_match = re.match("\((\d{4})\)", in_pub.journal) if date_match: out_pub[4] = date_match.group(1) if in_pub.pubmed_id: out_pub[0:4] = [ int(in_pub.pubmed_id), "PubMed", in_pub.title, f"http://www.ncbi.nlm.nih.gov/pubmed/{in_pub.pubmed_id}" ] pub_list.append(tuple(out_pub)) logging.info(f"Parsed {len(pub_list)} publication records") return set(pub_list) def _get_id(self, feat, tags=None): """Assign a id to a feature based on the first tag that exists""" _id = "" if not tags: tags = ['locus_tag', 'kbase_id'] for t in tags: _id = feat.qualifiers.get(t, [""])[0] if _id: break if not _id: if feat.type == 'gene': if not self.generate_ids: raise ValueError( f"Unable to find a valid id for gene " f"among these tags: {', '.join(tags)}. Correct the " f"file or rerun with generate_ids\n {feat}") self.orphan_types['gene'] += 1 _id = f"gene_{self.orphan_types['gene']}" if 'rna' in feat.type.lower() or feat.type in { 'CDS', 'sig_peptide', 'five_prime_UTR', 'three_prime_UTR' }: _id = f"gene_{self.orphan_types['gene']}" return _id def _parse_features(self, record, source): def _location(feat): """Convert to KBase style location objects""" strand_trans = ("", "+", "-") loc = [] for part in feat.location.parts: contig_id = part.ref if part.ref else record.id if part.strand >= 0: begin = int(part.start) + 1 else: begin = int(part.end) loc.append( (contig_id, begin, strand_trans[part.strand], len(part))) return loc def _warn(message): if message not in out_feat.get('warnings', []): out_feat['warnings'] = out_feat.get('warnings', []) + [message] def _check_suspect_location(parent=None): if 'trans_splicing' in out_feat.get('flags', []): return if out_feat['location'] == sorted( out_feat['location'], reverse=(in_feature.location.strand == -1)): return if record.id in self.circ_contigs and \ in_feature.location.start == 0 \ and in_feature.location.end == len(record): self.features_spaning_zero.add(out_feat['id']) return if parent and parent['id'] in self.features_spaning_zero: return _warn(warnings['not_trans_spliced']) self.defects['not_trans_spliced'] += 1 for in_feature in record.features: if in_feature.type in self.excluded_features: self.skiped_features[in_feature.type] += 1 continue feat_seq = self._get_seq(in_feature, record.id) if source == "Ensembl": _id = self._get_id(in_feature, ['gene', 'locus_tag']) else: _id = self._get_id(in_feature) # The following is common to all the feature types out_feat = { "id": "_".join([_id, in_feature.type]), "location": _location(in_feature), "dna_sequence": str(feat_seq), "dna_sequence_length": len(feat_seq), "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(), } if not _id: out_feat['id'] = in_feature.type # validate input feature # note that end is the larger number regardless of strand if int(in_feature.location.end) > len(record): self.genome_warnings.append( warnings["coordinates_off_end"].format(out_feat['id'])) self.genome_suspect = 1 continue for piece in in_feature.location.parts: if not isinstance(piece.start, ExactPosition) \ or not isinstance(piece.end, ExactPosition): _warn(warnings["non_exact_coordinates"]) self.feature_counts[in_feature.type] += 1 # add optional fields if 'note' in in_feature.qualifiers: out_feat['note'] = in_feature.qualifiers["note"][0] out_feat.update(self._get_aliases_flags_functions(in_feature)) ont, db_xrefs = self._get_ontology_db_xrefs(in_feature) if ont: out_feat['ontology_terms'] = ont if db_xrefs: out_feat['db_xrefs'] = db_xrefs if 'inference' in in_feature.qualifiers: out_feat['inference_data'] = parse_inferences( in_feature.qualifiers['inference']) _check_suspect_location(self.genes.get(_id)) # add type specific features if in_feature.type == 'CDS': self.process_cds(_id, feat_seq, in_feature, out_feat) elif in_feature.type == 'gene': self.process_gene(_id, out_feat) elif in_feature.type == 'mRNA': self.process_mrna(_id, out_feat) else: self.noncoding.append( self.process_noncoding(_id, in_feature.type, out_feat)) def get_feature_lists(self): """sort genes into their final arrays""" coding = [] for g in self.genes.values(): if len(g['cdss']): if g['mrnas'] and len(g['mrnas']) != len(g['cdss']): msg = "The length of the mrna and cdss arrays are not equal" g['warnings'] = g.get('warnings', []) + [msg] # remove duplicates that may arise from CDS info propagation for key in ('functions', 'aliases', 'db_xrefs'): if key in g: g[key] = list(set(g[key])) if not g['mrnas']: del g['mrnas'] del g['type'] coding.append(g) self.feature_counts["protein_encoding_gene"] += 1 else: del g['mrnas'], g['cdss'] self.noncoding.append(g) self.feature_counts["non_coding_genes"] += 1 self.feature_counts["non_coding_features"] = len(self.noncoding) return { 'features': coding, 'non_coding_features': self.noncoding, 'cdss': list(self.cdss.values()), 'mrnas': list(self.mrnas.values()) } def _get_seq(self, feat, contig): """Extract the DNA sequence for a feature""" seq = [] for part in feat.location.parts: strand = part.strand # handle trans-splicing across contigs if part.ref: part_contig = part.ref else: part_contig = contig if strand >= 0: seq.append( str(self.contig_seq[part_contig][part.start:part.end])) else: seq.append( str(self.contig_seq[part_contig] [part.start:part.end].reverse_complement())) return "".join(seq) def _create_ontology_event(self, ontology_type): """Creates the ontology_event if necessary Returns the index of the ontology event back.""" if ontology_type not in self.ont_mappings: raise ValueError(f"{ontology_type} is not a supported ontology") if "event_index" not in self.ont_mappings[ontology_type]: self.ont_mappings[ontology_type]['event_index'] = len( self.ontology_events) if ontology_type == "GO": ontology_ref = "KBaseOntology/gene_ontology" elif ontology_type == "PO": ontology_ref = "KBaseOntology/plant_ontology" else: ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology" self.ontology_events.append({ "method": "GenomeFileUtils Genbank uploader from annotations", "method_version": self.version, "timestamp": self.time_string, "id": ontology_type, "ontology_ref": ontology_ref }) return self.ont_mappings[ontology_type]['event_index'] def _get_ontology_db_xrefs(self, feature): """Splits the ontology info from the other db_xrefs""" ontology = defaultdict(dict) db_xrefs = [] for key in ("GO_process", "GO_function", "GO_component"): ontology_event_index = self._create_ontology_event("GO") for term in feature.qualifiers.get(key, []): sp = term.split(" - ") ontology['GO'][sp[0]] = [ontology_event_index] self.ontologies_present['GO'][ sp[0]] = self.ont_mappings['GO'].get(sp[0], '') for ref in feature.qualifiers.get('db_xref', []): if ref.startswith('GO:'): ontology['GO'][ref] = [self._create_ontology_event("GO")] self.ontologies_present['GO'][ref] = self.ont_mappings[ 'GO'].get(ref, '') elif ref.startswith('PO:'): ontology['PO'][ref] = [self._create_ontology_event("PO")] self.ontologies_present['PO'][ref] = self.ont_mappings[ 'PO'].get(ref, '') elif ref.startswith('KO:'): ontology['KO'][ref] = [self._create_ontology_event("KO")] self.ontologies_present['KO'][ref] = self.ont_mappings[ 'KO'].get(ref, '') elif ref.startswith('COG'): ontology['COG'][ref] = [self._create_ontology_event("COG")] self.ontologies_present['COG'][ref] = self.ont_mappings[ 'COG'].get(ref, '') elif ref.startswith('PF'): ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")] self.ontologies_present['PFAM'][ref] = self.ont_mappings[ 'PFAM'].get(ref, '') elif ref.startswith('TIGR'): ontology['TIGRFAM'][ref] = [ self._create_ontology_event("TIGRFAM") ] self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[ 'TIGRFAM'].get(ref, '') elif ":" not in ref: db_xrefs.append(tuple(["Unknown_Source", ref])) else: db_xrefs.append(tuple(ref.split(":", 1))) return dict(ontology), sorted(db_xrefs) @staticmethod def _get_aliases_flags_functions(feat): """Get the values for aliases flags and features from qualifiers""" alias_keys = { 'locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id', 'gene', 'EC_number', 'gene_synonym' } result = defaultdict(list) for key, val_list in feat.qualifiers.items(): if key in alias_keys: result['aliases'].extend([(key, val) for val in val_list]) # flags have no other information associated with them if val_list == ['']: result['flags'].append(key) if key == 'function': result['functional_descriptions'].extend( val_list[0].split('; ')) if key == 'product': result['functions'] = val_list return result def _find_parent_gene(self, potential_id, feature): """Unfortunately, Genbank files don't have a parent ID and the features can be out of order at times. To account for this, the this function works backwards from the end of list of IDs and stops when if finds a parent with valid coordinates or it hits the maximum number of tries""" if potential_id in self.genes: lookup_attempts = 0 while lookup_attempts < MAX_PARENT_LOOKUPS: if is_parent(self.genes[potential_id], feature): return potential_id lookup_attempts += 1 try: potential_id = list( self.genes.keys())[-(lookup_attempts + 1)] except IndexError: break # no more genes that could match exist self.defects['bad_parent_loc'] += 1 return None def assign_new_id(self, _id): """given a feature id that has already been used, add a unique modifier to it""" _id_modifier = self.used_twice_identifiers.get(_id, 1) self.used_twice_identifiers[_id] = _id_modifier + 1 return _id + "." + str(_id_modifier) def process_gene(self, _id, out_feat): out_feat.update({ "id": _id, "type": 'gene', "mrnas": [], 'cdss': [], }) if _id in self.genes: _id = self.assign_new_id(_id) out_feat.update({"id": _id}) # raise ValueError(f"Duplicate gene ID: {_id}") self.genes[_id] = out_feat def process_noncoding(self, gene_id, feat_type, out_feat): out_feat["type"] = feat_type # this prevents big misc_features from blowing up the genome size if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE: del out_feat['dna_sequence'] gene_id = self._find_parent_gene(gene_id, out_feat) if gene_id: if 'children' not in self.genes[gene_id]: self.genes[gene_id]['children'] = [] out_feat['id'] += "_" + str( len(self.genes[gene_id]['children']) + 1) self.genes[gene_id]['children'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types[feat_type] += 1 out_feat['id'] += "_" + str(self.orphan_types[feat_type]) return out_feat def process_mrna(self, gene_id, out_feat): if gene_id not in self.genes and self.generate_parents: self.process_gene(gene_id, copy.copy(out_feat)) gene_id = self._find_parent_gene(gene_id, out_feat) if gene_id: out_feat['id'] = "_".join( (gene_id, "mRNA", str(len(self.genes[gene_id]['mrnas']) + 1))) self.genes[gene_id]['mrnas'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types['mrna'] += 1 out_feat['id'] = f"mRNA_{self.orphan_types['mrna']}" out_feat['warnings'] = out_feat.get('warnings', []) + [ 'Unable to find parent gene for ' + str(out_feat['id']) ] self.mrnas[out_feat['id']] = out_feat def process_cds(self, gene_id, feat_seq, in_feature, out_feat): # Associate CDS with parents cds_warnings = out_feat.get('warnings', []) validated_gene_id = self._find_parent_gene(gene_id, out_feat) if validated_gene_id: out_feat['id'] = "_".join( (validated_gene_id, "CDS", str(len(self.genes[validated_gene_id]['cdss']) + 1))) self.genes[validated_gene_id]['cdss'].append(out_feat['id']) out_feat['parent_gene'] = validated_gene_id elif self.generate_parents and gene_id not in self.genes: new_feat = copy.copy(out_feat) new_feat['id'] = gene_id new_feat['warnings'] = [warnings['spoofed_gene']] self.orphan_types['gene'] += 1 self.defects['spoofed_genes'] += 1 self.process_gene(new_feat['id'], new_feat) out_feat['id'] = "_".join( (gene_id, "CDS", str(len(self.genes[gene_id]['cdss']) + 1))) self.genes[gene_id]['cdss'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types['cds'] += 1 out_feat['id'] = f"CDS_{self.orphan_types['cds']}" cds_warnings.append( f"Unable to find parent gene for {out_feat['id']}") # there is a 1 to 1 relationship of mRNA to CDS so XXX_mRNA_1 will match XXX_CDS_1 mrna_id = out_feat["id"].replace('CDS', 'mRNA') if mrna_id in self.mrnas: if not is_parent(self.mrnas[mrna_id], out_feat): cds_warnings.append(warnings['cds_mrna_cds'].format(mrna_id)) self.mrnas[mrna_id]['warnings'] = self.mrnas[mrna_id].get( 'warnings', []) + [warnings['cds_mrna_mrna']] self.defects['bad_parent_loc'] += 1 else: out_feat['parent_mrna'] = mrna_id self.mrnas[mrna_id]['cds'] = out_feat['id'] # process protein prot_seq = in_feature.qualifiers.get("translation", [""])[0] # allow a little slack to account for frameshift and stop codon if prot_seq and abs(len(prot_seq) * 3 - len(feat_seq)) > 4: cds_warnings.append(warnings["inconsistent_CDS_length"].format( len(feat_seq), len(prot_seq))) self.genome_warnings.append( warnings['genome_inc_CDS_length'].format( out_feat['id'], len(feat_seq), len(prot_seq))) self.genome_suspect = 1 try: if prot_seq and prot_seq != Seq.translate( feat_seq, self.code_table, cds=True).strip("*"): cds_warnings.append(warnings["inconsistent_translation"]) self.defects['cds_seq_not_matching'] += 1 except TranslationError as e: cds_warnings.append("Unable to verify protein sequence:" + str(e)) if not prot_seq: try: prot_seq = Seq.translate(feat_seq, self.code_table, cds=True).strip("*") cds_warnings.append(warnings["no_translation_supplied"]) except TranslationError as e: cds_warnings.append(warnings["no_translation_supplied"] + str(e)) out_feat.update({ "protein_translation": prot_seq, "protein_md5": hashlib.md5(prot_seq.encode('utf8')).hexdigest(), "protein_translation_length": len(prot_seq), }) if out_feat.get('parent_gene'): propagate_cds_props_to_gene(out_feat, self.genes[out_feat['parent_gene']]) if cds_warnings: out_feat['warnings'] = cds_warnings self.cdss[out_feat['id']] = out_feat
class FastaToAssembly: def __init__(self, callback_url, scratch, ws_url): self.scratch = scratch self.dfu = DataFileUtil(callback_url) self.ws = Workspace(ws_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print(f'filtering FASTA file by contig length (min len={min_contig_length} bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print(f'parsing FASTA file: {fasta_file_path}') assembly_data = self.parse_fasta(fasta_file_path, params) print(f' - parsed {assembly_data["num_contigs"]} contigs,{assembly_data["dna_size"]} bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) json.dump(assembly_object_to_save, open(self.scratch+"/example.json", 'w')) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): """ construct the WS object data to save based on the parsed info and params """ assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] fasta_file_handle_info['handle'] = fasta_file_handle_info['handle'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: info = self.ws.get_object_info3({'objects':[{'ref': params['taxon_ref']}]})['infos'][0] assembly_data['taxon_ref'] = f'{info[6]}/{info[0]}/{info[4]}' if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return sort_dict(assembly_data) def parse_fasta(self, fasta_file_path, params): """ Do the actual work of inspecting each contig """ # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This FASTA file may have amino acids in it instead ' 'of the required nucleotides.') raise ValueError(f"This FASTA file has non nucleic acid characters: " f"{character}") # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence.encode()).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The FASTA header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list)).encode()).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data @staticmethod def fasta_filter_contigs_generator(fasta_record_iter, min_contig_length): """ generates SeqRecords iterator for writing from a legacy contigset object """ rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(f' - filtered out {rows - rows_added} of {rows} contigs that were shorter ' f'than {(min_contig_length)} bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): """ removes all contigs less than the min_contig_length provided """ filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() if len(obj_data["contigs"]) == 0: raise ValueError('There are no contigs to save, thus there is no valid assembly.') obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): """ Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; """ print(f'Uploading FASTA file ({fasta_file_path}) to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): """ Setup the input_directory by fetching the files and returning the path to the file""" file_path = None if 'file' in params: if not os.path.isfile(params['file']['path']): raise ValueError('KBase Assembly Utils tried to save an assembly, but the calling application specified a file ('+params['file']['path']+') that is missing. Please check the application logs for details.') file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print(f'Downloading file from SHOCK node: {params["shock_id"]}') sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print(f'Downloading file from: {params["ftp_url"]}') sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid FASTA could be extracted based on the input parameters') @staticmethod def validate_params(params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a FASTA file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required FASTA file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one FASTA file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')