def _post_process(output_spp_name, output_pdf_name, output_dir, temp_dir): ''' A static method for post processing ppqt analysis :param output_spp_name: Output spp filename :param output_pdf_name: Output pdf filename :param output_dir: Destination output dir :param temp_dir: Source temp dir :returns: spp output path and pdf output path ''' try: tmp_spp_file = os.path.join(temp_dir, output_spp_name) dest_spp_file = os.path.join(output_dir, output_spp_name) tmp_pdf_file = os.path.join(temp_dir, output_pdf_name) dest_pdf_file = os.path.join(output_dir, output_pdf_name) check_file_path(tmp_spp_file) check_file_path(tmp_pdf_file) copy_local_file(\ source_path=tmp_spp_file, destinationa_path=dest_spp_file, force=True) copy_local_file(\ source_path=tmp_pdf_file, destinationa_path=dest_pdf_file, force=True) return dest_spp_file, dest_pdf_file except: raise
def _parse_spp_output(spp_file): ''' An internal static method for parsing PPQC spp out file :param spp_file: A spp.out filepath :returns: A list of dictionary ''' try: check_file_path(spp_file) column_names = \ ["PPQT_Filename", "PPQT_numReads", "PPQT_estFragLen", "PPQT_corr_estFragLen", "PPQT_PhantomPeak", "PPQT_corr_phantomPeak", "PPQT_argmin_corr", "PPQT_min_corr", "PPQT_Normalized_SCC_NSC", "PPQT_Relative_SCC_RSC", "PPQT_QualityTag"] data = \ pd.read_csv(\ spp_file, sep='\t', dtype=object, names=column_names) return data.to_dict(orient='records') except Exception as e: raise ValueError('Failed to parse file {0}, got error {1}'.\ format(spp_file,e))
def _generate_ipynb_from_template(self, param_map): ''' An internal method to generate notebook from template :param param_map: A dictionary for parameter substitution in output notebook :returns: A output notebook path ''' try: check_file_path(self.template_ipynb_path) check_file_path(self.temp_dir) if not isinstance(param_map, dict): raise TypeError( "Expecting a dictionary for notebook param substitution, got {0}".\ format(type(param_map))) notebook_output = \ os.path.join( self.temp_dir, os.path.basename(self.template_ipynb_path)) template_env = \ Environment( loader=\ FileSystemLoader( searchpath=os.path.dirname(self.template_ipynb_path)), autoescape=select_autoescape(['html', 'xml'])) notebook = \ template_env.\ get_template( os.path.basename(self.template_ipynb_path)) notebook.\ stream(**param_map).\ dump(notebook_output) return notebook_output except Exception as e: raise ValueError("Failed to generate notebook for template: {0}, error, {1}".\ format(self.template_ipynb_path,e))
def _pre_process(self, input_bam, output_spp_name, output_pdf_name, output_dir, temp_dir): ''' An internal method for preprocessing before the exe run :param input_bam: Input bam file :param output_spp_name: Output spp filename :param output_pdf_name: Output pdf filename :param output_dir: Destination output dir :param temp_dir: Source temp dir ''' try: check_file_path(self.rscript_path) check_file_path(self.ppqt_exe) if not os.path.exists(output_dir): os.makedirs(output_dir, mode=0o770) output_pdf = os.path.join(temp_dir, output_pdf_name) output_spp = os.path.join(temp_dir, output_spp_name) run_cmd = \ [quote(self.rscript_path), quote(self.ppqt_exe), quote('-c={0}'.format(input_bam)), quote('-rf'), quote('-p={0}'.format(str(self.threads))), quote('-savp={0}'.format(output_pdf)), quote('-out={0}'.format(output_spp)), quote('-tmpdir={0}'.format(temp_dir)), quote('-odir={0}'.format(output_dir))] return run_cmd except: raise
def _copy_container_output_and_update_map(self, temp_notebook_path): ''' An internal method to copy output files from container output dir and update the output map dictionary :returns: A new dictionary with updated filepath in the values ''' try: new_output_map = dict() if self.output_file_map is not None and \ isinstance(self.output_file_map,dict): for key, container_path in self.output_file_map.items(): mount_path = \ os.path.join( self.temp_dir, os.path.basename(container_path)) # get output path in container mounted dir check_file_path(mount_path) # check if its present final_path = \ os.path.join( self.output_dir, os.path.basename(mount_path)) # get target path if os.path.isfile(mount_path): copy_local_file(mount_path, final_path) # copy file with filename elif os.path.isdir(mount_path): copy_local_file( mount_path, self.output_dir) # copy dir to target dir new_output_map.\ update({key:final_path}) # update output map if self.output_format == 'html': temp_notebook_path = \ temp_notebook_path.replace('.ipynb','.html') elif self.output_format == 'markdown': temp_notebook_path = \ temp_notebook_path.replace('.ipynb','.md') elif self.output_format == 'notebook': temp_notebook_path = temp_notebook_path elif self.output_format == 'pdf': temp_notebook_path = \ temp_notebook_path.replace('.ipynb','.pdf') elif self.output_format == 'python': temp_notebook_path = \ temp_notebook_path.replace('.ipynb','.py') elif self.output_format == 'slide': temp_notebook_path = \ temp_notebook_path.replace('.ipynb','.html') check_file_path(temp_notebook_path) output_notebook_path = \ os.path.join( self.output_dir, os.path.basename(temp_notebook_path)) copy_local_file(temp_notebook_path, output_notebook_path) # copy notbook file new_output_map.\ update({self.notebook_tag:output_notebook_path}) return new_output_map except Exception as e: raise ValueError( "Failed to copy files from container mount dir, error: {0}".\ format(e))
def generate_ipynb_from_template(template_ipynb_path, output_dir, param_dictionary, date_tag='date_tag', use_ephemeral_space=False): ''' A class for generating notebook IPYNB file from a template files with param substitution :param template_ipynb_path: A template IPYNB file path :param output_dir: Output path :param param_dictionary: A dictionary containing the params for final notebook :param date_tag: A text for date tag name, default date_tag :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False :returns: None ''' try: check_file_path(template_ipynb_path) check_file_path(output_dir) if not isinstance(param_dictionary, dict): raise TypeError( "Expecting a dictionary, got {0}".\ format(type(param_dictionary))) date_tag_value = \ datetime.\ strftime( datetime.now(), '%Y-%b-%d %H:%M') # date tag values param_dictionary.\ update(dict(date_tag=date_tag_value)) # adding date tag values to params temp_dir = \ get_temp_dir( use_ephemeral_space=use_ephemeral_space) temp_output = \ os.path.join( temp_dir, os.path.basename(template_ipynb_path)) final_output = \ os.path.join( output_dir, os.path.basename(template_ipynb_path)) template_env = \ Environment( loader=\ FileSystemLoader( searchpath=os.path.dirname(template_ipynb_path)), autoescape=select_autoescape(['html', 'xml'])) notebook = \ template_env.\ get_template( os.path.basename(template_ipynb_path)) notebook.\ stream(**param_dictionary).\ dump(temp_output) # write temp ipynb file with param substitution copy_local_file(temp_output, final_output) remove_dir(temp_dir) except Exception as e: raise ValueError( "Failed to generate ipynb file from template {1}, error: {0}".\ format(e,template_ipynb_path))
def extract_cellranger_count_metrics_summary( cellranger_tar, collection_name=None, collection_type=None, attribute_name='attribute_name', attribute_value='attribute_value', attribute_prefix='None', target_filename='metrics_summary.csv'): ''' A function for extracting metrics summary file for cellranger ourput tar and parse the file. Optionally it can add the collection name and type info to the output dictionary. :param cellranger_tar: A cellranger output tar file :param target_filename: A filename for metrics summary file lookup, default metrics_summary.csv :param collection_name: Optional collection name, default None :param collection_type: Optional collection type, default None :param attribute_tag: An optional string to add as prefix of the attribute names, default None :returns: A dictionary containing the metrics values ''' try: check_file_path(cellranger_tar) temp_work_dir = get_temp_dir(use_ephemeral_space=False) metrics_file = None with tarfile.open(cellranger_tar, mode='r') as tar: for file_name in tar.getnames(): if os.path.basename(file_name) == target_filename: tar.extract(file_name, path=temp_work_dir) metrics_file = os.path.join(temp_work_dir, file_name) if metrics_file is None: raise IOError('Required file {0} not found in tar {1}'.\ format(target_filename,cellranger_tar)) attribute_data = pd.read_csv(metrics_file).T.\ reset_index() attribute_data.columns = [attribute_name, attribute_value] if attribute_prefix is None: attribute_data[attribute_name] = \ attribute_data[attribute_name].\ map(lambda x: x.replace(' ','_')) else: attribute_data[attribute_name] = \ attribute_data[attribute_name].\ map(lambda x: \ '{0}_{1}'.format(\ attribute_prefix, x.replace(' ','_'))) if collection_name is not None: attribute_data['name'] = collection_name if collection_type is not None: attribute_data['type'] = collection_type attribute_data = attribute_data.\ to_dict(orient='records') remove_dir(temp_work_dir) return attribute_data except: raise
def run_HaplotypeCaller(self, input_bam, output_vcf_path, dbsnp_vcf, emit_gvcf=True, force=False, dry_run=False, gatk_param_list=None): ''' A method for running GATK HaplotypeCaller :param input_bam: A input bam file :param output_vcf_path: A output vcf filepath :param dbsnp_vcf: A dbsnp vcf file :param emit_gvcf: A toggle for GVCF generation, default True :param force: Overwrite output file, if force is True :param dry_run: Return GATK command, if its true, default False :param gatk_param_list: List of additional params for BQSR, default None :returns: GATK commandline ''' try: self._run_gatk_checks() # run initial checks check_file_path(input_bam) check_file_path(dbsnp_vcf) temp_dir = \ get_temp_dir(use_ephemeral_space=self.use_ephemeral_space) # get temp dir temp_output = \ os.path.join( temp_dir, os.path.basename(output_vcf_path)) gatk_cmd = [ quote(self.gatk_exe), "HaplotypeCaller", "-I", quote(input_bam), "-O", quote(temp_output), "--reference", quote(self.ref_fasta), "--dbsnp", quote(dbsnp_vcf), "--java-options", quote(self.java_param) ] if emit_gvcf: gatk_cmd.extend(["--emit-ref-confidence", "GVCF"]) if gatk_param_list is not None and \ isinstance(gatk_param_list,list) and \ len(gatk_param_list) > 0: gatk_cmd.extend(gatk_param_list) # additional params gatk_cmd = ' '.join(gatk_cmd) if dry_run: return gatk_cmd subprocess.check_call(gatk_cmd, shell=True) copy_local_file(source_path=temp_output, destinationa_path=output_vcf_path, force=force) remove_dir(temp_dir) return gatk_cmd except Exception as e: raise ValueError( "Failed to run GATK HaplotypeCaller, error: {0}".\ format(e))
def run_bam_idxstat(samtools_exe, bam_file, output_dir, output_prefix=None, force=False, dry_run=False): ''' A function for running samtools index stats generation :param samtools_exe: samtools executable path :param bam_file: A bam filepath with / without index. Index file will be created if its missing :param output_dir: Bam idxstats output directory path :param output_prefix: Output file prefix, default None :param force: Output idxstats file will be overwritten if force is True, default False :param dry_run: A toggle for returning the samtools command without actually running it, default False :returns: Output file path and a list containing samtools command ''' try: check_file_path(samtools_exe) _check_bam_file(bam_file=bam_file) # check bam file if not dry_run: _check_bam_index(\ samtools_exe=samtools_exe, bam_file=bam_file) # generate bam index if output_prefix is None: output_prefix = os.path.basename(bam_file) output_path = \ '{0}.{1}.{2}'.\ format(output_prefix,'idxstats','txt') # get output filename output_path = \ os.path.join(\ output_dir, output_path) # get complete output path if not os.path.exists(output_dir): raise IOError('Output path {0} not found'.format(output_dir)) if os.path.exists(output_path) and not force: raise ValueError('Output file {0} already present, use force to overwrite'.\ format(output_path)) idxstat_cmd = \ [quote(samtools_exe), 'idxstats', quote(bam_file) ] if dry_run: return idxstat_cmd with open(output_path, 'w') as fp: with subprocess.Popen(idxstat_cmd, stdout=subprocess.PIPE) as proc: fp.write(proc.stdout.read().decode( 'utf-8')) # write bam flagstat output return output_path, idxstat_cmd except: raise
def run(self): ''' A method for running the cellranger count metrics extraction :param project_igf_id: A project igf id :param experiment_igf_id: An experiment igf id :param sample_igf_id: A sample igf id :param igf_session_class: A database session class :param analysis_output_list: Cellranger analysis tar output path :param collection_type: Cellranger results collection type :param metrics_filename: Name of the metrics file, default metrics_summary.csv :returns: None ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') igf_session_class = self.param_required('igf_session_class') analysis_output_list = self.param_required('analysis_output_list') collection_type = self.param('collection_type') metrics_filename = self.param('metrics_filename') attribute_prefix = self.param('attribute_prefix') for infile in analysis_output_list: check_file_path(infile) # check input file path cellranger_tar = analysis_output_list[0] cellranger_metrics = extract_cellranger_count_metrics_summary(\ cellranger_tar=cellranger_tar, target_filename=metrics_filename, collection_name=experiment_igf_id, collection_type=collection_type, attribute_prefix=attribute_prefix ) # extract cellranger metrics stats as dictionary ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=cellranger_metrics, autosave=False) # load cellranger metrics to collection attribute table ca.commit_session() ca.close_session() except: ca.rollback_session() ca.close_session() raise self.param('dataflow_params',{'cellranger_attribute':'done'}) except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def singularity_run(image_path, path_bind, args_list, container_dir='/tmp', return_results=True, use_ephemeral_space=False, dry_run=False): ''' A wrapper module for running singularity based containers :param image_path: Singularrity image path :param path_bind: Path to bind to singularity /tmp dir :param args_list: List of args for singulatiy run :param return_results: Return singulatiy run results, default True :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False :param dry_run: Return the singularity command without run, default False :returns: A response from container run and a string containing singularity command line ''' try: check_file_path(image_path) check_file_path(path_bind) temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space) res = None temp_image_path = \ os.path.join( temp_dir, os.path.basename(image_path)) copy_local_file(image_path, temp_image_path) # copy image to tmp dir if not isinstance(args_list,list) and \ len(args_list) > 0: raise ValueError( 'No args provided for singularity run') # safemode args = ' '.join(args_list) # flatten args singularity_run_cmd = \ 'singularity run {0} --bind {1}:{2} {3}'.\ format( temp_image_path, path_bind, container_dir, args) if dry_run: return res, singularity_run_cmd else: res = \ Client.run( image=temp_image_path, bind='{0}:{1}'.format(path_bind,container_dir), args=args, return_result=return_results) remove_dir(temp_dir) # remove copied image after run return res, singularity_run_cmd except Exception as e: raise ValueError( 'Failed to run image {0}, error: {1}'.\ format(image_path,e))
def _run_gatk_checks(self): ''' An internal method for running checks before GATK run ''' try: check_file_path(self.gatk_exe) check_file_path(self.ref_fasta) except Exception as e: raise ValueError( "Failed to run GATK checks, error: {0}".\ format(e))
def run_AnalyzeCovariates(self, before_report_file, after_report_file, output_pdf_path, force=False, dry_run=False, gatk_param_list=None): ''' A method for running GATK AnalyzeCovariates tool :param before_report_file: A file containing bqsr output before recalibration :param after_report_file: A file containing bqsr output after recalibration :param output_pdf_path: An output pdf filepath :param force: Overwrite output file, if force is True :param dry_run: Return GATK command, if its true, default False :param gatk_param_list: List of additional params for BQSR, default None :returns: GATK commandline ''' try: self._run_gatk_checks() # run initial checks check_file_path(before_report_file) check_file_path(after_report_file) temp_dir = \ get_temp_dir(use_ephemeral_space=self.use_ephemeral_space) # get temp dir temp_output = \ os.path.join( temp_dir, os.path.basename(output_pdf_path)) gatk_cmd = [ quote(self.gatk_exe), "AnalyzeCovariates", "--before-report-file", quote(before_report_file), "--after-report-file", quote(after_report_file), "--plots-report-file", quote(temp_output), "--java-options", quote(self.java_param) ] if gatk_param_list is not None and \ isinstance(gatk_param_list,list) and \ len(gatk_param_list) > 0: gatk_cmd.extend(gatk_param_list) # additional params gatk_cmd = ' '.join(gatk_cmd) if dry_run: return gatk_cmd subprocess.check_call(gatk_cmd, shell=True) copy_local_file(source_path=temp_output, destinationa_path=output_pdf_path, force=force) remove_dir(temp_dir) return gatk_cmd except Exception as e: raise ValueError( "Failed to run GATK AnalyzeCovariates, error: {0}".\ format(e))
def _check_bam_file(bam_file): ''' An internal method for checking bam file :param bam_file: A bam file path :raises Value Error: It raises error if bam file doesn't have '.bam' extension :raises IOError: It raises IOError if the bam_file doesn't exists ''' try: check_file_path(bam_file) if not fnmatch.fnmatch(bam_file, '*.bam'): raise ValueError('Bam file extension is not correct: {0}'.\ format(bam_file)) except: raise
def _check_cram_file(cram_path): ''' An internal method for checking cram file :param cram_path: A cram file path :raises Value Error: It raises error if cram file doesn't have '.cram' extension :raises IOError: It raises IOError if the cram_path doesn't exists ''' try: check_file_path(cram_path) if not fnmatch.fnmatch(cram_path, '*.cram'): raise ValueError('Cram file extension is not correct: {0}'.\ format(cram_path)) except: raise
def compare_fastq_files_read_counts(r1_file, r2_file): ''' A method for comparing read counts for fastq pairs :param r1_file: Fastq pair R1 file path :param r2_file: Fastq pair R2 file path :raises: ValueError if counts are not same ''' try: check_file_path(r1_file) check_file_path(r2_file) r1_count = count_fastq_lines(r1_file) r2_count = count_fastq_lines(r2_file) if r1_count != r2_count: raise ValueError('Fastq pair does not have same number of reads: {0} {1}'.\ format(r1_file,r2_file)) except: raise
def count_fastq_lines(fastq_file): ''' A method for counting fastq lines :param fastq_file: A gzipped or unzipped fastq file :returns: Fastq line count ''' try: gzipped_pattern = re.compile(r'\S+\.(fastq|fq)\.gz$') unzipped_pattern = re.compile(r'\S+\.(fastq|fq)$') lines = 0 check_file_path(fastq_file) if re.match(gzipped_pattern, fastq_file): # read gzipped file with gzip.open(fastq_file, 'rb') as f: buf_size = 1024 * 1024 read_f = f.read buf = read_f(buf_size) while buf: lines += buf.count(b'\n') buf = read_f(buf_size) elif re.match(unzipped_pattern, fastq_file): # read unzipped file with open(filename, 'rb') as f: buf_size = 1024 * 1024 read_f = f.raw.read buf = read_f(buf_size) while buf: lines += buf.count(b'\n') buf = read_f(buf_size) else: raise ValueError('Failed to detect read mode for fastq file {0}'.\ format(fastq_file)) if lines >= 4: if lines % 4 != 0: raise ValueError('Fastq file missing have block of 4 lines:{0}'.\ format(fastq_file)) lines = int(lines / 4) return lines except: raise
def _check_bam_index(samtools_exe, bam_file, dry_run=False): ''' An internal method for checking bam index files. It will generate a new index if its not found. :param samtools_exe: samtools executable path :param bam_file: A bam file path :param dry_run: A toggle for returning the samtools command without actually running it, default False ''' try: check_file_path(samtools_exe) bam_index = '{0}.bai'.format(bam_file) if not os.path.exists(bam_index): index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=bam_file, dry_run=dry_run ) except: raise
def _fetch_project_info_from_db(self): ''' An internal method for fetching data from db :returns: A dataframe containing following columns project_igf_id, sample_igf_id, expected_read, total_read ''' try: check_file_path(self.dbconfig_file) dbconf = read_dbconf_json(self.dbconfig_file) sa = SampleAdaptor(**dbconf) sa.start_session() query = sa.session.\ query(Project.project_igf_id, Sample.sample_igf_id, func.max(Sample_attribute.attribute_value).label(self.expected_read_tag), func.sum(Run_attribute.attribute_value).label(self.total_read_tag) ).\ outerjoin(Sample,Project.project_id==Sample.project_id).\ outerjoin(Sample_attribute, Sample.sample_id==Sample_attribute.sample_id).\ outerjoin(Experiment, Sample.sample_id==Experiment.sample_id).\ outerjoin(Run,Experiment.experiment_id==Run.experiment_id).\ outerjoin(Run_attribute,Run.run_id==Run_attribute.run_id).\ filter((Experiment.platform_name.in_(self.platform_list))|(Experiment.platform_name.is_(None))).\ filter(Sample_attribute.attribute_name==self.expected_read_tag).\ filter((Run_attribute.attribute_name==self.r1_read_tag)|(Run_attribute.attribute_name.is_(None))).\ group_by(Sample.sample_igf_id) records = sa.fetch_records(query=query, output_mode='dataframe') sa.close_session() records[self.total_read_tag] = records[self.total_read_tag].fillna(0).astype(int) return records except: raise
def run_sync(self): ''' A method for running the sequencing run sync ''' try: check_file_path(self.output_dir) all_seqrun_dir = \ list_remote_file_or_dirs(\ remote_server=self.seqrun_server, remote_path=self.seqrun_path, only_dirs=True) all_seqrun_dir = \ list(map(os.path.basename,all_seqrun_dir)) # convert paths to dirname new_seqrun_dirs = \ check_seqrun_dir_in_db(\ all_seqrun_dir=all_seqrun_dir, dbconfig=self.database_config_file) # filter existing seqruns for seqrun in new_seqrun_dirs: try: new_seqruns = \ check_seqrun_dir_in_db(\ all_seqrun_dir=[seqrun], dbconfig=self.database_config_file) # filter existing seqrun again if len(new_seqruns)>0: copy_remote_file(\ source_path=os.path.join(self.seqrun_path,seqrun), destinationa_path=self.output_dir, source_address=self.seqrun_server) # sync dirs if its still new except Exception as e: raise ValueError('Failed to sync seqrun {0}, got error {1}'.\ format(seqrun,e)) except Exception as e: raise ValueError('Stopped syncing seqrun data, got error: {0}'.\ format(e))
def _copy_to_container_temp(mount_dir, container_path_prefix, filepath): ''' An internal static method for copying files to container temp dir :param mount_dir: A dir path to mount in container :param container_path_prefix: Temp dir path in container :param filepath: File or dir path to copy :returns: A path in mounted temp dir and a path in the container temp dir ''' try: check_file_path(filepath) container_path = \ os.path.join( container_path_prefix, os.path.basename(filepath)) mount_dir_path = \ os.path.join( mount_dir, os.path.basename(filepath)) copy_local_file(filepath, mount_dir_path, force=True) return mount_dir_path, container_path except Exception as e: raise ValueError("Failed to copy path {0} to temp dir: {1}, error: {2}".\ format(filepath,mount_dir,e))
def _run_checks(self): ''' An internal method for running initial checks before bwa run ''' try: check_file_path(self.bwa_exe) if self.bam_output: check_file_path(self.samtools_exe) for file in self.input_fastq_list: check_file_path(file) check_file_path(self.output_dir) if len(self.input_fastq_list) > 2: raise ValueError('Expecting max 2 fastq files, got {0}'.\ format(len(self.input_fastq_list))) except: raise
def _run_checks(self): ''' An internal method for running initial checks before star run ''' try: check_file_path(self.star_exe) # checking star exe if not isinstance(self.input_files, list) or \ len(self.input_files)==0: raise ValueError('No input file list found for star') for file in self.input_files: check_file_path(file_path=file) # checking input file paths check_file_path(file_path=self.reference_gtf) # checking input gtf filepath except: raise
def convert_scanpy_h5ad_to_cellbrowser_dir(cbImportScanpy_path, h5ad_path, project_name, cellbrowser_htmldir, use_ephemeral_space=0): ''' A wrapper function for Scanpy h5ad file to UCSC cellbrowser html dir conversion :param cbImportScanpy_path: Path for cbImportScanpy executable :param h5ad_path: Path of input Scanpy h5ad file :param project_name: Project name for cellbrowser :param cellbrowser_htmldir: Output cellbrowser htmldir path :param use_ephemeral_space: A toggle for temp dir setting, default 0 ''' try: if os.path.exists(cellbrowser_htmldir): raise IOError('Cellbrowser output path already present') check_file_path(os.path.dirname(cellbrowser_htmldir)) check_file_path(cbImportScanpy_path) check_file_path(h5ad_path) temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_cellbrowser_html = \ os.path.join(\ temp_dir, os.path.basename(cellbrowser_htmldir)) temp_cellbrowser_dir = \ os.path.join(\ temp_dir, 'out') cbImportScanpy_cmd = \ [quote(cbImportScanpy_path), '-n',quote(project_name), '-i',quote(h5ad_path), '-o',temp_cellbrowser_dir, '--htmlDir',temp_cellbrowser_html ] subprocess.check_call(' '.join(cbImportScanpy_cmd), shell=True) copytree(\ temp_cellbrowser_html, cellbrowser_htmldir) except: raise
def _run_checks(self): ''' An internal method for running initial checks before fastp run ''' try: check_file_path(self.fastp_exe) if not isinstance(self.input_fastq_list, list): raise ValueError('No input fastq list found: {0}'.format(self.input_fastq_list)) if isinstance(self.run_thread, int): self.run_thread = str(self.run_thread) # convert run thread param to str for file in self.input_fastq_list: check_file_path(file) check_file_path(self.output_dir) if len(self.input_fastq_list) > 2: raise ValueError('Expecting max 2 fastq files, got {0}'.\ format(len(self.input_fastq_list))) except: raise
def run(self): ''' A method for running the cellranger count for a given sample using ehive pipeline :param project_igf_id: A project igf id :param experiment_igf_id: An experiment igf id :param sample_igf_id: A sample igf id :param biomaterial_type: Biomaterial type for samples, required for nuclei samples :param nuclei_biomaterial_type: Required keywords for nuclei samples, default 'SINGLE_NUCLEI' :param igf_session_class: A database session class :param cellranger_exe: Cellranger executable path :param cellranger_options: Cellranger parameters List of default parameters --jobmode=pbspro --localcores=1 --localmem=4 --mempercore=4 --maxjobs=20 :param base_work_dir: Base work directory path :param fastq_collection_type: Collection type name for input fastq files, default demultiplexed_fastq :param species_name: Reference genome collection name :param reference_type: Reference genome collection type, default TRANSCRIPTOME_TENX :param nuclei_reference_type: Reference genome collection type for pre-mRNA samples, default TRANSCRIPTOME_TENX_NUCLEI :param job_timeout: Timeout for cellranger job, default 24hrs :returns: Adding cellranger_output to the dataflow_params ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') igf_session_class = self.param_required('igf_session_class') cellranger_exe = self.param_required('cellranger_exe') cellranger_options = self.param_required('cellranger_options') base_work_dir = self.param_required('base_work_dir') fastq_collection_type = self.param_required( 'fastq_collection_type') biomaterial_type = self.param_required('biomaterial_type') job_timeout = self.param_required('job_timeout') nuclei_biomaterial_type = self.param('nuclei_biomaterial_type') species_name = self.param('species_name') reference_type = self.param('reference_type') nuclei_reference_type = self.param('nuclei_reference_type') # setup work dir for run work_dir = False work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix ) # replace this with temp dir while running in queue # setup env for run os.chdir(work_dir) # move to work dir os.environ['PATH'] += '{0}{1}'.format( os.pathsep, os.path.dirname( cellranger_exe)) # add cellranger location to env PATH # collect reference genome for run if biomaterial_type == nuclei_biomaterial_type: ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, tenx_ref_type=nuclei_reference_type) # fetch ref genome for pre-mRNA samples else: ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, tenx_ref_type=reference_type) # collect fastq input for run cellranger_ref_transcriptome = ref_genome.get_transcriptome_tenx( ) # fetch tenx ref transcriptome from db input_fastq_dirs = \ get_cellranger_count_input_list(\ db_session_class=igf_session_class, experiment_igf_id=experiment_igf_id, fastq_collection_type=fastq_collection_type) # fetch fastq dir paths as list for run # configure cellranger count command for run cellranger_options = \ self.format_tool_options(\ cellranger_options, separator='=') cellranger_cmd = \ [cellranger_exe, 'count', '{0}={1}'.format('--fastqs', quote(','.join(input_fastq_dirs))), '{0}={1}'.format('--id', quote(experiment_igf_id)), '{0}={1}'.format('--transcriptome', quote(cellranger_ref_transcriptome)), ] # set initial parameters cellranger_cmd.extend( cellranger_options) # add optional parameters # log before job submission message = \ 'started cellranger count for {0}, {1} {2}'.\ format(\ project_igf_id, sample_igf_id, experiment_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana message = ' '.join(cellranger_cmd) self.comment_asana_task( task_name=project_igf_id, comment=message) # send cellranger command to Asana # start job execution cellranger_cmd = ' '.join( cellranger_cmd) # create shell command string subprocess.\ check_call(\ cellranger_cmd, shell=True, timeout=job_timeout) # run cellranger count using shell # prepare output after cellranger run cellranger_output = \ os.path.join(\ work_dir, experiment_igf_id, 'outs') # get cellranger output path message = \ 'finished cellranger count for {0}, {1} {2} : {3}'.\ format(\ project_igf_id, sample_igf_id, experiment_igf_id, cellranger_output) self.post_message_to_slack(message, reaction='pass') # send log to slack self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana # validate output files after cellranger run check_cellranger_count_output( output_path=cellranger_output) # check output file cellranger_report = \ os.path.join(\ cellranger_output, 'web_summary.html') check_file_path(cellranger_report) self.param('dataflow_params',\ {'cellranger_output':cellranger_output, 'cellranger_report':cellranger_report}) # pass on cellranger output path except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs if work_dir: remove_dir(work_dir) raise
def run_samtools_view(samtools_exe, input_file, output_file, reference_file=None, force=True, cram_out=False, threads=1, samtools_params=None, index_output=True, dry_run=False, use_ephemeral_space=0): ''' A function for running samtools view command :param samtools_exe: samtools executable path :param input_file: An input bam filepath with / without index. Index file will be created if its missing :param output_file: An output file path :param reference_file: Reference genome fasta filepath, default None :param force: Output file will be overwritten if force is True, default True :param threads: Number of threads to use for conversion, default 1 :param samtools_params: List of samtools param, default None :param index_output: Index output file, default True :param dry_run: A toggle for returning the samtools command without actually running it, default False :param use_ephemeral_space: A toggle for temp dir settings, default 0 :returns: Samtools command as list ''' try: check_file_path(samtools_exe) _check_bam_file(bam_file=input_file) # check bam file if not dry_run: _check_bam_index(\ samtools_exe=samtools_exe, bam_file=input_file) # check bam index temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_file = \ os.path.join(\ temp_dir, os.path.basename(output_file)) # get temp output file path view_cmd = \ [quote(samtools_exe), 'view', '-o',quote(temp_file) ] # convert bam to cram using samtools if reference_file is not None: check_file_path(reference_file) view_cmd.extend(['-T', quote(reference_file)]) if threads is not None: view_cmd.append('-@{0}'.format(quote(str(threads)))) if cram_out: view_cmd.append('-C') if reference_file is None: raise ValueError('Reference file is required for cram output') else: view_cmd.append('-b') if samtools_params is not None and \ isinstance(samtools_params, list) and \ len(samtools_params) > 0: view_cmd.extend(\ [quote(i) for i in samtools_params]) # add additional params view_cmd.append(quote(input_file)) if dry_run: return view_cmd subprocess.check_call(\ ' '.join(view_cmd), shell=True) if cram_out: _check_cram_file(cram_path=temp_file) # check cram output copy_local_file(\ source_path=temp_file, destinationa_path=output_file, force=force) # move cram file to original path remove_dir(temp_dir) # remove temp directory if index_output: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=output_file, threads=threads) return view_cmd except: raise
def merge_multiple_bam(samtools_exe, input_bam_list, output_bam_path, sorted_by_name=False, use_ephemeral_space=0, threads=1, force=False, dry_run=False, index_output=True): ''' A function for merging multiple input bams to a single output bam :param samtools_exe: samtools executable path :param input_bam_list: A file containing list of bam filepath :param output_bam_path: A bam output filepath :param sorted_by_name: Sort bam file by read_name, default False (for coordinate sorted bams) :param threads: Number of threads to use for merging, default 1 :param force: Output bam file will be overwritten if force is True, default False :param index_output: Index output bam, default True :param use_ephemeral_space: A toggle for temp dir settings, default 0 :param dry_run: A toggle for returning the samtools command without actually running it, default False :return: samtools command ''' try: check_file_path(samtools_exe) check_file_path(input_bam_list) with open(input_bam_list, 'r') as fp: for bam in fp: check_file_path(bam.strip()) temp_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_bam = \ os.path.join(\ temp_dir, os.path.basename(output_bam_path)) merge_cmd = \ [quote(samtools_exe), 'merge', '--output-fmt','BAM', '--threads',quote(str(threads)), '-b',quote(input_bam_list) ] if sorted_by_name: merge_cmd.append('-n') # Input files are sorted by read name merge_cmd.append(temp_bam) if dry_run: return merge_cmd subprocess.check_call(merge_cmd) # run samtools merge copy_local_file(\ source_path=temp_bam, destinationa_path=output_bam_path, force=force) # copy bamfile remove_dir(temp_dir) # remove temp dir _check_bam_file(output_bam_path) if index_output and \ not sorted_by_name: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=output_bam_path, threads=threads) return merge_cmd except: raise
def run_sort_bam(samtools_exe, input_bam_path, output_bam_path, sort_by_name=False, use_ephemeral_space=0, threads=1, force=False, dry_run=False, cram_out=False, index_output=True): ''' A function for sorting input bam file and generate a output bam :param samtools_exe: samtools executable path :param input_bam_path: A bam filepath :param output_bam_path: A bam output filepath :param sort_by_name: Sort bam file by read_name, default False (for coordinate sorting) :param threads: Number of threads to use for sorting, default 1 :param force: Output bam file will be overwritten if force is True, default False :param cram_out: Output cram file, default False :param index_output: Index output bam, default True :param use_ephemeral_space: A toggle for temp dir settings, default 0 :param dry_run: A toggle for returning the samtools command without actually running it, default False :return: None ''' try: check_file_path(samtools_exe) _check_bam_file(bam_file=input_bam_path) sort_cmd = \ [quote(samtools_exe), 'sort', '-@{0}'.format(quote(str(threads))) ] if sort_by_name: sort_cmd.append('-n') # sorting by read name if cram_out: sort_cmd.append('--output-fmt CRAM') else: sort_cmd.append('--output-fmt BAM') temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_bam = \ os.path.join(\ temp_dir, os.path.basename(output_bam_path)) sort_cmd.extend(['-o', quote(temp_bam)]) sort_cmd.append(quote(input_bam_path)) if dry_run: return sort_cmd copy_local_file(\ source_path=temp_bam, destinationa_path=output_bam_path, force=force) # copy output bam remove_dir(temp_dir) # remove temp dir if cram_out: _check_cram_file(output_bam_path) else: _check_bam_file(output_bam_path) if index_output: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=output_bam_path, threads=threads) except: raise
def run_bam_stats(samtools_exe, bam_file, output_dir, threads=1, force=False, output_prefix=None, dry_run=False): ''' A method for generating samtools stats output :param samtools_exe: samtools executable path :param bam_file: A bam filepath with / without index. Index file will be created if its missing :param output_dir: Bam stats output directory path :param output_prefix: Output file prefix, default None :param threads: Number of threads to use for conversion, default 1 :param force: Output flagstat file will be overwritten if force is True, default False :param dry_run: A toggle for returning the samtools command without actually running it, default False :returns: Output file path, list containing samtools command and a list containing the SN matrics of report ''' try: check_file_path(samtools_exe) _check_bam_file(bam_file=bam_file) if not dry_run: _check_bam_index(\ samtools_exe=samtools_exe, bam_file=bam_file) if output_prefix is None: output_prefix = os.path.basename(bam_file) output_path = \ '{0}.{1}.{2}'.\ format(output_prefix,'stats','txt') output_path = \ os.path.join(\ output_dir, output_path) if not os.path.exists(output_dir): raise IOError('Output path {0} not found'.format(output_dir)) if os.path.exists(output_path) and not force: raise ValueError('Output file {0} already present, use force to overwrite'.\ format(output_path)) stats_cmd = \ [quote(samtools_exe), 'stats', '-@{0}'.format(quote(str(threads))), quote(bam_file) ] if dry_run: return stats_cmd with open(output_path, 'w') as fp: with subprocess.Popen(stats_cmd, stdout=subprocess.PIPE) as proc: fp.write(proc.stdout.read().decode( 'utf-8')) # write bam stats output stats_data_list = \ _parse_samtools_stats_output(stats_file=output_path) # parse stats output file return output_path, stats_cmd, stats_data_list except: raise