def test_load_file_to_disk_and_db4(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='ProjectA', collection_type='AnalysisA_Files', collection_table='project', rename_file=False) input_file_list = [ os.path.join(self.temp_work_dir, file_name) for file_name in self.input_list ] output_list = au.load_file_to_disk_and_db( input_file_list=input_file_list, withdraw_exisitng_collection=False ) # loading all files to same collection, without rename base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') file_list = list(ca_files['file_path'].to_dict().values()) self.assertTrue(input_file_list[0] in file_list) self.assertTrue(input_file_list[0] in output_list) base.close_session()
def test_load_file_to_disk_and_db7(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='RunA', collection_type='AnalysisA_Files', collection_table='run', base_path=self.temp_base_dir) input_file_list = [ os.path.join(self.temp_work_dir, file_name) for file_name in self.input_list ] output_list = au.load_file_to_disk_and_db( input_file_list=input_file_list, withdraw_exisitng_collection=False ) # loading all files to same collection base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='RunA', collection_type='AnalysisA_Files', output_mode='dataframe') file_list = list(ca_files['file_path'].to_dict().values()) datestamp = get_datestamp_label() test_file = os.path.join( self.temp_base_dir, 'ProjectA', 'SampleA', 'ExperimentA', 'RunA', 'AnalysisA', '{0}_{1}_{2}_{3}.{4}'.format('RunA', 'AnalysisA', 'TagA', datestamp, 'cram')) test_file = preprocess_path_name(input_path=test_file) self.assertTrue(test_file in file_list) self.assertTrue(test_file in output_list) base.close_session()
def test_load_file_to_disk_and_db1(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='ProjectA', collection_type='AnalysisA_Files', collection_table='project') input_file_list = [ os.path.join(self.temp_work_dir, file_name) for file_name in self.input_list ] output_list = au.load_file_to_disk_and_db( input_file_list=input_file_list, withdraw_exisitng_collection=False ) # loading all files to same collection base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') self.assertEqual(len(ca_files.index), len(self.input_list)) # compare with input list self.assertEqual(len(output_list), len(self.input_list)) # compare with output list base.close_session()
def test_load_file_to_disk_and_db2(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='ProjectA', collection_type='AnalysisA_Files', collection_table='project') input_file_list = [ os.path.join(self.temp_work_dir, file_name) for file_name in self.input_list ] output_list = au.load_file_to_disk_and_db( input_file_list=input_file_list, withdraw_exisitng_collection=True ) # withdrawing existing collection group before loading new base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') self.assertEqual(len(ca_files.index), 1) # check for unique collection group fa = FileAdaptor(**{'session': base.session}) query = fa.session.query(File) fa_records = fa.fetch_records(query=query, output_mode='dataframe') self.assertEqual( len(fa_records['file_path'].to_dict()), 3 ) # check if all files are present although only one collection group exists self.assertEqual(len(output_list), 3) base.close_session()
def run(self): ''' ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') igf_session_class = self.param_required('igf_session_class') tag_name = self.param('tag_name') input_files = self.param_required('input_files') base_result_dir = self.param_required('base_results_dir') analysis_name = self.param('analysis_name') collection_name = self.param_required('collection_name') collection_type = self.param('collection_type') collection_table = self.param('collection_table') withdraw_exisitng_collection = self.param( 'withdraw_exisitng_collection') remove_existing_file = self.param('remove_existing_file') file_suffix = self.param('file_suffix') for file in input_files: if not os.path.exists(file): raise IOError('File {0} not found'.format( file)) # check analysis files before loading au = \ Analysis_collection_utils( dbsession_class=igf_session_class, analysis_name=analysis_name, base_path=base_result_dir, tag_name=tag_name, collection_name=collection_name, collection_type=collection_type, collection_table=collection_table) # initiate analysis file loading output_file_list = \ au.load_file_to_disk_and_db( input_file_list=input_files, remove_file=remove_existing_file, file_suffix=file_suffix, withdraw_exisitng_collection=withdraw_exisitng_collection) # load file to db and disk self.param('dataflow_params', {'analysis_output_list': output_file_list }) # pass on analysis files to data flow except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def test_load_file_to_disk_and_db8(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='RunA', collection_type='AnalysisA_Files', collection_table='run') input_file = os.path.join(self.temp_work_dir, 'a.cram') input_file = preprocess_path_name(input_path=input_file) new_file_name = au.get_new_file_name(input_file=input_file) datestamp = get_datestamp_label() test_file_name = '{0}_{1}_{2}_{3}.{4}'.format('RunA', 'AnalysisA', 'TagA', datestamp, 'cram') self.assertEqual(new_file_name, test_file_name)
def test_create_or_update_analysis_collection_rename(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='ProjectA', collection_type='AnalysisA_Files', collection_table='project') base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() au.create_or_update_analysis_collection(file_path=os.path.join( self.temp_work_dir, 'a.cram'), dbsession=base.session, autosave_db=True) base.close_session() base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') self.assertEqual(len(ca_files.index), 1) au.create_or_update_analysis_collection( file_path=os.path.join(self.temp_work_dir, 'a.cram'), dbsession=base.session, autosave_db=True, force=True) # overwriting file collection base.close_session() base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') self.assertEqual(len(ca_files.index), 1) with self.assertRaises(sqlalchemy.exc.IntegrityError ): # file collection without force au.create_or_update_analysis_collection(\ file_path=os.path.join(self.temp_work_dir, 'a.cram'), dbsession=base.session, autosave_db=True, force=False ) base.close_session()
def run(self): ''' A runnable method for running PPQT analysis ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') threads = self.param('threads') base_work_dir = self.param_required('base_work_dir') base_results_dir = self.param_required('base_results_dir') deeptools_command = self.param_required('deeptools_command') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') load_signal_bigwig = self.param('load_signal_bigwig') signal_collection_type = self.param('signal_collection_type') blacklist_reference_type = self.param('blacklist_reference_type') species_name = self.param('species_name') deeptools_params = self.param('deeptools_params') deeptools_bamCov_params = self.param('deeptools_bamCov_params') collection_table = self.param('collection_table') remove_existing_file = self.param('remove_existing_file') withdraw_exisitng_collection = self.param( 'withdraw_exisitng_collection') analysis_name = self.param('analysis_name') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = self.param_required('date_stamp') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.format( output_prefix, seed_date_stamp) # adding datestamp to the output file prefix if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') signal_files = list() work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, blacklist_interval_type=blacklist_reference_type) # setup ref genome utils blacklist_bed = ref_genome.get_blacklist_region_bed( ) # get genome fasta if deeptools_command == 'plotCoverage': output_raw_counts = \ '{0}_{1}.raw.txt'.format(output_prefix,'plotCoverage') output_raw_counts = \ os.path.join(\ work_dir, output_raw_counts) plotcov_stdout = \ '{0}_{1}.stdout.txt'.format(output_prefix,'plotCoverage') plotcov_stdout = \ os.path.join(\ work_dir, plotcov_stdout) output_plot = \ '{0}_{1}.pdf'.format(output_prefix,'plotCoverage') output_plot = \ os.path.join(\ work_dir, output_plot) deeptools_args = \ run_plotCoverage(\ bam_files=input_files, output_raw_counts=output_raw_counts, plotcov_stdout=plotcov_stdout, output_plot=output_plot, blacklist_file=blacklist_bed, thread=threads, use_ephemeral_space=use_ephemeral_space, params_list=deeptools_params) analysis_files.extend(\ [output_raw_counts,plotcov_stdout,output_plot]) elif deeptools_command == 'bamCoverage': output_file = \ '{0}_{1}.bw'.format(output_prefix,'bamCoverage') output_file = \ os.path.join(\ work_dir, output_file) if deeptools_params is None: deeptools_params = deeptools_bamCov_params deeptools_args = \ run_bamCoverage(\ bam_files=input_files, output_file=output_file, blacklist_file=blacklist_bed, thread=threads, use_ephemeral_space=use_ephemeral_space, params_list=deeptools_params) if load_signal_bigwig: au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, base_path=base_results_dir, tag_name=species_name, collection_name=experiment_igf_id, collection_type=signal_collection_type, collection_table=collection_table) # initiate analysis file loading output_file_list = \ au.load_file_to_disk_and_db(\ input_file_list=[output_file], remove_file=remove_existing_file, file_suffix='bw', withdraw_exisitng_collection=withdraw_exisitng_collection) # load file to db and disk analysis_files.extend(output_file_list) signal_files.extend(output_file_list) else: analysis_files.append(output_file) elif deeptools_command == 'plotFingerprint': output_raw_counts = \ '{0}_{1}.raw.txt'.format(output_prefix,'plotFingerprint') output_raw_counts = \ os.path.join(\ work_dir, output_raw_counts) output_matrics = \ '{0}_{1}.metrics.txt'.format(output_prefix,'plotFingerprint') output_matrics = \ os.path.join(\ work_dir, output_matrics) output_plot = \ '{0}_{1}.pdf'.format(output_prefix,'plotFingerprint') output_plot = \ os.path.join(\ work_dir, output_plot) deeptools_args = \ run_plotFingerprint(\ bam_files=input_files, output_raw_counts=output_raw_counts, output_matrics=output_matrics, output_plot=output_plot, blacklist_file=blacklist_bed, thread=threads, use_ephemeral_space=use_ephemeral_space, params_list=deeptools_params) analysis_files.extend(\ [output_raw_counts,output_matrics,output_plot]) else: raise ValueError('Deeptool command {0} is not implemented yet'.\ format(deeptools_command)) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'signal_files': signal_files, 'seed_date_stamp': seed_date_stamp }) # pass on picard output list message = \ 'finished deeptools {0} for {1} {2}'.format( deeptools_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'Deeptools {0} command: {1}'.format( deeptools_command, deeptools_args) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running samtools commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param species_name: species_name :param base_result_dir: Base results directory :param report_template_file: A template file for writing scanpy report :param analysis_name: Analysis name, default scanpy :param species_name_lookup: A dictionary for ensembl species name lookup :param cellranger_collection_type: Cellranger analysis collection type, default CELLRANGER_RESULTS :param scanpy_collection_type: Scanpy report collection type, default SCANPY_RESULTS :param collection_table: Collection table name for loading scanpy report, default experiment ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') species_name = self.param_required('species_name') report_template_file = self.param_required('report_template_file') analysis_name = self.param_required('analysis_name') base_result_dir = self.param_required('base_result_dir') base_work_dir = self.param_required('base_work_dir') species_name_lookup = self.param('species_name_lookup') cellranger_collection_type = self.param( 'cellranger_collection_type') scanpy_collection_type = self.param('scanpy_collection_type') collection_table = self.param('collection_table') cellbrowser_dir_prefix = self.param('cellbrowser_dir_prefix') use_ephemeral_space = self.param('use_ephemeral_space') cellranger_tarfile = '' output_report = '' work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir if species_name in species_name_lookup.keys( ): # check for human or mice ensembl_species_name = species_name_lookup[ species_name] # get ensembl species name # fetch cellranger tar path from db if cellranger_tarfile == '': ca = CollectionAdaptor( **{'session_class': igf_session_class}) ca.start_session() # connect to database cellranger_tarfiles = \ ca.get_collection_files(\ collection_name=experiment_igf_id, collection_type=cellranger_collection_type, output_mode='dataframe') # fetch collection files ca.close_session() if len(cellranger_tarfiles.index) == 0: raise ValueError('No cellranger analysis output found for exp {0}'.\ format(experiment_igf_id)) cellranger_tarfile = cellranger_tarfiles[ 'file_path'].values[ 0] # select first file as analysis file # extract filtered metrics files from tar output_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir datestamp = get_datestamp_label() cellbrowser_dir = \ os.path.join( \ work_dir, '{0}_{1}'.\ format( \ cellbrowser_dir_prefix, datestamp)) cellbrowser_h5ad = \ os.path.join(\ cellbrowser_dir, 'scanpy.h5ad') output_report = \ os.path.join(\ output_dir, 'report.html') # get temp report path matrix_file,gene_file,barcode_file = \ self._extract_cellranger_filtered_metrics(\ tar_file=cellranger_tarfile, output_dir=output_dir) # get cellranger output files sp = \ Scanpy_tool(\ project_name=project_igf_id, sample_name=sample_igf_id, matrix_file=matrix_file, features_tsv=gene_file, barcode_tsv=barcode_file, html_template_file=report_template_file, species_name=ensembl_species_name, output_file=output_report, use_ephemeral_space=use_ephemeral_space, cellbrowser_h5ad=cellbrowser_h5ad) sp.generate_report() # generate scanpy report # load files to db and disk au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=scanpy_collection_type, collection_table=collection_table, base_path=base_result_dir) # initiate loading of report file output_file_list = \ au.load_file_to_disk_and_db(\ input_file_list=[output_report], withdraw_exisitng_collection=True) # load file to db and disk output_report = output_file_list[0] self.param( 'dataflow_params', { 'output_report': output_report, 'scanpy_h5ad_path': cellbrowser_h5ad }) # pass on output report filepath except Exception as e: message = 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') analysis_files = self.param_required('analysis_files') multiqc_exe = self.param('multiqc_exe') multiqc_options = self.param('multiqc_options') multiqc_dir_label = self.param('multiqc_dir_label') force_overwrite = self.param('force_overwrite') base_results_dir = self.param_required('base_results_dir') tag = self.param_required('tag_name') analysis_name = self.param_required('analysis_name') collection_name = self.param_required('collection_name') collection_type = self.param_required('collection_type') collection_table = self.param_required('collection_table') igf_session_class = self.param_required('igf_session_class') multiqc_template_file = self.param_required( 'multiqc_template_file') platform_name = self.param('platform_name') tool_order_list = self.param('tool_order_list') use_ephemeral_space = self.param('use_ephemeral_space') if not isinstance(analysis_files,list) and \ len(analysis_files) ==0: raise ValueError('Failed to run MultiQC for zero analysis list' ) # check analysis files temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp work dir multiqc_input_file = \ os.path.join( temp_work_dir, 'multiqc.txt') # get temp multiqc list with open(multiqc_input_file, 'w') as fp: for file in analysis_files: if not os.path.exists(file): raise IOError('File {0} not found for multiQC run'.\ format(file)) # check filepath fp.write('{}\n'.format(file)) # write file to temp file date_stamp = datetime.now().strftime('%d-%b-%Y %H:%M:%S') check_file_path(multiqc_template_file) multiqc_conf_file = \ os.path.join( temp_work_dir, os.path.basename(multiqc_template_file)) template_env = \ Environment( loader=\ FileSystemLoader( searchpath=os.path.dirname(multiqc_template_file)), autoescape=select_autoescape(['html', 'xml'])) multiqc_conf = \ template_env.\ get_template( os.path.basename(multiqc_template_file)) multiqc_conf.\ stream( project_igf_id=project_igf_id, sample_igf_id=sample_igf_id, platform_name=platform_name, tag_name=tag, date_stamp=date_stamp, tool_order_list=tool_order_list).\ dump(multiqc_conf_file) multiqc_report_title = \ 'Project:{0}'.format(project_igf_id) # base multiqc label if sample_igf_id is not None: multiqc_report_title = \ '{0},Sample:{1}'.\ format( multiqc_report_title, sample_igf_id) # add sample, if its present multiqc_report_title = \ '{0};tag:{1};date:{2}'.\ format( multiqc_report_title, tag, get_datestamp_label()) # add tag and date stamp multiqc_param = self.format_tool_options( multiqc_options) # format multiqc params multiqc_cmd = [ multiqc_exe, '--file-list', quote(multiqc_input_file), '--outdir', quote(temp_work_dir), '--title', quote(multiqc_report_title), '-c', quote(multiqc_conf_file) ] # multiqc base parameters multiqc_param = \ [quote(param) for param in multiqc_param] # wrap params in quotes multiqc_cmd.\ extend(multiqc_param) # add additional parameters subprocess.\ check_call(' '.join(multiqc_cmd),shell=True) # run multiqc multiqc_html = None output_list = list() for root, _, files in os.walk(top=temp_work_dir): for file in files: if fnmatch.fnmatch(file, '*.html'): multiqc_html = os.path.join( root, file) # get multiqc html path au = \ Analysis_collection_utils( dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=tag, collection_name=collection_name, collection_type=collection_type, collection_table=collection_table, base_path=base_results_dir) output_list = \ au.load_file_to_disk_and_db( input_file_list=[multiqc_html], withdraw_exisitng_collection=force_overwrite, force=True,remove_file=True) # load file to db and disk self.param('dataflow_params', {'multiqc_html': output_list[0] }) # add output files to dataflow except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A runnable method for running PPQT analysis ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') rscript_path = self.param_required('rscript_path') ppqt_exe = self.param_required('ppqt_exe') base_work_dir = self.param_required('base_work_dir') base_result_dir = self.param_required('base_result_dir') library_strategy = self.param_required('library_strategy') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') species_name = self.param_required('species_name') analysis_name = self.param('analysis_name') seed_date_stamp = self.param_required('date_stamp') load_metrics_to_cram = self.param('load_metrics_to_cram') ppqt_collection_type = self.param('ppqt_collection_type') cram_collection_type = self.param('cram_collection_type') collection_table = self.param('collection_table') force_overwrite = self.param('force_overwrite') use_ephemeral_space = self.param('use_ephemeral_space') threads = self.param('threads') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = '{0}_{1}'.format( output_prefix, seed_date_stamp ) # adding datestamp to the output file prefix if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') if len(input_files) > 1: raise ValueError('More than one input file found: {0}'.\ format(input_files)) if analysis_name is None: analysis_name = library_strategy # use library_strategy as default analysis_name input_file = input_files[0] work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir ppqt_obj = \ Ppqt_tools(\ rscript_path=rscript_path, ppqt_exe=ppqt_exe, use_ephemeral_space=use_ephemeral_space, threads=threads) ppqt_cmd,spp_output, pdf_output,spp_data = \ ppqt_obj.run_ppqt(\ input_bam=input_file, output_dir=work_dir, output_spp_name='{0}_{1}.spp.out'.format(output_prefix,'PPQT'), output_pdf_name='{0}_{1}.spp.pdf'.format(output_prefix,'PPQT')) analysis_files.append(spp_output) au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=ppqt_collection_type, collection_table=collection_table, base_path=base_result_dir) output_ppqt_list = \ au.load_file_to_disk_and_db(\ input_file_list=[pdf_output], file_suffix='pdf', withdraw_exisitng_collection=force_overwrite) # load file to db and disk if load_metrics_to_cram and \ len(spp_data) > 0: ca = CollectionAdaptor(**{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute(\ collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=spp_data) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False) ca.commit_session() ca.close_session() except Exception as e: ca.rollback_session() ca.close_session() raise ValueError('Failed to load data to db: {0}'.\ format(e)) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'output_ppqt_list': output_ppqt_list }) # pass on samtools output list message='finished PPQT for {0} {1}'.\ format(project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message='finished PPQT for {0} {1}: {2}'.\ format(project_igf_id, sample_igf_id, ppqt_cmd) self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' An ehive runnable method for cellranger count output processing for a given sample :param project_igf_id: A project igf id :param experiment_igf_id: An experiment igf id :param sample_igf_id: A sample igf id :param igf_session_class: A database session class :param cellranger_output: Cellranger output path :param base_work_dir: Base work directory path :param fastq_collection_type: Collection type name for input fastq files, default demultiplexed_fastq :param species_name: Reference genome collection name :param reference_type: Reference genome collection type, default TRANSCRIPTOME_TENX :param use_ephemeral_space: A toggle for temp dir settings, default 0 :returns: Adding cellranger_output to the dataflow_params ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') igf_session_class = self.param_required('igf_session_class') cellranger_output = self.param_required('cellranger_output') base_result_dir = self.param_required('base_results_dir') species_name = self.param('species_name') manifest_filename = self.param('manifest_filename') analysis_name = self.param('analysis_name') collection_type = self.param('collection_type') collection_table = self.param('collection_table') use_ephemeral_space = self.param('use_ephemeral_space') # prepare manifest file for the results dir manifest_file = \ os.path.join( cellranger_output, manifest_filename) # get name of the manifest file create_file_manifest_for_dir( results_dirpath=cellranger_output, output_file=manifest_file, md5_label='md5', exclude_list=['*.bam', '*.bai', '*.cram']) # create manifest for output dir # create archive for the results dir temp_archive_name = \ os.path.join( get_temp_dir(use_ephemeral_space=use_ephemeral_space), '{0}.tar.gz'.format(experiment_igf_id)) # get the name of temp archive file prepare_file_archive(results_dirpath=cellranger_output, output_file=temp_archive_name, exclude_list=['*.bam', '*.bai', '*.cram' ]) # archive cellranget output # load archive file to db collection and results dir au = \ Analysis_collection_utils( dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=collection_type, collection_table=collection_table, base_path=base_result_dir) # initiate loading of archive file output_file_list = \ au.load_file_to_disk_and_db( input_file_list=[temp_archive_name], withdraw_exisitng_collection=True) # load file to db and disk # find bam path for the data flow bam_list = list() # define empty bamfile list for file in os.listdir(cellranger_output): if fnmatch(file, '*.bam'): bam_list.\ append( os.path.join( cellranger_output, file)) # add all bams to bam_list if len(bam_list) > 1: raise ValueError( 'More than one bam found for cellranger count run:{0}'.\ format(cellranger_output)) # check number of bams, presence of one bam is already validated by check method bam_file = bam_list[0] au = \ Analysis_collection_utils( dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=collection_type, collection_table=collection_table) # initiate bam file rename new_bam_name = \ au.get_new_file_name(input_file=bam_file) if os.path.basename(bam_file) != new_bam_name: new_bam_name = \ os.path.join( os.path.dirname( bam_file), new_bam_name) # get ne bam path move_file(source_path=bam_file, destinationa_path=new_bam_name, force=True) # move bam file bam_file = new_bam_name # update bam file path self.param( 'dataflow_params', { 'cellranger_output': cellranger_output, 'bam_file': bam_file, 'analysis_output_list': output_file_list }) # pass on cellranger output path except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id=self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') input_files = self.param_required('input_files') igf_session_class = self.param_required('igf_session_class') template_report_file = self.param_required('template_report_file') rscript_path = self.param_required('rscript_path') batch_effect_rscript_path = self.param_required('batch_effect_rscript_path') base_result_dir = self.param_required('base_result_dir') strand_info = self.param('strand_info') read_threshold = self.param('read_threshold') collection_type = self.param('collection_type') collection_table = self.param('collection_table') analysis_name = self.param('analysis_name') tag_name = self.param('tag_name') use_ephemeral_space = self.param('use_ephemeral_space') output_file_list = None if len(input_files)==0: raise ValueError('No input files found for bactch effect checking') elif len(input_files) < 3: output_file_list = '' # can't run batch effect checking on less than 3 lanes else: for file in input_files: check_file_path(file) # check input filepath file_data = list() ra = RunAdaptor(**{'session_class':igf_session_class}) ra.start_session() for file in input_files: run_igf_id = os.path.basename(file).\ replace('ReadsPerGene.out.tab','') # using simple string match to fetch run igf ids flowcell_id, lane_id = \ ra.fetch_flowcell_and_lane_for_run(run_igf_id=run_igf_id) # fetch flowcell id and lane info file_data.append({'file':file, 'flowcell':flowcell_id, 'lane':lane_id }) ra.close_session() temp_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_json_file = \ os.path.join(temp_dir,'star_gene_counts.json') # temp json file path temp_output_file = \ os.path.join(\ temp_dir, os.path.basename(template_report_file)) # temp report file path with open(temp_json_file,'w') as jp: json.dump(file_data,jp,indent=2) # dumping json output br = Batch_effect_report(\ input_json_file=temp_json_file, template_file=template_report_file, rscript_path=rscript_path, batch_effect_rscript_path=batch_effect_rscript_path, strand_info=strand_info, read_threshold=read_threshold ) # set up batch effect run br.check_lane_effect_and_log_report(\ project_name=project_igf_id, sample_name=sample_igf_id, output_file=temp_output_file ) # generate report file au = Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, base_path=base_result_dir, tag_name=tag_name, collection_name=experiment_igf_id, collection_type=collection_type, collection_table=collection_table ) # prepare to load file output_file_list = \ au.load_file_to_disk_and_db(\ input_file_list=[temp_output_file]) # load file to db and disk self.param('dataflow_params', {'batch_effect_reports':output_file_list}) # populating data flow only if report is present except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running samtools commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param reference_type: Reference genome collection type, default GENOME_FASTA :param threads: Number of threads to use for Bam to Cram conversion, default 4 :param base_work_dir: Base workd directory :param samtools_command: Samtools command :param samFlagInclude: Sam flags to include in filtered bam, default None :param samFlagExclude: Sam flags to exclude from the filtered bam, default None :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804 :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796 :param use_ephemeral_space: A toggle for temp dir settings, default 0 :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False ''' try: temp_output_dir = False project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') samtools_exe = self.param_required('samtools_exe') reference_type = self.param('reference_type') threads = self.param('threads') base_work_dir = self.param_required('base_work_dir') samtools_command = self.param_required('samtools_command') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') load_metrics_to_cram = self.param('load_metrics_to_cram') cram_collection_type = self.param('cram_collection_type') collection_table = self.param('collection_table') base_result_dir = self.param('base_result_dir') analysis_name = self.param('analysis_name') force_overwrite = self.param('force_overwrite') samFlagInclude = self.param('samFlagInclude') samFlagExclude = self.param('samFlagExclude') mapq_threshold = self.param('mapq_threshold') library_layout = self.param_required('library_layout') use_encode_filter = self.param('use_encode_filter') species_name = self.param_required('species_name') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.\ format( output_prefix, seed_date_stamp) # adding datestamp to the output file prefix if use_encode_filter: samFlagInclude = None if library_layout == 'PAIRED': samFlagExclude = 1804 else: samFlagExclude = 1796 if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') if len(input_files) > 1: raise ValueError('More than one input file found: {0}'.\ format(input_files)) output_bam_cram_list = list() input_file = input_files[0] temp_output_dir = \ get_temp_dir( use_ephemeral_space=use_ephemeral_space) # get temp work dir work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir samtools_cmdline = '' temp_output = None if samtools_command == 'idxstats': temp_output,samtools_cmdline = \ run_bam_idxstat( samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, force=True) # run samtools idxstats elif samtools_command == 'flagstat': temp_output,samtools_cmdline = \ run_bam_flagstat(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools flagstat elif samtools_command == 'stats': temp_output,samtools_cmdline,stats_metrics = \ run_bam_stats(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools stats if load_metrics_to_cram and \ len(stats_metrics) > 0: ca = CollectionAdaptor( **{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute(\ collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=stats_metrics) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False) ca.commit_session() ca.close_session() except Exception as e: ca.rollback_session() ca.close_session() raise ValueError('Failed to load data to db: {0}'.\ format(e)) elif samtools_command == 'merge': if output_prefix is None: raise ValueError( 'Missing output filename prefix for merged bam') sorted_by_name = self.param('sorted_by_name') temp_output = \ os.path.join(\ work_dir, '{0}_merged.bam'.format(output_prefix)) samtools_cmdline = \ merge_multiple_bam(\ samtools_exe=samtools_exe, input_bam_list=input_file, output_bam_path=temp_output, sorted_by_name=sorted_by_name, threads=threads, use_ephemeral_space=use_ephemeral_space, force=True) elif samtools_command == 'view_bamToCram': if base_result_dir is None: raise ValueError( 'base_result_dir is required for CRAM file loading') if analysis_name is None: raise ValueError( 'analysis_name is required for CRAM file loading') ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, genome_fasta_type=reference_type) genome_fasta = ref_genome.get_genome_fasta( ) # get genome fasta cram_file = \ os.path.basename(input_file).\ replace('.bam','.cram') # get base cram file name cram_file = os.path.join( temp_output_dir, cram_file) # get cram file path in work dir samtools_cmdline = \ convert_bam_to_cram(\ samtools_exe=samtools_exe, bam_file=input_file, reference_file=genome_fasta, cram_path=cram_file, use_ephemeral_space=use_ephemeral_space, threads=threads, force=True, dry_run=False) au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=cram_collection_type, collection_table=collection_table, base_path=base_result_dir) temp_output_bam_cram_list = \ au.load_file_to_disk_and_db(\ input_file_list=[cram_file], file_suffix='cram', withdraw_exisitng_collection=force_overwrite) # load file to db and disk for cram in temp_output_bam_cram_list: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=cram, threads=threads, dry_run=False) index_path = '{0}.crai'.format(cram) output_bam_cram_list.append(cram) output_bam_cram_list.append(index_path) if len(output_bam_cram_list) == 0: raise ValueError('No output cram file found') elif samtools_command == 'view_filterBam': temp_output_bam = \ os.path.join(\ temp_output_dir, os.path.basename(input_file).replace('.bam','.filtered.bam')) samtools_cmdline = \ filter_bam_file( samtools_exe=samtools_exe, input_bam=input_file, output_bam=temp_output_bam, samFlagInclude=samFlagInclude, samFlagExclude=samFlagExclude, threads=threads, mapq_threshold=mapq_threshold, index_output=False, dry_run=False) dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output_bam)) move_file(\ source_path=temp_output_bam, destinationa_path=dest_path, force=True) index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=dest_path, threads=threads, dry_run=False) index_path = '{0}.bai'.format(dest_path) output_bam_cram_list.append(dest_path) output_bam_cram_list.append(index_path) else: raise ValueError('Samtools command {0} not supported'.\ format(samtools_command)) if temp_output is not None: dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output)) if dest_path != temp_output: move_file(\ source_path=temp_output, destinationa_path=dest_path, force=True) analysis_files.append(dest_path) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'output_bam_cram_list': output_bam_cram_list }) # pass on samtools output list message = \ 'finished samtools {0} for {1} {2}'.\ format( samtools_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'finished samtools {0} for {1} {2}: {3}'.\ format( samtools_command, project_igf_id, sample_igf_id, samtools_cmdline) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise