def run(self): ''' A method for running the cellranger count metrics extraction :param project_igf_id: A project igf id :param experiment_igf_id: An experiment igf id :param sample_igf_id: A sample igf id :param igf_session_class: A database session class :param analysis_output_list: Cellranger analysis tar output path :param collection_type: Cellranger results collection type :param metrics_filename: Name of the metrics file, default metrics_summary.csv :returns: None ''' try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') igf_session_class = self.param_required('igf_session_class') analysis_output_list = self.param_required('analysis_output_list') collection_type = self.param('collection_type') metrics_filename = self.param('metrics_filename') attribute_prefix = self.param('attribute_prefix') for infile in analysis_output_list: check_file_path(infile) # check input file path cellranger_tar = analysis_output_list[0] cellranger_metrics = extract_cellranger_count_metrics_summary(\ cellranger_tar=cellranger_tar, target_filename=metrics_filename, collection_name=experiment_igf_id, collection_type=collection_type, attribute_prefix=attribute_prefix ) # extract cellranger metrics stats as dictionary ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=cellranger_metrics, autosave=False) # load cellranger metrics to collection attribute table ca.commit_session() ca.close_session() except: ca.rollback_session() ca.close_session() raise self.param('dataflow_params',{'cellranger_attribute':'done'}) except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running picard commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param reference_type: Reference genome collection type, default GENOME_FASTA :param reference_refFlat: Reference genome collection type, default GENE_REFFLAT :param ribosomal_interval_type: Collection type for ribosomal interval list, default RIBOSOMAL_INTERVAL :param species_name: species_name :param java_exe: Java path :param java_java_paramexe: Java run parameters :param picard_jar: Picard jar path :param picard_command: Picard command :param base_work_dir: Base workd directory :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False :param use_ephemeral_space: A toggle for temp dir setting, default 0 :param patterned_flowcell_list: A list of paterned flowcells, default ['HISEQ4000','NEXTSEQ'] ''' try: temp_output_dir = False project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') java_exe = self.param_required('java_exe') java_param = self.param_required('java_param') picard_jar = self.param_required('picard_jar') input_files = self.param_required('input_files') picard_command = self.param_required('picard_command') igf_session_class = self.param_required('igf_session_class') species_name = self.param('species_name') reference_type = self.param('reference_type') reference_refFlat = self.param('reference_refFlat') ribosomal_interval_type = self.param('ribosomal_interval_type') base_work_dir = self.param_required('base_work_dir') analysis_files = self.param_required('analysis_files') picard_option = self.param('picard_option') patterned_flowcell_list = self.param('patterned_flowcell_list') platform_name = self.param_required('platform_name') output_prefix = self.param('output_prefix') load_metrics_to_cram = self.param('load_metrics_to_cram') cram_collection_type = self.param('cram_collection_type') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.\ format( output_prefix, seed_date_stamp) # adding seed datestamp to output prefix work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir temp_output_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp work dir ref_genome = \ Reference_genome_utils( genome_tag=species_name, dbsession_class=igf_session_class, genome_fasta_type=reference_type, gene_reflat_type=reference_refFlat, ribosomal_interval_type=ribosomal_interval_type) # setup ref genome utils genome_fasta = ref_genome.get_genome_fasta() # get genome fasta ref_flat_file = ref_genome.get_gene_reflat() # get refFlat file ribosomal_interval_file = ref_genome.get_ribosomal_interval( ) # get ribosomal interval file patterned_flowcell = False if platform_name in patterned_flowcell_list: # check for patterned flowcell patterned_flowcell = True if load_metrics_to_cram and \ not cram_collection_type: raise ValueError( 'Cram file collection type is required for loading picard metrics to db' ) picard=\ Picard_tools(\ java_exe=java_exe, java_param=java_param, picard_jar=picard_jar, input_files=input_files, output_dir=temp_output_dir, ref_fasta=genome_fasta, patterned_flowcell=patterned_flowcell, ref_flat_file=ref_flat_file, picard_option=picard_option, output_prefix=output_prefix, use_ephemeral_space=use_ephemeral_space, ribisomal_interval=ribosomal_interval_file) # setup picard tool temp_output_files,picard_command_line,picard_metrics = \ picard.run_picard_command(command_name=picard_command) # run picard command output_file_list = list() for source_path in temp_output_files: dest_path=\ os.path.join( work_dir, os.path.basename(source_path)) # get destination filepath move_file(source_path=source_path, destinationa_path=dest_path, force=True) # move files to work dir output_file_list.append(dest_path) remove_dir(temp_output_dir) analysis_files.extend(output_file_list) bam_files = list() for file in output_file_list: if file.endswith('.bam'): bam_files.append(file) if load_metrics_to_cram and \ len(picard_metrics)>0: ca = CollectionAdaptor(**{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute( collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=picard_metrics) # fromat data for collection attribute table ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False ) # load data to collection attribute table ca.commit_session() ca.close_session() except: ca.rollback_session() ca.close_session() raise self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'bam_files': bam_files, 'seed_date_stamp': seed_date_stamp }) # pass on picard output list message = \ 'finished picard {0} for {1} {2}'.\ format( picard_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'Picard {0} command: {1}'.\ format( picard_command, picard_command_line) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send commandline to Asana except Exception as e: if temp_output_dir and \ os.path.exists(temp_output_dir): remove_dir(temp_output_dir) message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running samtools commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param reference_type: Reference genome collection type, default GENOME_FASTA :param threads: Number of threads to use for Bam to Cram conversion, default 4 :param base_work_dir: Base workd directory :param samtools_command: Samtools command :param samFlagInclude: Sam flags to include in filtered bam, default None :param samFlagExclude: Sam flags to exclude from the filtered bam, default None :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804 :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796 :param use_ephemeral_space: A toggle for temp dir settings, default 0 :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False ''' try: temp_output_dir = False project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') samtools_exe = self.param_required('samtools_exe') reference_type = self.param('reference_type') threads = self.param('threads') base_work_dir = self.param_required('base_work_dir') samtools_command = self.param_required('samtools_command') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') load_metrics_to_cram = self.param('load_metrics_to_cram') cram_collection_type = self.param('cram_collection_type') collection_table = self.param('collection_table') base_result_dir = self.param('base_result_dir') analysis_name = self.param('analysis_name') force_overwrite = self.param('force_overwrite') samFlagInclude = self.param('samFlagInclude') samFlagExclude = self.param('samFlagExclude') mapq_threshold = self.param('mapq_threshold') library_layout = self.param_required('library_layout') use_encode_filter = self.param('use_encode_filter') species_name = self.param_required('species_name') seed_date_stamp = self.param_required('date_stamp') use_ephemeral_space = self.param('use_ephemeral_space') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = \ '{0}_{1}'.\ format( output_prefix, seed_date_stamp) # adding datestamp to the output file prefix if use_encode_filter: samFlagInclude = None if library_layout == 'PAIRED': samFlagExclude = 1804 else: samFlagExclude = 1796 if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') if len(input_files) > 1: raise ValueError('More than one input file found: {0}'.\ format(input_files)) output_bam_cram_list = list() input_file = input_files[0] temp_output_dir = \ get_temp_dir( use_ephemeral_space=use_ephemeral_space) # get temp work dir work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = \ self.get_job_work_dir(work_dir=work_dir_prefix) # get a run work dir samtools_cmdline = '' temp_output = None if samtools_command == 'idxstats': temp_output,samtools_cmdline = \ run_bam_idxstat( samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, force=True) # run samtools idxstats elif samtools_command == 'flagstat': temp_output,samtools_cmdline = \ run_bam_flagstat(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools flagstat elif samtools_command == 'stats': temp_output,samtools_cmdline,stats_metrics = \ run_bam_stats(\ samtools_exe=samtools_exe, bam_file=input_file, output_dir=temp_output_dir, output_prefix=output_prefix, threads=threads, force=True) # run samtools stats if load_metrics_to_cram and \ len(stats_metrics) > 0: ca = CollectionAdaptor( **{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute(\ collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=stats_metrics) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False) ca.commit_session() ca.close_session() except Exception as e: ca.rollback_session() ca.close_session() raise ValueError('Failed to load data to db: {0}'.\ format(e)) elif samtools_command == 'merge': if output_prefix is None: raise ValueError( 'Missing output filename prefix for merged bam') sorted_by_name = self.param('sorted_by_name') temp_output = \ os.path.join(\ work_dir, '{0}_merged.bam'.format(output_prefix)) samtools_cmdline = \ merge_multiple_bam(\ samtools_exe=samtools_exe, input_bam_list=input_file, output_bam_path=temp_output, sorted_by_name=sorted_by_name, threads=threads, use_ephemeral_space=use_ephemeral_space, force=True) elif samtools_command == 'view_bamToCram': if base_result_dir is None: raise ValueError( 'base_result_dir is required for CRAM file loading') if analysis_name is None: raise ValueError( 'analysis_name is required for CRAM file loading') ref_genome = \ Reference_genome_utils(\ genome_tag=species_name, dbsession_class=igf_session_class, genome_fasta_type=reference_type) genome_fasta = ref_genome.get_genome_fasta( ) # get genome fasta cram_file = \ os.path.basename(input_file).\ replace('.bam','.cram') # get base cram file name cram_file = os.path.join( temp_output_dir, cram_file) # get cram file path in work dir samtools_cmdline = \ convert_bam_to_cram(\ samtools_exe=samtools_exe, bam_file=input_file, reference_file=genome_fasta, cram_path=cram_file, use_ephemeral_space=use_ephemeral_space, threads=threads, force=True, dry_run=False) au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=cram_collection_type, collection_table=collection_table, base_path=base_result_dir) temp_output_bam_cram_list = \ au.load_file_to_disk_and_db(\ input_file_list=[cram_file], file_suffix='cram', withdraw_exisitng_collection=force_overwrite) # load file to db and disk for cram in temp_output_bam_cram_list: index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=cram, threads=threads, dry_run=False) index_path = '{0}.crai'.format(cram) output_bam_cram_list.append(cram) output_bam_cram_list.append(index_path) if len(output_bam_cram_list) == 0: raise ValueError('No output cram file found') elif samtools_command == 'view_filterBam': temp_output_bam = \ os.path.join(\ temp_output_dir, os.path.basename(input_file).replace('.bam','.filtered.bam')) samtools_cmdline = \ filter_bam_file( samtools_exe=samtools_exe, input_bam=input_file, output_bam=temp_output_bam, samFlagInclude=samFlagInclude, samFlagExclude=samFlagExclude, threads=threads, mapq_threshold=mapq_threshold, index_output=False, dry_run=False) dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output_bam)) move_file(\ source_path=temp_output_bam, destinationa_path=dest_path, force=True) index_bam_or_cram(\ samtools_exe=samtools_exe, input_path=dest_path, threads=threads, dry_run=False) index_path = '{0}.bai'.format(dest_path) output_bam_cram_list.append(dest_path) output_bam_cram_list.append(index_path) else: raise ValueError('Samtools command {0} not supported'.\ format(samtools_command)) if temp_output is not None: dest_path = \ os.path.join(\ work_dir, os.path.basename(temp_output)) if dest_path != temp_output: move_file(\ source_path=temp_output, destinationa_path=dest_path, force=True) analysis_files.append(dest_path) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'output_bam_cram_list': output_bam_cram_list }) # pass on samtools output list message = \ 'finished samtools {0} for {1} {2}'.\ format( samtools_command, project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message = \ 'finished samtools {0} for {1} {2}: {3}'.\ format( samtools_command, project_igf_id, sample_igf_id, samtools_cmdline) #self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A runnable method for running PPQT analysis ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') input_files = self.param_required('input_files') rscript_path = self.param_required('rscript_path') ppqt_exe = self.param_required('ppqt_exe') base_work_dir = self.param_required('base_work_dir') base_result_dir = self.param_required('base_result_dir') library_strategy = self.param_required('library_strategy') analysis_files = self.param_required('analysis_files') output_prefix = self.param_required('output_prefix') species_name = self.param_required('species_name') analysis_name = self.param('analysis_name') seed_date_stamp = self.param_required('date_stamp') load_metrics_to_cram = self.param('load_metrics_to_cram') ppqt_collection_type = self.param('ppqt_collection_type') cram_collection_type = self.param('cram_collection_type') collection_table = self.param('collection_table') force_overwrite = self.param('force_overwrite') use_ephemeral_space = self.param('use_ephemeral_space') threads = self.param('threads') seed_date_stamp = get_datestamp_label(seed_date_stamp) if output_prefix is not None: output_prefix = '{0}_{1}'.format( output_prefix, seed_date_stamp ) # adding datestamp to the output file prefix if not isinstance(input_files, list) or \ len(input_files) == 0: raise ValueError('No input file found') if len(input_files) > 1: raise ValueError('More than one input file found: {0}'.\ format(input_files)) if analysis_name is None: analysis_name = library_strategy # use library_strategy as default analysis_name input_file = input_files[0] work_dir_prefix = \ os.path.join(\ base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir ppqt_obj = \ Ppqt_tools(\ rscript_path=rscript_path, ppqt_exe=ppqt_exe, use_ephemeral_space=use_ephemeral_space, threads=threads) ppqt_cmd,spp_output, pdf_output,spp_data = \ ppqt_obj.run_ppqt(\ input_bam=input_file, output_dir=work_dir, output_spp_name='{0}_{1}.spp.out'.format(output_prefix,'PPQT'), output_pdf_name='{0}_{1}.spp.pdf'.format(output_prefix,'PPQT')) analysis_files.append(spp_output) au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=ppqt_collection_type, collection_table=collection_table, base_path=base_result_dir) output_ppqt_list = \ au.load_file_to_disk_and_db(\ input_file_list=[pdf_output], file_suffix='pdf', withdraw_exisitng_collection=force_overwrite) # load file to db and disk if load_metrics_to_cram and \ len(spp_data) > 0: ca = CollectionAdaptor(**{'session_class': igf_session_class}) attribute_data = \ ca.prepare_data_for_collection_attribute(\ collection_name=experiment_igf_id, collection_type=cram_collection_type, data_list=spp_data) ca.start_session() try: ca.create_or_update_collection_attributes(\ data=attribute_data, autosave=False) ca.commit_session() ca.close_session() except Exception as e: ca.rollback_session() ca.close_session() raise ValueError('Failed to load data to db: {0}'.\ format(e)) self.param( 'dataflow_params', { 'analysis_files': analysis_files, 'output_ppqt_list': output_ppqt_list }) # pass on samtools output list message='finished PPQT for {0} {1}'.\ format(project_igf_id, sample_igf_id) self.post_message_to_slack(message, reaction='pass') # send log to slack message='finished PPQT for {0} {1}: {2}'.\ format(project_igf_id, sample_igf_id, ppqt_cmd) self.comment_asana_task(task_name=project_igf_id, comment=message) # send comment to Asana except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') file_list = self.param_required('file_list') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') remote_project_path = self.param_required('remote_project_path') dir_labels = self.param_required('dir_labels') igf_session_class = self.param_required('igf_session_class') force_overwrite = self.param('force_overwrite') collect_remote_file = self.param('collect_remote_file') collection_name = self.param('collection_name') collection_type = self.param('collection_type') collection_table = self.param('collection_table') file_location = self.param('file_location') use_ephemeral_space = self.param('use_ephemeral_space') destination_output_path = \ os.path.join( remote_project_path, project_igf_id) # get base destination path if isinstance(dir_labels, list) and \ len(dir_labels) > 0: destination_output_path=\ os.path.join(destination_output_path, *dir_labels) if collect_remote_file: if collection_name is None or \ collection_type is None: raise ValueError('Name and type are required for db collection') output_file_list = list() temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp dir for file in file_list: if not os.path.exists(file): raise IOError('file {0} not found'.\ format(file)) if os.path.isfile(file): copy2( file, os.path.join( temp_work_dir, os.path.basename(file))) # copy file to a temp dir dest_file_path = \ os.path.join( destination_output_path, os.path.basename(file)) # get destination file path os.chmod( os.path.join( temp_work_dir, os.path.basename(file)), mode=0o764) # set file permission elif os.path.isdir(file): copytree(\ file, os.path.join( temp_work_dir, os.path.basename(file))) # copy dir to a temp dir dest_file_path=destination_output_path for root,dirs,files in os.walk(temp_work_dir): for dir_name in dirs: os.chmod( os.path.join(root,dir_name), mode=0o775) for file_name in files: os.chmod( os.path.join(root,file_name), mode=0o764) # changing file and dir permissions for remote files else: raise ValueError('Unknown source file type: {0}'.\ format(file)) #os.chmod( # os.path.join( # temp_work_dir, # os.path.basename(file)), # mode=0o754) # set file permission copy_remote_file(\ source_path=os.path.join(temp_work_dir, os.path.basename(file)), destinationa_path=dest_file_path, destination_address='{0}@{1}'.format(remote_user,remote_host), force_update=force_overwrite ) # copy file to remote if os.path.isdir(file): dest_file_path=\ os.path.join(\ dest_file_path, os.path.basename(file)) # fix for dir input output_file_list.append(dest_file_path) remove_dir(dir_path=temp_work_dir) # remove temp dir self.param('dataflow_params', {'status': 'done', 'output_list':output_file_list}) # add dataflow params if collect_remote_file: data=list() remove_data_list=[{'name':collection_name, 'type':collection_type}] for file in output_file_list: data.append( {'name':collection_name, 'type':collection_type, 'table':collection_table, 'file_path':file, 'location':file_location } ) ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() try: ca.remove_collection_group_info( data=remove_data_list, autosave=False) # remove existing data before loading new collection ca.load_file_and_create_collection( data=data, autosave=False, calculate_file_size_and_md5=False) # load remote files to db ca.commit_session() # commit changes ca.close_session() except: ca.rollback_session() # rollback changes ca.close_session() raise except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise