Пример #1
0
    def run(self):
        '''
    A method for running picard commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param reference_refFlat: Reference genome collection type, default GENE_REFFLAT
    :param ribosomal_interval_type: Collection type for ribosomal interval list, default RIBOSOMAL_INTERVAL
    :param species_name: species_name
    :param java_exe: Java path
    :param java_java_paramexe: Java run parameters
    :param picard_jar: Picard jar path
    :param picard_command: Picard command
    :param base_work_dir: Base workd directory
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    :param use_ephemeral_space: A toggle for temp dir setting, default 0
    :param patterned_flowcell_list: A list of paterned flowcells, default ['HISEQ4000','NEXTSEQ']
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            java_exe = self.param_required('java_exe')
            java_param = self.param_required('java_param')
            picard_jar = self.param_required('picard_jar')
            input_files = self.param_required('input_files')
            picard_command = self.param_required('picard_command')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param('species_name')
            reference_type = self.param('reference_type')
            reference_refFlat = self.param('reference_refFlat')
            ribosomal_interval_type = self.param('ribosomal_interval_type')
            base_work_dir = self.param_required('base_work_dir')
            analysis_files = self.param_required('analysis_files')
            picard_option = self.param('picard_option')
            patterned_flowcell_list = self.param('patterned_flowcell_list')
            platform_name = self.param_required('platform_name')
            output_prefix = self.param('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                                  # adding seed datestamp to output prefix

            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            temp_output_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp work dir
            ref_genome = \
              Reference_genome_utils(
                genome_tag=species_name,
                dbsession_class=igf_session_class,
                genome_fasta_type=reference_type,
                gene_reflat_type=reference_refFlat,
                ribosomal_interval_type=ribosomal_interval_type)                      # setup ref genome utils
            genome_fasta = ref_genome.get_genome_fasta()  # get genome fasta
            ref_flat_file = ref_genome.get_gene_reflat()  # get refFlat file
            ribosomal_interval_file = ref_genome.get_ribosomal_interval(
            )  # get ribosomal interval file
            patterned_flowcell = False
            if platform_name in patterned_flowcell_list:  # check for patterned flowcell
                patterned_flowcell = True

            if load_metrics_to_cram and \
               not cram_collection_type:
                raise ValueError(
                    'Cram file collection type is required for loading picard metrics to db'
                )

            picard=\
              Picard_tools(\
                java_exe=java_exe,
                java_param=java_param,
                picard_jar=picard_jar,
                input_files=input_files,
                output_dir=temp_output_dir,
                ref_fasta=genome_fasta,
                patterned_flowcell=patterned_flowcell,
                ref_flat_file=ref_flat_file,
                picard_option=picard_option,
                output_prefix=output_prefix,
                use_ephemeral_space=use_ephemeral_space,
                ribisomal_interval=ribosomal_interval_file)                           # setup picard tool
            temp_output_files,picard_command_line,picard_metrics = \
              picard.run_picard_command(command_name=picard_command)                  # run picard command
            output_file_list = list()
            for source_path in temp_output_files:
                dest_path=\
                  os.path.join(
                    work_dir,
                    os.path.basename(source_path))                                      # get destination filepath
                move_file(source_path=source_path,
                          destinationa_path=dest_path,
                          force=True)  # move files to work dir
                output_file_list.append(dest_path)
            remove_dir(temp_output_dir)
            analysis_files.extend(output_file_list)
            bam_files = list()
            for file in output_file_list:
                if file.endswith('.bam'):
                    bam_files.append(file)

            if load_metrics_to_cram and \
               len(picard_metrics)>0:
                ca = CollectionAdaptor(**{'session_class': igf_session_class})
                attribute_data = \
                  ca.prepare_data_for_collection_attribute(
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    data_list=picard_metrics)                                           # fromat data for collection attribute table
                ca.start_session()
                try:
                    ca.create_or_update_collection_attributes(\
                      data=attribute_data,
                      autosave=False
                    )                                                                     # load data to collection attribute table
                    ca.commit_session()
                    ca.close_session()
                except:
                    ca.rollback_session()
                    ca.close_session()
                    raise

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'bam_files': bam_files,
                    'seed_date_stamp': seed_date_stamp
                })  # pass on picard output list
            message = \
              'finished picard {0} for {1} {2}'.\
                format(
                  picard_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'Picard {0} command: {1}'.\
                format(
                  picard_command,
                  picard_command_line)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send commandline to Asana
        except Exception as e:
            if temp_output_dir and \
               os.path.exists(temp_output_dir):
                remove_dir(temp_output_dir)

            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
    def load_file_to_disk_and_db(self,
                                 input_file_list,
                                 withdraw_exisitng_collection=True,
                                 autosave_db=True,
                                 file_suffix=None,
                                 force=True,
                                 remove_file=False):
        '''
    A method for loading analysis results to disk and database. File will be moved to a new path if base_path is present.
    Directory structure of the final path is based on the collection_table information.
    
    Following will be the final directory structure if base_path is present
    
    project - base_path/project_igf_id/analysis_name
    sample - base_path/project_igf_id/sample_igf_id/analysis_name
    experiment - base_path/project_igf_id/sample_igf_id/experiment_igf_id/analysis_name
    run - base_path/project_igf_id/sample_igf_id/experiment_igf_id/run_igf_id/analysis_name
    
    :param input_file_list: A list of input file to load, all using the same collection info
    :param withdraw_exisitng_collection: Remove existing collection group, DO NOT use this while loading a list of files
    :param autosave_db: Save changes to database, default True
    :param file_suffix: Use a specific file suffix, use None if it should be same as original file
                        e.g. input.vcf.gz to  output.vcf.gz
    :param force: Toggle for removing existing file, default True
    :param remove_file: A toggle for removing existing file from disk, default False
    :returns: A list of final filepath
    '''
        try:
            project_igf_id = None
            sample_igf_id = None
            experiment_igf_id = None
            experiment_igf_id = None
            run_igf_id = None
            output_path_list = list()  # define empty output list
            dbconnected = False
            if self.collection_name is None or \
               self.collection_type is None or \
               self.collection_table is None:
                raise ValueError('File collection information is incomplete'
                                 )  # check for collection information

            base = BaseAdaptor(**{'session_class': self.dbsession_class})
            base.start_session()  # connect to db
            dbconnected = True
            if self.base_path is not None:
                if self.collection_table == 'sample':
                    sa = SampleAdaptor(**{'session': base.session})
                    sample_igf_id = self.collection_name
                    sample_exists = sa.check_sample_records_igf_id(
                        sample_igf_id=sample_igf_id)
                    if not sample_exists:
                        raise ValueError('Sample {0} not found in db'.\
                                         format(sample_igf_id))

                    project_igf_id = \
                      sa.fetch_sample_project(sample_igf_id=sample_igf_id)                # fetch project id for sample
                elif self.collection_table == 'experiment':
                    ea = ExperimentAdaptor(**{'session': base.session})
                    experiment_igf_id = self.collection_name
                    experiment_exists = \
                      ea.check_experiment_records_id(
                        experiment_igf_id=experiment_igf_id)
                    if not experiment_exists:
                        raise ValueError('Experiment {0} not present in database'.\
                                         format(experiment_igf_id))

                    (project_igf_id,sample_igf_id) = \
                        ea.fetch_project_and_sample_for_experiment(
                          experiment_igf_id=experiment_igf_id)                            # fetch project and sample id for experiment
                elif self.collection_table == 'run':
                    ra = RunAdaptor(**{'session': base.session})
                    run_igf_id = self.collection_name
                    run_exists = ra.check_run_records_igf_id(
                        run_igf_id=run_igf_id)
                    if not run_exists:
                        raise ValueError('Run {0} not found in database'.\
                                         format(run_igf_id))

                    (project_igf_id,sample_igf_id,experiment_igf_id) = \
                      ra.fetch_project_sample_and_experiment_for_run(
                        run_igf_id=run_igf_id)                                            # fetch project, sample and experiment id for run
                elif self.collection_table == 'project':
                    pa = ProjectAdaptor(**{'session': base.session})
                    project_igf_id = self.collection_name
                    project_exists = \
                      pa.check_project_records_igf_id(
                        project_igf_id=project_igf_id)
                    if not project_exists:
                        raise ValueError('Project {0} not found in database'.\
                                         format(project_igf_id))

            if self.rename_file and self.analysis_name is None:
                raise ValueError('Analysis name is required for renaming file'
                                 )  # check analysis name

            for input_file in input_file_list:
                final_path = ''
                if self.base_path is None:  # do not move file if base_path is absent
                    final_path = os.path.dirname(input_file)
                else:  # move file path
                    if self.collection_table == 'project':
                        if project_igf_id is None:
                            raise ValueError('Missing project id for collection {0}'.\
                                             format(self.collection_name))

                        final_path = \
                          os.path.join(
                            self.base_path,
                            project_igf_id,
                            self.analysis_name)                                             # final path for project
                    elif self.collection_table == 'sample':
                        if project_igf_id is None or \
                           sample_igf_id is None:
                            raise ValueError('Missing project and sample id for collection {0}'.\
                                             format(self.collection_name))

                        final_path = \
                          os.path.join(
                            self.base_path,
                            project_igf_id,
                            sample_igf_id,
                            self.analysis_name)                                             # final path for sample
                    elif self.collection_table == 'experiment':
                        if project_igf_id is None or \
                           sample_igf_id is None or \
                           experiment_igf_id is None:
                            raise ValueError('Missing project,sample and experiment id for collection {0}'.\
                                             format(self.collection_name))

                        final_path = \
                          os.path.join(
                            self.base_path,
                            project_igf_id,
                            sample_igf_id,
                            experiment_igf_id,
                            self.analysis_name)                                             # final path for experiment
                    elif self.collection_table == 'run':
                        if project_igf_id is None or \
                           sample_igf_id is None or \
                           experiment_igf_id is None or \
                           run_igf_id is None:
                            raise ValueError('Missing project,sample,experiment and run id for collection {0}'.\
                                             format(self.collection_name))

                        final_path = \
                          os.path.join(\
                            self.base_path,
                            project_igf_id,
                            sample_igf_id,
                            experiment_igf_id,
                            run_igf_id,
                            self.analysis_name)                                             # final path for run

                if self.rename_file:
                    new_filename = \
                      self.get_new_file_name(
                        input_file=input_file,
                        file_suffix=file_suffix)
                    final_path = \
                      os.path.join(
                        final_path,
                        new_filename)                                                     # get new filepath
                else:
                    final_path = \
                      os.path.join(
                        final_path,
                        os.path.basename(input_file))

                if final_path != input_file:  # move file if its required
                    final_path = preprocess_path_name(
                        input_path=final_path
                    )  # remove unexpected characters from file path
                    move_file(source_path=input_file,
                              destinationa_path=final_path,
                              force=force
                              )  # move or overwrite file to destination dir

                output_path_list.append(
                    final_path)  # add final path to the output list
                self.create_or_update_analysis_collection(
                    file_path=final_path,
                    dbsession=base.session,
                    withdraw_exisitng_collection=withdraw_exisitng_collection,
                    remove_file=remove_file,
                    autosave_db=autosave_db)  # load new file collection in db
                if autosave_db:
                    base.commit_session()  # save changes to db for each file

            base.commit_session()  # save changes to db
            base.close_session()  # close db connection
            return output_path_list
        except:
            if dbconnected:
                base.rollback_session()
                base.close_session()
            raise
Пример #3
0
 def run(self):
     '''
 A method for resetting md5 values in the samplesheet json files for all seqrun ids
 '''
     try:
         db_connected = False
         seqrun_list = self._read_seqrun_list(
             self.seqrun_igf_list
         )  # fetch list of seqrun ids from input file
         if len(seqrun_list) > 0:
             base = self.base_adaptor
             base.start_session()  # connect to database
             db_connected = True
             ca = CollectionAdaptor(**{'session': base.session
                                       })  # connect to collection table
             fa = FileAdaptor(**{'session':
                                 base.session})  # connect to file table
             for seqrun_id in seqrun_list:
                 try:
                     files_data = ca.get_collection_files(
                         collection_name=seqrun_id,
                         collection_type=self.json_collection_type,
                         output_mode='one_or_none'
                     )  # check for existing md5 json file in db
                     # TO DO: skip seqrun_id if pipeline is still running
                     if files_data is not None:
                         json_file_path = [
                             element.file_path for element in files_data
                             if isinstance(element, File)
                         ][0]  # get md5 json file path from sqlalchemy collection results
                         samplesheet_md5 = self._get_samplesheet_md5(
                             seqrun_id
                         )  # get md5 value for new samplesheet file
                         new_json_path = self._get_updated_json_file(
                             json_file_path, samplesheet_md5,
                             self.samplesheet_name
                         )  # get updated md5 json file if samplesheet has been changed
                         if new_json_path is not None:
                             new_json_file_md5 = calculate_file_checksum(
                                 filepath=new_json_path, hasher='md5')
                             fa.update_file_table_for_file_path(
                                 file_path=json_file_path,
                                 tag='md5',
                                 value=new_json_file_md5,
                                 autosave=False
                             )  # update json file md5 in db, don't commit yet
                             move_file(source_path=new_json_path,
                                       destinationa_path=json_file_path,
                                       force=True)  # overwrite json file
                             base.commit_session()  # save changes in db
                             message='Setting new Samplesheet info for run {0}'.\
                                     format(seqrun_id)
                             if self.log_slack:
                                 self.igf_slack.post_message_to_channel(
                                     message,
                                     reaction='pass')  # send log to slack
                             if self.log_asana:
                                 self.igf_asana.comment_asana_task(
                                     task_name=seqrun_id,
                                     comment=message)  # send log to asana
                         else:
                             message = 'no change in samplesheet for seqrun {0}'.format(
                                 seqrun_id)
                             warnings.warn(message)
                             if self.log_slack:
                                 self.igf_slack.post_message_to_channel(
                                     message, reaction='pass')
                     else:
                         message='No md5 json file found for seqrun_igf_id: {0}'.\
                                 format(seqrun_id)
                         warnings.warn(
                             message
                         )  # not raising any exception if seqrun id is not found
                         if self.log_slack:
                             self.igf_slack.post_message_to_channel(
                                 message, reaction='fail')
                 except Exception as e:
                     base.rollback_session()
                     message='Failed to update  json file for seqrun id {0}, error : {1}'.\
                             format(seqrun_id,e)
                     warnings.warn(message)
                     if self.log_slack:
                         self.igf_slack.post_message_to_channel(
                             message, reaction='fail')
             base.close_session()  # close db connection
             if self.clean_up:
                 self._clear_seqrun_list(
                     self.seqrun_igf_list)  # clear input file
         else:
             if self.log_slack:
                 message = 'No new seqrun id found for changing samplesheet md5'
                 warnings.warn(message)
                 if self.log_slack:
                     self.igf_slack.post_message_to_channel(
                         message, reaction='sleep')
     except:
         if db_connected:
             base.rollback_session()
             base.close_session()
         raise
Пример #4
0
    def run(self):
        '''
    An ehive runnable method for cellranger count output processing for a given sample
    
    :param project_igf_id: A project igf id
    :param experiment_igf_id: An experiment igf id
    :param sample_igf_id: A sample igf id
    :param igf_session_class: A database session class
    :param cellranger_output: Cellranger output path
    :param base_work_dir: Base work directory path
    :param fastq_collection_type: Collection type name for input fastq files, default demultiplexed_fastq
    :param species_name: Reference genome collection name
    :param reference_type: Reference genome collection type, default TRANSCRIPTOME_TENX
    :param use_ephemeral_space: A toggle for temp dir settings, default 0
    :returns: Adding cellranger_output to the dataflow_params
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            cellranger_output = self.param_required('cellranger_output')
            base_result_dir = self.param_required('base_results_dir')
            species_name = self.param('species_name')
            manifest_filename = self.param('manifest_filename')
            analysis_name = self.param('analysis_name')
            collection_type = self.param('collection_type')
            collection_table = self.param('collection_table')
            use_ephemeral_space = self.param('use_ephemeral_space')

            # prepare manifest file for the results dir
            manifest_file = \
              os.path.join(
                cellranger_output,
                manifest_filename)                                                    # get name of the manifest file
            create_file_manifest_for_dir(
                results_dirpath=cellranger_output,
                output_file=manifest_file,
                md5_label='md5',
                exclude_list=['*.bam', '*.bai',
                              '*.cram'])  # create manifest for output dir
            # create archive for the results dir
            temp_archive_name = \
              os.path.join(
                get_temp_dir(use_ephemeral_space=use_ephemeral_space),
                '{0}.tar.gz'.format(experiment_igf_id))                               # get the name of temp archive file
            prepare_file_archive(results_dirpath=cellranger_output,
                                 output_file=temp_archive_name,
                                 exclude_list=['*.bam', '*.bai', '*.cram'
                                               ])  # archive cellranget output
            # load archive file to db collection and results dir
            au = \
              Analysis_collection_utils(
                dbsession_class=igf_session_class,
                analysis_name=analysis_name,
                tag_name=species_name,
                collection_name=experiment_igf_id,
                collection_type=collection_type,
                collection_table=collection_table,
                base_path=base_result_dir)                                            # initiate loading of archive file
            output_file_list = \
              au.load_file_to_disk_and_db(
                input_file_list=[temp_archive_name],
                withdraw_exisitng_collection=True)                                    # load file to db and disk
            # find bam path for the data flow
            bam_list = list()  # define empty bamfile list
            for file in os.listdir(cellranger_output):
                if fnmatch(file, '*.bam'):
                    bam_list.\
                      append(
                        os.path.join(
                          cellranger_output,
                          file))                                                          # add all bams to bam_list

            if len(bam_list) > 1:
                raise ValueError(
                  'More than one bam found for cellranger count run:{0}'.\
                  format(cellranger_output))                                            # check number of bams, presence of one bam is already validated by check method

            bam_file = bam_list[0]
            au = \
              Analysis_collection_utils(
                dbsession_class=igf_session_class,
                analysis_name=analysis_name,
                tag_name=species_name,
                collection_name=experiment_igf_id,
                collection_type=collection_type,
                collection_table=collection_table)                                    # initiate bam file rename
            new_bam_name = \
              au.get_new_file_name(input_file=bam_file)
            if os.path.basename(bam_file) != new_bam_name:
                new_bam_name = \
                  os.path.join(
                    os.path.dirname(
                      bam_file),
                    new_bam_name)                                                       # get ne bam path
                move_file(source_path=bam_file,
                          destinationa_path=new_bam_name,
                          force=True)  # move bam file
                bam_file = new_bam_name  # update bam file path

            self.param(
                'dataflow_params', {
                    'cellranger_output': cellranger_output,
                    'bam_file': bam_file,
                    'analysis_output_list': output_file_list
                })  # pass on cellranger output path
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
              format(
                self.__class__.__name__,
                e,
                project_igf_id,
                sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
    def run(self):
        '''
    A method for running samtools commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param threads: Number of threads to use for Bam to Cram conversion, default 4
    :param base_work_dir: Base workd directory
    :param samtools_command: Samtools command
    :param samFlagInclude: Sam flags to include in filtered bam, default None
    :param samFlagExclude: Sam flags to exclude from the filtered bam, default None
    :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None
    :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False
    :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804
    :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796
    :param use_ephemeral_space: A toggle for temp dir settings, default 0
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            input_files = self.param_required('input_files')
            samtools_exe = self.param_required('samtools_exe')
            reference_type = self.param('reference_type')
            threads = self.param('threads')
            base_work_dir = self.param_required('base_work_dir')
            samtools_command = self.param_required('samtools_command')
            analysis_files = self.param_required('analysis_files')
            output_prefix = self.param_required('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            collection_table = self.param('collection_table')
            base_result_dir = self.param('base_result_dir')
            analysis_name = self.param('analysis_name')
            force_overwrite = self.param('force_overwrite')
            samFlagInclude = self.param('samFlagInclude')
            samFlagExclude = self.param('samFlagExclude')
            mapq_threshold = self.param('mapq_threshold')
            library_layout = self.param_required('library_layout')
            use_encode_filter = self.param('use_encode_filter')
            species_name = self.param_required('species_name')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                               # adding datestamp to the output file prefix

            if use_encode_filter:
                samFlagInclude = None
                if library_layout == 'PAIRED':
                    samFlagExclude = 1804
                else:
                    samFlagExclude = 1796

            if not isinstance(input_files, list) or \
               len(input_files) == 0:
                raise ValueError('No input file found')

            if len(input_files) > 1:
                raise ValueError('More than one input file found: {0}'.\
                                 format(input_files))

            output_bam_cram_list = list()
            input_file = input_files[0]
            temp_output_dir = \
              get_temp_dir(
                use_ephemeral_space=use_ephemeral_space)                              # get temp work dir
            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            samtools_cmdline = ''
            temp_output = None
            if samtools_command == 'idxstats':
                temp_output,samtools_cmdline = \
                  run_bam_idxstat(
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    force=True)                                                         # run samtools idxstats
            elif samtools_command == 'flagstat':
                temp_output,samtools_cmdline = \
                  run_bam_flagstat(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools flagstat
            elif samtools_command == 'stats':
                temp_output,samtools_cmdline,stats_metrics = \
                  run_bam_stats(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools stats
                if load_metrics_to_cram and \
                   len(stats_metrics) > 0:
                    ca = CollectionAdaptor(
                        **{'session_class': igf_session_class})
                    attribute_data = \
                    ca.prepare_data_for_collection_attribute(\
                      collection_name=experiment_igf_id,
                      collection_type=cram_collection_type,
                      data_list=stats_metrics)
                    ca.start_session()
                    try:
                        ca.create_or_update_collection_attributes(\
                          data=attribute_data,
                          autosave=False)
                        ca.commit_session()
                        ca.close_session()
                    except Exception as e:
                        ca.rollback_session()
                        ca.close_session()
                        raise ValueError('Failed to load data to db: {0}'.\
                                       format(e))

            elif samtools_command == 'merge':
                if output_prefix is None:
                    raise ValueError(
                        'Missing output filename prefix for merged bam')

                sorted_by_name = self.param('sorted_by_name')
                temp_output = \
                  os.path.join(\
                    work_dir,
                    '{0}_merged.bam'.format(output_prefix))
                samtools_cmdline = \
                  merge_multiple_bam(\
                    samtools_exe=samtools_exe,
                    input_bam_list=input_file,
                    output_bam_path=temp_output,
                    sorted_by_name=sorted_by_name,
                    threads=threads,
                    use_ephemeral_space=use_ephemeral_space,
                    force=True)
            elif samtools_command == 'view_bamToCram':
                if base_result_dir is None:
                    raise ValueError(
                        'base_result_dir is required for CRAM file loading')

                if analysis_name is None:
                    raise ValueError(
                        'analysis_name is required for CRAM file loading')

                ref_genome = \
                  Reference_genome_utils(\
                    genome_tag=species_name,
                    dbsession_class=igf_session_class,
                    genome_fasta_type=reference_type)
                genome_fasta = ref_genome.get_genome_fasta(
                )  # get genome fasta
                cram_file = \
                  os.path.basename(input_file).\
                    replace('.bam','.cram')                                             # get base cram file name
                cram_file = os.path.join(
                    temp_output_dir,
                    cram_file)  # get cram file path in work dir
                samtools_cmdline = \
                  convert_bam_to_cram(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    reference_file=genome_fasta,
                    cram_path=cram_file,
                    use_ephemeral_space=use_ephemeral_space,
                    threads=threads,
                    force=True,
                    dry_run=False)
                au = \
                  Analysis_collection_utils(\
                    dbsession_class=igf_session_class,
                    analysis_name=analysis_name,
                    tag_name=species_name,
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    collection_table=collection_table,
                    base_path=base_result_dir)
                temp_output_bam_cram_list = \
                  au.load_file_to_disk_and_db(\
                    input_file_list=[cram_file],
                    file_suffix='cram',
                    withdraw_exisitng_collection=force_overwrite)                       # load file to db and disk
                for cram in temp_output_bam_cram_list:
                    index_bam_or_cram(\
                      samtools_exe=samtools_exe,
                      input_path=cram,
                      threads=threads,
                      dry_run=False)
                    index_path = '{0}.crai'.format(cram)
                    output_bam_cram_list.append(cram)
                    output_bam_cram_list.append(index_path)

                if len(output_bam_cram_list) == 0:
                    raise ValueError('No output cram file found')

            elif samtools_command == 'view_filterBam':
                temp_output_bam = \
                  os.path.join(\
                    temp_output_dir,
                    os.path.basename(input_file).replace('.bam','.filtered.bam'))
                samtools_cmdline = \
                  filter_bam_file(
                    samtools_exe=samtools_exe,
                    input_bam=input_file,
                    output_bam=temp_output_bam,
                    samFlagInclude=samFlagInclude,
                    samFlagExclude=samFlagExclude,
                    threads=threads,
                    mapq_threshold=mapq_threshold,
                    index_output=False,
                    dry_run=False)
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output_bam))
                move_file(\
                  source_path=temp_output_bam,
                  destinationa_path=dest_path,
                  force=True)
                index_bam_or_cram(\
                  samtools_exe=samtools_exe,
                  input_path=dest_path,
                  threads=threads,
                  dry_run=False)
                index_path = '{0}.bai'.format(dest_path)
                output_bam_cram_list.append(dest_path)
                output_bam_cram_list.append(index_path)
            else:
                raise ValueError('Samtools command {0} not supported'.\
                                 format(samtools_command))

            if temp_output is not None:
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output))
                if dest_path != temp_output:
                    move_file(\
                      source_path=temp_output,
                      destinationa_path=dest_path,
                      force=True)
                analysis_files.append(dest_path)

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'output_bam_cram_list': output_bam_cram_list
                })  # pass on samtools output list
            message = \
              'finished samtools {0} for {1} {2}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'finished samtools {0} for {1} {2}: {3}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id,
                  samtools_cmdline)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send comment to Asana
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise