Пример #1
0
    def run(self):
        '''
    A runnable method for running PPQT analysis
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            input_files = self.param_required('input_files')
            rscript_path = self.param_required('rscript_path')
            ppqt_exe = self.param_required('ppqt_exe')
            base_work_dir = self.param_required('base_work_dir')
            base_result_dir = self.param_required('base_result_dir')
            library_strategy = self.param_required('library_strategy')
            analysis_files = self.param_required('analysis_files')
            output_prefix = self.param_required('output_prefix')
            species_name = self.param_required('species_name')
            analysis_name = self.param('analysis_name')
            seed_date_stamp = self.param_required('date_stamp')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            ppqt_collection_type = self.param('ppqt_collection_type')
            cram_collection_type = self.param('cram_collection_type')
            collection_table = self.param('collection_table')
            force_overwrite = self.param('force_overwrite')
            use_ephemeral_space = self.param('use_ephemeral_space')
            threads = self.param('threads')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = '{0}_{1}'.format(
                    output_prefix, seed_date_stamp
                )  # adding datestamp to the output file prefix

            if not isinstance(input_files, list) or \
               len(input_files) == 0:
                raise ValueError('No input file found')

            if len(input_files) > 1:
                raise ValueError('More than one input file found: {0}'.\
                                 format(input_files))

            if analysis_name is None:
                analysis_name = library_strategy  # use library_strategy as default analysis_name

            input_file = input_files[0]
            work_dir_prefix = \
              os.path.join(\
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix)  # get a run work dir
            ppqt_obj = \
              Ppqt_tools(\
                rscript_path=rscript_path,
                ppqt_exe=ppqt_exe,
                use_ephemeral_space=use_ephemeral_space,
                threads=threads)
            ppqt_cmd,spp_output, pdf_output,spp_data = \
              ppqt_obj.run_ppqt(\
                input_bam=input_file,
                output_dir=work_dir,
                output_spp_name='{0}_{1}.spp.out'.format(output_prefix,'PPQT'),
                output_pdf_name='{0}_{1}.spp.pdf'.format(output_prefix,'PPQT'))
            analysis_files.append(spp_output)
            au = \
              Analysis_collection_utils(\
                dbsession_class=igf_session_class,
                analysis_name=analysis_name,
                tag_name=species_name,
                collection_name=experiment_igf_id,
                collection_type=ppqt_collection_type,
                collection_table=collection_table,
                base_path=base_result_dir)
            output_ppqt_list = \
              au.load_file_to_disk_and_db(\
                input_file_list=[pdf_output],
                file_suffix='pdf',
                withdraw_exisitng_collection=force_overwrite)                         # load file to db and disk
            if load_metrics_to_cram and \
               len(spp_data) > 0:
                ca = CollectionAdaptor(**{'session_class': igf_session_class})
                attribute_data = \
                  ca.prepare_data_for_collection_attribute(\
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    data_list=spp_data)
                ca.start_session()
                try:
                    ca.create_or_update_collection_attributes(\
                      data=attribute_data,
                      autosave=False)
                    ca.commit_session()
                    ca.close_session()
                except Exception as e:
                    ca.rollback_session()
                    ca.close_session()
                    raise ValueError('Failed to load data to db: {0}'.\
                                     format(e))

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'output_ppqt_list': output_ppqt_list
                })  # pass on samtools output list
            message='finished PPQT for {0} {1}'.\
                    format(project_igf_id,
                           sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message='finished PPQT for {0} {1}: {2}'.\
                    format(project_igf_id,
                           sample_igf_id,
                           ppqt_cmd)
            self.comment_asana_task(task_name=project_igf_id,
                                    comment=message)  # send comment to Asana
        except Exception as e:
            message='project: {2}, sample:{3}, Error in {0}: {1}'.\
                    format(self.__class__.__name__,
                           e,
                           project_igf_id,
                           sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
Пример #2
0
    def run(self):
        '''
    A method for running picard commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param reference_refFlat: Reference genome collection type, default GENE_REFFLAT
    :param ribosomal_interval_type: Collection type for ribosomal interval list, default RIBOSOMAL_INTERVAL
    :param species_name: species_name
    :param java_exe: Java path
    :param java_java_paramexe: Java run parameters
    :param picard_jar: Picard jar path
    :param picard_command: Picard command
    :param base_work_dir: Base workd directory
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    :param use_ephemeral_space: A toggle for temp dir setting, default 0
    :param patterned_flowcell_list: A list of paterned flowcells, default ['HISEQ4000','NEXTSEQ']
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            java_exe = self.param_required('java_exe')
            java_param = self.param_required('java_param')
            picard_jar = self.param_required('picard_jar')
            input_files = self.param_required('input_files')
            picard_command = self.param_required('picard_command')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param('species_name')
            reference_type = self.param('reference_type')
            reference_refFlat = self.param('reference_refFlat')
            ribosomal_interval_type = self.param('ribosomal_interval_type')
            base_work_dir = self.param_required('base_work_dir')
            analysis_files = self.param_required('analysis_files')
            picard_option = self.param('picard_option')
            patterned_flowcell_list = self.param('patterned_flowcell_list')
            platform_name = self.param_required('platform_name')
            output_prefix = self.param('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                                  # adding seed datestamp to output prefix

            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            temp_output_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp work dir
            ref_genome = \
              Reference_genome_utils(
                genome_tag=species_name,
                dbsession_class=igf_session_class,
                genome_fasta_type=reference_type,
                gene_reflat_type=reference_refFlat,
                ribosomal_interval_type=ribosomal_interval_type)                      # setup ref genome utils
            genome_fasta = ref_genome.get_genome_fasta()  # get genome fasta
            ref_flat_file = ref_genome.get_gene_reflat()  # get refFlat file
            ribosomal_interval_file = ref_genome.get_ribosomal_interval(
            )  # get ribosomal interval file
            patterned_flowcell = False
            if platform_name in patterned_flowcell_list:  # check for patterned flowcell
                patterned_flowcell = True

            if load_metrics_to_cram and \
               not cram_collection_type:
                raise ValueError(
                    'Cram file collection type is required for loading picard metrics to db'
                )

            picard=\
              Picard_tools(\
                java_exe=java_exe,
                java_param=java_param,
                picard_jar=picard_jar,
                input_files=input_files,
                output_dir=temp_output_dir,
                ref_fasta=genome_fasta,
                patterned_flowcell=patterned_flowcell,
                ref_flat_file=ref_flat_file,
                picard_option=picard_option,
                output_prefix=output_prefix,
                use_ephemeral_space=use_ephemeral_space,
                ribisomal_interval=ribosomal_interval_file)                           # setup picard tool
            temp_output_files,picard_command_line,picard_metrics = \
              picard.run_picard_command(command_name=picard_command)                  # run picard command
            output_file_list = list()
            for source_path in temp_output_files:
                dest_path=\
                  os.path.join(
                    work_dir,
                    os.path.basename(source_path))                                      # get destination filepath
                move_file(source_path=source_path,
                          destinationa_path=dest_path,
                          force=True)  # move files to work dir
                output_file_list.append(dest_path)
            remove_dir(temp_output_dir)
            analysis_files.extend(output_file_list)
            bam_files = list()
            for file in output_file_list:
                if file.endswith('.bam'):
                    bam_files.append(file)

            if load_metrics_to_cram and \
               len(picard_metrics)>0:
                ca = CollectionAdaptor(**{'session_class': igf_session_class})
                attribute_data = \
                  ca.prepare_data_for_collection_attribute(
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    data_list=picard_metrics)                                           # fromat data for collection attribute table
                ca.start_session()
                try:
                    ca.create_or_update_collection_attributes(\
                      data=attribute_data,
                      autosave=False
                    )                                                                     # load data to collection attribute table
                    ca.commit_session()
                    ca.close_session()
                except:
                    ca.rollback_session()
                    ca.close_session()
                    raise

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'bam_files': bam_files,
                    'seed_date_stamp': seed_date_stamp
                })  # pass on picard output list
            message = \
              'finished picard {0} for {1} {2}'.\
                format(
                  picard_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'Picard {0} command: {1}'.\
                format(
                  picard_command,
                  picard_command_line)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send commandline to Asana
        except Exception as e:
            if temp_output_dir and \
               os.path.exists(temp_output_dir):
                remove_dir(temp_output_dir)

            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
    def run(self):
        '''
    A method for running samtools commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param threads: Number of threads to use for Bam to Cram conversion, default 4
    :param base_work_dir: Base workd directory
    :param samtools_command: Samtools command
    :param samFlagInclude: Sam flags to include in filtered bam, default None
    :param samFlagExclude: Sam flags to exclude from the filtered bam, default None
    :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None
    :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False
    :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804
    :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796
    :param use_ephemeral_space: A toggle for temp dir settings, default 0
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            input_files = self.param_required('input_files')
            samtools_exe = self.param_required('samtools_exe')
            reference_type = self.param('reference_type')
            threads = self.param('threads')
            base_work_dir = self.param_required('base_work_dir')
            samtools_command = self.param_required('samtools_command')
            analysis_files = self.param_required('analysis_files')
            output_prefix = self.param_required('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            collection_table = self.param('collection_table')
            base_result_dir = self.param('base_result_dir')
            analysis_name = self.param('analysis_name')
            force_overwrite = self.param('force_overwrite')
            samFlagInclude = self.param('samFlagInclude')
            samFlagExclude = self.param('samFlagExclude')
            mapq_threshold = self.param('mapq_threshold')
            library_layout = self.param_required('library_layout')
            use_encode_filter = self.param('use_encode_filter')
            species_name = self.param_required('species_name')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                               # adding datestamp to the output file prefix

            if use_encode_filter:
                samFlagInclude = None
                if library_layout == 'PAIRED':
                    samFlagExclude = 1804
                else:
                    samFlagExclude = 1796

            if not isinstance(input_files, list) or \
               len(input_files) == 0:
                raise ValueError('No input file found')

            if len(input_files) > 1:
                raise ValueError('More than one input file found: {0}'.\
                                 format(input_files))

            output_bam_cram_list = list()
            input_file = input_files[0]
            temp_output_dir = \
              get_temp_dir(
                use_ephemeral_space=use_ephemeral_space)                              # get temp work dir
            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            samtools_cmdline = ''
            temp_output = None
            if samtools_command == 'idxstats':
                temp_output,samtools_cmdline = \
                  run_bam_idxstat(
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    force=True)                                                         # run samtools idxstats
            elif samtools_command == 'flagstat':
                temp_output,samtools_cmdline = \
                  run_bam_flagstat(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools flagstat
            elif samtools_command == 'stats':
                temp_output,samtools_cmdline,stats_metrics = \
                  run_bam_stats(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools stats
                if load_metrics_to_cram and \
                   len(stats_metrics) > 0:
                    ca = CollectionAdaptor(
                        **{'session_class': igf_session_class})
                    attribute_data = \
                    ca.prepare_data_for_collection_attribute(\
                      collection_name=experiment_igf_id,
                      collection_type=cram_collection_type,
                      data_list=stats_metrics)
                    ca.start_session()
                    try:
                        ca.create_or_update_collection_attributes(\
                          data=attribute_data,
                          autosave=False)
                        ca.commit_session()
                        ca.close_session()
                    except Exception as e:
                        ca.rollback_session()
                        ca.close_session()
                        raise ValueError('Failed to load data to db: {0}'.\
                                       format(e))

            elif samtools_command == 'merge':
                if output_prefix is None:
                    raise ValueError(
                        'Missing output filename prefix for merged bam')

                sorted_by_name = self.param('sorted_by_name')
                temp_output = \
                  os.path.join(\
                    work_dir,
                    '{0}_merged.bam'.format(output_prefix))
                samtools_cmdline = \
                  merge_multiple_bam(\
                    samtools_exe=samtools_exe,
                    input_bam_list=input_file,
                    output_bam_path=temp_output,
                    sorted_by_name=sorted_by_name,
                    threads=threads,
                    use_ephemeral_space=use_ephemeral_space,
                    force=True)
            elif samtools_command == 'view_bamToCram':
                if base_result_dir is None:
                    raise ValueError(
                        'base_result_dir is required for CRAM file loading')

                if analysis_name is None:
                    raise ValueError(
                        'analysis_name is required for CRAM file loading')

                ref_genome = \
                  Reference_genome_utils(\
                    genome_tag=species_name,
                    dbsession_class=igf_session_class,
                    genome_fasta_type=reference_type)
                genome_fasta = ref_genome.get_genome_fasta(
                )  # get genome fasta
                cram_file = \
                  os.path.basename(input_file).\
                    replace('.bam','.cram')                                             # get base cram file name
                cram_file = os.path.join(
                    temp_output_dir,
                    cram_file)  # get cram file path in work dir
                samtools_cmdline = \
                  convert_bam_to_cram(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    reference_file=genome_fasta,
                    cram_path=cram_file,
                    use_ephemeral_space=use_ephemeral_space,
                    threads=threads,
                    force=True,
                    dry_run=False)
                au = \
                  Analysis_collection_utils(\
                    dbsession_class=igf_session_class,
                    analysis_name=analysis_name,
                    tag_name=species_name,
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    collection_table=collection_table,
                    base_path=base_result_dir)
                temp_output_bam_cram_list = \
                  au.load_file_to_disk_and_db(\
                    input_file_list=[cram_file],
                    file_suffix='cram',
                    withdraw_exisitng_collection=force_overwrite)                       # load file to db and disk
                for cram in temp_output_bam_cram_list:
                    index_bam_or_cram(\
                      samtools_exe=samtools_exe,
                      input_path=cram,
                      threads=threads,
                      dry_run=False)
                    index_path = '{0}.crai'.format(cram)
                    output_bam_cram_list.append(cram)
                    output_bam_cram_list.append(index_path)

                if len(output_bam_cram_list) == 0:
                    raise ValueError('No output cram file found')

            elif samtools_command == 'view_filterBam':
                temp_output_bam = \
                  os.path.join(\
                    temp_output_dir,
                    os.path.basename(input_file).replace('.bam','.filtered.bam'))
                samtools_cmdline = \
                  filter_bam_file(
                    samtools_exe=samtools_exe,
                    input_bam=input_file,
                    output_bam=temp_output_bam,
                    samFlagInclude=samFlagInclude,
                    samFlagExclude=samFlagExclude,
                    threads=threads,
                    mapq_threshold=mapq_threshold,
                    index_output=False,
                    dry_run=False)
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output_bam))
                move_file(\
                  source_path=temp_output_bam,
                  destinationa_path=dest_path,
                  force=True)
                index_bam_or_cram(\
                  samtools_exe=samtools_exe,
                  input_path=dest_path,
                  threads=threads,
                  dry_run=False)
                index_path = '{0}.bai'.format(dest_path)
                output_bam_cram_list.append(dest_path)
                output_bam_cram_list.append(index_path)
            else:
                raise ValueError('Samtools command {0} not supported'.\
                                 format(samtools_command))

            if temp_output is not None:
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output))
                if dest_path != temp_output:
                    move_file(\
                      source_path=temp_output,
                      destinationa_path=dest_path,
                      force=True)
                analysis_files.append(dest_path)

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'output_bam_cram_list': output_bam_cram_list
                })  # pass on samtools output list
            message = \
              'finished samtools {0} for {1} {2}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'finished samtools {0} for {1} {2}: {3}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id,
                  samtools_cmdline)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send comment to Asana
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise