def test_load_file_to_disk_and_db7(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='RunA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='run',
                                    base_path=self.temp_base_dir)
     input_file_list = [
         os.path.join(self.temp_work_dir, file_name)
         for file_name in self.input_list
     ]
     output_list = au.load_file_to_disk_and_db(
         input_file_list=input_file_list,
         withdraw_exisitng_collection=False
     )  # loading all files to same collection
     base = BaseAdaptor(**{'session_class': self.session_class})
     base.start_session()
     ca = CollectionAdaptor(**{'session': base.session})
     ca_files = ca.get_collection_files(collection_name='RunA',
                                        collection_type='AnalysisA_Files',
                                        output_mode='dataframe')
     file_list = list(ca_files['file_path'].to_dict().values())
     datestamp = get_datestamp_label()
     test_file = os.path.join(
         self.temp_base_dir, 'ProjectA', 'SampleA', 'ExperimentA', 'RunA',
         'AnalysisA', '{0}_{1}_{2}_{3}.{4}'.format('RunA', 'AnalysisA',
                                                   'TagA', datestamp,
                                                   'cram'))
     test_file = preprocess_path_name(input_path=test_file)
     self.assertTrue(test_file in file_list)
     self.assertTrue(test_file in output_list)
     base.close_session()
示例#2
0
 def get_datestamp(self):
     '''
 A method for fetching datestamp
 :returns: A padded string of format YYYYMMDD
 '''
     try:
         datestamp = get_datestamp_label()
         return datestamp
     except:
         raise
示例#3
0
    def run(self):
        '''
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            run_igf_id = self.param_required('run_igf_id')
            accu_data = self.param_required('accu_data')
            output_mode = self.param_required('output_mode')
            base_work_dir = self.param_required('base_work_dir')
            seed_date_stamp = self.param_required('date_stamp')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            run_analysis_files = accu_data.get(run_igf_id).get(seed_date_stamp)
            if run_analysis_files is None:
                raise ValueError('No data found in accu table for run {0} and date_stamp {1}'.\
                                 format(run_igf_id,seed_date_stamp))                    # incorrect data structure

            if isinstance(run_analysis_files,list) and \
               len(run_analysis_files)==0:
                raise ValueError('No run level file found in accu data for run {0} and date_stamp {1}'.\
                                 format(run_igf_id,seed_date_stamp))                    # zero input file

            if output_mode == 'list':
                self.param('dataflow_params',
                           {'run_chunk_list': run_analysis_files})
            elif output_mode == 'file':
                work_dir_prefix = os.path.join(base_work_dir, project_igf_id,
                                               sample_igf_id,
                                               experiment_igf_id, run_igf_id)
                work_dir = self.get_job_work_dir(
                    work_dir=work_dir_prefix)  # get a run work dir
                output_file = os.path.join(work_dir, 'run_level_chunk.txt')
                with open(output_file, 'w') as fp:
                    fp.write('\n'.join(run_analysis_files))

                self.param('dataflow_params',
                           {'run_chunk_list_file': output_file})
            else:
                raise ValueError(
                    'Output mode {0} not supported'.format(output_mode))

        except Exception as e:
            message='project: {2}, sample:{3}, Error in {0}: {1}'.format(self.__class__.__name__, \
                                                            e, \
                                                            project_igf_id,
                                                            sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
 def test_load_file_to_disk_and_db8(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='RunA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='run')
     input_file = os.path.join(self.temp_work_dir, 'a.cram')
     input_file = preprocess_path_name(input_path=input_file)
     new_file_name = au.get_new_file_name(input_file=input_file)
     datestamp = get_datestamp_label()
     test_file_name = '{0}_{1}_{2}_{3}.{4}'.format('RunA', 'AnalysisA',
                                                   'TagA', datestamp,
                                                   'cram')
     self.assertEqual(new_file_name, test_file_name)
 def run(self):
     try:
         project_igf_id = self.param_required('project_igf_id')
         sample_igf_id = self.param_required('sample_igf_id')
         experiment_igf_id = self.param_required('experiment_igf_id')
         base_work_dir = self.param_required('base_work_dir')
         cbImportScanpy_path = self.param_required('cbImportScanpy_path')
         scanpy_h5ad_path = self.param_required('scanpy_h5ad_path')
         cellbrowser_dir_prefix = self.param('cellbrowser_dir_prefix')
         use_ephemeral_space = self.param('use_ephemeral_space')
         work_dir_prefix = \
           os.path.join(\
             base_work_dir,
             project_igf_id,
             sample_igf_id,
             experiment_igf_id)
         work_dir = \
           self.get_job_work_dir(\
             work_dir=work_dir_prefix)
         datestamp = get_datestamp_label()
         cellbrowser_dir = \
             os.path.join( \
               work_dir,
               '{0}_{1}'.\
                 format( \
                   cellbrowser_dir_prefix,
                   datestamp))
         convert_scanpy_h5ad_to_cellbrowser_dir(\
           cbImportScanpy_path=cbImportScanpy_path,
           h5ad_path=scanpy_h5ad_path,
           project_name=experiment_igf_id,
           use_ephemeral_space=use_ephemeral_space,
           cellbrowser_htmldir=cellbrowser_dir)
         self.param('dataflow_params', {'cellbrowser_dir': cellbrowser_dir})
     except Exception as e:
         message = \
           'project: {2}, sample:{3}, Error in {0}: {1}'.\
             format(
               self.__class__.__name__,
               e,
               project_igf_id,
               sample_igf_id)
         self.warning(message)
         self.post_message_to_slack(
             message, reaction='fail')  # post msg to slack for failed jobs
         raise
    def get_new_file_name(self, input_file, file_suffix=None):
        '''
    A method for fetching new file name
    
    :param input_file: An input filepath
    :param file_suffix: A file suffix
    '''
        try:
            new_filename = self.collection_name  # use collection name to rename file
            if new_filename == '':
                raise ValueError('New filename not found for input file {0}'.\
                                 format(input_file))

            new_filename = \
              '{0}_{1}'.format(
                new_filename,
                self.analysis_name)
            if self.tag_name is not None:
                new_filename = \
                  '{0}_{1}'.format(
                    new_filename,
                    self.tag_name)                                                     # add tagname to filepath

            if self.add_datestamp:
                datestamp = get_datestamp_label()  # collect datestamp
                new_filename = \
                  '{0}_{1}'.format(
                    new_filename,
                    datestamp)                                                          # add datestamp to filepath

            if file_suffix is None:
                file_suffix = get_file_extension(
                    input_file=input_file)  # collect file suffix

            if file_suffix == '':
                raise ValueError('Missing file extension for new file name of {0}'.\
                                 format(input_file))                                    # raise error if not file suffix found

            new_filename = \
              '{0}.{1}'.format(\
                new_filename,
                file_suffix)                                                          # add file suffix to the new name
            return new_filename
        except:
            raise
    def run(self):
        '''
    A method for running samtools commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param species_name: species_name
    :param base_result_dir: Base results directory
    :param report_template_file: A template file for writing scanpy report
    :param analysis_name: Analysis name, default scanpy
    :param species_name_lookup: A dictionary for ensembl species name lookup
    :param cellranger_collection_type: Cellranger analysis collection type, default CELLRANGER_RESULTS
    :param scanpy_collection_type: Scanpy report collection type, default SCANPY_RESULTS
    :param collection_table: Collection table name for loading scanpy report, default experiment
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param_required('species_name')
            report_template_file = self.param_required('report_template_file')
            analysis_name = self.param_required('analysis_name')
            base_result_dir = self.param_required('base_result_dir')
            base_work_dir = self.param_required('base_work_dir')
            species_name_lookup = self.param('species_name_lookup')
            cellranger_collection_type = self.param(
                'cellranger_collection_type')
            scanpy_collection_type = self.param('scanpy_collection_type')
            collection_table = self.param('collection_table')
            cellbrowser_dir_prefix = self.param('cellbrowser_dir_prefix')
            use_ephemeral_space = self.param('use_ephemeral_space')
            cellranger_tarfile = ''
            output_report = ''
            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix)  # get a run work dir
            if species_name in species_name_lookup.keys(
            ):  # check for human or mice
                ensembl_species_name = species_name_lookup[
                    species_name]  # get ensembl species name
                # fetch cellranger tar path from db
                if cellranger_tarfile == '':
                    ca = CollectionAdaptor(
                        **{'session_class': igf_session_class})
                    ca.start_session()  # connect to database
                    cellranger_tarfiles = \
                      ca.get_collection_files(\
                        collection_name=experiment_igf_id,
                        collection_type=cellranger_collection_type,
                        output_mode='dataframe')                                          # fetch collection files
                    ca.close_session()
                    if len(cellranger_tarfiles.index) == 0:
                        raise ValueError('No cellranger analysis output found for exp {0}'.\
                                         format(experiment_igf_id))

                    cellranger_tarfile = cellranger_tarfiles[
                        'file_path'].values[
                            0]  # select first file as analysis file

                # extract filtered metrics files from tar
                output_dir = \
                  get_temp_dir(use_ephemeral_space=use_ephemeral_space)                 # get a temp dir
                datestamp = get_datestamp_label()
                cellbrowser_dir = \
                  os.path.join( \
                    work_dir,
                    '{0}_{1}'.\
                      format( \
                        cellbrowser_dir_prefix,
                        datestamp))
                cellbrowser_h5ad = \
                  os.path.join(\
                    cellbrowser_dir,
                    'scanpy.h5ad')
                output_report = \
                  os.path.join(\
                    output_dir,
                    'report.html')                                                      # get temp report path
                matrix_file,gene_file,barcode_file = \
                  self._extract_cellranger_filtered_metrics(\
                    tar_file=cellranger_tarfile,
                    output_dir=output_dir)                                              # get cellranger output files
                sp = \
                  Scanpy_tool(\
                    project_name=project_igf_id,
                    sample_name=sample_igf_id,
                    matrix_file=matrix_file,
                    features_tsv=gene_file,
                    barcode_tsv=barcode_file,
                    html_template_file=report_template_file,
                    species_name=ensembl_species_name,
                    output_file=output_report,
                    use_ephemeral_space=use_ephemeral_space,
                    cellbrowser_h5ad=cellbrowser_h5ad)
                sp.generate_report()  # generate scanpy report
                # load files to db and disk
                au = \
                  Analysis_collection_utils(\
                    dbsession_class=igf_session_class,
                    analysis_name=analysis_name,
                    tag_name=species_name,
                    collection_name=experiment_igf_id,
                    collection_type=scanpy_collection_type,
                    collection_table=collection_table,
                    base_path=base_result_dir)                                          # initiate loading of report file
                output_file_list = \
                  au.load_file_to_disk_and_db(\
                    input_file_list=[output_report],
                    withdraw_exisitng_collection=True)                                  # load file to db and disk
                output_report = output_file_list[0]

            self.param(
                'dataflow_params', {
                    'output_report': output_report,
                    'scanpy_h5ad_path': cellbrowser_h5ad
                })  # pass on output report filepath
        except Exception as e:
            message = 'project: {2}, sample:{3}, Error in {0}: {1}'.\
                      format(self.__class__.__name__,
                             e,
                             project_igf_id,
                             sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
示例#8
0
    def run(self):
        '''
    A method for running picard commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param reference_refFlat: Reference genome collection type, default GENE_REFFLAT
    :param ribosomal_interval_type: Collection type for ribosomal interval list, default RIBOSOMAL_INTERVAL
    :param species_name: species_name
    :param java_exe: Java path
    :param java_java_paramexe: Java run parameters
    :param picard_jar: Picard jar path
    :param picard_command: Picard command
    :param base_work_dir: Base workd directory
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    :param use_ephemeral_space: A toggle for temp dir setting, default 0
    :param patterned_flowcell_list: A list of paterned flowcells, default ['HISEQ4000','NEXTSEQ']
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            java_exe = self.param_required('java_exe')
            java_param = self.param_required('java_param')
            picard_jar = self.param_required('picard_jar')
            input_files = self.param_required('input_files')
            picard_command = self.param_required('picard_command')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param('species_name')
            reference_type = self.param('reference_type')
            reference_refFlat = self.param('reference_refFlat')
            ribosomal_interval_type = self.param('ribosomal_interval_type')
            base_work_dir = self.param_required('base_work_dir')
            analysis_files = self.param_required('analysis_files')
            picard_option = self.param('picard_option')
            patterned_flowcell_list = self.param('patterned_flowcell_list')
            platform_name = self.param_required('platform_name')
            output_prefix = self.param('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                                  # adding seed datestamp to output prefix

            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            temp_output_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp work dir
            ref_genome = \
              Reference_genome_utils(
                genome_tag=species_name,
                dbsession_class=igf_session_class,
                genome_fasta_type=reference_type,
                gene_reflat_type=reference_refFlat,
                ribosomal_interval_type=ribosomal_interval_type)                      # setup ref genome utils
            genome_fasta = ref_genome.get_genome_fasta()  # get genome fasta
            ref_flat_file = ref_genome.get_gene_reflat()  # get refFlat file
            ribosomal_interval_file = ref_genome.get_ribosomal_interval(
            )  # get ribosomal interval file
            patterned_flowcell = False
            if platform_name in patterned_flowcell_list:  # check for patterned flowcell
                patterned_flowcell = True

            if load_metrics_to_cram and \
               not cram_collection_type:
                raise ValueError(
                    'Cram file collection type is required for loading picard metrics to db'
                )

            picard=\
              Picard_tools(\
                java_exe=java_exe,
                java_param=java_param,
                picard_jar=picard_jar,
                input_files=input_files,
                output_dir=temp_output_dir,
                ref_fasta=genome_fasta,
                patterned_flowcell=patterned_flowcell,
                ref_flat_file=ref_flat_file,
                picard_option=picard_option,
                output_prefix=output_prefix,
                use_ephemeral_space=use_ephemeral_space,
                ribisomal_interval=ribosomal_interval_file)                           # setup picard tool
            temp_output_files,picard_command_line,picard_metrics = \
              picard.run_picard_command(command_name=picard_command)                  # run picard command
            output_file_list = list()
            for source_path in temp_output_files:
                dest_path=\
                  os.path.join(
                    work_dir,
                    os.path.basename(source_path))                                      # get destination filepath
                move_file(source_path=source_path,
                          destinationa_path=dest_path,
                          force=True)  # move files to work dir
                output_file_list.append(dest_path)
            remove_dir(temp_output_dir)
            analysis_files.extend(output_file_list)
            bam_files = list()
            for file in output_file_list:
                if file.endswith('.bam'):
                    bam_files.append(file)

            if load_metrics_to_cram and \
               len(picard_metrics)>0:
                ca = CollectionAdaptor(**{'session_class': igf_session_class})
                attribute_data = \
                  ca.prepare_data_for_collection_attribute(
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    data_list=picard_metrics)                                           # fromat data for collection attribute table
                ca.start_session()
                try:
                    ca.create_or_update_collection_attributes(\
                      data=attribute_data,
                      autosave=False
                    )                                                                     # load data to collection attribute table
                    ca.commit_session()
                    ca.close_session()
                except:
                    ca.rollback_session()
                    ca.close_session()
                    raise

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'bam_files': bam_files,
                    'seed_date_stamp': seed_date_stamp
                })  # pass on picard output list
            message = \
              'finished picard {0} for {1} {2}'.\
                format(
                  picard_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'Picard {0} command: {1}'.\
                format(
                  picard_command,
                  picard_command_line)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send commandline to Asana
        except Exception as e:
            if temp_output_dir and \
               os.path.exists(temp_output_dir):
                remove_dir(temp_output_dir)

            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
    def run(self):
        try:
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            analysis_files = self.param_required('analysis_files')
            multiqc_exe = self.param('multiqc_exe')
            multiqc_options = self.param('multiqc_options')
            multiqc_dir_label = self.param('multiqc_dir_label')
            force_overwrite = self.param('force_overwrite')
            base_results_dir = self.param_required('base_results_dir')
            tag = self.param_required('tag_name')
            analysis_name = self.param_required('analysis_name')
            collection_name = self.param_required('collection_name')
            collection_type = self.param_required('collection_type')
            collection_table = self.param_required('collection_table')
            igf_session_class = self.param_required('igf_session_class')
            multiqc_template_file = self.param_required(
                'multiqc_template_file')
            platform_name = self.param('platform_name')
            tool_order_list = self.param('tool_order_list')
            use_ephemeral_space = self.param('use_ephemeral_space')
            if not isinstance(analysis_files,list) and \
               len(analysis_files) ==0:
                raise ValueError('Failed to run MultiQC for zero analysis list'
                                 )  # check analysis files

            temp_work_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp work dir
            multiqc_input_file = \
              os.path.join(
                temp_work_dir,
                'multiqc.txt')                                                        # get temp multiqc list
            with open(multiqc_input_file, 'w') as fp:
                for file in analysis_files:
                    if not os.path.exists(file):
                        raise IOError('File {0} not found for multiQC run'.\
                                      format(file))                                         # check filepath

                    fp.write('{}\n'.format(file))  # write file to temp file

            date_stamp = datetime.now().strftime('%d-%b-%Y %H:%M:%S')
            check_file_path(multiqc_template_file)
            multiqc_conf_file = \
              os.path.join(
                temp_work_dir,
                os.path.basename(multiqc_template_file))
            template_env = \
              Environment(
                loader=\
                  FileSystemLoader(
                    searchpath=os.path.dirname(multiqc_template_file)),
                autoescape=select_autoescape(['html', 'xml']))
            multiqc_conf = \
              template_env.\
                get_template(
                  os.path.basename(multiqc_template_file))
            multiqc_conf.\
              stream(
                project_igf_id=project_igf_id,
                sample_igf_id=sample_igf_id,
                platform_name=platform_name,
                tag_name=tag,
                date_stamp=date_stamp,
                tool_order_list=tool_order_list).\
            dump(multiqc_conf_file)
            multiqc_report_title = \
              'Project:{0}'.format(project_igf_id)                                    # base multiqc label
            if sample_igf_id is not None:
                multiqc_report_title = \
                  '{0},Sample:{1}'.\
                    format(
                      multiqc_report_title,
                      sample_igf_id)                                                    # add sample, if its present

            multiqc_report_title = \
              '{0};tag:{1};date:{2}'.\
                format(
                  multiqc_report_title,
                  tag,
                  get_datestamp_label())                                              # add tag and date stamp
            multiqc_param = self.format_tool_options(
                multiqc_options)  # format multiqc params
            multiqc_cmd = [
                multiqc_exe, '--file-list',
                quote(multiqc_input_file), '--outdir',
                quote(temp_work_dir), '--title',
                quote(multiqc_report_title), '-c',
                quote(multiqc_conf_file)
            ]  # multiqc base parameters
            multiqc_param = \
              [quote(param) for param in multiqc_param]                               # wrap params in quotes
            multiqc_cmd.\
              extend(multiqc_param)                                                   # add additional parameters
            subprocess.\
              check_call(' '.join(multiqc_cmd),shell=True)                            # run multiqc
            multiqc_html = None
            output_list = list()
            for root, _, files in os.walk(top=temp_work_dir):
                for file in files:
                    if fnmatch.fnmatch(file, '*.html'):
                        multiqc_html = os.path.join(
                            root, file)  # get multiqc html path
                        au = \
                          Analysis_collection_utils(
                            dbsession_class=igf_session_class,
                            analysis_name=analysis_name,
                            tag_name=tag,
                            collection_name=collection_name,
                            collection_type=collection_type,
                            collection_table=collection_table,
                            base_path=base_results_dir)
                        output_list = \
                          au.load_file_to_disk_and_db(
                            input_file_list=[multiqc_html],
                            withdraw_exisitng_collection=force_overwrite,
                            force=True,remove_file=True)                                    # load file to db and disk

            self.param('dataflow_params', {'multiqc_html': output_list[0]
                                           })  # add output files to dataflow
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
    def upload_analysis_results_and_create_collection(self,
                                                      file_list,
                                                      irods_user,
                                                      project_name,
                                                      analysis_name='default',
                                                      dir_path_list=None,
                                                      file_tag=None):
        '''
    A method for uploading analysis files to irods server
    
    :param file_list: A list of file paths to upload to irods
    :param irods_user: Irods user name
    :param project_name: Name of the project_name
    :param analysis_name: A string for analysis name, default is 'default'
    :param dir_path_list: A list of directory structure for irod server, default None for using datestamp
    :param file_tag: A text string for adding tag to collection, default None for only project_name
    '''
        try:
            irods_exe_dir = self.irods_exe_dir
            irods_base_dir=os.path.join(self.zone, \
                                        'home', \
                                        irods_user, \
                                        project_name)
            if dir_path_list is not None and \
               isinstance(dir_path_list, list) and \
               len(dir_path_list) >0 :
                irods_base_dir = os.path.join(
                    irods_base_dir,
                    os.path.sep.join(dir_path_list))  # use path from dir list
            else:
                datestamp = get_datestamp_label()
                irods_base_dir = os.path.join(irods_base_dir,
                                              datestamp)  # use datestamp

            if not isinstance(dir_path_list, list) or \
               analysis_name not in dir_path_list:
                irods_base_dir = os.path.join(
                    irods_base_dir,
                    analysis_name)  # add analysis name to the irods dir

            chk_cmd = [os.path.join(irods_exe_dir, 'ils'), irods_base_dir]
            response = subprocess.call(
                chk_cmd)  # check for existing dir in irods
            if response != 0:  # create dir if response is not 0
                make_dir_cmd = [
                    os.path.join(irods_exe_dir, 'imkdir'), '-p',
                    quote(irods_base_dir)
                ]
                subprocess.check_call(make_dir_cmd)  # create destination dir
                chmod_cmd=[os.path.join(irods_exe_dir,'ichmod'),
                           '-M',
                           'own',
                           quote(self.igf_user), \
                           quote(irods_base_dir)]
                subprocess.check_call(chmod_cmd)  # change directory ownership
                inherit_cmd = [
                    os.path.join(irods_exe_dir, 'ichmod'), '-r', 'inherit',
                    quote(irods_base_dir)
                ]
                subprocess.check_call(inherit_cmd)  # inherit new directory

            for filepath in file_list:
                if not os.path.exists(filepath) or os.path.isdir(filepath):
                    raise IOError('filepath {0} not found or its not a file'.\
                                  format(filepath))                                       # checking filepath before upload

                irods_filepath = os.path.join(irods_base_dir,
                                              os.path.basename(filepath))
                file_chk_cmd = [
                    os.path.join(irods_exe_dir, 'ils'), irods_filepath
                ]
                file_response = subprocess.call(
                    file_chk_cmd)  # check for existing file in irods
                if file_response == 0:
                    file_rm_cmd = [
                        os.path.join(irods_exe_dir, 'irm'), '-rf',
                        quote(irods_filepath)
                    ]
                    subprocess.check_call(
                        file_rm_cmd
                    )  # remove existing file to prevent any clash

                iput_cmd = [
                    os.path.join(irods_exe_dir, 'iput'), '-k', '-f', '-N', '1',
                    '-R',
                    quote(self.irods_resource),
                    quote(filepath),
                    quote(irods_base_dir)
                ]
                subprocess.check_call(
                    iput_cmd
                )  # upload file to irods dir, calculate md5sub and overwrite
                if file_tag is None:
                    file_meta_info = project_name
                else:
                    file_meta_info = '{0} - {1}'.format(project_name, file_tag)

                meta_project_user=[os.path.join(irods_exe_dir,'imeta'),
                                   'add',
                                   '-d',
                                   quote(irods_filepath),\
                                   quote(file_meta_info), \
                                   quote(irods_user),\
                                   quote('iRODSUserTagging:Star')]
                subprocess.check_call(
                    meta_project_user)  # add more metadata to file
                meta_30d = [
                    os.path.join(irods_exe_dir, 'isysmeta'), 'mod',
                    quote(irods_filepath),
                    quote('+30d')
                ]
                subprocess.call(meta_30d)  # add metadata for file
                meta_file_retentaion = [
                    os.path.join(irods_exe_dir, 'imeta'), 'add', '-d',
                    quote(irods_filepath),
                    quote('retention'),
                    quote('30'),
                    quote('days')
                ]
                subprocess.call(
                    meta_file_retentaion)  # adding file retaintion info
        except:
            raise
示例#11
0
    def run(self):
        '''
    A method for running STAR alignment
    
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            run_igf_id = self.param('run_igf_id')
            star_exe = self.param_required('star_exe')
            run_mode = self.param_required('run_mode')
            output_prefix = self.param_required('output_prefix')
            run_thread = self.param('run_thread')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param('species_name')
            reference_type = self.param('reference_type')
            reference_gtf_type = self.param('reference_gtf_type')
            fasta_fai_reference_type = self.param('fasta_fai_reference_type')
            star_patameters = self.param('star_patameters')
            two_pass_mode = self.param('two_pass_mode')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            base_work_dir = self.param_required('base_work_dir')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            if run_igf_id is not None:
                work_dir_prefix = \
                  os.path.join(
                    work_dir_prefix,
                    run_igf_id)

            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            ref_genome = \
              Reference_genome_utils(
                genome_tag=species_name,
                dbsession_class=igf_session_class,
                gene_gtf_type=reference_gtf_type,
                fasta_fai_type=fasta_fai_reference_type,
                star_ref_type=reference_type)                                         # setup ref genome utils
            star_ref = ref_genome.get_transcriptome_star()  # get star ref
            gene_gtf = ref_genome.get_gene_gtf()  # get gtf file
            genome_fai = ref_genome.get_genome_fasta_fai(
            )  # fetch genomic fasta fai index
            if run_mode == 'generate_aligned_bams':
                if run_igf_id is None:
                    raise ValueError('No Run igf id found')

                r1_read_file = self.param_required('r1_read_file')
                r2_read_file = self.param('r2_read_file')
                input_fastq_list = list()
                input_fastq_list.append(r1_read_file[0])  # get the first input
                if r2_read_file is not None and \
                   len(r2_read_file)>0:
                    input_fastq_list.append(
                        r2_read_file[0])  # get the first input

                star_obj = \
                  Star_utils(
                    star_exe=star_exe,
                    input_files=input_fastq_list,
                    genome_dir=star_ref,
                    reference_gtf=gene_gtf,
                    output_dir=work_dir,
                    output_prefix=output_prefix,
                    use_ephemeral_space=use_ephemeral_space,
                    threads=run_thread)                                                 # set up star for run
                if two_pass_mode is None:
                    two_pass_mode = True
                elif two_pass_mode == 0:
                    two_pass_mode = False  # reset srat twopass mode

                if isinstance(star_patameters, str):
                    star_patameters = json.loads(
                        star_patameters)  # convert string param to dict

                genomic_bam,transcriptomic_bam,star_log_file,\
                star_gene_count_file,star_cmd = \
                    star_obj.\
                      generate_aligned_bams(
                        two_pass_mode=two_pass_mode,
                        star_patameters=star_patameters)                                # run star cmd
                self.param(
                    'dataflow_params', {
                        'star_genomic_bam': genomic_bam,
                        'star_transcriptomic_bam': transcriptomic_bam,
                        'star_log_file': star_log_file,
                        'star_gene_count_file': star_gene_count_file,
                        'seed_date_stamp': seed_date_stamp
                    })
            elif run_mode == 'generate_rna_bigwig':
                input_bam = self.param_required('input_bam')
                bedGraphToBigWig_path = self.param_required(
                    'bedGraphToBigWig_path')
                chrom_length_file = genome_fai
                stranded = self.param('stranded')
                star_obj = \
                  Star_utils(
                    star_exe=star_exe,
                    input_files=[input_bam],
                    genome_dir=star_ref,
                    reference_gtf=gene_gtf,
                    output_dir=work_dir,
                    output_prefix=output_prefix,
                    use_ephemeral_space=use_ephemeral_space,
                    threads=run_thread)                                                 # set up star for run
                output_paths,star_cmd = \
                  star_obj.\
                    generate_rna_bigwig(
                      bedGraphToBigWig_path=bedGraphToBigWig_path,
                      chrom_length_file=chrom_length_file,
                      stranded=stranded,)                                               # generate bigwig signal tracks
                self.param('dataflow_params',
                           {'star_bigwigs': output_paths
                            })  # passing bigwig paths to dataflow

            message = \
              'finished star for {0} {1}'.format(
                project_igf_id,
                run_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'STAR {0} {1} command: {2}'.format(
                run_igf_id,
                output_prefix,
                star_cmd)
            self.comment_asana_task(
                task_name=project_igf_id,
                comment=message)  # send commandline to Asana
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
    def run(self):
        '''
    A method for running samtools commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param threads: Number of threads to use for Bam to Cram conversion, default 4
    :param base_work_dir: Base workd directory
    :param samtools_command: Samtools command
    :param samFlagInclude: Sam flags to include in filtered bam, default None
    :param samFlagExclude: Sam flags to exclude from the filtered bam, default None
    :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None
    :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False
    :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804
    :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796
    :param use_ephemeral_space: A toggle for temp dir settings, default 0
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            input_files = self.param_required('input_files')
            samtools_exe = self.param_required('samtools_exe')
            reference_type = self.param('reference_type')
            threads = self.param('threads')
            base_work_dir = self.param_required('base_work_dir')
            samtools_command = self.param_required('samtools_command')
            analysis_files = self.param_required('analysis_files')
            output_prefix = self.param_required('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            collection_table = self.param('collection_table')
            base_result_dir = self.param('base_result_dir')
            analysis_name = self.param('analysis_name')
            force_overwrite = self.param('force_overwrite')
            samFlagInclude = self.param('samFlagInclude')
            samFlagExclude = self.param('samFlagExclude')
            mapq_threshold = self.param('mapq_threshold')
            library_layout = self.param_required('library_layout')
            use_encode_filter = self.param('use_encode_filter')
            species_name = self.param_required('species_name')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                               # adding datestamp to the output file prefix

            if use_encode_filter:
                samFlagInclude = None
                if library_layout == 'PAIRED':
                    samFlagExclude = 1804
                else:
                    samFlagExclude = 1796

            if not isinstance(input_files, list) or \
               len(input_files) == 0:
                raise ValueError('No input file found')

            if len(input_files) > 1:
                raise ValueError('More than one input file found: {0}'.\
                                 format(input_files))

            output_bam_cram_list = list()
            input_file = input_files[0]
            temp_output_dir = \
              get_temp_dir(
                use_ephemeral_space=use_ephemeral_space)                              # get temp work dir
            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            samtools_cmdline = ''
            temp_output = None
            if samtools_command == 'idxstats':
                temp_output,samtools_cmdline = \
                  run_bam_idxstat(
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    force=True)                                                         # run samtools idxstats
            elif samtools_command == 'flagstat':
                temp_output,samtools_cmdline = \
                  run_bam_flagstat(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools flagstat
            elif samtools_command == 'stats':
                temp_output,samtools_cmdline,stats_metrics = \
                  run_bam_stats(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools stats
                if load_metrics_to_cram and \
                   len(stats_metrics) > 0:
                    ca = CollectionAdaptor(
                        **{'session_class': igf_session_class})
                    attribute_data = \
                    ca.prepare_data_for_collection_attribute(\
                      collection_name=experiment_igf_id,
                      collection_type=cram_collection_type,
                      data_list=stats_metrics)
                    ca.start_session()
                    try:
                        ca.create_or_update_collection_attributes(\
                          data=attribute_data,
                          autosave=False)
                        ca.commit_session()
                        ca.close_session()
                    except Exception as e:
                        ca.rollback_session()
                        ca.close_session()
                        raise ValueError('Failed to load data to db: {0}'.\
                                       format(e))

            elif samtools_command == 'merge':
                if output_prefix is None:
                    raise ValueError(
                        'Missing output filename prefix for merged bam')

                sorted_by_name = self.param('sorted_by_name')
                temp_output = \
                  os.path.join(\
                    work_dir,
                    '{0}_merged.bam'.format(output_prefix))
                samtools_cmdline = \
                  merge_multiple_bam(\
                    samtools_exe=samtools_exe,
                    input_bam_list=input_file,
                    output_bam_path=temp_output,
                    sorted_by_name=sorted_by_name,
                    threads=threads,
                    use_ephemeral_space=use_ephemeral_space,
                    force=True)
            elif samtools_command == 'view_bamToCram':
                if base_result_dir is None:
                    raise ValueError(
                        'base_result_dir is required for CRAM file loading')

                if analysis_name is None:
                    raise ValueError(
                        'analysis_name is required for CRAM file loading')

                ref_genome = \
                  Reference_genome_utils(\
                    genome_tag=species_name,
                    dbsession_class=igf_session_class,
                    genome_fasta_type=reference_type)
                genome_fasta = ref_genome.get_genome_fasta(
                )  # get genome fasta
                cram_file = \
                  os.path.basename(input_file).\
                    replace('.bam','.cram')                                             # get base cram file name
                cram_file = os.path.join(
                    temp_output_dir,
                    cram_file)  # get cram file path in work dir
                samtools_cmdline = \
                  convert_bam_to_cram(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    reference_file=genome_fasta,
                    cram_path=cram_file,
                    use_ephemeral_space=use_ephemeral_space,
                    threads=threads,
                    force=True,
                    dry_run=False)
                au = \
                  Analysis_collection_utils(\
                    dbsession_class=igf_session_class,
                    analysis_name=analysis_name,
                    tag_name=species_name,
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    collection_table=collection_table,
                    base_path=base_result_dir)
                temp_output_bam_cram_list = \
                  au.load_file_to_disk_and_db(\
                    input_file_list=[cram_file],
                    file_suffix='cram',
                    withdraw_exisitng_collection=force_overwrite)                       # load file to db and disk
                for cram in temp_output_bam_cram_list:
                    index_bam_or_cram(\
                      samtools_exe=samtools_exe,
                      input_path=cram,
                      threads=threads,
                      dry_run=False)
                    index_path = '{0}.crai'.format(cram)
                    output_bam_cram_list.append(cram)
                    output_bam_cram_list.append(index_path)

                if len(output_bam_cram_list) == 0:
                    raise ValueError('No output cram file found')

            elif samtools_command == 'view_filterBam':
                temp_output_bam = \
                  os.path.join(\
                    temp_output_dir,
                    os.path.basename(input_file).replace('.bam','.filtered.bam'))
                samtools_cmdline = \
                  filter_bam_file(
                    samtools_exe=samtools_exe,
                    input_bam=input_file,
                    output_bam=temp_output_bam,
                    samFlagInclude=samFlagInclude,
                    samFlagExclude=samFlagExclude,
                    threads=threads,
                    mapq_threshold=mapq_threshold,
                    index_output=False,
                    dry_run=False)
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output_bam))
                move_file(\
                  source_path=temp_output_bam,
                  destinationa_path=dest_path,
                  force=True)
                index_bam_or_cram(\
                  samtools_exe=samtools_exe,
                  input_path=dest_path,
                  threads=threads,
                  dry_run=False)
                index_path = '{0}.bai'.format(dest_path)
                output_bam_cram_list.append(dest_path)
                output_bam_cram_list.append(index_path)
            else:
                raise ValueError('Samtools command {0} not supported'.\
                                 format(samtools_command))

            if temp_output is not None:
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output))
                if dest_path != temp_output:
                    move_file(\
                      source_path=temp_output,
                      destinationa_path=dest_path,
                      force=True)
                analysis_files.append(dest_path)

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'output_bam_cram_list': output_bam_cram_list
                })  # pass on samtools output list
            message = \
              'finished samtools {0} for {1} {2}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'finished samtools {0} for {1} {2}: {3}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id,
                  samtools_cmdline)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send comment to Asana
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
    def run(self):
        '''
    A runnable method for running PPQT analysis
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            input_files = self.param_required('input_files')
            threads = self.param('threads')
            base_work_dir = self.param_required('base_work_dir')
            base_results_dir = self.param_required('base_results_dir')
            deeptools_command = self.param_required('deeptools_command')
            analysis_files = self.param_required('analysis_files')
            output_prefix = self.param_required('output_prefix')
            load_signal_bigwig = self.param('load_signal_bigwig')
            signal_collection_type = self.param('signal_collection_type')
            blacklist_reference_type = self.param('blacklist_reference_type')
            species_name = self.param('species_name')
            deeptools_params = self.param('deeptools_params')
            deeptools_bamCov_params = self.param('deeptools_bamCov_params')
            collection_table = self.param('collection_table')
            remove_existing_file = self.param('remove_existing_file')
            withdraw_exisitng_collection = self.param(
                'withdraw_exisitng_collection')
            analysis_name = self.param('analysis_name')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = self.param_required('date_stamp')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.format(
                    output_prefix,
                    seed_date_stamp)                                                    # adding datestamp to the output file prefix

            if not isinstance(input_files, list) or \
               len(input_files) == 0:
                raise ValueError('No input file found')

            signal_files = list()
            work_dir_prefix = \
              os.path.join(\
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix)  # get a run work dir
            ref_genome = \
              Reference_genome_utils(\
                genome_tag=species_name,
                dbsession_class=igf_session_class,
                blacklist_interval_type=blacklist_reference_type)                     # setup ref genome utils
            blacklist_bed = ref_genome.get_blacklist_region_bed(
            )  # get genome fasta
            if deeptools_command == 'plotCoverage':
                output_raw_counts = \
                  '{0}_{1}.raw.txt'.format(output_prefix,'plotCoverage')
                output_raw_counts = \
                  os.path.join(\
                    work_dir,
                    output_raw_counts)
                plotcov_stdout = \
                  '{0}_{1}.stdout.txt'.format(output_prefix,'plotCoverage')
                plotcov_stdout = \
                  os.path.join(\
                    work_dir,
                    plotcov_stdout)
                output_plot = \
                  '{0}_{1}.pdf'.format(output_prefix,'plotCoverage')
                output_plot = \
                  os.path.join(\
                    work_dir,
                    output_plot)
                deeptools_args = \
                  run_plotCoverage(\
                    bam_files=input_files,
                    output_raw_counts=output_raw_counts,
                    plotcov_stdout=plotcov_stdout,
                    output_plot=output_plot,
                    blacklist_file=blacklist_bed,
                    thread=threads,
                    use_ephemeral_space=use_ephemeral_space,
                    params_list=deeptools_params)
                analysis_files.extend(\
                  [output_raw_counts,plotcov_stdout,output_plot])
            elif deeptools_command == 'bamCoverage':
                output_file = \
                  '{0}_{1}.bw'.format(output_prefix,'bamCoverage')
                output_file = \
                  os.path.join(\
                    work_dir,
                    output_file)
                if deeptools_params is None:
                    deeptools_params = deeptools_bamCov_params

                deeptools_args = \
                  run_bamCoverage(\
                    bam_files=input_files,
                    output_file=output_file,
                    blacklist_file=blacklist_bed,
                    thread=threads,
                    use_ephemeral_space=use_ephemeral_space,
                    params_list=deeptools_params)
                if load_signal_bigwig:
                    au = \
                      Analysis_collection_utils(\
                        dbsession_class=igf_session_class,
                        analysis_name=analysis_name,
                        base_path=base_results_dir,
                        tag_name=species_name,
                        collection_name=experiment_igf_id,
                        collection_type=signal_collection_type,
                        collection_table=collection_table)                                # initiate analysis file loading
                    output_file_list = \
                      au.load_file_to_disk_and_db(\
                        input_file_list=[output_file],
                        remove_file=remove_existing_file,
                        file_suffix='bw',
                        withdraw_exisitng_collection=withdraw_exisitng_collection)        # load file to db and disk
                    analysis_files.extend(output_file_list)
                    signal_files.extend(output_file_list)
                else:
                    analysis_files.append(output_file)
            elif deeptools_command == 'plotFingerprint':
                output_raw_counts = \
                  '{0}_{1}.raw.txt'.format(output_prefix,'plotFingerprint')
                output_raw_counts = \
                  os.path.join(\
                    work_dir,
                    output_raw_counts)
                output_matrics = \
                  '{0}_{1}.metrics.txt'.format(output_prefix,'plotFingerprint')
                output_matrics = \
                  os.path.join(\
                    work_dir,
                    output_matrics)
                output_plot = \
                  '{0}_{1}.pdf'.format(output_prefix,'plotFingerprint')
                output_plot = \
                  os.path.join(\
                    work_dir,
                    output_plot)
                deeptools_args = \
                  run_plotFingerprint(\
                    bam_files=input_files,
                    output_raw_counts=output_raw_counts,
                    output_matrics=output_matrics,
                    output_plot=output_plot,
                    blacklist_file=blacklist_bed,
                    thread=threads,
                    use_ephemeral_space=use_ephemeral_space,
                    params_list=deeptools_params)
                analysis_files.extend(\
                  [output_raw_counts,output_matrics,output_plot])
            else:
                raise ValueError('Deeptool command {0} is not implemented yet'.\
                                 format(deeptools_command))

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'signal_files': signal_files,
                    'seed_date_stamp': seed_date_stamp
                })  # pass on picard output list
            message = \
              'finished deeptools {0} for {1} {2}'.format(
                deeptools_command,
                project_igf_id,
                sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'Deeptools {0} command: {1}'.format(
                deeptools_command,
                deeptools_args)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)       # send commandline to Asana
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
示例#14
0
    def run(self):
        '''
    A runnable method for running PPQT analysis
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            input_files = self.param_required('input_files')
            rscript_path = self.param_required('rscript_path')
            ppqt_exe = self.param_required('ppqt_exe')
            base_work_dir = self.param_required('base_work_dir')
            base_result_dir = self.param_required('base_result_dir')
            library_strategy = self.param_required('library_strategy')
            analysis_files = self.param_required('analysis_files')
            output_prefix = self.param_required('output_prefix')
            species_name = self.param_required('species_name')
            analysis_name = self.param('analysis_name')
            seed_date_stamp = self.param_required('date_stamp')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            ppqt_collection_type = self.param('ppqt_collection_type')
            cram_collection_type = self.param('cram_collection_type')
            collection_table = self.param('collection_table')
            force_overwrite = self.param('force_overwrite')
            use_ephemeral_space = self.param('use_ephemeral_space')
            threads = self.param('threads')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = '{0}_{1}'.format(
                    output_prefix, seed_date_stamp
                )  # adding datestamp to the output file prefix

            if not isinstance(input_files, list) or \
               len(input_files) == 0:
                raise ValueError('No input file found')

            if len(input_files) > 1:
                raise ValueError('More than one input file found: {0}'.\
                                 format(input_files))

            if analysis_name is None:
                analysis_name = library_strategy  # use library_strategy as default analysis_name

            input_file = input_files[0]
            work_dir_prefix = \
              os.path.join(\
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix)  # get a run work dir
            ppqt_obj = \
              Ppqt_tools(\
                rscript_path=rscript_path,
                ppqt_exe=ppqt_exe,
                use_ephemeral_space=use_ephemeral_space,
                threads=threads)
            ppqt_cmd,spp_output, pdf_output,spp_data = \
              ppqt_obj.run_ppqt(\
                input_bam=input_file,
                output_dir=work_dir,
                output_spp_name='{0}_{1}.spp.out'.format(output_prefix,'PPQT'),
                output_pdf_name='{0}_{1}.spp.pdf'.format(output_prefix,'PPQT'))
            analysis_files.append(spp_output)
            au = \
              Analysis_collection_utils(\
                dbsession_class=igf_session_class,
                analysis_name=analysis_name,
                tag_name=species_name,
                collection_name=experiment_igf_id,
                collection_type=ppqt_collection_type,
                collection_table=collection_table,
                base_path=base_result_dir)
            output_ppqt_list = \
              au.load_file_to_disk_and_db(\
                input_file_list=[pdf_output],
                file_suffix='pdf',
                withdraw_exisitng_collection=force_overwrite)                         # load file to db and disk
            if load_metrics_to_cram and \
               len(spp_data) > 0:
                ca = CollectionAdaptor(**{'session_class': igf_session_class})
                attribute_data = \
                  ca.prepare_data_for_collection_attribute(\
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    data_list=spp_data)
                ca.start_session()
                try:
                    ca.create_or_update_collection_attributes(\
                      data=attribute_data,
                      autosave=False)
                    ca.commit_session()
                    ca.close_session()
                except Exception as e:
                    ca.rollback_session()
                    ca.close_session()
                    raise ValueError('Failed to load data to db: {0}'.\
                                     format(e))

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'output_ppqt_list': output_ppqt_list
                })  # pass on samtools output list
            message='finished PPQT for {0} {1}'.\
                    format(project_igf_id,
                           sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message='finished PPQT for {0} {1}: {2}'.\
                    format(project_igf_id,
                           sample_igf_id,
                           ppqt_cmd)
            self.comment_asana_task(task_name=project_igf_id,
                                    comment=message)  # send comment to Asana
        except Exception as e:
            message='project: {2}, sample:{3}, Error in {0}: {1}'.\
                    format(self.__class__.__name__,
                           e,
                           project_igf_id,
                           sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
示例#15
0
    def run(self):
        '''
    A method for running BWA alignment
    
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            run_igf_id = self.param_required('run_igf_id')
            bwa_exe = self.param_required('bwa_exe')
            samtools_exe = self.param_required('samtools_exe')
            r1_read_file = self.param_required('r1_read_file')
            r2_read_file = self.param('r2_read_file')
            run_thread = self.param('run_thread')
            output_prefix = self.param_required('output_prefix')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param('species_name')
            reference_type = self.param('reference_type')
            base_work_dir = self.param_required('base_work_dir')
            parameter_options = self.param('parameter_options')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            input_fastq_list = list()
            input_fastq_list.append(r1_read_file[0])
            if r2_read_file is not None and \
               len(r2_read_file)>0:
                input_fastq_list.append(r2_read_file[0])

            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id,
                run_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            ref_genome = \
              Reference_genome_utils(
                genome_tag=species_name,
                dbsession_class=igf_session_class,
                bwa_ref_type=reference_type)                                          # setup ref genome utils
            bwa_ref = ref_genome.get_genome_bwa()  # get bwa ref
            bwa_obj = \
              BWA_util(
                bwa_exe=bwa_exe,
                samtools_exe=samtools_exe,
                ref_genome=bwa_ref,
                input_fastq_list=input_fastq_list,
                output_dir=work_dir,
                output_prefix=output_prefix,
                bam_output=True,
                use_ephemeral_space=use_ephemeral_space,
                thread=run_thread)                                                    # set up bwa for run
            if isinstance(parameter_options, str):
                parameter_options = json.loads(
                    parameter_options)  # convert string param to dict

            final_output_file,bwa_cmd = \
              bwa_obj.\
                run_mem(parameter_options=parameter_options)                          # run bwa mem
            self.param('dataflow_params', {
                'bwa_bam': final_output_file,
                'seed_date_stamp': seed_date_stamp
            })  # pass on bwa output list
            message = \
              'finished bwa {0} {1}'.\
                format(
                  project_igf_id,
                  run_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            self.comment_asana_task(task_name=project_igf_id,
                                    comment=message)  # send comment to Asana
            message = \
              'Bwa {0} {1}'.\
                format(
                  run_igf_id,
                  bwa_cmd)
            self.comment_asana_task(
                task_name=project_igf_id,
                comment=message)  # send commandline to Asana
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
示例#16
0
    def run(self):
        '''
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            rsem_exe_dir = self.param_required('rsem_exe_dir')
            library_layout = self.param_required('library_layout')
            reference_type = self.param_required('reference_type')
            igf_session_class = self.param_required('igf_session_class')
            output_prefix = self.param_required('output_prefix')
            base_work_dir = self.param_required('base_work_dir')
            input_bams = self.param_required('input_bams')
            strandedness = self.param('strandedness')
            threads = self.param('threads')
            use_ephemeral_space = self.param('use_ephemeral_space')
            memory_limit = self.param('memory_limit')
            rsem_options = self.param('rsem_options')
            force_overwrite = self.param('force_overwrite')
            species_name = self.param('species_name')
            base_work_dir = self.param_required('base_work_dir')
            seed_date_stamp = self.param_required('date_stamp')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if not isinstance(input_bams,list) or \
               len(input_bams) != 1:
                raise ValueError('Expecting one input bam for rsem and got : {0}'.\
                                 format(len(input_bams)))

            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            ref_genome = \
              Reference_genome_utils(
                genome_tag=species_name,
                dbsession_class=igf_session_class,
                gene_rsem_type=reference_type)
            rsem_ref = ref_genome.get_transcriptome_rsem(
            )  # fetch rsem refrence
            if library_layout == 'PAIRED':
                paired_end = True
            else:
                paired_end = False

            rsem_obj = \
              RSEM_utils(
                rsem_exe_dir=rsem_exe_dir,
                reference_rsem=rsem_ref,
                input_bam=input_bams[0],
                threads=threads,
                use_ephemeral_space=use_ephemeral_space,
                memory_limit=memory_limit)                                            # prepare rsem for run
            rsem_cmd,rsem_output_list,rsem_log_file = \
              rsem_obj.\
                run_rsem_calculate_expression(
                  output_dir=work_dir,
                  output_prefix=output_prefix,
                  paired_end=paired_end,
                  strandedness=strandedness,
                  options=rsem_options,
                  force=force_overwrite)
            if not isinstance(rsem_output_list,list) or \
               len(rsem_output_list)==0:
                raise ValueError(
                    'No RSEM output files found')  # check output files

            self.param(
                'dataflow_params', {
                    'rsem_output': rsem_output_list,
                    'rsem_log_file': rsem_log_file,
                    'seed_date_stamp': seed_date_stamp
                })  # pass on rsem output list
            message = \
              'Finished RSEM {0} for {1}'.format(
                project_igf_id,
                sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'RSEM {0} command: {1}'.format(
                experiment_igf_id,
                rsem_cmd)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send commandline to Asana
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
 def test_get_datestamp_label(self):
     date_str = '2018-08-23 15:15:01'
     self.assertEqual(get_datestamp_label(date_str), '20180823')
     self.assertEqual(get_datestamp_label(parse(date_str)), '20180823')
 def run(self):
     '''
 A method for running featureCounts tool
 
 '''
     try:
         project_igf_id = self.param_required('project_igf_id')
         experiment_igf_id = self.param_required('experiment_igf_id')
         sample_igf_id = self.param_required('sample_igf_id')
         featurecounts_exe = self.param_required('featurecounts_exe')
         input_files = self.param_required('input_files')
         reference_gtf = self.param('reference_gtf')
         base_work_dir = self.param_required('base_work_dir')
         igf_session_class = self.param_required('igf_session_class')
         species_name = self.param_required('species_name')
         parameter_options = self.param('parameter_options')
         run_thread = self.param('run_thread')
         use_ephemeral_space = self.param('use_ephemeral_space')
         output_prefix = self.param_required('output_prefix')
         seed_date_stamp = self.param_required('date_stamp')
         seed_date_stamp = get_datestamp_label(seed_date_stamp)
         work_dir_prefix = \
           os.path.join(
             base_work_dir,
             project_igf_id,
             sample_igf_id,
             experiment_igf_id)
         work_dir = \
           self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
         output_prefix = \
           '{0}_{1}'.format(
             output_prefix,
             seed_date_stamp)
         output_file = \
           os.path.join(
             work_dir,
             output_prefix)
         ref_genome = \
           Reference_genome_utils(
             genome_tag=species_name,
             dbsession_class=igf_session_class,
             gene_gtf_type=reference_gtf)                                          # setup ref genome utils
         gene_gtf = ref_genome.get_gene_gtf()  # get gtf file
         summary_file,featureCount_cmd = \
           run_featureCounts(
             featurecounts_exe=featurecounts_exe,
             input_gtf=gene_gtf,
             input_bams=input_files,
             output_file=output_file,
             thread=run_thread,
             use_ephemeral_space=use_ephemeral_space,
             options=parameter_options)
         self.param(
             'dataflow_params', {
                 'featureCounts_output': output_file,
                 'featureCounts_summary': summary_file,
                 'seed_date_stamp': seed_date_stamp
             })
         message = \
           'finished featureCounts for {0} {1}'.format(
             project_igf_id,
             experiment_igf_id)
         self.post_message_to_slack(message,
                                    reaction='pass')  # send log to slack
         message = \
           'featureCounts {0} command: {1}'.format(
             experiment_igf_id,
             featureCount_cmd)
         self.comment_asana_task(
             task_name=project_igf_id,
             comment=message)  # send commandline to Asana
     except Exception as e:
         message = \
           'project: {2}, sample:{3}, Error in {0}: {1}'.\
             format(
               self.__class__.__name__,
               e,
               project_igf_id,
               sample_igf_id)
         self.warning(message)
         self.post_message_to_slack(
             message, reaction='fail')  # post msg to slack for failed jobs
         raise
    def run(self):
        '''
    A method for running Fastp commands
    
    :param project_igf_id: A project_igf_id from dataflow
    :param experiment_igf_id: A experiment_igf_id from dataflow
    :param sample_igf_id: A sample_igf_id from dataflow
    :param fastp_exe: Fastp exe path from analysis config
    :param input_fastq_list: Input fastq list from dataflow
    :param base_work_dir: Base work dir path from analysis config
    :param run_thread: Number of threads for fastp run, default 1
    :param split_fastq: Enable splitting fastq files, default None
    :param split_by_lines_count: Number of fastq lines to be used if split_fastq is True, default 5000000
    :param fastp_options_list: A list of fasrp tool options, default ['-a=auto','--qualified_quality_phred=15','--length_required=15']
    :param platform_name: Sequencing platform name from dataflow
    :param use_ephemeral_space: A toggle for temp dir setting, default 0
    :param polyg_platform_list: A list of Illumin platforms which emit poly Gs for empty cycles, default ['NextSeq','NOVASEQ6000']
    :param enable_polyg_trim: Enable Fastp poly G trim, default False
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            run_igf_id = self.param_required('run_igf_id')
            fastp_exe = self.param_required('fastp_exe')
            input_fastq_list = self.param_required('input_fastq_list')
            base_work_dir = self.param_required('base_work_dir')
            run_thread = self.param('run_thread')
            split_fastq = self.param('split_fastq')
            split_by_lines_count = self.param('split_by_lines_count')
            fastp_options_list = self.param('fastp_options_list')
            platform_name = self.param_required('platform_name')
            polyg_platform_list = self.param('polyg_platform_list')
            enable_polyg_trim = self.param('enable_polyg_trim')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = self.param_required('date_stamp')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            work_dir_prefix = \
              os.path.join(\
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix)  # get a run work dir
            split_fastq = \
              False if split_fastq is None else True                                  # set default value for split fastq
            if platform_name in polyg_platform_list:
                enable_polyg_trim = True  # enable poly G trim for new Illumin platforms

            fastp_obj = \
              Fastp_utils(\
                fastp_exe=fastp_exe,
                input_fastq_list=input_fastq_list,
                log_output_prefix=run_igf_id,
                output_dir=work_dir,
                run_thread=run_thread,
                use_ephemeral_space=use_ephemeral_space,
                enable_polyg_trim=enable_polyg_trim,
                split_by_lines_count=split_by_lines_count,
                fastp_options_list=fastp_options_list)                                # setup fastp tool for run
            output_read1, output_read2, output_html_file, output_json_file, _ = \
              fastp_obj.\
                run_adapter_trimming(split_fastq=split_fastq)                         # run fastp trimming
            self.param(
                'dataflow_params', {
                    'output_read1': output_read1,
                    'output_read2': output_read2,
                    'output_html_file': output_html_file,
                    'output_json_file': output_json_file,
                    'seed_date_stamp': seed_date_stamp
                })  # pass on fastp output list
            message = 'finished fastp for {0} {1}'.\
                      format(project_igf_id,
                             sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
              format(\
                self.__class__.__name__,
                e,
                project_igf_id,
                sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise