except:
            raise


if __name__ == '__main__':
    from igf_data.igfdb.igfTables import Base
    from igf_data.utils.dbutils import read_dbconf_json
    from igf_data.utils.fileutils import get_temp_dir
    from igf_data.utils.fileutils import remove_dir

    dbparams = read_dbconf_json('data/dbconfig.json')
    dbname = dbparams['dbname']
    if os.path.exists(dbname):
        os.remove(dbname)

    temp_dir = get_temp_dir()
    base = BaseAdaptor(**dbparams)
    Base.metadata.create_all(base.engine)
    base.start_session()
    collection_data = [{
        'name': 'IGF001_MISEQ',
        'type': 'ALIGNMENT_CRAM',
        'table': 'experiment'
    }, {
        'name': 'IGF002_MISEQ',
        'type': 'ALIGNMENT_CRAM',
        'table': 'experiment'
    }]

    ca = CollectionAdaptor(**{'session': base.session})
    ca.store_collection_and_attribute_data(data=collection_data, autosave=True)
    def check_lane_effect_and_log_report(self, project_name, sample_name,
                                         output_file):
        '''
    A function for generating batch effect report for a sample and project
    
    :param project_name: A project name for the report file
    :param sample_name: A sample name for the report file
    :param output_file: Path of the output report file
    '''
        try:
            if self.strand_info not in self.allowed_strands:
                raise ValueError('{0} is not a valid strand'.format(
                    self.strand_info))

            temp_dir = get_temp_dir(use_ephemeral_space=False)
            temp_merged_output = os.path.join(temp_dir, 'merged.csv')
            temp_cpm_output = os.path.join(temp_dir, 'merged_cpm.csv')
            temp_png_output = os.path.join(temp_dir, 'plot.png')
            temp_clustermap = os.path.join(temp_dir, 'clustermap.png')
            temp_corr = os.path.join(temp_dir, 'corr.png')
            temp_pca_flowcell = os.path.join(temp_dir, 'pca_flowcell.png')
            temp_pca_flowcell_lane = os.path.join(temp_dir,
                                                  'pca_flowcell_lane.png')
            temp_html_report = os.path.join(
                temp_dir, os.path.basename(self.template_file))
            check_file_path(self.input_json_file)
            check_file_path(self.rscript_path)
            check_file_path(self.batch_effect_rscript_path)
            with open(self.input_json_file, 'r') as json_data:
                input_list = json.load(json_data)

            if len(input_list) < 2:
                raise ValueError(
                    'Minimum two input files are required for lane level batch effect checking'
                )

            gene_name_label = 'gene_name'
            final_df = pd.DataFrame()
            for entry in input_list:
                file = entry.get('file')
                flowcell = entry.get('flowcell')
                lane = entry.get('lane')
                if file is None or \
                   flowcell is None or \
                   lane is None:
                    raise ValueError('Missing required info for batch effect check: {0}'.\
                                     format(entry))
                unstranded_label = 'unstranded_{0}_{1}'.format(flowcell, lane)
                reverse_strand_label = 'reverse_strand_{0}_{1}'.format(
                    flowcell, lane)
                forward_strand_label = 'forward_strand_{0}_{1}'.format(
                    flowcell, lane)
                data=pd.read_csv(\
                          file,
                          sep='\t',
                          header=None,
                          skiprows=4,
                          index_col=False,
                          names=[gene_name_label,
                                 unstranded_label,
                                 forward_strand_label,
                                 reverse_strand_label])
                if self.strand_info == 'reverse_strand':
                    data = data[[gene_name_label, reverse_strand_label]]
                    data = data[
                        data[reverse_strand_label] > self.
                        read_threshold]  # filter series and remove any low value gene
                elif self.strand_info == 'forward_strand':
                    data = data[[gene_name_label, forward_strand_label]]
                    data = data[
                        data[forward_strand_label] > self.
                        read_threshold]  # filter series and remove any low value gene
                elif self.strand_info == 'unstranded':
                    data = data[[gene_name_label, unstranded_label]]
                    data = data[
                        data[unstranded_label] > self.
                        read_threshold]  # filter series and remove any low value gene
                if len(final_df.index) == 0:
                    final_df = copy(data)
                else:
                    final_df=final_df.\
                             merge(data,
                                   how='outer',
                                   on=gene_name_label)

            final_df = final_df.dropna().set_index(
                gene_name_label)  # remove any row with NA values from df
            final_df.\
            applymap(lambda x: float(x)).\
            to_csv(temp_merged_output,index=True)                                     # dump raw counts as csv file
            rscript_cmd = [
                quote(self.rscript_path),
                quote(self.batch_effect_rscript_path),
                quote(temp_merged_output),
                quote(temp_cpm_output),
                quote(temp_png_output)
            ]
            subprocess.check_call(' '.join(rscript_cmd),
                                  shell=True)  # run r script for cpm counts
            check_file_path(temp_cpm_output)  # check output file
            mod_data=pd.read_csv(temp_cpm_output).\
                     rename(columns={'Unnamed: 0':gene_name_label}).\
                     set_index(gene_name_label)                                       # read output file
            sns_fig = sns.clustermap(mod_data, figsize=(10, 10))
            sns_fig.fig.savefig(temp_clustermap)
            check_file_path(temp_clustermap)  # plot clustermap
            corr_df = mod_data.corr()
            cmap = sns.diverging_palette(220, 10, as_cmap=True)
            fig, ax = plt.subplots(figsize=(7, 7))
            sns.heatmap(corr_df,
                        cmap=cmap,
                        square=True,
                        linewidths=.5,
                        cbar_kws={"shrink": .4},
                        ax=ax)
            plt.savefig(temp_corr)
            check_file_path(temp_corr)  # plot correlation values
            pca = PCA(n_components=2)
            X_r = pca.fit(mod_data.T).transform(mod_data.T)
            pattern1 = re.compile(
                r'(rev_strand|forward_strand|unstranded)_(\S+)_([1-8])')
            pattern2 = re.compile(
                r'(rev_strand|forward_strand|unstranded)_(\S+_[1-8])')
            results_df=pd.DataFrame(\
                           {'PCA1':X_r[:,0],
                            'PCA2':X_r[:,1],
                            'flowcell':[re.match(pattern1,label).group(2)
                                        if re.match(pattern1,label) else label
                                          for label in mod_data.T.index],
                            'flowcell_lane':[re.match(pattern2,label).group(2)
                                             if re.match(pattern2,label) else label
                                               for label in mod_data.T.index]
                           })
            pca_plot = sns.lmplot('PCA1',
                                  'PCA2',
                                  hue='flowcell',
                                  data=results_df,
                                  fit_reg=False)
            pca_plot.fig.savefig(temp_pca_flowcell)  # plot flowcell level pca
            pca_plot = sns.lmplot('PCA1',
                                  'PCA2',
                                  hue='flowcell_lane',
                                  data=results_df,
                                  fit_reg=False)
            pca_plot.fig.savefig(
                temp_pca_flowcell_lane)  # plot flowcell-lane level pca
            template_env=Environment(\
                           loader=FileSystemLoader(\
                                    searchpath=os.path.dirname(self.template_file)),
                           autoescape=select_autoescape(['xml']))
            template_file=template_env.\
                          get_template(os.path.basename(self.template_file))
            template_file.\
              stream(ProjectName=project_name,
                     SampleName=sample_name,
                     mdsPlot=self._encode_png_image(png_file=temp_png_output),
                     clustermapPlot=self._encode_png_image(png_file=temp_clustermap),
                     corrPlot=self._encode_png_image(png_file=temp_corr),
                     pca1Plot=self._encode_png_image(png_file=temp_pca_flowcell),
                     pca2Plot=self._encode_png_image(png_file=temp_pca_flowcell_lane),
                    ).\
              dump(temp_html_report)
            copy_local_file(temp_html_report, output_file, force=True)
        except:
            raise
예제 #3
0
    def run(self):
        try:
            samplesheet_file = self.param_required('original_samplesheet')
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            fastq_dir = self.param_required('fastq_dir')
            model_name = self.param_required('model_name')
            project_name = self.param_required('project_name')
            stats_filename = self.param('stats_filename')
            strict_check = self.param('strict_check')
            use_ephemeral_space = self.param('use_ephemeral_space')

            work_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get work directory name
            stats_json_file = \
              os.path.join(
                fastq_dir,
                stats_filename)                                                       # get stats file path
            barcode_stat = \
              CheckSequenceIndexBarcodes(
                stats_json_file=stats_json_file,
                samplesheet_file=samplesheet_file,
                platform_name=model_name)                                             # create check instance
            barcode_stat.\
              validate_barcode_stats(
                work_dir=work_dir, \
                strict_check=strict_check)                                            # validate seqrun stats
            self.param('dataflow_params',
                       {'barcode_qc_stats': 'PASS'
                        })  # seed dataflow parame for the qc passed lanes
        except IndexBarcodeValidationError as e:
            self.param(
                'dataflow_params',
                {'barcode_qc_stats': 'FAIL'})  # seed dataflow for failed lanes
            message = \
              'project: {0}, message:{1}'.\
                format(
                  project_name,
                  e.message)
            if len(e.plots) == 0:
                self.post_message_to_slack(\
                  message=e.message,
                  reaction='fail')                                                      # only post msg to slack if no plots
                self.comment_asana_task(\
                  task_name=seqrun_igf_id,
                  comment=e.message)                                                    # log to asana task
            else:
                for plot_file in e.plots:
                    self.post_file_to_slack(
                        message=message,
                        filepath=plot_file)  # posting plot files to slack
                    self.upload_file_to_asana_task(\
                      task_name=seqrun_igf_id,
                      filepath=plot_file, \
                      comment=message)                                                    # upload plots to asana
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
    def run_mem(self,
                mem_cmd='mem',
                parameter_options=("-M", ""),
                samtools_cmd='view',
                dry_run=False):
        '''
    A method for running Bwa mem and generate output alignment
    
    :param mem_cmd: Bwa mem command, default mem
    :param option_list: List of bwa mem option, default -M
    :param samtools_cmd: Samtools view command, default view
    :param dry_run: A toggle for returning the bwa cmd without running it, default False
    :returns: A alignment file path and bwa run cmd
    '''
        try:
            self._run_checks()  # check input params
            read1_list,read2_list = \
              identify_fastq_pair(\
                input_list=self.input_fastq_list)                                     # fetch input files
            temp_dir = \
              get_temp_dir(
                use_ephemeral_space=self.use_ephemeral_space)
            bwa_cmd = [
                quote(self.bwa_exe),
                quote(mem_cmd), '-t',
                quote(str(self.thread))
            ]
            if isinstance(parameter_options,tuple) and \
               len(parameter_options)>0 :
                parameter_options = \
                  {item:parameter_options[index+1]
                     for index, item in enumerate(parameter_options)
                       if index %2==0}                                                  # convert default param tuple to a dict

            if isinstance(parameter_options,dict) and \
               len(parameter_options)>0:
                parameter_options = [
                    quote(str(field))
                    for key, val in parameter_options.items()
                    for field in [key, val] if field != ''
                ]  # flatten param list
                bwa_cmd.extend(parameter_options)  # add mem specific options

            bwa_cmd.append(quote(self.ref_genome))
            bwa_cmd.append(quote(read1_list[0]))  # add read 1
            if len(read2_list) > 0:
                bwa_cmd.append(quote(read2_list[0]))  # add read 2

            if self.bam_output:
                temp_output_path = \
                  os.path.join(
                    temp_dir,
                    '{0}.bam'.format(self.output_prefix))                               # bam output
                samtools_cmd = [
                    quote(self.samtools_exe),
                    quote(samtools_cmd),
                    quote('--threads'),
                    quote(str(self.thread)),
                    quote('-bo'), temp_output_path
                ]
                if dry_run:
                    return bwa_cmd, samtools_cmd  # return bwa and samtools cmd

                with subprocess.Popen(bwa_cmd, stdout=subprocess.PIPE) as proc:
                    _ = \
                      subprocess.\
                        Popen(
                          ' '.join(samtools_cmd),
                          shell=True,
                          stdin=proc.stdout)

            else:
                temp_output_path = \
                  os.path.join(
                    temp_dir,
                    '{0}.sam'.format(self.output_prefix))                               # sam output
                if dry_run:
                    return bwa_cmd

                with open(temp_output_path, 'w') as sam:
                    with subprocess.Popen(bwa_cmd,
                                          stdout=subprocess.PIPE) as proc:
                        sam.write(proc.stdout.read().decode(
                            'utf-8'))  # writing sam output

            if os.path.exists(temp_output_path):
                final_output_file = \
                  os.path.join(
                    self.output_dir,
                    os.path.basename(temp_output_path))
                copy_local_file(source_path=temp_output_path,
                                destinationa_path=final_output_file)
            else:
                raise IOError('Alignment temp output missing')

            return final_output_file, bwa_cmd
        except:
            raise
    def run(self):
        try:
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            project_name = self.param_required('project_name')
            seqrun_date = self.param_required('seqrun_date')
            flowcell_id = self.param_required('flowcell_id')
            remote_project_path = self.param_required('remote_project_path')
            remote_user = self.param_required('remote_user')
            remote_host = self.param_required('remote_host')
            template_dir = self.param_required('template_dir')
            page_type = self.param_required('page_type')
            fastq_dir = self.param('fastq_dir')
            multiqc_remote_file = self.param('multiqc_remote_file')
            lane_index_info = self.param('lane_index_info')
            qc_template_path = self.param('qc_template_path')
            project_template = self.param('project_template')
            undetermined_template = self.param('undetermined_template')
            sample_template = self.param('sample_template')
            project_filename = self.param('project_filename')
            sample_filename = self.param('sample_filename')
            undetermined_filename = self.param('undetermined_filename')
            report_html = self.param('report_html')
            remote_ftp_base = self.param('remote_ftp_base')
            use_ephemeral_space = self.param('use_ephemeral_space')

            if page_type not in ['project', 'sample', 'undetermined']:
                raise ValueError(
                    'Project type {0} is not defined yet'.format(page_type))

            qc_template_path = \
              os.path.join(template_dir,qc_template_path)
            remote_file_path = \
              os.path.join(\
                remote_project_path,
                project_name,
                seqrun_date,
                flowcell_id)
            if lane_index_info is not None:
                remote_file_path = \
                  os.path.join(\
                    remote_file_path,
                    lane_index_info)                                                    # generic remote path, lane info is none for project

            template_env = \
              Environment(
                loader=FileSystemLoader(searchpath=qc_template_path),
                autoescape=select_autoescape(['xml']))                                # set template env

            #remote_chk_cmd=['ssh',\
            #                '{0}@{1}'.\
            #                format(remote_user,\
            #                       remote_host),\
            #                'ls']

            #remote_rm_cmd=['ssh',\
            #                '{0}@{1}'.\
            #                format(remote_user,\
            #                       remote_host),\
            #                'rm', \
            #                '-f']

            temp_work_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp dir
            report_output_file = None
            qc_file_info = dict()
            qc_file_info.\
              update({
                'project_name':project_name,
                'flowcell': flowcell_id,
              })
            if page_type == 'project':  # prepare project page
                (headerdata, qcmain) = self._process_projects_data(
                )  # get required data for project qc page

                template_file = \
                  template_env.get_template(project_template)
                report_output_file = \
                  os.path.join(\
                    temp_work_dir,
                    project_filename)
                template_file.\
                stream(\
                  ProjectName=project_name,
                  SeqrunDate=seqrun_date,
                  FlowcellId=flowcell_id,
                  headerdata=headerdata,
                  qcmain=qcmain).\
                dump(report_output_file)
                os.chmod(report_output_file, mode=0o754)

                #remote_chk_cmd.append(os.path.join(remote_file_path,project_filename))
                #remote_rm_cmd.append(os.path.join(remote_file_path,project_filename))

            elif page_type == 'undetermined':  # prepare undetermined fastq page
                (headerdata, qcmain) = \
                  self._process_undetermined_data(remote_file_path)                     # get required data for undetermined qc page
                template_file = \
                  template_env.get_template(undetermined_template)
                report_output_file = \
                  os.path.join(\
                    temp_work_dir,
                    undetermined_filename)
                template_file.\
                stream(
                  ProjectName=project_name,
                  SeqrunDate=seqrun_date,
                  FlowcellId=flowcell_id,
                  headerdata=headerdata,
                  qcmain=qcmain).\
                dump(report_output_file)
                os.chmod(report_output_file, mode=0o754)
                #remote_chk_cmd.append(os.path.join(remote_file_path,undetermined_filename))
                #remote_rm_cmd.append(os.path.join(remote_file_path,undetermined_filename))

            elif page_type == 'sample':  # prepare sample page
                if lane_index_info is None:
                    raise ValueError('Missing lane and index information')

                if fastq_dir is None:
                    raise ValueError('Missing required fastq_dir')

                (headerdata, qcmain) = \
                  self._process_samples_data()                                          # get required data for sample qc page
                (lane_id,index_length) = \
                  lane_index_info.split('_',1)                                          # get lane and index info
                template_file = \
                  template_env.get_template(sample_template)                            # get template file
                report_output_file = \
                  os.path.join(\
                    temp_work_dir,
                    sample_filename)
                template_file.\
                  stream(
                    ProjectName=project_name,
                    SeqrunDate=seqrun_date,
                    FlowcellId=flowcell_id,
                    Lane=lane_id,
                    IndexBarcodeLength=index_length,
                    headerdata=headerdata,
                    qcmain=qcmain).\
                  dump(report_output_file)                                                # dump data to template file
                os.chmod(report_output_file, mode=0o754)

                #remote_chk_cmd.append(os.path.join(remote_file_path,sample_filename))
                #remote_rm_cmd.append(os.path.join(remote_file_path,sample_filename))

                remote_sample_qc_path = \
                  os.path.join(\
                    remote_file_path,
                    os.path.basename(report_output_file))
                if multiqc_remote_file is None:
                    raise ValueError(
                        'required a valid path for remote multiqc')

                remote_path = \
                  os.path.join(\
                    remote_project_path,
                    project_name,
                    seqrun_date,
                    flowcell_id)                                                        # get remote base path
                remote_sample_qc_path = \
                  os.path.relpath(\
                    remote_sample_qc_path,
                    start=remote_path)                                                  # elative path for sample qc
                multiqc_remote_file = \
                  os.path.relpath(\
                    multiqc_remote_file,
                    start=remote_path)                                                  # relative path for multiqc

                report_htmlname = os.path.basename(report_html)
                reports = list()
                for root, _, files in os.walk(top=fastq_dir):
                    if report_htmlname in files:
                        reports.\
                          extend([os.path.join(os.path.abspath(root),file) \
                                   for file in files \
                                     if fnmatch.fnmatch(os.path.join(root,file),report_html)]) # get all html reports

                if len(reports) == 0:
                    raise ValueError('No demultiplexing report found for fastq dir {0}'.\
                                     format(fastq_dir))

                os.chmod(reports[0],
                         mode=0o774)  # added read permission for report html
                copy_remote_file(source_path=reports[0],
                                 destinationa_path=remote_file_path,
                                 destination_address='{0}@{1}'.format(
                                     remote_user,
                                     remote_host))  # copy file to remote
                remote_report_file = \
                  os.path.join(\
                    remote_file_path,
                    os.path.basename(reports[0]))                                       # get remote path for report file
                remote_report_file = \
                  os.path.relpath(\
                    remote_report_file,
                    start=remote_path)                                                  # get relative path for demultiplexing report

                qc_file_info = \
                  {'lane_id':lane_id,
                   'index_length':index_length,
                   'sample_qc_page':remote_sample_qc_path,
                   'multiqc_page':multiqc_remote_file,
                   'demultiplexing_report':remote_report_file,
                   'fastq_dir':fastq_dir,
                   'project_name':project_name,
                  }

            #response=subprocess.call(remote_chk_cmd)
            #if response!=0:
            #  subprocess.check_call(remote_rm_cmd)                                    # remove existing remote file

            if not os.path.exists(report_output_file):
                raise IOError('file {0} not found'.format(report_output_file))

            copy_remote_file(\
              source_path=report_output_file,
              destinationa_path=remote_file_path,
              destination_address='{0}@{1}'.format(remote_user,remote_host))          # copy file to remote
            remote_qc_page = \
              os.path.join(\
                remote_file_path,
                os.path.basename(report_output_file))
            qc_file_info.\
              update({'remote_qc_page':remote_qc_page})
            self.param('dataflow_params', {'qc_file_info': qc_file_info})

            remote_url_path = \
              'http://{0}/{1}'.\
              format(remote_host,
                     os.path.relpath(\
                       remote_qc_page,
                       start=remote_ftp_base))
            message = \
              'QC page {0}, {1},{2}: {3}'.\
                format(
                  seqrun_igf_id,
                  project_name,
                  page_type,
                  remote_url_path)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send msg to slack
            self.comment_asana_task(\
              task_name=seqrun_igf_id,
              comment=message)                                                        # send msg to asana
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
                format(\
                  self.__class__.__name__,
                  e,
                  seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
예제 #6
0
    def run(self):
        try:
            fastq_file = self.param_required('fastq_file')
            fastq_dir = self.param_required('fastq_dir')
            igf_session_class = self.param_required('igf_session_class')
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            base_results_dir = self.param_required('base_results_dir')
            project_name = self.param_required('project_name')
            seqrun_date = self.param_required('seqrun_date')
            flowcell_id = self.param_required('flowcell_id')
            fastqscreen_exe = self.param_required('fastqscreen_exe')
            fastqscreen_conf = self.param_required('fastqscreen_conf')
            tag = self.param_required('tag')
            lane_index_info = self.param_required('lane_index_info')
            sample_name = self.param('sample_name')
            fastqscreen_options = self.param('fastqscreen_options')
            force_overwrite = self.param('force_overwrite')
            fastqscreen_dir_label = self.param('fastqscreen_dir_label')
            fastqs_collection_type = self.param('fastqs_collection_type')
            hpc_location = self.param('hpc_location')
            store_file = self.param('store_file')
            required_collection_table = self.param('required_collection_table')
            use_ephemeral_space = self.param('use_ephemeral_space')

            if lane_index_info is None:
                lane_index_info = os.path.basename(
                    fastq_dir)  # get the lane and index length info

            fastq_file_label = \
              os.path.basename(fastq_file).replace('.fastq.gz','')

            if tag == 'known' and store_file:  # fetch sample name for known fastq, if its not defined
                base = BaseAdaptor(**{'session_class': igf_session_class})
                base.start_session()  # connect to db

                ca = CollectionAdaptor(**{'session': base.session})
                (collection_name,collection_table) = \
                  ca.fetch_collection_name_and_table_from_file_path(\
                    file_path=fastq_file)                                               # fetch collection name and table info

                if collection_table != required_collection_table:
                    raise ValueError(
                      'Expected collection table {0} and got {1}, {2}'.\
                        format(
                          required_collection_table,
                          collection_table,
                          fastq_file))

                ra = RunAdaptor(**{'session': base.session})
                sample = ra.fetch_sample_info_for_run(
                    run_igf_id=collection_name)
                sample_name = sample['sample_igf_id']
                base.close_session()

            fastqscreen_result_dir = \
              os.path.join(\
                base_results_dir,
                project_name,
                seqrun_date,
                flowcell_id,
                lane_index_info,
                tag)                                                                  # result dir path is generic

            if sample_name is not None:
                fastqscreen_result_dir = \
                  os.path.join(\
                    fastqscreen_result_dir,
                    sample_name)                                                        # add sample name to dir path only if its available

            fastqscreen_result_dir = \
              os.path.join(\
                fastqscreen_result_dir,
                fastq_file_label,
                fastqscreen_dir_label)                                                # keep multiple files under same dir

            if os.path.exists(fastqscreen_result_dir) and force_overwrite:
                remove_dir(
                    fastqscreen_result_dir
                )  # remove existing output dir if force_overwrite is true

            if not os.path.exists(fastqscreen_result_dir):
                os.makedirs(fastqscreen_result_dir,
                            mode=0o775)  # create output dir if its not present

            temp_work_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp work dir
            if not os.path.exists(fastq_file):
                raise IOError('fastq file {0} not readable'.format(
                    fastq_file))  # raise if fastq file path is not readable

            fastqscreen_output = os.path.join(temp_work_dir, fastq_file_label)
            os.mkdir(fastqscreen_output)  # create fastqc output dir

            fastqscreen_param = self.format_tool_options(
                fastqscreen_options)  # format fastqc params
            fastqscreen_cmd = \
              [fastqscreen_exe,
               '-conf',fastqscreen_conf,
               '--outdir',fastqscreen_output,
              ]                                                                       # fastqscreen base parameters
            fastqscreen_cmd.extend(
                fastqscreen_param)  # add additional parameters
            fastqscreen_cmd.append(fastq_file)  # fastqscreen input file
            subprocess.check_call(fastqscreen_cmd)  # run fastqscreen

            fastqscreen_stat = None
            fastqscreen_html = None
            fastqscreen_png = None
            for root, _, files in os.walk(top=fastqscreen_output):
                for file in files:
                    if fnmatch.fnmatch(file, '*.txt'):
                        input_fastqs_txt = os.path.join(root, file)
                        copy2(input_fastqs_txt, fastqscreen_result_dir)
                        fastqscreen_stat = os.path.join(
                            fastqscreen_result_dir, file)

                    if fnmatch.fnmatch(file, '*.html'):
                        input_fastqs_html = os.path.join(root, file)
                        copy2(input_fastqs_html, fastqscreen_result_dir)
                        fastqscreen_html = os.path.join(
                            fastqscreen_result_dir, file)

                    if fnmatch.fnmatch(file, '*.png'):
                        input_fastqs_png = os.path.join(root, file)
                        copy2(input_fastqs_png, fastqscreen_result_dir)
                        fastqscreen_png = os.path.join(fastqscreen_result_dir,
                                                       file)

            if fastqscreen_stat is None or fastqscreen_html is None or \
               fastqscreen_png is None:
                raise ValueError('Missing required file, stat: {0}, html: {1}, png: {2}'.\
                                 format(fastqscreen_stat,
                                        fastqscreen_html,
                                        fastqscreen_png))

            if tag == 'known' and store_file:
                fastqs_files = \
                  [{'name':collection_name,
                    'type':fastqs_collection_type,
                    'table':required_collection_table,
                    'file_path':fastqscreen_stat,
                    'location':hpc_location},
                   {'name':collection_name,
                    'type':fastqs_collection_type,
                    'table':required_collection_table,
                    'file_path':fastqscreen_html,
                    'location':hpc_location},
                   {'name':collection_name,
                    'type':fastqs_collection_type,
                    'table':required_collection_table,
                    'file_path':fastqscreen_png,
                    'location':hpc_location},
                  ]
                ca = CollectionAdaptor(**{'session_class': igf_session_class})
                ca.start_session()
                ca.load_file_and_create_collection(
                    data=fastqs_files)  # store fastqs files to db
                ca.close_session()

            self.param('dataflow_params',
                       {'fastqscreen_html':fastqscreen_html,
                        'lane_index_info':lane_index_info,
                        'sample_name':sample_name,
                        'fastqscreen': \
                          {'fastq_dir':fastq_dir,
                           'fastqscreen_stat':fastqscreen_stat,
                           'fastqscreen_html':fastqscreen_html,
                      }})                                                             # set dataflow params
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
              format(\
                self.__class__.__name__,
                e,
                seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
예제 #7
0
  def run(self):
    try:
      project_igf_id = self.param_required('project_igf_id')
      experiment_igf_id=self.param_required('experiment_igf_id')
      sample_igf_id = self.param_required('sample_igf_id')
      input_files = self.param_required('input_files')
      igf_session_class = self.param_required('igf_session_class')
      template_report_file = self.param_required('template_report_file')
      rscript_path = self.param_required('rscript_path')
      batch_effect_rscript_path = self.param_required('batch_effect_rscript_path')
      base_result_dir = self.param_required('base_result_dir')
      strand_info = self.param('strand_info')
      read_threshold = self.param('read_threshold')
      collection_type = self.param('collection_type')
      collection_table = self.param('collection_table')
      analysis_name = self.param('analysis_name')
      tag_name = self.param('tag_name')
      use_ephemeral_space = self.param('use_ephemeral_space')

      output_file_list = None
      if len(input_files)==0:
        raise ValueError('No input files found for bactch effect checking')
      elif len(input_files) < 3:
        output_file_list = ''                                                   # can't run batch effect checking on less than 3 lanes
      else:
        for file in input_files:
          check_file_path(file)                                                 # check input filepath

        file_data = list()
        ra = RunAdaptor(**{'session_class':igf_session_class})
        ra.start_session()
        for file in input_files:
          run_igf_id = os.path.basename(file).\
                       replace('ReadsPerGene.out.tab','')                       # using simple string match to fetch run igf ids
          flowcell_id, lane_id = \
            ra.fetch_flowcell_and_lane_for_run(run_igf_id=run_igf_id)           # fetch flowcell id and lane info
          file_data.append({'file':file,
                            'flowcell':flowcell_id,
                            'lane':lane_id
                          })
        ra.close_session()
        temp_dir = \
          get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        temp_json_file = \
          os.path.join(temp_dir,'star_gene_counts.json')                        # temp json file path
        temp_output_file = \
          os.path.join(\
            temp_dir,
            os.path.basename(template_report_file))                             # temp report file path
        with open(temp_json_file,'w') as jp:
          json.dump(file_data,jp,indent=2)                                      # dumping json output

        br = Batch_effect_report(\
               input_json_file=temp_json_file,
               template_file=template_report_file,
               rscript_path=rscript_path,
               batch_effect_rscript_path=batch_effect_rscript_path,
               strand_info=strand_info,
               read_threshold=read_threshold
             )                                                                  # set up batch effect run
        br.check_lane_effect_and_log_report(\
             project_name=project_igf_id,
             sample_name=sample_igf_id,
              output_file=temp_output_file
            )                                                                   # generate report file
        au = Analysis_collection_utils(\
               dbsession_class=igf_session_class,
               analysis_name=analysis_name,
               base_path=base_result_dir,
               tag_name=tag_name,
               collection_name=experiment_igf_id,
               collection_type=collection_type,
               collection_table=collection_table
             )                                                                  # prepare to load file
        output_file_list = \
          au.load_file_to_disk_and_db(\
               input_file_list=[temp_output_file])                              # load file to db and disk

      self.param('dataflow_params',
                 {'batch_effect_reports':output_file_list})                     # populating data flow only if report is present
    except Exception as e:
      message = \
        'project: {2}, sample:{3}, Error in {0}: {1}'.\
        format(\
          self.__class__.__name__,
          e,
          project_igf_id,
          sample_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise
    def run(self):
        '''
    A method for running samtools commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param threads: Number of threads to use for Bam to Cram conversion, default 4
    :param base_work_dir: Base workd directory
    :param samtools_command: Samtools command
    :param samFlagInclude: Sam flags to include in filtered bam, default None
    :param samFlagExclude: Sam flags to exclude from the filtered bam, default None
    :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None
    :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False
    :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804
    :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796
    :param use_ephemeral_space: A toggle for temp dir settings, default 0
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            input_files = self.param_required('input_files')
            samtools_exe = self.param_required('samtools_exe')
            reference_type = self.param('reference_type')
            threads = self.param('threads')
            base_work_dir = self.param_required('base_work_dir')
            samtools_command = self.param_required('samtools_command')
            analysis_files = self.param_required('analysis_files')
            output_prefix = self.param_required('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            collection_table = self.param('collection_table')
            base_result_dir = self.param('base_result_dir')
            analysis_name = self.param('analysis_name')
            force_overwrite = self.param('force_overwrite')
            samFlagInclude = self.param('samFlagInclude')
            samFlagExclude = self.param('samFlagExclude')
            mapq_threshold = self.param('mapq_threshold')
            library_layout = self.param_required('library_layout')
            use_encode_filter = self.param('use_encode_filter')
            species_name = self.param_required('species_name')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                               # adding datestamp to the output file prefix

            if use_encode_filter:
                samFlagInclude = None
                if library_layout == 'PAIRED':
                    samFlagExclude = 1804
                else:
                    samFlagExclude = 1796

            if not isinstance(input_files, list) or \
               len(input_files) == 0:
                raise ValueError('No input file found')

            if len(input_files) > 1:
                raise ValueError('More than one input file found: {0}'.\
                                 format(input_files))

            output_bam_cram_list = list()
            input_file = input_files[0]
            temp_output_dir = \
              get_temp_dir(
                use_ephemeral_space=use_ephemeral_space)                              # get temp work dir
            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            samtools_cmdline = ''
            temp_output = None
            if samtools_command == 'idxstats':
                temp_output,samtools_cmdline = \
                  run_bam_idxstat(
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    force=True)                                                         # run samtools idxstats
            elif samtools_command == 'flagstat':
                temp_output,samtools_cmdline = \
                  run_bam_flagstat(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools flagstat
            elif samtools_command == 'stats':
                temp_output,samtools_cmdline,stats_metrics = \
                  run_bam_stats(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools stats
                if load_metrics_to_cram and \
                   len(stats_metrics) > 0:
                    ca = CollectionAdaptor(
                        **{'session_class': igf_session_class})
                    attribute_data = \
                    ca.prepare_data_for_collection_attribute(\
                      collection_name=experiment_igf_id,
                      collection_type=cram_collection_type,
                      data_list=stats_metrics)
                    ca.start_session()
                    try:
                        ca.create_or_update_collection_attributes(\
                          data=attribute_data,
                          autosave=False)
                        ca.commit_session()
                        ca.close_session()
                    except Exception as e:
                        ca.rollback_session()
                        ca.close_session()
                        raise ValueError('Failed to load data to db: {0}'.\
                                       format(e))

            elif samtools_command == 'merge':
                if output_prefix is None:
                    raise ValueError(
                        'Missing output filename prefix for merged bam')

                sorted_by_name = self.param('sorted_by_name')
                temp_output = \
                  os.path.join(\
                    work_dir,
                    '{0}_merged.bam'.format(output_prefix))
                samtools_cmdline = \
                  merge_multiple_bam(\
                    samtools_exe=samtools_exe,
                    input_bam_list=input_file,
                    output_bam_path=temp_output,
                    sorted_by_name=sorted_by_name,
                    threads=threads,
                    use_ephemeral_space=use_ephemeral_space,
                    force=True)
            elif samtools_command == 'view_bamToCram':
                if base_result_dir is None:
                    raise ValueError(
                        'base_result_dir is required for CRAM file loading')

                if analysis_name is None:
                    raise ValueError(
                        'analysis_name is required for CRAM file loading')

                ref_genome = \
                  Reference_genome_utils(\
                    genome_tag=species_name,
                    dbsession_class=igf_session_class,
                    genome_fasta_type=reference_type)
                genome_fasta = ref_genome.get_genome_fasta(
                )  # get genome fasta
                cram_file = \
                  os.path.basename(input_file).\
                    replace('.bam','.cram')                                             # get base cram file name
                cram_file = os.path.join(
                    temp_output_dir,
                    cram_file)  # get cram file path in work dir
                samtools_cmdline = \
                  convert_bam_to_cram(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    reference_file=genome_fasta,
                    cram_path=cram_file,
                    use_ephemeral_space=use_ephemeral_space,
                    threads=threads,
                    force=True,
                    dry_run=False)
                au = \
                  Analysis_collection_utils(\
                    dbsession_class=igf_session_class,
                    analysis_name=analysis_name,
                    tag_name=species_name,
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    collection_table=collection_table,
                    base_path=base_result_dir)
                temp_output_bam_cram_list = \
                  au.load_file_to_disk_and_db(\
                    input_file_list=[cram_file],
                    file_suffix='cram',
                    withdraw_exisitng_collection=force_overwrite)                       # load file to db and disk
                for cram in temp_output_bam_cram_list:
                    index_bam_or_cram(\
                      samtools_exe=samtools_exe,
                      input_path=cram,
                      threads=threads,
                      dry_run=False)
                    index_path = '{0}.crai'.format(cram)
                    output_bam_cram_list.append(cram)
                    output_bam_cram_list.append(index_path)

                if len(output_bam_cram_list) == 0:
                    raise ValueError('No output cram file found')

            elif samtools_command == 'view_filterBam':
                temp_output_bam = \
                  os.path.join(\
                    temp_output_dir,
                    os.path.basename(input_file).replace('.bam','.filtered.bam'))
                samtools_cmdline = \
                  filter_bam_file(
                    samtools_exe=samtools_exe,
                    input_bam=input_file,
                    output_bam=temp_output_bam,
                    samFlagInclude=samFlagInclude,
                    samFlagExclude=samFlagExclude,
                    threads=threads,
                    mapq_threshold=mapq_threshold,
                    index_output=False,
                    dry_run=False)
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output_bam))
                move_file(\
                  source_path=temp_output_bam,
                  destinationa_path=dest_path,
                  force=True)
                index_bam_or_cram(\
                  samtools_exe=samtools_exe,
                  input_path=dest_path,
                  threads=threads,
                  dry_run=False)
                index_path = '{0}.bai'.format(dest_path)
                output_bam_cram_list.append(dest_path)
                output_bam_cram_list.append(index_path)
            else:
                raise ValueError('Samtools command {0} not supported'.\
                                 format(samtools_command))

            if temp_output is not None:
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output))
                if dest_path != temp_output:
                    move_file(\
                      source_path=temp_output,
                      destinationa_path=dest_path,
                      force=True)
                analysis_files.append(dest_path)

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'output_bam_cram_list': output_bam_cram_list
                })  # pass on samtools output list
            message = \
              'finished samtools {0} for {1} {2}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'finished samtools {0} for {1} {2}: {3}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id,
                  samtools_cmdline)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send comment to Asana
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
    def run(self):
        try:
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            project_name = self.param_required('project_name')
            remote_project_path = self.param_required('remote_project_path')
            igf_session_class = self.param_required('igf_session_class')
            remote_user = self.param_required('remote_user')
            remote_host = self.param_required('remote_host')
            seqruninfofile = self.param('seqruninfofile')
            samplereadcountfile = self.param('samplereadcountfile')
            samplereadcountcsvfile = self.param('samplereadcountcsvfile')
            status_data_json = self.param('status_data_json')
            pipeline_name = self.param_required('pipeline_name')
            analysis_pipeline_name = self.param_required(
                'analysis_pipeline_name')
            sample_column = self.param('sample_column')
            use_ephemeral_space = self.param('use_ephemeral_space')

            temp_work_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp dir
            temp_read_count_output = \
              os.path.join(\
                temp_work_dir,
                samplereadcountfile)                                                  # get path for temp read count file
            temp_read_count_csv_output = \
              os.path.join(\
                temp_work_dir,
                samplereadcountcsvfile)                                               # get path for temp read count csv file
            temp_seqrun_info = \
              os.path.join(\
                temp_work_dir,
                seqruninfofile)                                                       # get path for temp seqrun info file
            raw_read_count = \
              get_project_read_count(\
                session_class=igf_session_class,
                project_igf_id=project_name)                                          # get raw read count for project
            (description,read_count_data,column_order) = \
              convert_project_data_gviz_data(input_data=raw_read_count)               # convert read count to gviz requirements
            convert_to_gviz_json_for_display(\
              description=description,
              data=read_count_data,
              columns_order=column_order,
              output_file=temp_read_count_output)                                     # write data to output json file
            read_count_data = pd.DataFrame(read_count_data)
            if not isinstance(read_count_data, pd.DataFrame):
                raise ValueError('Expecting a pandas dataframe, and got {0}'.\
                                 format(type(read_count_data)))

            read_count_data.\
              set_index(sample_column).\
              to_csv(\
                temp_read_count_csv_output,
                index=True)                                                           # create csv output for project data
            seqrun_data = \
              get_seqrun_info_for_project(\
                session_class=igf_session_class,
                project_igf_id=project_name)                                          # fetch seqrun info for each projects
            add_seqrun_path_info(\
              input_data=seqrun_data,
              output_file=temp_seqrun_info)                                           # write seqrun info json
            remote_project_dir = \
              os.path.join(\
                remote_project_path,
                project_name)                                                         # get remote project directory path
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=temp_seqrun_info,
              remote_file=os.path.join(remote_project_dir,
                                       seqruninfofile))                               # copy seqrun info file to remote
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=temp_read_count_output,
              remote_file=os.path.join(remote_project_dir,
                                       samplereadcountfile))                          # copy file sample read count json file to remote
            os.chmod(temp_read_count_csv_output,
                     mode=0o754)  # changed file permission before copy
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=temp_read_count_csv_output,
              remote_file=os.path.join(remote_project_dir,
                                       samplereadcountcsvfile))                       # copy file sample read count csv file to remote
            ps = Project_status(\
                  igf_session_class=igf_session_class,
                  project_igf_id=project_name)
            temp_status_output = \
              os.path.join(\
                temp_work_dir,
                status_data_json)                                                     # get path for temp status file
            ps.generate_gviz_json_file(\
                 output_file=temp_status_output,
                 demultiplexing_pipeline=pipeline_name,
                 analysis_pipeline=analysis_pipeline_name,
                 active_seqrun_igf_id=seqrun_igf_id)                                  # write data to output json file
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=temp_status_output,
              remote_file=os.path.join(remote_project_dir,
                                       status_data_json))                             # copy file project status file to remote
            self.param('dataflow_params', {'remote_project_info': 'done'})
            remove_dir(temp_work_dir)  # remove temp dir
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
              format(\
                self.__class__.__name__,
                e,
                seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
예제 #10
0
  def run(self):
    try:
      project_igf_id = self.param_required('project_igf_id')
      sample_igf_id = self.param_required('sample_igf_id')
      file_list = self.param_required('file_list')
      remote_user = self.param_required('remote_user')
      remote_host = self.param_required('remote_host')
      remote_project_path = self.param_required('remote_project_path')
      dir_labels = self.param_required('dir_labels')
      igf_session_class = self.param_required('igf_session_class')
      force_overwrite = self.param('force_overwrite')
      collect_remote_file = self.param('collect_remote_file')
      collection_name = self.param('collection_name')
      collection_type = self.param('collection_type')
      collection_table = self.param('collection_table')
      file_location = self.param('file_location')
      use_ephemeral_space = self.param('use_ephemeral_space')
      destination_output_path = \
        os.path.join(
          remote_project_path,
          project_igf_id)                                                       # get base destination path
      if isinstance(dir_labels, list) and \
         len(dir_labels) > 0:
        destination_output_path=\
          os.path.join(destination_output_path,
                       *dir_labels)

      if collect_remote_file:
        if collection_name is None or \
           collection_type is None:
           raise ValueError('Name and type are required for db collection')

      output_file_list = list()
      temp_work_dir = \
        get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp dir
      for file in file_list:
        if not os.path.exists(file):
          raise IOError('file {0} not found'.\
                        format(file))

        if os.path.isfile(file):
          copy2(
            file,
            os.path.join(
              temp_work_dir,
              os.path.basename(file)))                                          # copy file to a temp dir
          dest_file_path = \
            os.path.join(
              destination_output_path,
              os.path.basename(file))                                           # get destination file path
          os.chmod(
            os.path.join(
              temp_work_dir,
              os.path.basename(file)),
            mode=0o764)                                                         # set file permission
        elif os.path.isdir(file):
          copytree(\
            file,
            os.path.join(
              temp_work_dir,
              os.path.basename(file)))                                          # copy dir to a temp dir
          dest_file_path=destination_output_path
          for root,dirs,files in os.walk(temp_work_dir):
            for dir_name in dirs:
              os.chmod(
                os.path.join(root,dir_name),
                mode=0o775)
            for file_name in files:
              os.chmod(
                os.path.join(root,file_name),
                mode=0o764)                                                     # changing file and dir permissions for remote files
        else:
          raise ValueError('Unknown source file type: {0}'.\
                           format(file))

        #os.chmod(
        #  os.path.join(
        #    temp_work_dir,
        #    os.path.basename(file)),
        #  mode=0o754)                                                                       # set file permission
        copy_remote_file(\
          source_path=os.path.join(temp_work_dir,
                                   os.path.basename(file)),
          destinationa_path=dest_file_path,
          destination_address='{0}@{1}'.format(remote_user,remote_host),
          force_update=force_overwrite
        )                                                                       # copy file to remote
        if os.path.isdir(file):
          dest_file_path=\
            os.path.join(\
              dest_file_path,
              os.path.basename(file))                                           # fix for dir input

        output_file_list.append(dest_file_path)

      remove_dir(dir_path=temp_work_dir)                                        # remove temp dir
      self.param('dataflow_params',
                 {'status': 'done',
                  'output_list':output_file_list})                              # add dataflow params
      if collect_remote_file:
        data=list()
        remove_data_list=[{'name':collection_name,
                           'type':collection_type}]
        for file in output_file_list:
          data.append(
            {'name':collection_name,
             'type':collection_type,
             'table':collection_table,
             'file_path':file,
             'location':file_location
            }
          )

        ca = CollectionAdaptor(**{'session_class':igf_session_class})
        ca.start_session()
        try:
          ca.remove_collection_group_info(
            data=remove_data_list,
            autosave=False)                                                     # remove existing data before loading new collection
          ca.load_file_and_create_collection(
            data=data,
            autosave=False,
            calculate_file_size_and_md5=False)                                  # load remote files to db
          ca.commit_session()                                                   # commit changes
          ca.close_session()
        except:
          ca.rollback_session()                                                 # rollback changes
          ca.close_session()
          raise

    except Exception as e:
      message = \
        'project: {2}, sample:{3}, Error in {0}: {1}'.\
        format(
          self.__class__.__name__,
          e,
          project_igf_id,
          sample_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise
예제 #11
0
    def run(self):
        try:
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            collection_type_list = self.param_required('collection_type_list')
            analysis_data_json = self.param_required('analysis_data_json')
            igf_session_class = self.param_required('igf_session_class')
            remote_project_path = self.param_required('remote_project_path')
            remote_user = self.param_required('remote_user')
            remote_host = self.param_required('remote_host')
            remote_analysis_dir = self.param('remote_analysis_dir')
            pipeline_name = self.param_required('pipeline_name')
            attribute_collection_file_type = self.param(
                'attribute_collection_file_type')
            pipeline_seed_table = self.param('pipeline_seed_table')
            pipeline_finished_status = self.param('pipeline_finished_status')
            chart_data_json = self.param('chart_data_json')
            chart_data_csv = self.param('chart_data_csv')
            sample_id_label = self.param('sample_id_label')
            use_ephemeral_space = self.param('use_ephemeral_space')

            temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space)
            output_file = os.path.join(temp_dir, analysis_data_json)
            chart_json_output_file = os.path.join(temp_dir, chart_data_json)
            csv_output_file = os.path.join(temp_dir, chart_data_csv)
            prj_data = \
              Project_analysis(\
                igf_session_class=igf_session_class,
                collection_type_list=collection_type_list,
                remote_analysis_dir=remote_analysis_dir,
                attribute_collection_file_type=attribute_collection_file_type,
                pipeline_name=pipeline_name,
                pipeline_seed_table=pipeline_seed_table,
                pipeline_finished_status=pipeline_finished_status,
                use_ephemeral_space=use_ephemeral_space,
                sample_id_label=sample_id_label)
            prj_data.\
              get_analysis_data_for_project(\
                project_igf_id=project_igf_id,
                output_file=output_file,
                chart_json_output_file=chart_json_output_file,
                csv_output_file=csv_output_file)
            remote_file_path = \
              os.path.join(\
                remote_project_path,
                project_igf_id,
                analysis_data_json)
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=output_file,
              remote_file=remote_file_path)
            remote_chart_file_path = \
              os.path.join(\
                remote_project_path,
                project_igf_id,
                chart_data_json)
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=chart_json_output_file,
              remote_file=remote_chart_file_path)
            remote_csv_file_path = \
              os.path.join(\
                remote_project_path,
                project_igf_id,
                chart_data_csv)
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=csv_output_file,
              remote_file=remote_csv_file_path)
            self.param('dataflow_params',
                       {'remote_file_path': remote_file_path})
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
              format(\
                self.__class__.__name__,
                e,
                project_igf_id,
                sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
예제 #12
0
def run_plotCoverage(bam_files,
                     output_raw_counts,
                     plotcov_stdout,
                     output_plot=None,
                     blacklist_file=None,
                     thread=1,
                     params_list=None,
                     dry_run=False,
                     use_ephemeral_space=0):
    '''
  A function for running Deeptools plotCoverage

  :param bam_files: A list of indexed bam files
  :param output_raw_counts: Output raw count filepath
  :param plotcov_stdout: Output path of plotCoverage stdout logs
  :param output_plot: Output plots filepath, default None
  :param blacklist_file: Input blacklist region filepath, default None
  :param thread: Number of threads to use, default 1
  :param params_list: Additional deeptools plotCoverage params as list, default None
  :param dry_run: Return Deeptools command list without running it
  :param use_ephemeral_space: A toggle for tmp dir settings, default 0
  :returns: Deeptools command list
  '''
    try:
        if len(bam_files) == 0:
            raise ValueError(
                'No bamfiles found to generate coverage plot data')

        plotcov_args = ['--bamfiles']  # prepare to add input bams to args
        for path in bam_files:
            check_file_path(path)  # check input bams
            plotcov_args.append(quote(path))  # adding input bams

        temp_dir = \
          get_temp_dir(
            use_ephemeral_space=use_ephemeral_space)
        temp_output_raw_counts = \
          os.path.join(
            temp_dir,
            os.path.basename(output_raw_counts))                                    # path for temp raw counts
        temp_plotcov_stdout = \
          os.path.join(
            temp_dir,
            os.path.basename(plotcov_stdout))                                       # path for temp raw counts

        plotcov_args.\
          extend([
            "--numberOfProcessors",quote(str(thread)),
           "--outRawCounts",temp_output_raw_counts ])
        if output_plot is not None:
            temp_output_plot = \
              os.path.join(
                temp_dir,
                os.path.basename(output_plot))                                        # path for temp raw counts
            plotcov_args.extend(["--plotFile", temp_output_plot])

        if blacklist_file is not None:
            check_file_path(blacklist_file)
            plotcov_args.extend(["--blackListFileName", quote(blacklist_file)])

        if (params_list is not None or \
            params_list !='') and \
           isinstance(params_list,list) and \
           len(params_list) > 0:
            params_list = [quote(param) for param in params_list]
            plotcov_args.extend(
                params_list)  # add additional params to the list

        if dry_run:
            return plotcov_args

        from deeptools.plotCoverage import main as plotCoverage_main
        f = io.StringIO()
        with redirect_stdout(f):
            plotCoverage_main(plotcov_args)

        stdout_logs = f.getvalue()
        with open(temp_plotcov_stdout, 'w') as fp:
            fp.write(stdout_logs)

        copy_local_file(source_path=temp_plotcov_stdout,
                        destinationa_path=plotcov_stdout)
        copy_local_file(source_path=temp_output_raw_counts,
                        destinationa_path=output_raw_counts)
        if output_plot is not None:
            copy_local_file(source_path=temp_output_plot,
                            destinationa_path=output_plot)

        remove_dir(temp_dir)  # clean up temp dir
        plotcov_args.insert(0, 'plotCoverage')  # fix for deeptools commandline
        return plotcov_args
    except:
        raise
예제 #13
0
def run_bamCoverage(bam_files,
                    output_file,
                    blacklist_file=None,
                    thread=1,
                    dry_run=False,
                    params_list=("--outFileFormat", "bigwig"),
                    use_ephemeral_space=0):
    '''
  A function for running Deeptools bamCoverage

  :param bam_files: A list of bam files to run tool,expecting only one file
  :param output_file: Ouput filepath for the coverage plot
  :param blacklist_file: Input blacklist region filepath, default None
  :param thread: Number of threads to use, default 1
  :param dry_run: Return Deeptools command list without running it
  :param params_list: Additional deeptools plotCoverage params as list, default ("--outFileFormat","bigwig")
  :param use_ephemeral_space: A toggle for temp dir settings, default 0
  :returns: Deeptools command as list
  '''
    try:
        if len(bam_files) == 0:
            raise ValueError('No bamfiles found to generate coverage data')

        if len(bam_files) > 1:
            raise ValueError(
              'Expecting only one bam for bamCoverage tools, found : {0}'.\
                format(len(bam_files)))

        bamcov_args = ['--bam']  # prepare to add input bams to args
        for path in bam_files:
            check_file_path(path)  # check input bams
            bamcov_args.append(quote(path))  # adding input bams

        temp_dir = \
          get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        temp_output = \
          os.path.join(
            temp_dir,
            os.path.basename(output_file))
        bamcov_args.\
        extend([
          "--numberOfProcessors",quote(str(thread)),
          "--outFileName",temp_output])
        if blacklist_file is not None:
            check_file_path(blacklist_file)
            bamcov_args.extend(["--blackListFileName", quote(blacklist_file)])


        if (params_list is not None or \
            params_list != '') and \
            (isinstance(params_list,list) or \
             isinstance(params_list,tuple)) and \
            len(params_list)>0:
            params_list = list(params_list)
            if len(params_list) > 0:
                params_list = [quote(param) for param in params_list]
                bamcov_args.extend(
                    params_list)  # add additional params to the list

        if dry_run:
            return bamcov_args

        from deeptools.bamCoverage import main as bamCoverage_main
        bamCoverage_main(bamcov_args)  # generate bam coverage file
        copy_local_file(source_path=temp_output,
                        destinationa_path=output_file)  # copy output file
        remove_dir(temp_dir)  # clean up temp dir
        bamcov_args.insert(0, 'bamCoverage')  # fix for deeptools commandline
        return bamcov_args
    except:
        raise
    def run(self):
        try:
            seqrun_igf_id = self.param_required('seqrun_igf_id')
            project_name = self.param_required('project_name')
            remote_project_path = self.param_required('remote_project_path')
            remote_user = self.param_required('remote_user')
            remote_host = self.param_required('remote_host')
            template_dir = self.param_required('template_dir')
            igf_session_class = self.param_required('igf_session_class')
            htaccess_template_path = self.param('htaccess_template_path')
            htaccess_template = self.param('htaccess_template')
            htpasswd_template = self.param('htpasswd_template')
            htaccess_filename = self.param('htaccess_filename')
            htpasswd_filename = self.param('htpasswd_filename')
            project_template = self.param('project_template')
            status_template = self.param('status_template')
            analysis_template = self.param('analysis_template')
            analysis_viewer_template = self.param('analysis_viewer_template')
            seqruninfofile = self.param('seqruninfofile')
            samplereadcountfile = self.param('samplereadcountfile')
            samplereadcountcsvfile = self.param('samplereadcountcsvfile')
            status_data_json = self.param('status_data_json')
            analysis_data_json = self.param('analysis_data_json')
            analysis_data_csv = self.param('analysis_data_csv')
            analysis_chart_data_csv = self.param('analysis_chart_data_csv')
            analysis_chart_data_json = self.param('analysis_chart_data_json')
            analysis_view_js = self.param('analysis_view_js')
            image_height = self.param('image_height')
            sample_count_threshold = self.param('sample_count_threshold')
            use_ephemeral_space = self.param('use_ephemeral_space')

            htaccess_template_path = \
              os.path.join(\
                template_dir,
                htaccess_template_path)                                               # set path for template dir
            project_template_path = \
              os.path.join(\
                template_dir,
                project_template)                                                     # set path for project template
            status_template_path = \
              os.path.join(\
                template_dir,
                status_template)                                                      # set path for project status template
            analysis_template_path = \
              os.path.join(\
                template_dir,
                analysis_template)                                                    # set path for project analysis template
            analysis_viewer_template = \
              os.path.join(\
                template_dir,
                analysis_viewer_template)                                             # set path for analysis viewer template
            pa = ProjectAdaptor(**{'session_class': igf_session_class})
            pa.start_session()
            user_info = \
              pa.get_project_user_info(project_igf_id=project_name)                   # fetch user info from db
            sample_counts = \
              pa.count_project_samples(\
                project_igf_id=project_name,
                only_active=True)                                                     # get sample counts for the project
            pa.close_session()

            image_height = \
              self._calculate_image_height(\
                sample_count=sample_counts,
                height=image_height,
                threshold=sample_count_threshold)                                     # change image height based on sample count

            user_info = user_info.to_dict(
                orient='records')  # convert dataframe to list of dictionaries
            if len(user_info) == 0:
                raise ValueError('No user found for project {0}'.\
                                 format(project_name))

            user_list = list()
            user_passwd_dict = dict()
            hpc_user = True  # by default, load hpc user settings
            for user in user_info:
                username = user['username']  # get username for irods
                user_list.append(username)
                if 'ht_password' in user.keys():
                    ht_passwd = user['ht_password']  # get htaccess passwd
                    user_passwd_dict.update({username: ht_passwd})

                if 'category' in user.keys() and \
                   'data_authority' in user.keys() and \
                   user['category'] == 'NON_HPC_USER' and \
                   user['data_authority']=='T':
                    hpc_user = False  # switch to non-hpc settings if primary user is non-hpc
            temp_work_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp dir
            template_env = \
              Environment(\
                loader=FileSystemLoader(\
                         searchpath=htaccess_template_path),
                autoescape=select_autoescape(['html', 'xml']))                        # set template env
            htaccess = template_env.get_template(
                htaccess_template)  # read htaccess template
            htpasswd = template_env.get_template(
                htpasswd_template)  # read htpass template
            htaccess_output = \
              os.path.join(\
                temp_work_dir,
                htaccess_filename)
            htpasswd_output = \
              os.path.join(\
                temp_work_dir,
                htpasswd_filename)

            htaccess.\
            stream(\
              remote_project_dir=remote_project_path,
              project_tag=project_name,
              hpcUser=hpc_user,
              htpasswd_filename=htpasswd_filename,
              customerUsernameList=' '.join(user_list)).\
            dump(htaccess_output)                                                     # write new htacces file

            htpasswd.\
            stream(userDict=user_passwd_dict).\
            dump(htpasswd_output)                                                     # write new htpass file
            template_prj = \
              Environment(\
                loader=FileSystemLoader(\
                         searchpath=os.path.dirname(project_template_path)),
                autoescape=select_autoescape(['txt', 'xml']))                         # set template env for project
            project_index = \
              template_prj.\
                get_template(os.path.basename(project_template_path))                 # read htaccess template
            project_output = \
                os.path.join(\
                  temp_work_dir,
                  os.path.basename(project_template_path))
            project_index.\
            stream(\
              ProjectName=project_name,
              seqrunInfoFile=seqruninfofile,
              sampleReadCountFile=samplereadcountfile,
              sampleReadCountCsvFile=samplereadcountcsvfile,
              ImageHeight=image_height).\
            dump(project_output)                                                      # write new project file

            template_status = \
              Environment(\
                loader=FileSystemLoader(\
                         searchpath=os.path.dirname(status_template_path)),
                autoescape=select_autoescape(['txt', 'xml']))                         # set template env for project
            project_status = \
              template_status.\
              get_template(os.path.basename(status_template_path))                    # read status page template
            status_output = \
              os.path.join(\
                temp_work_dir,
                os.path.basename(status_template_path))
            project_status.\
            stream(\
              ProjectName=project_name,
              status_data_json=status_data_json).\
            dump(status_output)                                                       # write new project status file

            template_analysis = \
              Environment(\
                loader=FileSystemLoader(\
                         searchpath=os.path.dirname(analysis_template_path)),
                autoescape=select_autoescape(['txt', 'xml']))                         # set template env for analysis
            project_analysis = \
              template_analysis.\
                get_template(os.path.basename(analysis_template_path))                # read analysis page template
            analysis_output = \
              os.path.join(\
                temp_work_dir,
                os.path.basename(analysis_template_path))
            project_analysis.\
            stream(\
              ProjectName=project_name,
              analysisInfoFile=analysis_data_json,
              analysisInfoCsvFile=analysis_data_csv,
              analysisCsvDataFile=analysis_chart_data_csv,
              analysisPlotFile=analysis_chart_data_json).\
            dump(analysis_output)                                                     # write new project analysis file

            template_analysis_viewer = \
              Environment(\
                loader=FileSystemLoader(\
                         searchpath=os.path.dirname(analysis_viewer_template)),
                autoescape=select_autoescape(['txt', 'xml']))                         # set template env for analysis viewer
            project_analysis_viewer = \
              template_analysis_viewer.\
                get_template(os.path.basename(analysis_viewer_template))              # read analysis viewer page template
            analysis_viewer_output = \
              os.path.join(\
                temp_work_dir,
                os.path.basename(analysis_viewer_template))
            project_analysis_viewer.\
            stream(\
              ProjectName=project_name,
              analysisJsFile=analysis_view_js).\
            dump(analysis_viewer_output)                                              # write new project analysis viewer file

            remote_project_dir = \
              os.path.join(\
                remote_project_path,
                project_name)                                                         # ger remote project dir path
            remote_htaccess_file = \
              os.path.join(\
                remote_project_dir,
                htaccess_filename)                                                    # remote htaccess filepath
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=htaccess_output,
              remote_file=remote_htaccess_file)                                       # copy htaccess file to remote dir
            remote_htpasswd_file = \
              os.path.join(\
                remote_project_dir,
                htpasswd_filename)                                                    # remote htpasswd filepath
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=htpasswd_output,
              remote_file=remote_htpasswd_file)                                       # copy htpasswd file to remote dir
            remote_project_output_file = \
              os.path.join(\
                remote_project_dir,
                os.path.basename(project_output))                                     # remote project output filepath
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=project_output,
              remote_file=remote_project_output_file)                                 # copy project output file to remote dir
            remote_status_output_file = \
              os.path.join(\
                remote_project_dir,
                os.path.basename(status_output))                                      # remote project status output filepath
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=status_output,
              remote_file=remote_status_output_file)                                  # copy project status output file to remote dir
            remote_analysis_output_file = \
              os.path.join(\
                remote_project_dir,
                os.path.basename(analysis_output))                                    # remote project analysis output filepath
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=analysis_output,
              remote_file=remote_analysis_output_file)                                # copy project analysis output file to remote dir
            remote_analysis_viewer_output_file = \
              os.path.join(\
                remote_project_dir,
                os.path.basename(analysis_viewer_output))                             # remote project analysis viewer output filepath
            self._check_and_copy_remote_file(\
              remote_user=remote_user,
              remote_host=remote_host,
              source_file=analysis_viewer_output,
              remote_file=remote_analysis_viewer_output_file)                         # copy project analysis viewer output file to remote dir
            self.param('dataflow_params', {'remote_dir_status': 'done'})
            remove_dir(temp_work_dir)
        except Exception as e:
            message = \
              'seqrun: {2}, Error in {0}: {1}'.\
              format(\
                self.__class__.__name__,
                e,
                seqrun_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise