Python check_file_path 예제들, igf_data.utils.fileutils.check_file_path Python 예제들

예제 #1

0

파일 보기

파일: ppqt_utils.py 프로젝트: bballamudi/data-management-python

    def _post_process(output_spp_name, output_pdf_name, output_dir, temp_dir):
        '''
    A static method for post processing ppqt analysis

    :param output_spp_name: Output spp filename
    :param output_pdf_name: Output pdf filename
    :param output_dir: Destination output dir
    :param temp_dir: Source temp dir
    :returns: spp output path and pdf output path
    '''
        try:
            tmp_spp_file = os.path.join(temp_dir, output_spp_name)
            dest_spp_file = os.path.join(output_dir, output_spp_name)
            tmp_pdf_file = os.path.join(temp_dir, output_pdf_name)
            dest_pdf_file = os.path.join(output_dir, output_pdf_name)
            check_file_path(tmp_spp_file)
            check_file_path(tmp_pdf_file)
            copy_local_file(\
              source_path=tmp_spp_file,
              destinationa_path=dest_spp_file,
              force=True)
            copy_local_file(\
              source_path=tmp_pdf_file,
              destinationa_path=dest_pdf_file,
              force=True)
            return dest_spp_file, dest_pdf_file
        except:
            raise

예제 #2

0

파일 보기

파일: ppqt_utils.py 프로젝트: bballamudi/data-management-python

    def _parse_spp_output(spp_file):
        '''
    An internal static method for parsing PPQC spp out file

    :param spp_file: A spp.out filepath
    :returns: A list of dictionary
    '''
        try:
            check_file_path(spp_file)
            column_names = \
              ["PPQT_Filename",
               "PPQT_numReads",
               "PPQT_estFragLen",
               "PPQT_corr_estFragLen",
               "PPQT_PhantomPeak",
               "PPQT_corr_phantomPeak",
               "PPQT_argmin_corr",
               "PPQT_min_corr",
               "PPQT_Normalized_SCC_NSC",
               "PPQT_Relative_SCC_RSC",
               "PPQT_QualityTag"]
            data = \
              pd.read_csv(\
                spp_file,
                sep='\t',
                dtype=object,
                names=column_names)
            return data.to_dict(orient='records')
        except Exception as e:
            raise ValueError('Failed to parse file {0}, got error {1}'.\
                  format(spp_file,e))

예제 #3

0

파일 보기

    def _generate_ipynb_from_template(self, param_map):
        '''
    An internal method to generate notebook from template

    :param param_map: A dictionary for parameter substitution in output notebook
    :returns: A output notebook path
    '''
        try:
            check_file_path(self.template_ipynb_path)
            check_file_path(self.temp_dir)
            if not isinstance(param_map, dict):
                raise TypeError(
                        "Expecting a dictionary for notebook param substitution, got {0}".\
                          format(type(param_map)))
            notebook_output = \
              os.path.join(
                self.temp_dir,
                os.path.basename(self.template_ipynb_path))
            template_env = \
              Environment(
                loader=\
                  FileSystemLoader(
                    searchpath=os.path.dirname(self.template_ipynb_path)),
                autoescape=select_autoescape(['html', 'xml']))
            notebook = \
              template_env.\
                get_template(
                  os.path.basename(self.template_ipynb_path))
            notebook.\
              stream(**param_map).\
              dump(notebook_output)
            return notebook_output
        except Exception as e:
            raise ValueError("Failed to generate notebook for template: {0}, error, {1}".\
                               format(self.template_ipynb_path,e))

예제 #4

0

파일 보기

파일: ppqt_utils.py 프로젝트: bballamudi/data-management-python

    def _pre_process(self, input_bam, output_spp_name, output_pdf_name,
                     output_dir, temp_dir):
        '''
    An internal method for preprocessing before the exe run

    :param input_bam: Input bam file
    :param output_spp_name: Output spp filename
    :param output_pdf_name: Output pdf filename
    :param output_dir: Destination output dir
    :param temp_dir: Source temp dir
    '''
        try:
            check_file_path(self.rscript_path)
            check_file_path(self.ppqt_exe)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir, mode=0o770)

            output_pdf = os.path.join(temp_dir, output_pdf_name)
            output_spp = os.path.join(temp_dir, output_spp_name)
            run_cmd = \
              [quote(self.rscript_path),
               quote(self.ppqt_exe),
               quote('-c={0}'.format(input_bam)),
               quote('-rf'),
               quote('-p={0}'.format(str(self.threads))),
               quote('-savp={0}'.format(output_pdf)),
               quote('-out={0}'.format(output_spp)),
               quote('-tmpdir={0}'.format(temp_dir)),
               quote('-odir={0}'.format(output_dir))]
            return run_cmd
        except:
            raise

예제 #5

0

파일 보기

    def _copy_container_output_and_update_map(self, temp_notebook_path):
        '''
    An internal method to copy output files from container output dir and update the output map dictionary

    :returns: A new dictionary with updated filepath in the values
    '''
        try:
            new_output_map = dict()
            if self.output_file_map is not None and \
               isinstance(self.output_file_map,dict):
                for key, container_path in self.output_file_map.items():
                    mount_path = \
                      os.path.join(
                        self.temp_dir,
                        os.path.basename(container_path))                                 # get output path in container mounted dir
                    check_file_path(mount_path)  # check if its present
                    final_path = \
                      os.path.join(
                        self.output_dir,
                        os.path.basename(mount_path))                                     # get target path
                    if os.path.isfile(mount_path):
                        copy_local_file(mount_path,
                                        final_path)  # copy file with filename
                    elif os.path.isdir(mount_path):
                        copy_local_file(
                            mount_path,
                            self.output_dir)  # copy dir to target dir
                    new_output_map.\
                      update({key:final_path})                                            # update output map
            if self.output_format == 'html':
                temp_notebook_path = \
                  temp_notebook_path.replace('.ipynb','.html')
            elif self.output_format == 'markdown':
                temp_notebook_path = \
                  temp_notebook_path.replace('.ipynb','.md')
            elif self.output_format == 'notebook':
                temp_notebook_path = temp_notebook_path
            elif self.output_format == 'pdf':
                temp_notebook_path = \
                  temp_notebook_path.replace('.ipynb','.pdf')
            elif self.output_format == 'python':
                temp_notebook_path = \
                  temp_notebook_path.replace('.ipynb','.py')
            elif self.output_format == 'slide':
                temp_notebook_path = \
                  temp_notebook_path.replace('.ipynb','.html')
            check_file_path(temp_notebook_path)
            output_notebook_path = \
              os.path.join(
                self.output_dir,
                os.path.basename(temp_notebook_path))
            copy_local_file(temp_notebook_path,
                            output_notebook_path)  # copy notbook file
            new_output_map.\
              update({self.notebook_tag:output_notebook_path})
            return new_output_map
        except Exception as e:
            raise ValueError(
                    "Failed to copy files from container mount dir, error: {0}".\
                      format(e))

예제 #6

0

파일 보기

def generate_ipynb_from_template(template_ipynb_path,
                                 output_dir,
                                 param_dictionary,
                                 date_tag='date_tag',
                                 use_ephemeral_space=False):
    '''
  A class for generating notebook IPYNB file from a template files with param substitution

  :param template_ipynb_path: A template IPYNB file path
  :param output_dir: Output path
  :param param_dictionary: A dictionary containing the params for final notebook
  :param date_tag: A text for date tag name, default date_tag
  :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False
  :returns: None
  '''
    try:
        check_file_path(template_ipynb_path)
        check_file_path(output_dir)
        if not isinstance(param_dictionary, dict):
            raise TypeError(
                    "Expecting a dictionary, got {0}".\
                      format(type(param_dictionary)))
        date_tag_value = \
          datetime.\
            strftime(
              datetime.now(),
              '%Y-%b-%d %H:%M')                                                     # date tag values
        param_dictionary.\
          update(dict(date_tag=date_tag_value))                                     # adding date tag values to params
        temp_dir = \
          get_temp_dir(
            use_ephemeral_space=use_ephemeral_space)
        temp_output = \
          os.path.join(
            temp_dir,
            os.path.basename(template_ipynb_path))
        final_output = \
          os.path.join(
            output_dir,
            os.path.basename(template_ipynb_path))
        template_env = \
          Environment(
            loader=\
              FileSystemLoader(
                searchpath=os.path.dirname(template_ipynb_path)),
            autoescape=select_autoescape(['html', 'xml']))
        notebook = \
          template_env.\
            get_template(
              os.path.basename(template_ipynb_path))
        notebook.\
          stream(**param_dictionary).\
          dump(temp_output)                                                         # write temp ipynb file with param substitution
        copy_local_file(temp_output, final_output)
        remove_dir(temp_dir)
    except Exception as e:
        raise ValueError(
                "Failed to generate ipynb file from template {1}, error: {0}".\
                  format(e,template_ipynb_path))

예제 #7

0

파일 보기

def extract_cellranger_count_metrics_summary(
        cellranger_tar,
        collection_name=None,
        collection_type=None,
        attribute_name='attribute_name',
        attribute_value='attribute_value',
        attribute_prefix='None',
        target_filename='metrics_summary.csv'):
    '''
  A function for extracting metrics summary file for cellranger ourput tar and parse the file.
  Optionally it can add the collection name and type info to the output dictionary.
  
  :param cellranger_tar: A cellranger output tar file
  :param target_filename: A filename for metrics summary file lookup, default metrics_summary.csv
  :param collection_name: Optional collection name, default None
  :param collection_type: Optional collection type, default None
  :param attribute_tag: An optional string to add as prefix of the attribute names, default None
  :returns: A dictionary containing the metrics values
  '''
    try:
        check_file_path(cellranger_tar)
        temp_work_dir = get_temp_dir(use_ephemeral_space=False)
        metrics_file = None
        with tarfile.open(cellranger_tar, mode='r') as tar:
            for file_name in tar.getnames():
                if os.path.basename(file_name) == target_filename:
                    tar.extract(file_name, path=temp_work_dir)
                    metrics_file = os.path.join(temp_work_dir, file_name)

        if metrics_file is None:
            raise IOError('Required file {0} not found in tar {1}'.\
                          format(target_filename,cellranger_tar))

        attribute_data = pd.read_csv(metrics_file).T.\
                         reset_index()
        attribute_data.columns = [attribute_name, attribute_value]
        if attribute_prefix is None:
            attribute_data[attribute_name] = \
              attribute_data[attribute_name].\
                map(lambda x: x.replace(' ','_'))
        else:
            attribute_data[attribute_name] = \
              attribute_data[attribute_name].\
                map(lambda x: \
                    '{0}_{1}'.format(\
                      attribute_prefix,
                      x.replace(' ','_')))

        if collection_name is not None:
            attribute_data['name'] = collection_name
        if collection_type is not None:
            attribute_data['type'] = collection_type

        attribute_data = attribute_data.\
                         to_dict(orient='records')
        remove_dir(temp_work_dir)
        return attribute_data
    except:
        raise

예제 #8

0

파일 보기

    def run_HaplotypeCaller(self,
                            input_bam,
                            output_vcf_path,
                            dbsnp_vcf,
                            emit_gvcf=True,
                            force=False,
                            dry_run=False,
                            gatk_param_list=None):
        '''
    A method for running GATK HaplotypeCaller
    
    :param input_bam: A input bam file
    :param output_vcf_path: A output vcf filepath
    :param dbsnp_vcf: A dbsnp vcf file
    :param emit_gvcf: A toggle for GVCF generation, default True
    :param force: Overwrite output file, if force is True
    :param dry_run: Return GATK command, if its true, default False
    :param gatk_param_list: List of additional params for BQSR, default None
    :returns: GATK commandline
    '''
        try:
            self._run_gatk_checks()  # run initial checks
            check_file_path(input_bam)
            check_file_path(dbsnp_vcf)
            temp_dir = \
              get_temp_dir(use_ephemeral_space=self.use_ephemeral_space)              # get temp dir
            temp_output = \
              os.path.join(
                temp_dir,
                os.path.basename(output_vcf_path))
            gatk_cmd = [
                quote(self.gatk_exe), "HaplotypeCaller", "-I",
                quote(input_bam), "-O",
                quote(temp_output), "--reference",
                quote(self.ref_fasta), "--dbsnp",
                quote(dbsnp_vcf), "--java-options",
                quote(self.java_param)
            ]
            if emit_gvcf:
                gatk_cmd.extend(["--emit-ref-confidence", "GVCF"])
            if gatk_param_list is not None and \
               isinstance(gatk_param_list,list) and \
               len(gatk_param_list) > 0:
                gatk_cmd.extend(gatk_param_list)  # additional params
            gatk_cmd = ' '.join(gatk_cmd)
            if dry_run:
                return gatk_cmd

            subprocess.check_call(gatk_cmd, shell=True)
            copy_local_file(source_path=temp_output,
                            destinationa_path=output_vcf_path,
                            force=force)
            remove_dir(temp_dir)
            return gatk_cmd
        except Exception as e:
            raise ValueError(
                    "Failed to run GATK HaplotypeCaller, error: {0}".\
                      format(e))

예제 #9

0

파일 보기

def run_bam_idxstat(samtools_exe,
                    bam_file,
                    output_dir,
                    output_prefix=None,
                    force=False,
                    dry_run=False):
    '''
  A function for running samtools index stats generation
  
  :param samtools_exe: samtools executable path
  :param bam_file: A bam filepath with / without index. Index file will be created if its missing
  :param output_dir: Bam idxstats output directory path
  :param output_prefix: Output file prefix, default None
  :param force: Output idxstats file will be overwritten if force is True, default False
  :param dry_run: A toggle for returning the samtools command without actually running it, default False
  :returns: Output file path and a list containing samtools command
  '''
    try:
        check_file_path(samtools_exe)
        _check_bam_file(bam_file=bam_file)  # check bam file
        if not dry_run:
            _check_bam_index(\
              samtools_exe=samtools_exe,
              bam_file=bam_file)                                                      # generate bam index
        if output_prefix is None:
            output_prefix = os.path.basename(bam_file)

        output_path = \
          '{0}.{1}.{2}'.\
          format(output_prefix,'idxstats','txt')                                    # get output filename
        output_path = \
          os.path.join(\
            output_dir,
            output_path)                                                            # get complete output path
        if not os.path.exists(output_dir):
            raise IOError('Output path {0} not found'.format(output_dir))

        if os.path.exists(output_path) and not force:
            raise ValueError('Output file {0} already present, use force to overwrite'.\
                             format(output_path))

        idxstat_cmd = \
          [quote(samtools_exe),
           'idxstats',
           quote(bam_file)
          ]
        if dry_run:
            return idxstat_cmd

        with open(output_path, 'w') as fp:
            with subprocess.Popen(idxstat_cmd, stdout=subprocess.PIPE) as proc:
                fp.write(proc.stdout.read().decode(
                    'utf-8'))  # write bam flagstat output

        return output_path, idxstat_cmd
    except:
        raise

예제 #10

0

파일 보기

  def run(self):
    '''
    A method for running the cellranger count metrics extraction
    
    :param project_igf_id: A project igf id
    :param experiment_igf_id: An experiment igf id
    :param sample_igf_id: A sample igf id
    :param igf_session_class: A database session class
    :param analysis_output_list: Cellranger analysis tar output path
    :param collection_type: Cellranger results collection type
    :param metrics_filename: Name of the metrics file, default metrics_summary.csv
    :returns: None
    '''
    try:
      project_igf_id = self.param_required('project_igf_id')
      experiment_igf_id = self.param_required('experiment_igf_id')
      sample_igf_id = self.param_required('sample_igf_id')
      igf_session_class = self.param_required('igf_session_class')
      analysis_output_list = self.param_required('analysis_output_list')
      collection_type = self.param('collection_type')
      metrics_filename = self.param('metrics_filename')
      attribute_prefix = self.param('attribute_prefix')
      for infile in analysis_output_list:
        check_file_path(infile)                                                 # check input file path

      cellranger_tar = analysis_output_list[0]
      cellranger_metrics = extract_cellranger_count_metrics_summary(\
                            cellranger_tar=cellranger_tar,
                            target_filename=metrics_filename,
                            collection_name=experiment_igf_id,
                            collection_type=collection_type,
                            attribute_prefix=attribute_prefix
                            )                                                   # extract cellranger metrics stats as dictionary
      ca = CollectionAdaptor(**{'session_class':igf_session_class})
      ca.start_session()
      try:
        ca.create_or_update_collection_attributes(\
           data=cellranger_metrics,
           autosave=False)                                                      # load cellranger metrics to collection attribute table
        ca.commit_session()
        ca.close_session()
      except:
          ca.rollback_session()
          ca.close_session()
          raise

      self.param('dataflow_params',{'cellranger_attribute':'done'})
    except Exception as e:
      message='project: {2}, sample:{3}, Error in {0}: {1}'.\
              format(self.__class__.__name__,
                     e,
                     project_igf_id,
                     sample_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise

예제 #11

0

파일 보기

파일: singularity_run_wrapper.py 프로젝트: bballamudi/data-management-python

def singularity_run(image_path,
                    path_bind,
                    args_list,
                    container_dir='/tmp',
                    return_results=True,
                    use_ephemeral_space=False,
                    dry_run=False):
    '''
  A wrapper module for running singularity based containers

  :param image_path: Singularrity image path
  :param path_bind: Path to bind to singularity /tmp dir
  :param args_list: List of args for singulatiy run
  :param return_results: Return singulatiy run results, default True
  :param use_ephemeral_space: Toggle for using ephemeral space for temp dir, default False
  :param dry_run: Return the singularity command without run, default False
  :returns: A response from container run and a string containing singularity command line
  '''
    try:
        check_file_path(image_path)
        check_file_path(path_bind)
        temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        res = None
        temp_image_path = \
          os.path.join(
            temp_dir,
            os.path.basename(image_path))
        copy_local_file(image_path, temp_image_path)  # copy image to tmp dir
        if not isinstance(args_list,list) and \
           len(args_list) > 0:
            raise ValueError(
                'No args provided for singularity run')  # safemode
        args = ' '.join(args_list)  # flatten args
        singularity_run_cmd = \
          'singularity run {0} --bind {1}:{2} {3}'.\
            format(
              temp_image_path,
              path_bind,
              container_dir,
              args)
        if dry_run:
            return res, singularity_run_cmd
        else:

            res = \
              Client.run(
                image=temp_image_path,
                bind='{0}:{1}'.format(path_bind,container_dir),
                args=args,
                return_result=return_results)
            remove_dir(temp_dir)  # remove copied image after run
            return res, singularity_run_cmd
    except Exception as e:
        raise ValueError(
                'Failed to run image {0}, error: {1}'.\
                  format(image_path,e))

예제 #12

0

파일 보기

 def _run_gatk_checks(self):
     '''
 An internal method for running checks before GATK run
 '''
     try:
         check_file_path(self.gatk_exe)
         check_file_path(self.ref_fasta)
     except Exception as e:
         raise ValueError(
                 "Failed to run GATK checks, error: {0}".\
                   format(e))

예제 #13

0

파일 보기

    def run_AnalyzeCovariates(self,
                              before_report_file,
                              after_report_file,
                              output_pdf_path,
                              force=False,
                              dry_run=False,
                              gatk_param_list=None):
        '''
    A method for running GATK AnalyzeCovariates tool
    
    :param before_report_file: A file containing bqsr output before recalibration
    :param after_report_file: A file containing bqsr output after recalibration
    :param output_pdf_path: An output pdf filepath
    :param force: Overwrite output file, if force is True
    :param dry_run: Return GATK command, if its true, default False
    :param gatk_param_list: List of additional params for BQSR, default None
    :returns: GATK commandline
    '''
        try:
            self._run_gatk_checks()  # run initial checks
            check_file_path(before_report_file)
            check_file_path(after_report_file)
            temp_dir = \
              get_temp_dir(use_ephemeral_space=self.use_ephemeral_space)              # get temp dir
            temp_output = \
              os.path.join(
                temp_dir,
                os.path.basename(output_pdf_path))
            gatk_cmd = [
                quote(self.gatk_exe), "AnalyzeCovariates",
                "--before-report-file",
                quote(before_report_file), "--after-report-file",
                quote(after_report_file), "--plots-report-file",
                quote(temp_output), "--java-options",
                quote(self.java_param)
            ]
            if gatk_param_list is not None and \
               isinstance(gatk_param_list,list) and \
               len(gatk_param_list) > 0:
                gatk_cmd.extend(gatk_param_list)  # additional params
            gatk_cmd = ' '.join(gatk_cmd)
            if dry_run:
                return gatk_cmd

            subprocess.check_call(gatk_cmd, shell=True)
            copy_local_file(source_path=temp_output,
                            destinationa_path=output_pdf_path,
                            force=force)
            remove_dir(temp_dir)
            return gatk_cmd
        except Exception as e:
            raise ValueError(
                    "Failed to run GATK AnalyzeCovariates, error: {0}".\
                      format(e))

예제 #14

0

파일 보기

def _check_bam_file(bam_file):
    '''
  An internal method for checking bam file
  
  :param bam_file: A bam file path
  :raises Value Error: It raises error if bam file doesn't have '.bam' extension
  :raises IOError: It raises IOError if the bam_file doesn't exists
  '''
    try:
        check_file_path(bam_file)
        if not fnmatch.fnmatch(bam_file, '*.bam'):
            raise ValueError('Bam file extension is not correct: {0}'.\
                             format(bam_file))
    except:
        raise

예제 #15

0

파일 보기

def _check_cram_file(cram_path):
    '''
  An internal method for checking cram file
  
  :param cram_path: A cram file path
  :raises Value Error: It raises error if cram file doesn't have '.cram' extension
  :raises IOError: It raises IOError if the cram_path doesn't exists
  '''
    try:
        check_file_path(cram_path)
        if not fnmatch.fnmatch(cram_path, '*.cram'):
            raise ValueError('Cram file extension is not correct: {0}'.\
                             format(cram_path))
    except:
        raise

예제 #16

0

파일 보기

def compare_fastq_files_read_counts(r1_file, r2_file):
    '''
  A method for comparing read counts for fastq pairs
  
  :param r1_file: Fastq pair R1 file path
  :param r2_file: Fastq pair R2 file path
  :raises: ValueError if counts are not same
  '''
    try:
        check_file_path(r1_file)
        check_file_path(r2_file)
        r1_count = count_fastq_lines(r1_file)
        r2_count = count_fastq_lines(r2_file)
        if r1_count != r2_count:
            raise ValueError('Fastq pair does not have same number of reads: {0} {1}'.\
                             format(r1_file,r2_file))
    except:
        raise

예제 #17

0

파일 보기

def count_fastq_lines(fastq_file):
    '''
  A method for counting fastq lines
  
  :param fastq_file: A gzipped or unzipped fastq file
  :returns: Fastq line count
  '''
    try:
        gzipped_pattern = re.compile(r'\S+\.(fastq|fq)\.gz$')
        unzipped_pattern = re.compile(r'\S+\.(fastq|fq)$')
        lines = 0
        check_file_path(fastq_file)
        if re.match(gzipped_pattern, fastq_file):  # read gzipped file
            with gzip.open(fastq_file, 'rb') as f:
                buf_size = 1024 * 1024
                read_f = f.read
                buf = read_f(buf_size)
                while buf:
                    lines += buf.count(b'\n')
                    buf = read_f(buf_size)

        elif re.match(unzipped_pattern, fastq_file):  # read unzipped file
            with open(filename, 'rb') as f:
                buf_size = 1024 * 1024
                read_f = f.raw.read
                buf = read_f(buf_size)
                while buf:
                    lines += buf.count(b'\n')
                    buf = read_f(buf_size)

        else:
            raise ValueError('Failed to detect read mode for fastq file {0}'.\
                             format(fastq_file))

        if lines >= 4:
            if lines % 4 != 0:
                raise ValueError('Fastq file missing have block of 4 lines:{0}'.\
                                 format(fastq_file))

            lines = int(lines / 4)

        return lines
    except:
        raise

예제 #18

0

파일 보기

def _check_bam_index(samtools_exe, bam_file, dry_run=False):
    '''
  An internal method for checking bam index files. It will generate a new index if its not found.
  
  :param samtools_exe: samtools executable path
  :param bam_file: A bam file path
  :param dry_run: A toggle for returning the samtools command without actually running it, default False
  '''
    try:
        check_file_path(samtools_exe)
        bam_index = '{0}.bai'.format(bam_file)
        if not os.path.exists(bam_index):
            index_bam_or_cram(\
              samtools_exe=samtools_exe,
              input_path=bam_file,
              dry_run=dry_run
            )
    except:
        raise

예제 #19

0

파일 보기

파일: project_pooling_info.py 프로젝트: bballamudi/data-management-python

  def _fetch_project_info_from_db(self):
    '''
    An internal method for fetching data from db

    :returns: A dataframe containing following columns
    
              project_igf_id,
              sample_igf_id,
              expected_read,
              total_read
    '''
    try:
      check_file_path(self.dbconfig_file)
      dbconf = read_dbconf_json(self.dbconfig_file)
      sa = SampleAdaptor(**dbconf)
      sa.start_session()
      query = sa.session.\
              query(Project.project_igf_id,
                    Sample.sample_igf_id,
                    func.max(Sample_attribute.attribute_value).label(self.expected_read_tag),
                    func.sum(Run_attribute.attribute_value).label(self.total_read_tag)
                   ).\
              outerjoin(Sample,Project.project_id==Sample.project_id).\
              outerjoin(Sample_attribute, Sample.sample_id==Sample_attribute.sample_id).\
              outerjoin(Experiment, Sample.sample_id==Experiment.sample_id).\
              outerjoin(Run,Experiment.experiment_id==Run.experiment_id).\
              outerjoin(Run_attribute,Run.run_id==Run_attribute.run_id).\
              filter((Experiment.platform_name.in_(self.platform_list))|(Experiment.platform_name.is_(None))).\
              filter(Sample_attribute.attribute_name==self.expected_read_tag).\
              filter((Run_attribute.attribute_name==self.r1_read_tag)|(Run_attribute.attribute_name.is_(None))).\
              group_by(Sample.sample_igf_id)
      records = sa.fetch_records(query=query,
                                 output_mode='dataframe')
      sa.close_session()
      records[self.total_read_tag] = records[self.total_read_tag].fillna(0).astype(int)
      return records
    except:
      raise

예제 #20

0

파일 보기

파일: sync_seqrun_data_on_remote.py 프로젝트: bballamudi/data-management-python

  def run_sync(self):
    '''
    A method for running the sequencing run sync
    '''
    try:
      check_file_path(self.output_dir)
      all_seqrun_dir = \
        list_remote_file_or_dirs(\
          remote_server=self.seqrun_server,
          remote_path=self.seqrun_path,
          only_dirs=True)
      all_seqrun_dir = \
        list(map(os.path.basename,all_seqrun_dir))                              # convert paths to dirname
      new_seqrun_dirs = \
        check_seqrun_dir_in_db(\
          all_seqrun_dir=all_seqrun_dir,
          dbconfig=self.database_config_file)                                   # filter existing seqruns
      for seqrun in new_seqrun_dirs:
        try:
          new_seqruns = \
            check_seqrun_dir_in_db(\
              all_seqrun_dir=[seqrun],
              dbconfig=self.database_config_file)                               # filter existing seqrun again
          if len(new_seqruns)>0:
            copy_remote_file(\
              source_path=os.path.join(self.seqrun_path,seqrun),
              destinationa_path=self.output_dir,
              source_address=self.seqrun_server)                                # sync dirs if its still new

        except Exception as e:
          raise ValueError('Failed to sync seqrun {0}, got error {1}'.\
                           format(seqrun,e))

    except Exception as e:
      raise ValueError('Stopped syncing seqrun data, got error: {0}'.\
                       format(e))

예제 #21

0

파일 보기

    def _copy_to_container_temp(mount_dir, container_path_prefix, filepath):
        '''
    An internal static method for copying files to container temp dir

    :param mount_dir: A dir path to mount in container
    :param container_path_prefix: Temp dir path in container
    :param filepath: File or dir path to copy
    :returns: A path in mounted temp dir and a path in the container temp dir
    '''
        try:
            check_file_path(filepath)
            container_path = \
              os.path.join(
                container_path_prefix,
                os.path.basename(filepath))
            mount_dir_path = \
              os.path.join(
                mount_dir,
                os.path.basename(filepath))
            copy_local_file(filepath, mount_dir_path, force=True)
            return mount_dir_path, container_path
        except Exception as e:
            raise ValueError("Failed to copy path {0} to temp dir: {1}, error: {2}".\
                              format(filepath,mount_dir,e))

예제 #22

0

파일 보기

파일: bwa_utils.py 프로젝트: bballamudi/data-management-python

    def _run_checks(self):
        '''
    An internal method for running initial checks before bwa run
    '''
        try:
            check_file_path(self.bwa_exe)
            if self.bam_output:
                check_file_path(self.samtools_exe)

            for file in self.input_fastq_list:
                check_file_path(file)

            check_file_path(self.output_dir)
            if len(self.input_fastq_list) > 2:
                raise ValueError('Expecting max 2 fastq files, got {0}'.\
                                 format(len(self.input_fastq_list)))

        except:
            raise

예제 #23

0

파일 보기

파일: star_utils.py 프로젝트: bballamudi/data-management-python

  def _run_checks(self):
    '''
    An internal method for running initial checks before star run
    '''
    try:
      check_file_path(self.star_exe)                                            # checking star exe
      if not isinstance(self.input_files, list) or \
         len(self.input_files)==0:
        raise ValueError('No input file list found for star')

      for file in self.input_files:
        check_file_path(file_path=file)                                         # checking input file paths

      check_file_path(file_path=self.reference_gtf)                             # checking input gtf filepath
    except:
      raise

예제 #24

0

파일 보기

파일: ucsc_cellbrowser_utils.py 프로젝트: bballamudi/data-management-python

def convert_scanpy_h5ad_to_cellbrowser_dir(cbImportScanpy_path,
                                           h5ad_path,
                                           project_name,
                                           cellbrowser_htmldir,
                                           use_ephemeral_space=0):
    '''
  A wrapper function for Scanpy h5ad file to UCSC cellbrowser html dir conversion

  :param cbImportScanpy_path: Path for cbImportScanpy executable
  :param h5ad_path: Path of input Scanpy h5ad file
  :param project_name: Project name for cellbrowser
  :param cellbrowser_htmldir: Output cellbrowser htmldir path
  :param use_ephemeral_space: A toggle for temp dir setting, default 0
  '''
    try:
        if os.path.exists(cellbrowser_htmldir):
            raise IOError('Cellbrowser output path already present')

        check_file_path(os.path.dirname(cellbrowser_htmldir))
        check_file_path(cbImportScanpy_path)
        check_file_path(h5ad_path)
        temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        temp_cellbrowser_html = \
          os.path.join(\
            temp_dir,
            os.path.basename(cellbrowser_htmldir))
        temp_cellbrowser_dir = \
          os.path.join(\
            temp_dir,
            'out')
        cbImportScanpy_cmd = \
          [quote(cbImportScanpy_path),
           '-n',quote(project_name),
           '-i',quote(h5ad_path),
           '-o',temp_cellbrowser_dir,
           '--htmlDir',temp_cellbrowser_html
          ]
        subprocess.check_call(' '.join(cbImportScanpy_cmd), shell=True)
        copytree(\
          temp_cellbrowser_html,
          cellbrowser_htmldir)
    except:
        raise

예제 #25

0

파일 보기

  def _run_checks(self):
    '''
    An internal method for running initial checks before fastp run
    '''
    try:
      check_file_path(self.fastp_exe)
      if not isinstance(self.input_fastq_list, list):
        raise ValueError('No input fastq list found: {0}'.format(self.input_fastq_list))

      if  isinstance(self.run_thread, int):
        self.run_thread = str(self.run_thread)                                  # convert run thread param to str

      for file in self.input_fastq_list:
        check_file_path(file)

      check_file_path(self.output_dir)
      if len(self.input_fastq_list) > 2:
        raise ValueError('Expecting max 2 fastq files, got {0}'.\
                         format(len(self.input_fastq_list)))

    except:
      raise

예제 #26

0

파일 보기

    def run(self):
        '''
    A method for running the cellranger count for a given sample using ehive pipeline
    
    :param project_igf_id: A project igf id
    :param experiment_igf_id: An experiment igf id
    :param sample_igf_id: A sample igf id
    :param biomaterial_type: Biomaterial type for samples, required for nuclei samples
    :param nuclei_biomaterial_type: Required keywords for nuclei samples, default 'SINGLE_NUCLEI'
    :param igf_session_class: A database session class
    :param cellranger_exe: Cellranger executable path
    :param cellranger_options: Cellranger parameters
                               
                               List of default parameters
                                 --jobmode=pbspro
                                 --localcores=1
                                 --localmem=4
                                 --mempercore=4
                                 --maxjobs=20
    
    :param base_work_dir: Base work directory path
    :param fastq_collection_type: Collection type name for input fastq files, default demultiplexed_fastq
    :param species_name: Reference genome collection name
    :param reference_type: Reference genome collection type, default TRANSCRIPTOME_TENX
    :param nuclei_reference_type: Reference genome collection type for pre-mRNA samples, default TRANSCRIPTOME_TENX_NUCLEI
    :param job_timeout: Timeout for cellranger job, default 24hrs
    :returns: Adding cellranger_output to the dataflow_params
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            cellranger_exe = self.param_required('cellranger_exe')
            cellranger_options = self.param_required('cellranger_options')
            base_work_dir = self.param_required('base_work_dir')
            fastq_collection_type = self.param_required(
                'fastq_collection_type')
            biomaterial_type = self.param_required('biomaterial_type')
            job_timeout = self.param_required('job_timeout')
            nuclei_biomaterial_type = self.param('nuclei_biomaterial_type')
            species_name = self.param('species_name')
            reference_type = self.param('reference_type')
            nuclei_reference_type = self.param('nuclei_reference_type')

            # setup work dir for run
            work_dir = False
            work_dir_prefix = \
              os.path.join(\
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix
            )  # replace this with temp dir while running in queue
            # setup env for run
            os.chdir(work_dir)  # move to work dir
            os.environ['PATH'] += '{0}{1}'.format(
                os.pathsep, os.path.dirname(
                    cellranger_exe))  # add cellranger location to env PATH
            # collect reference genome for run
            if biomaterial_type == nuclei_biomaterial_type:
                ref_genome = \
                  Reference_genome_utils(\
                    genome_tag=species_name,
                    dbsession_class=igf_session_class,
                    tenx_ref_type=nuclei_reference_type)                                # fetch ref genome for pre-mRNA samples
            else:
                ref_genome = \
                  Reference_genome_utils(\
                    genome_tag=species_name,
                    dbsession_class=igf_session_class,
                    tenx_ref_type=reference_type)

            # collect fastq input for run
            cellranger_ref_transcriptome = ref_genome.get_transcriptome_tenx(
            )  # fetch tenx ref transcriptome from db
            input_fastq_dirs = \
              get_cellranger_count_input_list(\
                db_session_class=igf_session_class,
                experiment_igf_id=experiment_igf_id,
                fastq_collection_type=fastq_collection_type)                          # fetch fastq dir paths as list for run
            # configure cellranger count command for run
            cellranger_options = \
              self.format_tool_options(\
                cellranger_options,
                separator='=')
            cellranger_cmd = \
              [cellranger_exe,
               'count',
               '{0}={1}'.format('--fastqs',
                                quote(','.join(input_fastq_dirs))),
               '{0}={1}'.format('--id',
                                quote(experiment_igf_id)),
               '{0}={1}'.format('--transcriptome',
                                quote(cellranger_ref_transcriptome)),
              ]                                                                       # set initial parameters
            cellranger_cmd.extend(
                cellranger_options)  # add optional parameters
            # log before job submission
            message = \
              'started cellranger count for {0}, {1} {2}'.\
              format(\
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            self.comment_asana_task(task_name=project_igf_id,
                                    comment=message)  # send comment to Asana
            message = ' '.join(cellranger_cmd)
            self.comment_asana_task(
                task_name=project_igf_id,
                comment=message)  # send cellranger command to Asana
            # start job execution
            cellranger_cmd = ' '.join(
                cellranger_cmd)  # create shell command string
            subprocess.\
              check_call(\
                cellranger_cmd,
                shell=True,
                timeout=job_timeout)                                                  # run cellranger count using shell
            # prepare output after cellranger run
            cellranger_output = \
              os.path.join(\
                work_dir,
                experiment_igf_id,
                'outs')                                                               # get cellranger output path
            message = \
              'finished cellranger count for {0}, {1} {2} : {3}'.\
              format(\
                project_igf_id,
                sample_igf_id,
                experiment_igf_id,
                cellranger_output)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            self.comment_asana_task(task_name=project_igf_id,
                                    comment=message)  # send comment to Asana
            # validate output files after cellranger run
            check_cellranger_count_output(
                output_path=cellranger_output)  # check output file
            cellranger_report = \
              os.path.join(\
                cellranger_output,
                'web_summary.html')
            check_file_path(cellranger_report)

            self.param('dataflow_params',\
                       {'cellranger_output':cellranger_output,
                        'cellranger_report':cellranger_report})                       # pass on cellranger output path
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
              format(\
                self.__class__.__name__,
                e,
                project_igf_id,
                sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            if work_dir:
                remove_dir(work_dir)
            raise

예제 #27

0

파일 보기

def run_samtools_view(samtools_exe,
                      input_file,
                      output_file,
                      reference_file=None,
                      force=True,
                      cram_out=False,
                      threads=1,
                      samtools_params=None,
                      index_output=True,
                      dry_run=False,
                      use_ephemeral_space=0):
    '''
  A function for running samtools view command

  :param samtools_exe: samtools executable path
  :param input_file: An input bam filepath with / without index. Index file will be created if its missing
  :param output_file: An output file path
  :param reference_file: Reference genome fasta filepath, default None
  :param force: Output file will be overwritten if force is True, default True
  :param threads: Number of threads to use for conversion, default 1
  :param samtools_params: List of samtools param, default None
  :param index_output: Index output file, default True
  :param dry_run: A toggle for returning the samtools command without actually running it, default False
  :param use_ephemeral_space: A toggle for temp dir settings, default 0
  :returns: Samtools command as list
  '''
    try:
        check_file_path(samtools_exe)
        _check_bam_file(bam_file=input_file)  # check bam file
        if not dry_run:
            _check_bam_index(\
              samtools_exe=samtools_exe,
              bam_file=input_file)                                                    # check bam index

        temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        temp_file = \
          os.path.join(\
            temp_dir,
            os.path.basename(output_file))                                          # get temp output file path
        view_cmd = \
          [quote(samtools_exe),
           'view',
           '-o',quote(temp_file)
          ]                                                                         # convert bam to cram using samtools
        if reference_file is not None:
            check_file_path(reference_file)
            view_cmd.extend(['-T', quote(reference_file)])

        if threads is not None:
            view_cmd.append('-@{0}'.format(quote(str(threads))))

        if cram_out:
            view_cmd.append('-C')
            if reference_file is None:
                raise ValueError('Reference file is required for cram output')
        else:
            view_cmd.append('-b')

        if samtools_params is not None and \
           isinstance(samtools_params, list) and \
           len(samtools_params) > 0:
            view_cmd.extend(\
              [quote(i) for i in samtools_params])                                    # add additional params

        view_cmd.append(quote(input_file))
        if dry_run:
            return view_cmd

        subprocess.check_call(\
          ' '.join(view_cmd),
          shell=True)
        if cram_out:
            _check_cram_file(cram_path=temp_file)  # check cram output

        copy_local_file(\
          source_path=temp_file,
          destinationa_path=output_file,
          force=force)                                                              # move cram file to original path
        remove_dir(temp_dir)  # remove temp directory
        if index_output:
            index_bam_or_cram(\
              samtools_exe=samtools_exe,
              input_path=output_file,
              threads=threads)

        return view_cmd
    except:
        raise

예제 #28

0

파일 보기

def merge_multiple_bam(samtools_exe,
                       input_bam_list,
                       output_bam_path,
                       sorted_by_name=False,
                       use_ephemeral_space=0,
                       threads=1,
                       force=False,
                       dry_run=False,
                       index_output=True):
    '''
  A function for merging multiple input bams to a single output bam
  
  :param samtools_exe: samtools executable path
  :param input_bam_list: A file containing list of bam filepath
  :param output_bam_path: A bam output filepath
  :param sorted_by_name: Sort bam file by read_name, default False (for coordinate sorted bams)
  :param threads: Number of threads to use for merging, default 1
  :param force: Output bam file will be overwritten if force is True, default False
  :param index_output: Index output bam, default True
  :param use_ephemeral_space: A toggle for temp dir settings, default 0
  :param dry_run: A toggle for returning the samtools command without actually running it, default False
  :return: samtools command
  '''
    try:
        check_file_path(samtools_exe)
        check_file_path(input_bam_list)
        with open(input_bam_list, 'r') as fp:
            for bam in fp:
                check_file_path(bam.strip())

        temp_dir = \
          get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        temp_bam = \
          os.path.join(\
            temp_dir,
            os.path.basename(output_bam_path))
        merge_cmd = \
          [quote(samtools_exe),
           'merge',
           '--output-fmt','BAM',
           '--threads',quote(str(threads)),
           '-b',quote(input_bam_list)
          ]
        if sorted_by_name:
            merge_cmd.append('-n')  # Input files are sorted by read name

        merge_cmd.append(temp_bam)
        if dry_run:
            return merge_cmd

        subprocess.check_call(merge_cmd)  # run samtools merge
        copy_local_file(\
          source_path=temp_bam,
          destinationa_path=output_bam_path,
          force=force)                                                              # copy bamfile
        remove_dir(temp_dir)  # remove temp dir
        _check_bam_file(output_bam_path)
        if index_output and \
           not sorted_by_name:
            index_bam_or_cram(\
              samtools_exe=samtools_exe,
              input_path=output_bam_path,
              threads=threads)
        return merge_cmd
    except:
        raise

예제 #29

0

파일 보기

def run_sort_bam(samtools_exe,
                 input_bam_path,
                 output_bam_path,
                 sort_by_name=False,
                 use_ephemeral_space=0,
                 threads=1,
                 force=False,
                 dry_run=False,
                 cram_out=False,
                 index_output=True):
    '''
  A function for sorting input bam file and generate a output bam
  
  :param samtools_exe: samtools executable path
  :param input_bam_path: A bam filepath
  :param output_bam_path: A bam output filepath
  :param sort_by_name: Sort bam file by read_name, default False (for coordinate sorting)
  :param threads: Number of threads to use for sorting, default 1
  :param force: Output bam file will be overwritten if force is True, default False
  :param cram_out: Output cram file, default False
  :param index_output: Index output bam, default True
  :param use_ephemeral_space: A toggle for temp dir settings, default 0
  :param dry_run: A toggle for returning the samtools command without actually running it, default False
  :return: None
  '''
    try:
        check_file_path(samtools_exe)
        _check_bam_file(bam_file=input_bam_path)
        sort_cmd = \
          [quote(samtools_exe),
           'sort',
           '-@{0}'.format(quote(str(threads)))
          ]
        if sort_by_name:
            sort_cmd.append('-n')  # sorting by read name

        if cram_out:
            sort_cmd.append('--output-fmt CRAM')
        else:
            sort_cmd.append('--output-fmt BAM')

        temp_dir = get_temp_dir(use_ephemeral_space=use_ephemeral_space)
        temp_bam = \
          os.path.join(\
            temp_dir,
            os.path.basename(output_bam_path))

        sort_cmd.extend(['-o', quote(temp_bam)])
        sort_cmd.append(quote(input_bam_path))
        if dry_run:
            return sort_cmd

        copy_local_file(\
          source_path=temp_bam,
          destinationa_path=output_bam_path,
          force=force)                                                              # copy output bam
        remove_dir(temp_dir)  # remove temp dir
        if cram_out:
            _check_cram_file(output_bam_path)
        else:
            _check_bam_file(output_bam_path)

        if index_output:
            index_bam_or_cram(\
              samtools_exe=samtools_exe,
              input_path=output_bam_path,
              threads=threads)
    except:
        raise

예제 #30

0

파일 보기

def run_bam_stats(samtools_exe,
                  bam_file,
                  output_dir,
                  threads=1,
                  force=False,
                  output_prefix=None,
                  dry_run=False):
    '''
  A method for generating samtools stats output
  
  :param samtools_exe: samtools executable path
  :param bam_file: A bam filepath with / without index. Index file will be created if its missing
  :param output_dir: Bam stats output directory path
  :param output_prefix: Output file prefix, default None
  :param threads: Number of threads to use for conversion, default 1
  :param force: Output flagstat file will be overwritten if force is True, default False
  :param dry_run: A toggle for returning the samtools command without actually running it, default False
  :returns: Output file path, list containing samtools command and a list containing the SN matrics of report
  '''
    try:
        check_file_path(samtools_exe)
        _check_bam_file(bam_file=bam_file)
        if not dry_run:
            _check_bam_index(\
              samtools_exe=samtools_exe,
              bam_file=bam_file)

        if output_prefix is None:
            output_prefix = os.path.basename(bam_file)

        output_path = \
          '{0}.{1}.{2}'.\
          format(output_prefix,'stats','txt')
        output_path = \
          os.path.join(\
            output_dir,
            output_path)
        if not os.path.exists(output_dir):
            raise IOError('Output path {0} not found'.format(output_dir))

        if os.path.exists(output_path) and not force:
            raise ValueError('Output file {0} already present, use force to overwrite'.\
                             format(output_path))

        stats_cmd = \
          [quote(samtools_exe),
           'stats',
           '-@{0}'.format(quote(str(threads))),
           quote(bam_file)
          ]
        if dry_run:
            return stats_cmd

        with open(output_path, 'w') as fp:
            with subprocess.Popen(stats_cmd, stdout=subprocess.PIPE) as proc:
                fp.write(proc.stdout.read().decode(
                    'utf-8'))  # write bam stats output

        stats_data_list = \
          _parse_samtools_stats_output(stats_file=output_path)                      # parse stats output file

        return output_path, stats_cmd, stats_data_list
    except:
        raise