Пример #1
0
 def test_find_fastq_and_build_db_collection(self):
     ci = Collect_seqrun_fastq_to_db(
         fastq_dir=self.fastq_dir,
         session_class=self.session_class,
         seqrun_igf_id=self.seqrun_igf_id,
         flowcell_id=self.flowcell_id,
         model_name=self.model_name,
         file_location=self.file_location,
         samplesheet_file=self.samplesheet_file,
         manifest_name=self.manifest_name,
     )
     ci.find_fastq_and_build_db_collection()
     ca = CollectionAdaptor(**{'session_class': self.session_class})
     ca.start_session()
     file_path = 'data/collect_fastq_dir/sc_1_8/IGFP0001_test_22-8-2017_rna_sc/IGF00001/IGF00001-1_S1_L003_R1_001.fastq.gz'
     (name,
      type) = ca.fetch_collection_name_and_table_from_file_path(file_path)
     ca.close_session()
     self.assertEqual(name, 'IGF00001_NEXTSEQ_TESTABC_3')
Пример #2
0
 def test_find_fastq_and_build_db_collection(self):
     ci = Collect_seqrun_fastq_to_db(
         fastq_dir=self.fastq_dir,
         session_class=self.session_class,
         seqrun_igf_id=self.seqrun_igf_id,
         flowcell_id=self.flowcell_id,
         model_name=self.model_name,
         file_location=self.file_location,
         samplesheet_file=self.samplesheet_file,
         manifest_name=self.manifest_name,
     )
     ci.find_fastq_and_build_db_collection()
     ca = CollectionAdaptor(**{'session_class': self.session_class})
     ca.start_session()
     query = ca.session.query(Collection).filter(
         Collection.name == 'IGF00001_MISEQ_000000000-D0YLK_1')
     file_path = 'data/collect_fastq_dir/1_16/IGFP0001_test_22-8-2017_rna/IGF00002/IGF00002-2_S1_L001_R1_001.fastq.gz'
     (name,
      type) = ca.fetch_collection_name_and_table_from_file_path(file_path)
     ca.close_session()
     self.assertEqual(name, 'IGF00002_MISEQ_000000000-D0YLK_1')
  def run(self):
    try:
      fastq_file = self.param_required('fastq_file')
      fastq_dir = self.param_required('fastq_dir')
      igf_session_class = self.param_required('igf_session_class')
      fastqc_exe = self.param_required('fastqc_exe')
      tag = self.param_required('tag')
      seqrun_igf_id = self.param_required('seqrun_igf_id')
      seqrun_date = self.param_required('seqrun_date')
      flowcell_id = self.param_required('flowcell_id')
      fastqc_options = self.param('fastqc_options')
      base_results_dir = self.param_required('base_results_dir')
      project_name = self.param_required('project_name')
      force_overwrite = self.param('force_overwrite')
      fastqc_dir_label = self.param('fastqc_dir_label')
      required_collection_table = self.param('required_collection_table')
      sample_name = self.param('sample_name')
      hpc_location = self.param('hpc_location')
      fastqc_collection_type = self.param('fastqc_collection_type')
      use_ephemeral_space = self.param('use_ephemeral_space')
      store_file = self.param('store_file')

      lane_index_info = os.path.basename(fastq_dir)                             # get the lane and index length info
      fastq_file_label = os.path.basename(fastq_file).replace('.fastq.gz','')
      collection_name = None
      collection_table = None
      if tag=='known' and store_file:                                           # fetch sample name for known fastq, if its not defined
        base = BaseAdaptor(**{'session_class':igf_session_class})
        base.start_session()                                                    # connect to db

        ca = CollectionAdaptor(**{'session':base.session})
        (collection_name,collection_table) = \
          ca.fetch_collection_name_and_table_from_file_path(\
            file_path=fastq_file)                                               # fetch collection name and table info

        if collection_table != required_collection_table:
          raise ValueError(
        'Expected collection table {0} and got {1}, {2}'.\
          format(
            required_collection_table,
            collection_table,
            fastq_file))

        ra = RunAdaptor(**{'session':base.session})
        sample = ra.fetch_sample_info_for_run(run_igf_id=collection_name)
        sample_name = sample['sample_igf_id']
        base.close_session()

      fastqc_result_dir = \
        os.path.join(\
          base_results_dir,
          project_name,
          seqrun_date,
          flowcell_id,
          lane_index_info,
          tag)                                                                  # result dir path is generic
      if sample_name is not None:
        fastqc_result_dir = \
          os.path.join(\
            fastqc_result_dir,
            sample_name)                                                        # add sample name to dir path if its available

      fastqc_result_dir = \
        os.path.join(\
          fastqc_result_dir,
          fastq_file_label,
          fastqc_dir_label)                                                     # keep multiple files under same dir

      if os.path.exists(fastqc_result_dir) and force_overwrite:
        remove_dir(fastqc_result_dir)                                           # remove existing output dir if force_overwrite is true

      if not os.path.exists(fastqc_result_dir):
        os.makedirs(fastqc_result_dir,mode=0o775)                               # create output dir if its not present

      temp_work_dir = \
        get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp work dir
      if not os.path.exists(fastq_file):
        raise IOError('fastq file {0} not readable'.format(fastq_file))         # raise if fastq file path is not readable

      fastqc_output = \
        os.path.join(\
          temp_work_dir,
          fastq_file_label)
      os.mkdir(fastqc_output)                                                   # create fastqc output dir
      fastqc_param = \
        self.format_tool_options(fastqc_options)                                # format fastqc params
      fastqc_cmd = \
        [fastqc_exe, '-o',fastqc_output, '-d',temp_work_dir ]                   # fastqc base parameters
      fastqc_cmd.extend(fastqc_param)                                           # add additional parameters
      fastqc_cmd.append(fastq_file)                                             # fastqc input file
      subprocess.check_call(' '.join(fastqc_cmd),shell=True)                    # run fastqc

      fastqc_zip = None
      fastqc_html = None
      for root, _, files in os.walk(top=fastqc_output):
        for file in files:
          if fnmatch.fnmatch(file, '*.zip'):
            input_fastqc_zip = os.path.join(root,file)
            copy2(input_fastqc_zip,fastqc_result_dir)
            fastqc_zip = os.path.join(fastqc_result_dir,file)

          if fnmatch.fnmatch(file, '*.html'):
            input_fastqc_html = os.path.join(root,file)
            copy2(input_fastqc_html,fastqc_result_dir)
            fastqc_html = os.path.join(fastqc_result_dir,file)

      if fastqc_html is None or fastqc_zip is None:
        raise ValueError('Missing required values, fastqc zip: {0}, fastqc html: {1}'.\
                         format(fastqc_zip,fastqc_html))

      if tag=='known' and store_file:
        if collection_name is None:
          raise ValueError('couldn\'t retrieve collection name for {0}'.\
                           format(fastq_file))

        fastqc_files = \
          [{'name':collection_name,
            'type':fastqc_collection_type,
            'table':required_collection_table,
            'file_path':fastqc_zip,
            'location':hpc_location},
           {'name':collection_name,
            'type':fastqc_collection_type,
            'table':required_collection_table,
            'file_path':fastqc_html,
            'location':hpc_location},
          ]
        ca = CollectionAdaptor(**{'session_class':igf_session_class})
        ca.start_session()
        ca.load_file_and_create_collection(data=fastqc_files)                 # store fastqc files to db
        ca.close_session()

      self.param('dataflow_params',
                 {'fastqc_html':fastqc_html,
                  'lane_index_info':lane_index_info,
                  'sample_name':sample_name,
                  'fastqc':{'fastq_dir':fastq_dir,
                            'fastqc_zip':fastqc_zip,
                            'fastqc_html':fastqc_html}})                        # set dataflow params
    except Exception as e:
      message = \
        'seqrun: {2}, Error in {0}: {1}'.\
        format(\
          self.__class__.__name__,
          e,
          seqrun_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise
    def _process_samples_data(self):
        '''
    An internal method for processing samples data
    '''
        try:
            fastq_dir = self.param_required('fastq_dir')
            qc_files = self.param_required('qc_files')
            samplesheet_filename = self.param('samplesheet_filename')
            igf_session_class = self.param_required('igf_session_class')
            remote_project_path = self.param_required('remote_project_path')
            project_name = self.param_required('project_name')
            seqrun_date = self.param_required('seqrun_date')
            flowcell_id = self.param_required('flowcell_id')
            lane_index_info = self.param_required('lane_index_info')
            singlecell_tag = self.param('singlecell_tag')

            remote_path = \
              os.path.join(\
                remote_project_path,
                project_name,
                seqrun_date,
                flowcell_id,
                lane_index_info)                                                      # get remote base path

            base = BaseAdaptor(**{'session_class': igf_session_class})
            base.start_session()  # connect to db
            ca = CollectionAdaptor(**{'session': base.session})
            ra = RunAdaptor(**{'session': base.session})
            fastqc_data = list()
            for fastqc_file in qc_files[
                    'fastqc']:  # get fastqc files for fastq_dir
                fastqc_zip = fastqc_file['fastqc_zip']
                fastq_file = fastqc_file['fastq_file']
                qc_fastq_dir = fastqc_file['fastq_dir']

                if qc_fastq_dir == fastq_dir:  # check for fastq dir
                    remote_fastqc_path = fastqc_file['remote_fastqc_path']
                    remote_fastqc_path = \
                      os.path.relpath(\
                        remote_fastqc_path,
                        start=remote_path)                                                # get relative path
                    (total_reads, _) = \
                      get_fastq_info_from_fastq_zip(fastqc_zip)
                    (collection_name,_) = \
                      ca.fetch_collection_name_and_table_from_file_path(\
                        file_path=fastq_file)                                             # fetch collection name and table info
                    sample = ra.fetch_sample_info_for_run(
                        run_igf_id=collection_name)
                    sample_name = sample['sample_igf_id']
                    fastqc_data.\
                      append(\
                        {'Sample_ID':sample_name,
                         'Fastqc':remote_fastqc_path,
                         'FastqFile':fastq_file,
                         'TotalReads':total_reads})

            base.close_session()  # close db connection
            fastqs_data = list()
            for fastqs_file in qc_files[
                    'fastqscreen']:  # get fastqs files for fastq_dir
                fastq_file = fastqs_file['fastq_file']
                remote_fastqs_path = fastqs_file['remote_fastqscreen_path']
                qs_fastq_dir = fastqc_file['fastq_dir']

                if qs_fastq_dir == fastq_dir:  # check for accu data
                    remote_fastqs_path = \
                      os.path.relpath(\
                        remote_fastqs_path,
                        start=remote_path)                                                # get relative path
                    fastqs_data.\
                      append(\
                        {'Fastqscreen':remote_fastqs_path,
                         'FastqFile':fastq_file})

            if len(fastqc_data) == 0 or len(fastqs_data) == 0:
                raise ValueError('Value not found for fastqc: {0} or fastqscreen:{1}'.\
                                 format(len(fastqc_data), len(fastqs_data)))

            fastqc_data = pd.DataFrame(fastqc_data)
            fastqs_data = pd.DataFrame(fastqs_data).set_index(
                'FastqFile')  # convert to dataframe
            merged_qc_info = \
              fastqc_data.\
                join(\
                  fastqs_data,
                  how='inner',
                  on='FastqFile',
                  lsuffix='',
                  rsuffix='_s'
                )                                                                     # merge fastqc and fastqscreen info
            if len(merged_qc_info) == 0:
                raise ValueError('No QC data found for merging, fastqc:{0}, fastqscreen: {1}'.\
                                 format(len(fastqc_data), len(fastqs_data)))

            samplesheet_file = \
              os.path.join(\
                fastq_dir,
                samplesheet_filename)
            if not os.path.exists(samplesheet_file):
                raise IOError('samplesheet file {0} not found'.\
                              format(samplesheet_file))

            final_samplesheet_data = list()
            samplesheet_sc = SampleSheet(
                infile=samplesheet_file
            )  # read samplesheet for single cell check
            samplesheet_sc.\
              filter_sample_data(\
                condition_key='Description',
                condition_value=singlecell_tag,
                method='include')                                                     # keep only single cell samples
            if len(samplesheet_sc._data) > 0:
                sc_data = \
                  pd.DataFrame(samplesheet_sc._data).\
                  drop(['Sample_ID','Sample_Name','index'],axis=1).\
                  drop_duplicates().\
                  rename(columns={'Original_Sample_ID':'Sample_ID',
                                  'Original_Sample_Name':'Sample_Name',
                                  'Original_index':'index'}).\
                  to_dict(orient='region')                                              # restructure single cell data. sc data doesn't have index2
                final_samplesheet_data.extend(
                    sc_data)  # add single cell samples to final data

            sa = SampleSheet(infile=samplesheet_file)
            sa.filter_sample_data(\
              condition_key='Description',
              condition_value=singlecell_tag,
              method='exclude')                                                       # remove only single cell samples
            if len(sa._data) > 0:
                final_samplesheet_data.extend(
                    sa._data)  # add non single cell samples info to final data

            sample_data = \
              pd.DataFrame(final_samplesheet_data).\
              set_index('Sample_ID')                                                  # get sample info from final data
            merged_data = \
              merged_qc_info.\
                join(\
                  sample_data,
                  how='inner',
                  on='Sample_ID',
                  lsuffix='',
                  rsuffix='_sa')                                                      # merge sample data with qc data
            required_headers = \
              ['Sample_ID',
               'Sample_Name',
               'FastqFile',
               'TotalReads',
               'index']
            if 'index2' in list(sample_data.columns):
                required_headers.append('index2')

            required_headers.\
              extend(\
                ['Fastqc',
                 'Fastqscreen'])                                                      # create header order
            merged_data['FastqFile'] = \
              merged_data['FastqFile'].\
              map(lambda path: os.path.basename(path))                                # keep only fastq filename
            qc_merged_data = \
              merged_data.loc[:,required_headers].\
              to_dict(orient='records')                                               #  extract final data
            return required_headers, qc_merged_data
        except:
            raise