def test_load_file_to_disk_and_db2(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='ProjectA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='project')
     input_file_list = [
         os.path.join(self.temp_work_dir, file_name)
         for file_name in self.input_list
     ]
     output_list = au.load_file_to_disk_and_db(
         input_file_list=input_file_list, withdraw_exisitng_collection=True
     )  # withdrawing existing collection group before loading new
     base = BaseAdaptor(**{'session_class': self.session_class})
     base.start_session()
     ca = CollectionAdaptor(**{'session': base.session})
     ca_files = ca.get_collection_files(collection_name='ProjectA',
                                        collection_type='AnalysisA_Files',
                                        output_mode='dataframe')
     self.assertEqual(len(ca_files.index),
                      1)  # check for unique collection group
     fa = FileAdaptor(**{'session': base.session})
     query = fa.session.query(File)
     fa_records = fa.fetch_records(query=query, output_mode='dataframe')
     self.assertEqual(
         len(fa_records['file_path'].to_dict()), 3
     )  # check if all files are present although only one collection group exists
     self.assertEqual(len(output_list), 3)
     base.close_session()
 def test_load_file_to_disk_and_db1(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='ProjectA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='project')
     input_file_list = [
         os.path.join(self.temp_work_dir, file_name)
         for file_name in self.input_list
     ]
     output_list = au.load_file_to_disk_and_db(
         input_file_list=input_file_list,
         withdraw_exisitng_collection=False
     )  # loading all files to same collection
     base = BaseAdaptor(**{'session_class': self.session_class})
     base.start_session()
     ca = CollectionAdaptor(**{'session': base.session})
     ca_files = ca.get_collection_files(collection_name='ProjectA',
                                        collection_type='AnalysisA_Files',
                                        output_mode='dataframe')
     self.assertEqual(len(ca_files.index),
                      len(self.input_list))  # compare with input list
     self.assertEqual(len(output_list),
                      len(self.input_list))  # compare with output list
     base.close_session()
 def test_load_file_to_disk_and_db7(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='RunA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='run',
                                    base_path=self.temp_base_dir)
     input_file_list = [
         os.path.join(self.temp_work_dir, file_name)
         for file_name in self.input_list
     ]
     output_list = au.load_file_to_disk_and_db(
         input_file_list=input_file_list,
         withdraw_exisitng_collection=False
     )  # loading all files to same collection
     base = BaseAdaptor(**{'session_class': self.session_class})
     base.start_session()
     ca = CollectionAdaptor(**{'session': base.session})
     ca_files = ca.get_collection_files(collection_name='RunA',
                                        collection_type='AnalysisA_Files',
                                        output_mode='dataframe')
     file_list = list(ca_files['file_path'].to_dict().values())
     datestamp = get_datestamp_label()
     test_file = os.path.join(
         self.temp_base_dir, 'ProjectA', 'SampleA', 'ExperimentA', 'RunA',
         'AnalysisA', '{0}_{1}_{2}_{3}.{4}'.format('RunA', 'AnalysisA',
                                                   'TagA', datestamp,
                                                   'cram'))
     test_file = preprocess_path_name(input_path=test_file)
     self.assertTrue(test_file in file_list)
     self.assertTrue(test_file in output_list)
     base.close_session()
 def test_load_file_to_disk_and_db4(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='ProjectA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='project',
                                    rename_file=False)
     input_file_list = [
         os.path.join(self.temp_work_dir, file_name)
         for file_name in self.input_list
     ]
     output_list = au.load_file_to_disk_and_db(
         input_file_list=input_file_list,
         withdraw_exisitng_collection=False
     )  # loading all files to same collection, without rename
     base = BaseAdaptor(**{'session_class': self.session_class})
     base.start_session()
     ca = CollectionAdaptor(**{'session': base.session})
     ca_files = ca.get_collection_files(collection_name='ProjectA',
                                        collection_type='AnalysisA_Files',
                                        output_mode='dataframe')
     file_list = list(ca_files['file_path'].to_dict().values())
     self.assertTrue(input_file_list[0] in file_list)
     self.assertTrue(input_file_list[0] in output_list)
     base.close_session()
    def create_or_update_analysis_collection(self,
                                             file_path,
                                             dbsession,
                                             withdraw_exisitng_collection=True,
                                             autosave_db=True,
                                             force=True,
                                             remove_file=False):
        '''
    A method for create or update analysis file collection in db. Required elements will be
    collected from database if base_path element is given.
    
    :param file_path: file path to load as db collection
    :param dbsession: An active database session
    :param withdraw_exisitng_collection: Remove existing collection group
    :param autosave_db: Save changes to database, default True
    :param remove_file: A toggle for removing existing file from disk, default False
    :param force: Toggle for removing existing file collection, default True
    '''
        try:
            ca = CollectionAdaptor(**{'session': dbsession})

            collection_exists = \
              ca.get_collection_files(
                collection_name=self.collection_name,
                collection_type=self.collection_type)
            if len(collection_exists.index) >0 and \
                withdraw_exisitng_collection:
                remove_data = [{
                    'name': self.collection_name,
                    'type': self.collection_type
                }]
                ca.remove_collection_group_info(
                    data=remove_data, autosave=autosave_db
                )  # removing all existing collection groups for the collection name and type

            fa = FileAdaptor(**{'session': dbsession})
            file_exists = fa.check_file_records_file_path(
                file_path=file_path)  # check if file already present in db
            if file_exists and force:
                fa.remove_file_data_for_file_path(
                    file_path=file_path,
                    remove_file=remove_file,
                    autosave=autosave_db
                )  # remove entry from file table and disk

            collection_data = [{
                'name': self.collection_name,
                'type': self.collection_type,
                'table': self.collection_table,
                'file_path': file_path
            }]
            ca.load_file_and_create_collection(
                data=collection_data,
                calculate_file_size_and_md5=True,
                autosave=autosave_db
            )  # load file, collection and create collection group
        except:
            raise
Пример #6
0
 def setUp(self):
   self.dbconfig = 'data/dbconfig.json'
   dbparam=read_dbconf_json(self.dbconfig)
   base = BaseAdaptor(**dbparam)
   self.engine = base.engine
   self.dbname=dbparam['dbname']
   Base.metadata.drop_all(self.engine)
   if os.path.exists(self.dbname):
     os.remove(self.dbname)
   Base.metadata.create_all(self.engine)
   self.session_class=base.get_session_class()
   base.start_session()
   project_data=[{'project_igf_id':'ProjectA'}]
   pa=ProjectAdaptor(**{'session':base.session})
   pa.store_project_and_attribute_data(data=project_data)                      # load project data
   sample_data=[{'sample_igf_id':'SampleA',
                 'project_igf_id':'ProjectA'}]                                 # sample data
   sa=SampleAdaptor(**{'session':base.session})
   sa.store_sample_and_attribute_data(data=sample_data)                        # store sample data
   experiment_data=[{'experiment_igf_id':'ExperimentA',
                     'sample_igf_id':'SampleA',
                     'library_name':'SampleA',
                     'platform_name':'MISEQ',
                     'project_igf_id':'ProjectA'}]                             # experiment data
   ea=ExperimentAdaptor(**{'session':base.session})
   ea.store_project_and_attribute_data(data=experiment_data)
   self.temp_dir=get_temp_dir()
   temp_files=['a.csv','b.csv']
   for temp_file in temp_files:
     with open(os.path.join(self.temp_dir,temp_file),'w') as fp:
       fp.write('A')
   collection_data=[{'name':'ExperimentA',
                     'type':'AnalysisA_html',
                     'table':'experiment',
                     'file_path':os.path.join(self.temp_dir,temp_file)}
                     for temp_file in temp_files]
   ca=CollectionAdaptor(**{'session':base.session})
   ca.load_file_and_create_collection(data=collection_data,
                                      calculate_file_size_and_md5=False)
   base.close_session()
Пример #7
0
 def test_find_fastq_and_build_db_collection(self):
     ci = Collect_seqrun_fastq_to_db(
         fastq_dir=self.fastq_dir,
         session_class=self.session_class,
         seqrun_igf_id=self.seqrun_igf_id,
         flowcell_id=self.flowcell_id,
         model_name=self.model_name,
         file_location=self.file_location,
         samplesheet_file=self.samplesheet_file,
         manifest_name=self.manifest_name,
     )
     ci.find_fastq_and_build_db_collection()
     ca = CollectionAdaptor(**{'session_class': self.session_class})
     ca.start_session()
     file_path = 'data/collect_fastq_dir/sc_1_8/IGFP0001_test_22-8-2017_rna_sc/IGF00001/IGF00001-1_S1_L003_R1_001.fastq.gz'
     (name,
      type) = ca.fetch_collection_name_and_table_from_file_path(file_path)
     ca.close_session()
     self.assertEqual(name, 'IGF00001_NEXTSEQ_TESTABC_3')
    def run(self):
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            run_igf_id = self.param_required('run_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            fastq_collection_type = self.param('fastq_collection_type')
            fastq_collection_table = self.param('fastq_collection_table')
            ca = CollectionAdaptor(**{'session_class': igf_session_class})
            ca.start_session()
            fastq_files = ca.get_collection_files(
                collection_name=run_igf_id,
                collection_type=fastq_collection_type,
                collection_table=fastq_collection_table,
                output_mode='dataframe')
            ca.close_session()
            fastq_counts = len(fastq_files.index)
            fastq_list = list(fastq_files['file_path'].values
                              )  # converting fastq filepaths to a list
            if not isinstance(fastq_list, list) or \
               len(fastq_list)==0:
                raise ValueError(
                    'No fastq file found for run {0}'.format(run_igf_id))

            for file in fastq_list:
                if not os.path.exists(file):
                    raise IOError('Fastq file path {0} not found for run {1}'.\
                                  format(file,run_igf_id))

            self.param('dataflow_params',
                       {'fastq_files_list': fastq_list
                        })  # add fastq filepaths to dataflow
        except Exception as e:
            message='project: {2}, sample:{3}, Error in {0}: {1}'.\
                    format(self.__class__.__name__,
                           e,
                           project_igf_id,
                           sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
 def setUp(self):
     self.dbconfig = 'data/dbconfig.json'
     dbparam = read_dbconf_json(self.dbconfig)
     base = BaseAdaptor(**dbparam)
     self.engine = base.engine
     self.dbname = dbparam['dbname']
     Base.metadata.create_all(self.engine)
     self.session_class = base.get_session_class()
     self.json_file_path = 'data/reset_samplesheet_md5/seqrun1_file_md5.json'
     json_data = pd.DataFrame([{
         'file_md5': '1e7531158974b5a5b7cbb7dde09ac779',
         'seqrun_file_name': 'SampleSheet.csv'
     }, {
         'file_md5': '2b22f945bc9e7e390af5432425783a03',
         'seqrun_file_name': 'RTAConfiguration.xml'
     }])
     with open(self.json_file_path, 'w') as jp:
         json.dump(json_data.to_dict(orient='record'), jp, indent=4)
     self.initial_json_md5 = calculate_file_checksum(
         filepath=self.json_file_path)
     self.correct_samplesheet_md5 = '259ed03f2e8c45980de121f7c3a70565'
     self.json_collection_name = 'seqrun1'
     self.json_collection_type = 'ILLUMINA_BCL_MD5'
     self.seqrun_path = 'data/reset_samplesheet_md5'
     self.seqrun_input_list = 'data/reset_samplesheet_md5/seqrun_input_list.txt'
     ca = CollectionAdaptor(**{'session_class': self.session_class})
     ca.start_session()
     data = pd.DataFrame([{
         'name': self.json_collection_name,
         'type': self.json_collection_type,
         'table': 'seqrun',
         'file_path': self.json_file_path,
     }])
     ca.load_file_and_create_collection(data, autosave=True, hasher='md5')
     ca.close_session()
     with open(self.seqrun_input_list, 'w') as fp:
         fp.write(self.json_collection_name)
Пример #10
0
 def test_find_fastq_and_build_db_collection(self):
     ci = Collect_seqrun_fastq_to_db(
         fastq_dir=self.fastq_dir,
         session_class=self.session_class,
         seqrun_igf_id=self.seqrun_igf_id,
         flowcell_id=self.flowcell_id,
         model_name=self.model_name,
         file_location=self.file_location,
         samplesheet_file=self.samplesheet_file,
         manifest_name=self.manifest_name,
     )
     ci.find_fastq_and_build_db_collection()
     ca = CollectionAdaptor(**{'session_class': self.session_class})
     ca.start_session()
     query = ca.session.query(Collection).filter(
         Collection.name == 'IGF00001_MISEQ_000000000-D0YLK_1')
     file_path = 'data/collect_fastq_dir/1_16/IGFP0001_test_22-8-2017_rna/IGF00002/IGF00002-2_S1_L001_R1_001.fastq.gz'
     (name,
      type) = ca.fetch_collection_name_and_table_from_file_path(file_path)
     ca.close_session()
     self.assertEqual(name, 'IGF00002_MISEQ_000000000-D0YLK_1')
    def test_create_or_update_analysis_collection_rename(self):
        au = Analysis_collection_utils(dbsession_class=self.session_class,
                                       analysis_name='AnalysisA',
                                       tag_name='TagA',
                                       collection_name='ProjectA',
                                       collection_type='AnalysisA_Files',
                                       collection_table='project')
        base = BaseAdaptor(**{'session_class': self.session_class})
        base.start_session()
        au.create_or_update_analysis_collection(file_path=os.path.join(
            self.temp_work_dir, 'a.cram'),
                                                dbsession=base.session,
                                                autosave_db=True)
        base.close_session()
        base.start_session()
        ca = CollectionAdaptor(**{'session': base.session})
        ca_files = ca.get_collection_files(collection_name='ProjectA',
                                           collection_type='AnalysisA_Files',
                                           output_mode='dataframe')
        self.assertEqual(len(ca_files.index), 1)
        au.create_or_update_analysis_collection(
            file_path=os.path.join(self.temp_work_dir, 'a.cram'),
            dbsession=base.session,
            autosave_db=True,
            force=True)  # overwriting file collection
        base.close_session()
        base.start_session()
        ca = CollectionAdaptor(**{'session': base.session})
        ca_files = ca.get_collection_files(collection_name='ProjectA',
                                           collection_type='AnalysisA_Files',
                                           output_mode='dataframe')
        self.assertEqual(len(ca_files.index), 1)

        with self.assertRaises(sqlalchemy.exc.IntegrityError
                               ):  # file collection without force
            au.create_or_update_analysis_collection(\
              file_path=os.path.join(self.temp_work_dir,
                                     'a.cram'),
              dbsession=base.session,
              autosave_db=True,
              force=False
            )
        base.close_session()
Пример #12
0
    def _fetch_collection_files(self,
                                collection_type,
                                check_missing=False,
                                unique_file=True,
                                file_path_label='file_path'):
        '''
    An internal method for fetching collection group files from database
    
    :param collection_type: Collection type information for database lookup
    :param check_missing: A toggle for checking errors for missing files, default False
    :param unique_file: A toggle for keeping only a single collection file, default True
    :param file_path_label: Name of the file_path column in the File table, default file_path
    :returns: A single file if unique_file is true, else a list of files
    '''
        try:
            ref_file = None
            ca = \
              CollectionAdaptor(**{'session_class':self.dbsession_class})
            ca.start_session()
            collection_files = \
              ca.\
                get_collection_files(
                  collection_name=self.genome_tag,
                  collection_type=collection_type,
                  output_mode='dataframe')                                            # fetch collection files from db
            ca.close_session()
            if len(collection_files.index) > 0:
                files = list(collection_files[file_path_label].values)
                if unique_file:
                    ref_file = files[
                        0]  # select the first file from db results
                else:
                    ref_file = files

            if ref_file is None and check_missing:
                raise ValueError(
                        'No file collection found for reference genome {0}:{1}'.\
                          format(self.genome_tag,collection_type))
            return ref_file
        except:
            raise
Пример #13
0
  def run(self):
    '''
    A method for running the cellranger count metrics extraction
    
    :param project_igf_id: A project igf id
    :param experiment_igf_id: An experiment igf id
    :param sample_igf_id: A sample igf id
    :param igf_session_class: A database session class
    :param analysis_output_list: Cellranger analysis tar output path
    :param collection_type: Cellranger results collection type
    :param metrics_filename: Name of the metrics file, default metrics_summary.csv
    :returns: None
    '''
    try:
      project_igf_id = self.param_required('project_igf_id')
      experiment_igf_id = self.param_required('experiment_igf_id')
      sample_igf_id = self.param_required('sample_igf_id')
      igf_session_class = self.param_required('igf_session_class')
      analysis_output_list = self.param_required('analysis_output_list')
      collection_type = self.param('collection_type')
      metrics_filename = self.param('metrics_filename')
      attribute_prefix = self.param('attribute_prefix')
      for infile in analysis_output_list:
        check_file_path(infile)                                                 # check input file path

      cellranger_tar = analysis_output_list[0]
      cellranger_metrics = extract_cellranger_count_metrics_summary(\
                            cellranger_tar=cellranger_tar,
                            target_filename=metrics_filename,
                            collection_name=experiment_igf_id,
                            collection_type=collection_type,
                            attribute_prefix=attribute_prefix
                            )                                                   # extract cellranger metrics stats as dictionary
      ca = CollectionAdaptor(**{'session_class':igf_session_class})
      ca.start_session()
      try:
        ca.create_or_update_collection_attributes(\
           data=cellranger_metrics,
           autosave=False)                                                      # load cellranger metrics to collection attribute table
        ca.commit_session()
        ca.close_session()
      except:
          ca.rollback_session()
          ca.close_session()
          raise

      self.param('dataflow_params',{'cellranger_attribute':'done'})
    except Exception as e:
      message='project: {2}, sample:{3}, Error in {0}: {1}'.\
              format(self.__class__.__name__,
                     e,
                     project_igf_id,
                     sample_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise
 def setUp(self):
     self.dbconfig = 'data/dbconfig.json'
     dbparam = read_dbconf_json(self.dbconfig)
     base = BaseAdaptor(**dbparam)
     self.engine = base.engine
     self.dbname = dbparam['dbname']
     Base.metadata.create_all(self.engine)
     self.session_class = base.get_session_class()
     base.start_session()
     platform_data = [{
         "platform_igf_id": "M03291",
         "model_name": "MISEQ",
         "vendor_name": "ILLUMINA",
         "software_name": "RTA",
         "software_version": "RTA1.18.54"
     }, {
         "platform_igf_id": "NB501820",
         "model_name": "NEXTSEQ",
         "vendor_name": "ILLUMINA",
         "software_name": "RTA",
         "software_version": "RTA2"
     }, {
         "platform_igf_id": "K00345",
         "model_name": "HISEQ4000",
         "vendor_name": "ILLUMINA",
         "software_name": "RTA",
         "software_version": "RTA2"
     }]
     flowcell_rule_data = [{
         "platform_igf_id": "K00345",
         "flowcell_type": "HiSeq 3000/4000 SR",
         "index_1": "NO_CHANGE",
         "index_2": "NO_CHANGE"
     }, {
         "platform_igf_id": "K00345",
         "flowcell_type": "HiSeq 3000/4000 PE",
         "index_1": "NO_CHANGE",
         "index_2": "REVCOMP"
     }, {
         "platform_igf_id": "NB501820",
         "flowcell_type": "NEXTSEQ",
         "index_1": "NO_CHANGE",
         "index_2": "REVCOMP"
     }, {
         "platform_igf_id": "M03291",
         "flowcell_type": "MISEQ",
         "index_1": "NO_CHANGE",
         "index_2": "NO_CHANGE"
     }]
     pl = PlatformAdaptor(**{'session': base.session})
     pl.store_platform_data(data=platform_data)
     pl.store_flowcell_barcode_rule(data=flowcell_rule_data)
     seqrun_data = [{
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'flowcell_id': '000000000-BRN47',
         'platform_igf_id': 'M03291',
         'flowcell': 'MISEQ',
     }, {
         'seqrun_igf_id': '180416_NB03291_013_000000001-BRN47',
         'flowcell_id': '000000001-BRN47',
         'platform_igf_id': 'NB501820',
         'flowcell': 'NEXTSEQ',
     }]
     sra = SeqrunAdaptor(**{'session': base.session})
     sra.store_seqrun_and_attribute_data(data=seqrun_data)
     project_data = [{'project_igf_id': 'projectA'}]
     pa = ProjectAdaptor(**{'session': base.session})
     pa.store_project_and_attribute_data(data=project_data)
     sample_data = [
         {
             'sample_igf_id': 'sampleA',
             'project_igf_id': 'projectA',
             'species_name': 'HG38'
         },
         {
             'sample_igf_id': 'sampleB',
             'project_igf_id': 'projectA',
             'species_name': 'UNKNOWN'
         },
     ]
     sa = SampleAdaptor(**{'session': base.session})
     sa.store_sample_and_attribute_data(data=sample_data)
     experiment_data = [
         {
             'project_igf_id': 'projectA',
             'sample_igf_id': 'sampleA',
             'experiment_igf_id': 'sampleA_MISEQ',
             'library_name': 'sampleA',
             'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL',
             'library_strategy': 'RNA-SEQ',
             'experiment_type': 'TENX-TRANSCRIPTOME-3P',
             'library_layout': 'PAIRED',
             'platform_name': 'MISEQ',
         },
         {
             'project_igf_id': 'projectA',
             'sample_igf_id': 'sampleA',
             'experiment_igf_id': 'sampleA_NEXTSEQ',
             'library_name': 'sampleA',
             'library_source': 'UNKNOWN',
             'library_strategy': 'RNA-SEQ',
             'experiment_type': 'TENX-TRANSCRIPTOME-3P',
             'library_layout': 'PAIRED',
             'platform_name': 'NEXTSEQ',
         },
         {
             'project_igf_id': 'projectA',
             'sample_igf_id': 'sampleB',
             'experiment_igf_id': 'sampleB_MISEQ',
             'library_name': 'sampleB',
             'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL',
             'library_strategy': 'RNA-SEQ',
             'experiment_type': 'TENX-TRANSCRIPTOME-3P',
             'library_layout': 'PAIRED',
             'platform_name': 'MISEQ',
         },
     ]
     ea = ExperimentAdaptor(**{'session': base.session})
     ea.store_project_and_attribute_data(data=experiment_data)
     run_data = [{
         'experiment_igf_id': 'sampleA_MISEQ',
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'run_igf_id': 'sampleA_MISEQ_000000000-BRN47_1',
         'lane_number': '1'
     }, {
         'experiment_igf_id': 'sampleA_NEXTSEQ',
         'seqrun_igf_id': '180416_NB03291_013_000000001-BRN47',
         'run_igf_id': 'sampleA_NEXTSEQ_000000001-BRN47_2',
         'lane_number': '2'
     }, {
         'experiment_igf_id': 'sampleB_MISEQ',
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'run_igf_id': 'sampleB_MISEQ_HVWN7BBXX_1',
         'lane_number': '1'
     }]
     ra = RunAdaptor(**{'session': base.session})
     ra.store_run_and_attribute_data(data=run_data)
     file_data = [
         {
             'file_path':
             '/path/sampleA_MISEQ_000000000-BRN47_1_R1.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1528121404',
         },
         {
             'file_path':
             '/path/sampleA_NEXTSEQ_000000001-BRN47_2_R1.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1528121404',
         },
         {
             'file_path': '/path/sampleB_MISEQ_HVWN7BBXX_1_R1.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1528121404',
         },
     ]
     fa = FileAdaptor(**{'session': base.session})
     fa.store_file_and_attribute_data(data=file_data)
     collection_data = [{
         'name': 'sampleA_MISEQ_000000000-BRN47_1',
         'type': 'demultiplexed_fastq',
         'table': 'run'
     }, {
         'name': 'sampleA_NEXTSEQ_000000001-BRN47_2',
         'type': 'demultiplexed_fastq',
         'table': 'run'
     }, {
         'name': 'sampleB_MISEQ_HVWN7BBXX_1',
         'type': 'demultiplexed_fastq',
         'table': 'run'
     }]
     collection_files_data = [{
         'name':
         'sampleA_MISEQ_000000000-BRN47_1',
         'type':
         'demultiplexed_fastq',
         'file_path':
         '/path/sampleA_MISEQ_000000000-BRN47_1_R1.fastq.gz'
     }, {
         'name':
         'sampleA_NEXTSEQ_000000001-BRN47_2',
         'type':
         'demultiplexed_fastq',
         'file_path':
         '/path/sampleA_NEXTSEQ_000000001-BRN47_2_R1.fastq.gz'
     }, {
         'name':
         'sampleB_MISEQ_HVWN7BBXX_1',
         'type':
         'demultiplexed_fastq',
         'file_path':
         '/path/sampleB_MISEQ_HVWN7BBXX_1_R1.fastq.gz'
     }]
     ca = CollectionAdaptor(**{'session': base.session})
     ca.store_collection_and_attribute_data(data=collection_data)
     ca.create_collection_group(data=collection_files_data)
     base.close_session()
    def _process_samples_data(self):
        '''
    An internal method for processing samples data
    '''
        try:
            fastq_dir = self.param_required('fastq_dir')
            qc_files = self.param_required('qc_files')
            samplesheet_filename = self.param('samplesheet_filename')
            igf_session_class = self.param_required('igf_session_class')
            remote_project_path = self.param_required('remote_project_path')
            project_name = self.param_required('project_name')
            seqrun_date = self.param_required('seqrun_date')
            flowcell_id = self.param_required('flowcell_id')
            lane_index_info = self.param_required('lane_index_info')
            singlecell_tag = self.param('singlecell_tag')

            remote_path = \
              os.path.join(\
                remote_project_path,
                project_name,
                seqrun_date,
                flowcell_id,
                lane_index_info)                                                      # get remote base path

            base = BaseAdaptor(**{'session_class': igf_session_class})
            base.start_session()  # connect to db
            ca = CollectionAdaptor(**{'session': base.session})
            ra = RunAdaptor(**{'session': base.session})
            fastqc_data = list()
            for fastqc_file in qc_files[
                    'fastqc']:  # get fastqc files for fastq_dir
                fastqc_zip = fastqc_file['fastqc_zip']
                fastq_file = fastqc_file['fastq_file']
                qc_fastq_dir = fastqc_file['fastq_dir']

                if qc_fastq_dir == fastq_dir:  # check for fastq dir
                    remote_fastqc_path = fastqc_file['remote_fastqc_path']
                    remote_fastqc_path = \
                      os.path.relpath(\
                        remote_fastqc_path,
                        start=remote_path)                                                # get relative path
                    (total_reads, _) = \
                      get_fastq_info_from_fastq_zip(fastqc_zip)
                    (collection_name,_) = \
                      ca.fetch_collection_name_and_table_from_file_path(\
                        file_path=fastq_file)                                             # fetch collection name and table info
                    sample = ra.fetch_sample_info_for_run(
                        run_igf_id=collection_name)
                    sample_name = sample['sample_igf_id']
                    fastqc_data.\
                      append(\
                        {'Sample_ID':sample_name,
                         'Fastqc':remote_fastqc_path,
                         'FastqFile':fastq_file,
                         'TotalReads':total_reads})

            base.close_session()  # close db connection
            fastqs_data = list()
            for fastqs_file in qc_files[
                    'fastqscreen']:  # get fastqs files for fastq_dir
                fastq_file = fastqs_file['fastq_file']
                remote_fastqs_path = fastqs_file['remote_fastqscreen_path']
                qs_fastq_dir = fastqc_file['fastq_dir']

                if qs_fastq_dir == fastq_dir:  # check for accu data
                    remote_fastqs_path = \
                      os.path.relpath(\
                        remote_fastqs_path,
                        start=remote_path)                                                # get relative path
                    fastqs_data.\
                      append(\
                        {'Fastqscreen':remote_fastqs_path,
                         'FastqFile':fastq_file})

            if len(fastqc_data) == 0 or len(fastqs_data) == 0:
                raise ValueError('Value not found for fastqc: {0} or fastqscreen:{1}'.\
                                 format(len(fastqc_data), len(fastqs_data)))

            fastqc_data = pd.DataFrame(fastqc_data)
            fastqs_data = pd.DataFrame(fastqs_data).set_index(
                'FastqFile')  # convert to dataframe
            merged_qc_info = \
              fastqc_data.\
                join(\
                  fastqs_data,
                  how='inner',
                  on='FastqFile',
                  lsuffix='',
                  rsuffix='_s'
                )                                                                     # merge fastqc and fastqscreen info
            if len(merged_qc_info) == 0:
                raise ValueError('No QC data found for merging, fastqc:{0}, fastqscreen: {1}'.\
                                 format(len(fastqc_data), len(fastqs_data)))

            samplesheet_file = \
              os.path.join(\
                fastq_dir,
                samplesheet_filename)
            if not os.path.exists(samplesheet_file):
                raise IOError('samplesheet file {0} not found'.\
                              format(samplesheet_file))

            final_samplesheet_data = list()
            samplesheet_sc = SampleSheet(
                infile=samplesheet_file
            )  # read samplesheet for single cell check
            samplesheet_sc.\
              filter_sample_data(\
                condition_key='Description',
                condition_value=singlecell_tag,
                method='include')                                                     # keep only single cell samples
            if len(samplesheet_sc._data) > 0:
                sc_data = \
                  pd.DataFrame(samplesheet_sc._data).\
                  drop(['Sample_ID','Sample_Name','index'],axis=1).\
                  drop_duplicates().\
                  rename(columns={'Original_Sample_ID':'Sample_ID',
                                  'Original_Sample_Name':'Sample_Name',
                                  'Original_index':'index'}).\
                  to_dict(orient='region')                                              # restructure single cell data. sc data doesn't have index2
                final_samplesheet_data.extend(
                    sc_data)  # add single cell samples to final data

            sa = SampleSheet(infile=samplesheet_file)
            sa.filter_sample_data(\
              condition_key='Description',
              condition_value=singlecell_tag,
              method='exclude')                                                       # remove only single cell samples
            if len(sa._data) > 0:
                final_samplesheet_data.extend(
                    sa._data)  # add non single cell samples info to final data

            sample_data = \
              pd.DataFrame(final_samplesheet_data).\
              set_index('Sample_ID')                                                  # get sample info from final data
            merged_data = \
              merged_qc_info.\
                join(\
                  sample_data,
                  how='inner',
                  on='Sample_ID',
                  lsuffix='',
                  rsuffix='_sa')                                                      # merge sample data with qc data
            required_headers = \
              ['Sample_ID',
               'Sample_Name',
               'FastqFile',
               'TotalReads',
               'index']
            if 'index2' in list(sample_data.columns):
                required_headers.append('index2')

            required_headers.\
              extend(\
                ['Fastqc',
                 'Fastqscreen'])                                                      # create header order
            merged_data['FastqFile'] = \
              merged_data['FastqFile'].\
              map(lambda path: os.path.basename(path))                                # keep only fastq filename
            qc_merged_data = \
              merged_data.loc[:,required_headers].\
              to_dict(orient='records')                                               #  extract final data
            return required_headers, qc_merged_data
        except:
            raise
Пример #16
0
dbconfig_path = args.dbconfig_path
collection_file_data = args.collection_file_data
calculate_checksum = args.calculate_checksum

if __name__ == '__main__':
    try:
        dbconnected = False
        if not os.path.exists(dbconfig_path):
            raise IOError('Dbconfig file {0} not found'.format(dbconfig_path))

        if not os.path.exists(collection_file_data):
            raise IOError('Collection data json file {0} not found'.format(
                collection_file_data))

        dbparam = read_dbconf_json(dbconfig_path)  # read db config
        collection_data = read_json_data(
            collection_file_data)  # read collection data json
        ca = CollectionAdaptor(**dbparam)
        ca.start_session()  # connect to database
        dbconnected = True
        ca.load_file_and_create_collection(
            data=collection_data,
            calculate_file_size_and_md5=calculate_checksum,
            autosave=True)  # load data and commit changes
        ca.close_session()
        dbconnected = False
    except Exception as e:
        if dbconnected:
            ca.rollback_session()
            ca.close_session()
        raise ValueError('Error: {0}'.format(e))
    def _build_and_store_exp_run_and_collection_in_db(self,fastq_files_list, \
                                                      restricted_list=('10X')):
        '''
    An internal method for building db collections for the raw fastq files
    '''
        session_class = self.session_class
        db_connected = False
        try:
            restricted_list = list(restricted_list)
            dataframe = pd.DataFrame(fastq_files_list)
            # calculate additional detail
            dataframe=dataframe.apply(lambda data: \
                                      self._calculate_experiment_run_and_file_info(data,
                                                                     restricted_list),\
                                      axis=1)
            # get file data
            file_group_columns = [
                'name', 'type', 'location', 'R1', 'R1_md5', 'R1_size', 'R2',
                'R2_md5', 'R2_size'
            ]
            file_group_data = dataframe.loc[:, file_group_columns]
            file_group_data = file_group_data.drop_duplicates()
            (file_data, file_group_data) = self._reformat_file_group_data(
                data=file_group_data)
            # get base session
            base = BaseAdaptor(**{'session_class': session_class})
            base.start_session()
            db_connected = True
            # get experiment data
            experiment_columns=base.get_table_columns(table_name=Experiment, \
                                                      excluded_columns=['experiment_id',
                                                                        'project_id',
                                                                        'sample_id' ])
            experiment_columns.extend(['project_igf_id', 'sample_igf_id'])
            exp_data = dataframe.loc[:, experiment_columns]
            exp_data = exp_data.drop_duplicates()
            if exp_data.index.size > 0:
                exp_data=exp_data.apply(lambda x: \
                                        self._check_existing_data(\
                                              data=x,\
                                              dbsession=base.session,\
                                              table_name='experiment',\
                                              check_column='EXISTS'),\
                                        axis=1)
                exp_data = exp_data[exp_data['EXISTS'] ==
                                    False]  # filter existing experiments
                exp_data.drop('EXISTS', axis=1,
                              inplace=True)  # remove extra columns
                exp_data = exp_data[pd.isnull(exp_data['experiment_igf_id']) ==
                                    False]  # filter exp with null values
            # get run data
            run_columns=base.get_table_columns(table_name=Run, \
                                               excluded_columns=['run_id',
                                                                 'seqrun_id',
                                                                 'experiment_id',
                                                                 'date_created',
                                                                 'status'
                                                                ])
            run_columns.extend([
                'seqrun_igf_id', 'experiment_igf_id', 'R1_READ_COUNT',
                'R2_READ_COUNT'
            ])
            run_data = dataframe.loc[:, run_columns]
            run_data = run_data.drop_duplicates()
            if run_data.index.size > 0:
                run_data=run_data.apply(lambda x: \
                                        self._check_existing_data(\
                                              data=x,\
                                              dbsession=base.session,\
                                              table_name='run',\
                                              check_column='EXISTS'),\
                                        axis=1)
                run_data = run_data[run_data['EXISTS'] ==
                                    False]  # filter existing runs
                run_data.drop('EXISTS', axis=1,
                              inplace=True)  # remove extra columns
                run_data = run_data[pd.isnull(run_data['run_igf_id']) ==
                                    False]  # filter run with null values
            # get collection data
            collection_columns = ['name', 'type', 'table']
            collection_data = dataframe.loc[:, collection_columns]
            collection_data = collection_data.drop_duplicates()
            if collection_data.index.size > 0:
                collection_data=collection_data.apply(lambda x: \
                                        self._check_existing_data( \
                                              data=x, \
                                              dbsession=base.session, \
                                              table_name='collection', \
                                              check_column='EXISTS'), \
                                        axis=1)
                collection_data = collection_data[collection_data[
                    'EXISTS'] == False]  # filter existing collection
                collection_data.drop('EXISTS', axis=1,
                                     inplace=True)  # remove extra columns
                collection_data = collection_data[pd.isnull(
                    collection_data['name']
                ) == False]  # filter collection with null values
            # store experiment to db
            if exp_data.index.size > 0:
                ea = ExperimentAdaptor(**{'session': base.session})
                ea.store_project_and_attribute_data(data=exp_data,
                                                    autosave=False)
                base.session.flush()
            # store run to db
            if run_data.index.size > 0:
                ra = RunAdaptor(**{'session': base.session})
                ra.store_run_and_attribute_data(data=run_data, autosave=False)
                base.session.flush()
            # store file to db

            fa = FileAdaptor(**{'session': base.session})
            fa.store_file_and_attribute_data(data=file_data, autosave=False)
            base.session.flush()
            # store collection to db
            ca = CollectionAdaptor(**{'session': base.session})
            if collection_data.index.size > 0:
                ca.store_collection_and_attribute_data(data=collection_data,\
                                                       autosave=False)
                base.session.flush()
            ca.create_collection_group(data=file_group_data, autosave=False)
            base.commit_session()
            self._write_manifest_file(file_data)
        except:
            if db_connected:
                base.rollback_session()
            raise
        finally:
            if db_connected:
                base.close_session()
    def _check_existing_data(data,
                             dbsession,
                             table_name,
                             check_column='EXISTS'):
        try:
            if not isinstance(data, pd.Series):
                raise ValueError('Expecting a data series and got {0}'.format(
                    type(data)))

            if table_name == 'experiment':
                if 'experiment_igf_id' in data and \
                   not pd.isnull(data['experiment_igf_id']):
                    experiment_igf_id = data['experiment_igf_id']
                    ea = ExperimentAdaptor(**{'session': dbsession})
                    experiment_exists = ea.check_experiment_records_id(
                        experiment_igf_id)
                    if experiment_exists:  # store data only if experiment is not existing
                        data[check_column] = True
                    else:
                        data[check_column] = False
                    return data
                else:
                    raise ValueError(
                        'Missing or empty required column experiment_igf_id')
            elif table_name == 'run':
                if 'run_igf_id' in data and \
                   not pd.isnull(data['run_igf_id']):
                    run_igf_id = data['run_igf_id']
                    ra = RunAdaptor(**{'session': dbsession})
                    run_exists = ra.check_run_records_igf_id(run_igf_id)
                    if run_exists:  # store data only if run is not existing
                        data[check_column] = True
                    else:
                        data[check_column] = False
                    return data
                else:
                    raise ValueError(
                        'Missing or empty required column run_igf_id')

            elif table_name == 'collection':
                if 'name' in data and 'type' in data and \
                   not pd.isnull(data['name']) and \
                   not pd.isnull(data['type']):
                    ca = CollectionAdaptor(**{'session': dbsession})
                    collection_exists=ca.check_collection_records_name_and_type(\
                                           collection_name=data['name'], \
                                           collection_type=data['type'])
                    if collection_exists:
                        data[check_column] = True
                    else:
                        data[check_column] = False
                    return data
                else:
                    raise ValueError(
                        'Missing or empty required column name or type')

            else:
                raise ValueError(
                    'table {0} not supported yet'.format(table_name))
        except:
            raise
Пример #19
0
 def run(self):
     '''
 A method for resetting md5 values in the samplesheet json files for all seqrun ids
 '''
     try:
         db_connected = False
         seqrun_list = self._read_seqrun_list(
             self.seqrun_igf_list
         )  # fetch list of seqrun ids from input file
         if len(seqrun_list) > 0:
             base = self.base_adaptor
             base.start_session()  # connect to database
             db_connected = True
             ca = CollectionAdaptor(**{'session': base.session
                                       })  # connect to collection table
             fa = FileAdaptor(**{'session':
                                 base.session})  # connect to file table
             for seqrun_id in seqrun_list:
                 try:
                     files_data = ca.get_collection_files(
                         collection_name=seqrun_id,
                         collection_type=self.json_collection_type,
                         output_mode='one_or_none'
                     )  # check for existing md5 json file in db
                     # TO DO: skip seqrun_id if pipeline is still running
                     if files_data is not None:
                         json_file_path = [
                             element.file_path for element in files_data
                             if isinstance(element, File)
                         ][0]  # get md5 json file path from sqlalchemy collection results
                         samplesheet_md5 = self._get_samplesheet_md5(
                             seqrun_id
                         )  # get md5 value for new samplesheet file
                         new_json_path = self._get_updated_json_file(
                             json_file_path, samplesheet_md5,
                             self.samplesheet_name
                         )  # get updated md5 json file if samplesheet has been changed
                         if new_json_path is not None:
                             new_json_file_md5 = calculate_file_checksum(
                                 filepath=new_json_path, hasher='md5')
                             fa.update_file_table_for_file_path(
                                 file_path=json_file_path,
                                 tag='md5',
                                 value=new_json_file_md5,
                                 autosave=False
                             )  # update json file md5 in db, don't commit yet
                             move_file(source_path=new_json_path,
                                       destinationa_path=json_file_path,
                                       force=True)  # overwrite json file
                             base.commit_session()  # save changes in db
                             message='Setting new Samplesheet info for run {0}'.\
                                     format(seqrun_id)
                             if self.log_slack:
                                 self.igf_slack.post_message_to_channel(
                                     message,
                                     reaction='pass')  # send log to slack
                             if self.log_asana:
                                 self.igf_asana.comment_asana_task(
                                     task_name=seqrun_id,
                                     comment=message)  # send log to asana
                         else:
                             message = 'no change in samplesheet for seqrun {0}'.format(
                                 seqrun_id)
                             warnings.warn(message)
                             if self.log_slack:
                                 self.igf_slack.post_message_to_channel(
                                     message, reaction='pass')
                     else:
                         message='No md5 json file found for seqrun_igf_id: {0}'.\
                                 format(seqrun_id)
                         warnings.warn(
                             message
                         )  # not raising any exception if seqrun id is not found
                         if self.log_slack:
                             self.igf_slack.post_message_to_channel(
                                 message, reaction='fail')
                 except Exception as e:
                     base.rollback_session()
                     message='Failed to update  json file for seqrun id {0}, error : {1}'.\
                             format(seqrun_id,e)
                     warnings.warn(message)
                     if self.log_slack:
                         self.igf_slack.post_message_to_channel(
                             message, reaction='fail')
             base.close_session()  # close db connection
             if self.clean_up:
                 self._clear_seqrun_list(
                     self.seqrun_igf_list)  # clear input file
         else:
             if self.log_slack:
                 message = 'No new seqrun id found for changing samplesheet md5'
                 warnings.warn(message)
                 if self.log_slack:
                     self.igf_slack.post_message_to_channel(
                         message, reaction='sleep')
     except:
         if db_connected:
             base.rollback_session()
             base.close_session()
         raise
  def run(self):
    try:
      seqrun_igf_id=self.param_required('seqrun_igf_id')
      seqrun_source=self.param_required('seqrun_source')
      seqrun_server=self.param_required('seqrun_server')
      seqrun_user=self.param_required('seqrun_user')
      igf_session_class=self.param_required('igf_session_class')
      seqrun_md5_type=self.param_required('seqrun_md5_type')
      hpc_location=self.param_required('hpc_location')
      db_file_location_label=self.param_required('db_file_location_label')
      db_file_path_label=self.param_required('db_file_path_label')

      seqrun_path=os.path.join(seqrun_source,seqrun_igf_id)                     # get new seqrun path
      seqrun_server_login='******'.format(seqrun_user, seqrun_server)          # get destination path
      subprocess.check_call(['ssh', 
                             seqrun_server_login,
                             'ls', 
                             seqrun_path])                                      # check remote file
      ca=CollectionAdaptor(**{'session_class':igf_session_class})               # get the md5 list from db
      ca.start_session()
      files=ca.get_collection_files(collection_name=seqrun_igf_id,
                                    collection_type=seqrun_md5_type)            # fetch file collection
      files=files.to_dict(orient='records')
      ca.close_session()

      if len(files)>1:
        raise ValueError('sequencing run {0} has more than one md5 json file'.\
                         format(seqrun_igf_id))

      if len(files)==0:
        raise ValueError('sequencing run {0} does not have any md5 json file'.\
                         format(seqrun_igf_id))
      
      md5_json_location=files[0][db_file_location_label]
      md5_json_path=files[0][db_file_path_label]
      if md5_json_location !=hpc_location:
        temp_dir=get_temp_dir(work_dir=os.getcwd())                             # create a temp directory
        destination_path=os.path.join(temp_dir,os.path.basename(md5_json_path)) # get destination path for md5 file
        copy_remote_file(source_path=md5_json_path,
                         destinationa_path=destination_path,
                         source_address=seqrun_server_login)                    # copy remote file to local disk
        md5_json_path=destination_path                                          # set md5 json filepath

      with open(md5_json_path) as json_data:
            md5_json=json.load(json_data)                                       # read json data, get all file and md5 from json file
      self.param('sub_tasks',md5_json)                                          # seed dataflow
      remove_dir(temp_dir)                                                      # remove temp dir when its not required
      
      message='seqrun: {0}, seeded {1} files for copy'.format(seqrun_igf_id, \
                                                              len(md5_json))
      self.warning(message)
      self.post_message_to_slack(message,reaction='pass')
      self.comment_asana_task(task_name=seqrun_igf_id, \
                              comment=message)

    except Exception as e:
      message='Error in {0}: {1}, seqrun: {2}'.format(self.__class__.__name__,\
                                                      e,\
                                                      seqrun_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')
      self.comment_asana_task(task_name=seqrun_igf_id, \
                              comment=message)
      raise
    def run(self):
        '''
    A method for running samtools commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param threads: Number of threads to use for Bam to Cram conversion, default 4
    :param base_work_dir: Base workd directory
    :param samtools_command: Samtools command
    :param samFlagInclude: Sam flags to include in filtered bam, default None
    :param samFlagExclude: Sam flags to exclude from the filtered bam, default None
    :param mapq_threshold: Skip alignments with MAPQ smaller than this value, default None
    :param use_encode_filter: For samtools filter, use Encode epigenome filter, i.e. samFlagExclude 1804(PE) / 1796(SE), default False
    :param encodePeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1804
    :param encodeSeExcludeFlag: For samtools filter, Encode exclude flag for PE reads, default 1796
    :param use_ephemeral_space: A toggle for temp dir settings, default 0
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            input_files = self.param_required('input_files')
            samtools_exe = self.param_required('samtools_exe')
            reference_type = self.param('reference_type')
            threads = self.param('threads')
            base_work_dir = self.param_required('base_work_dir')
            samtools_command = self.param_required('samtools_command')
            analysis_files = self.param_required('analysis_files')
            output_prefix = self.param_required('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            collection_table = self.param('collection_table')
            base_result_dir = self.param('base_result_dir')
            analysis_name = self.param('analysis_name')
            force_overwrite = self.param('force_overwrite')
            samFlagInclude = self.param('samFlagInclude')
            samFlagExclude = self.param('samFlagExclude')
            mapq_threshold = self.param('mapq_threshold')
            library_layout = self.param_required('library_layout')
            use_encode_filter = self.param('use_encode_filter')
            species_name = self.param_required('species_name')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                               # adding datestamp to the output file prefix

            if use_encode_filter:
                samFlagInclude = None
                if library_layout == 'PAIRED':
                    samFlagExclude = 1804
                else:
                    samFlagExclude = 1796

            if not isinstance(input_files, list) or \
               len(input_files) == 0:
                raise ValueError('No input file found')

            if len(input_files) > 1:
                raise ValueError('More than one input file found: {0}'.\
                                 format(input_files))

            output_bam_cram_list = list()
            input_file = input_files[0]
            temp_output_dir = \
              get_temp_dir(
                use_ephemeral_space=use_ephemeral_space)                              # get temp work dir
            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            samtools_cmdline = ''
            temp_output = None
            if samtools_command == 'idxstats':
                temp_output,samtools_cmdline = \
                  run_bam_idxstat(
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    force=True)                                                         # run samtools idxstats
            elif samtools_command == 'flagstat':
                temp_output,samtools_cmdline = \
                  run_bam_flagstat(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools flagstat
            elif samtools_command == 'stats':
                temp_output,samtools_cmdline,stats_metrics = \
                  run_bam_stats(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    output_dir=temp_output_dir,
                    output_prefix=output_prefix,
                    threads=threads,
                    force=True)                                                         # run samtools stats
                if load_metrics_to_cram and \
                   len(stats_metrics) > 0:
                    ca = CollectionAdaptor(
                        **{'session_class': igf_session_class})
                    attribute_data = \
                    ca.prepare_data_for_collection_attribute(\
                      collection_name=experiment_igf_id,
                      collection_type=cram_collection_type,
                      data_list=stats_metrics)
                    ca.start_session()
                    try:
                        ca.create_or_update_collection_attributes(\
                          data=attribute_data,
                          autosave=False)
                        ca.commit_session()
                        ca.close_session()
                    except Exception as e:
                        ca.rollback_session()
                        ca.close_session()
                        raise ValueError('Failed to load data to db: {0}'.\
                                       format(e))

            elif samtools_command == 'merge':
                if output_prefix is None:
                    raise ValueError(
                        'Missing output filename prefix for merged bam')

                sorted_by_name = self.param('sorted_by_name')
                temp_output = \
                  os.path.join(\
                    work_dir,
                    '{0}_merged.bam'.format(output_prefix))
                samtools_cmdline = \
                  merge_multiple_bam(\
                    samtools_exe=samtools_exe,
                    input_bam_list=input_file,
                    output_bam_path=temp_output,
                    sorted_by_name=sorted_by_name,
                    threads=threads,
                    use_ephemeral_space=use_ephemeral_space,
                    force=True)
            elif samtools_command == 'view_bamToCram':
                if base_result_dir is None:
                    raise ValueError(
                        'base_result_dir is required for CRAM file loading')

                if analysis_name is None:
                    raise ValueError(
                        'analysis_name is required for CRAM file loading')

                ref_genome = \
                  Reference_genome_utils(\
                    genome_tag=species_name,
                    dbsession_class=igf_session_class,
                    genome_fasta_type=reference_type)
                genome_fasta = ref_genome.get_genome_fasta(
                )  # get genome fasta
                cram_file = \
                  os.path.basename(input_file).\
                    replace('.bam','.cram')                                             # get base cram file name
                cram_file = os.path.join(
                    temp_output_dir,
                    cram_file)  # get cram file path in work dir
                samtools_cmdline = \
                  convert_bam_to_cram(\
                    samtools_exe=samtools_exe,
                    bam_file=input_file,
                    reference_file=genome_fasta,
                    cram_path=cram_file,
                    use_ephemeral_space=use_ephemeral_space,
                    threads=threads,
                    force=True,
                    dry_run=False)
                au = \
                  Analysis_collection_utils(\
                    dbsession_class=igf_session_class,
                    analysis_name=analysis_name,
                    tag_name=species_name,
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    collection_table=collection_table,
                    base_path=base_result_dir)
                temp_output_bam_cram_list = \
                  au.load_file_to_disk_and_db(\
                    input_file_list=[cram_file],
                    file_suffix='cram',
                    withdraw_exisitng_collection=force_overwrite)                       # load file to db and disk
                for cram in temp_output_bam_cram_list:
                    index_bam_or_cram(\
                      samtools_exe=samtools_exe,
                      input_path=cram,
                      threads=threads,
                      dry_run=False)
                    index_path = '{0}.crai'.format(cram)
                    output_bam_cram_list.append(cram)
                    output_bam_cram_list.append(index_path)

                if len(output_bam_cram_list) == 0:
                    raise ValueError('No output cram file found')

            elif samtools_command == 'view_filterBam':
                temp_output_bam = \
                  os.path.join(\
                    temp_output_dir,
                    os.path.basename(input_file).replace('.bam','.filtered.bam'))
                samtools_cmdline = \
                  filter_bam_file(
                    samtools_exe=samtools_exe,
                    input_bam=input_file,
                    output_bam=temp_output_bam,
                    samFlagInclude=samFlagInclude,
                    samFlagExclude=samFlagExclude,
                    threads=threads,
                    mapq_threshold=mapq_threshold,
                    index_output=False,
                    dry_run=False)
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output_bam))
                move_file(\
                  source_path=temp_output_bam,
                  destinationa_path=dest_path,
                  force=True)
                index_bam_or_cram(\
                  samtools_exe=samtools_exe,
                  input_path=dest_path,
                  threads=threads,
                  dry_run=False)
                index_path = '{0}.bai'.format(dest_path)
                output_bam_cram_list.append(dest_path)
                output_bam_cram_list.append(index_path)
            else:
                raise ValueError('Samtools command {0} not supported'.\
                                 format(samtools_command))

            if temp_output is not None:
                dest_path = \
                  os.path.join(\
                    work_dir,
                    os.path.basename(temp_output))
                if dest_path != temp_output:
                    move_file(\
                      source_path=temp_output,
                      destinationa_path=dest_path,
                      force=True)
                analysis_files.append(dest_path)

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'output_bam_cram_list': output_bam_cram_list
                })  # pass on samtools output list
            message = \
              'finished samtools {0} for {1} {2}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'finished samtools {0} for {1} {2}: {3}'.\
                format(
                  samtools_command,
                  project_igf_id,
                  sample_igf_id,
                  samtools_cmdline)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send comment to Asana
        except Exception as e:
            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
Пример #22
0
    def run(self):
        '''
    A method for running samtools commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param species_name: species_name
    :param base_result_dir: Base results directory
    :param report_template_file: A template file for writing scanpy report
    :param analysis_name: Analysis name, default scanpy
    :param species_name_lookup: A dictionary for ensembl species name lookup
    :param cellranger_collection_type: Cellranger analysis collection type, default CELLRANGER_RESULTS
    :param scanpy_collection_type: Scanpy report collection type, default SCANPY_RESULTS
    :param collection_table: Collection table name for loading scanpy report, default experiment
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param_required('species_name')
            report_template_file = self.param_required('report_template_file')
            analysis_name = self.param_required('analysis_name')
            base_result_dir = self.param_required('base_result_dir')
            base_work_dir = self.param_required('base_work_dir')
            species_name_lookup = self.param('species_name_lookup')
            cellranger_collection_type = self.param(
                'cellranger_collection_type')
            scanpy_collection_type = self.param('scanpy_collection_type')
            collection_table = self.param('collection_table')
            cellbrowser_dir_prefix = self.param('cellbrowser_dir_prefix')
            use_ephemeral_space = self.param('use_ephemeral_space')
            cellranger_tarfile = ''
            output_report = ''
            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix)  # get a run work dir
            if species_name in species_name_lookup.keys(
            ):  # check for human or mice
                ensembl_species_name = species_name_lookup[
                    species_name]  # get ensembl species name
                # fetch cellranger tar path from db
                if cellranger_tarfile == '':
                    ca = CollectionAdaptor(
                        **{'session_class': igf_session_class})
                    ca.start_session()  # connect to database
                    cellranger_tarfiles = \
                      ca.get_collection_files(\
                        collection_name=experiment_igf_id,
                        collection_type=cellranger_collection_type,
                        output_mode='dataframe')                                          # fetch collection files
                    ca.close_session()
                    if len(cellranger_tarfiles.index) == 0:
                        raise ValueError('No cellranger analysis output found for exp {0}'.\
                                         format(experiment_igf_id))

                    cellranger_tarfile = cellranger_tarfiles[
                        'file_path'].values[
                            0]  # select first file as analysis file

                # extract filtered metrics files from tar
                output_dir = \
                  get_temp_dir(use_ephemeral_space=use_ephemeral_space)                 # get a temp dir
                datestamp = get_datestamp_label()
                cellbrowser_dir = \
                  os.path.join( \
                    work_dir,
                    '{0}_{1}'.\
                      format( \
                        cellbrowser_dir_prefix,
                        datestamp))
                cellbrowser_h5ad = \
                  os.path.join(\
                    cellbrowser_dir,
                    'scanpy.h5ad')
                output_report = \
                  os.path.join(\
                    output_dir,
                    'report.html')                                                      # get temp report path
                matrix_file,gene_file,barcode_file = \
                  self._extract_cellranger_filtered_metrics(\
                    tar_file=cellranger_tarfile,
                    output_dir=output_dir)                                              # get cellranger output files
                sp = \
                  Scanpy_tool(\
                    project_name=project_igf_id,
                    sample_name=sample_igf_id,
                    matrix_file=matrix_file,
                    features_tsv=gene_file,
                    barcode_tsv=barcode_file,
                    html_template_file=report_template_file,
                    species_name=ensembl_species_name,
                    output_file=output_report,
                    use_ephemeral_space=use_ephemeral_space,
                    cellbrowser_h5ad=cellbrowser_h5ad)
                sp.generate_report()  # generate scanpy report
                # load files to db and disk
                au = \
                  Analysis_collection_utils(\
                    dbsession_class=igf_session_class,
                    analysis_name=analysis_name,
                    tag_name=species_name,
                    collection_name=experiment_igf_id,
                    collection_type=scanpy_collection_type,
                    collection_table=collection_table,
                    base_path=base_result_dir)                                          # initiate loading of report file
                output_file_list = \
                  au.load_file_to_disk_and_db(\
                    input_file_list=[output_report],
                    withdraw_exisitng_collection=True)                                  # load file to db and disk
                output_report = output_file_list[0]

            self.param(
                'dataflow_params', {
                    'output_report': output_report,
                    'scanpy_h5ad_path': cellbrowser_h5ad
                })  # pass on output report filepath
        except Exception as e:
            message = 'project: {2}, sample:{3}, Error in {0}: {1}'.\
                      format(self.__class__.__name__,
                             e,
                             project_igf_id,
                             sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
Пример #23
0
  def run(self):
    try:
      fastq_file = self.param_required('fastq_file')
      fastq_dir = self.param_required('fastq_dir')
      igf_session_class = self.param_required('igf_session_class')
      fastqc_exe = self.param_required('fastqc_exe')
      tag = self.param_required('tag')
      seqrun_igf_id = self.param_required('seqrun_igf_id')
      seqrun_date = self.param_required('seqrun_date')
      flowcell_id = self.param_required('flowcell_id')
      fastqc_options = self.param('fastqc_options')
      base_results_dir = self.param_required('base_results_dir')
      project_name = self.param_required('project_name')
      force_overwrite = self.param('force_overwrite')
      fastqc_dir_label = self.param('fastqc_dir_label')
      required_collection_table = self.param('required_collection_table')
      sample_name = self.param('sample_name')
      hpc_location = self.param('hpc_location')
      fastqc_collection_type = self.param('fastqc_collection_type')
      use_ephemeral_space = self.param('use_ephemeral_space')
      store_file = self.param('store_file')

      lane_index_info = os.path.basename(fastq_dir)                             # get the lane and index length info
      fastq_file_label = os.path.basename(fastq_file).replace('.fastq.gz','')
      collection_name = None
      collection_table = None
      if tag=='known' and store_file:                                           # fetch sample name for known fastq, if its not defined
        base = BaseAdaptor(**{'session_class':igf_session_class})
        base.start_session()                                                    # connect to db

        ca = CollectionAdaptor(**{'session':base.session})
        (collection_name,collection_table) = \
          ca.fetch_collection_name_and_table_from_file_path(\
            file_path=fastq_file)                                               # fetch collection name and table info

        if collection_table != required_collection_table:
          raise ValueError(
        'Expected collection table {0} and got {1}, {2}'.\
          format(
            required_collection_table,
            collection_table,
            fastq_file))

        ra = RunAdaptor(**{'session':base.session})
        sample = ra.fetch_sample_info_for_run(run_igf_id=collection_name)
        sample_name = sample['sample_igf_id']
        base.close_session()

      fastqc_result_dir = \
        os.path.join(\
          base_results_dir,
          project_name,
          seqrun_date,
          flowcell_id,
          lane_index_info,
          tag)                                                                  # result dir path is generic
      if sample_name is not None:
        fastqc_result_dir = \
          os.path.join(\
            fastqc_result_dir,
            sample_name)                                                        # add sample name to dir path if its available

      fastqc_result_dir = \
        os.path.join(\
          fastqc_result_dir,
          fastq_file_label,
          fastqc_dir_label)                                                     # keep multiple files under same dir

      if os.path.exists(fastqc_result_dir) and force_overwrite:
        remove_dir(fastqc_result_dir)                                           # remove existing output dir if force_overwrite is true

      if not os.path.exists(fastqc_result_dir):
        os.makedirs(fastqc_result_dir,mode=0o775)                               # create output dir if its not present

      temp_work_dir = \
        get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get a temp work dir
      if not os.path.exists(fastq_file):
        raise IOError('fastq file {0} not readable'.format(fastq_file))         # raise if fastq file path is not readable

      fastqc_output = \
        os.path.join(\
          temp_work_dir,
          fastq_file_label)
      os.mkdir(fastqc_output)                                                   # create fastqc output dir
      fastqc_param = \
        self.format_tool_options(fastqc_options)                                # format fastqc params
      fastqc_cmd = \
        [fastqc_exe, '-o',fastqc_output, '-d',temp_work_dir ]                   # fastqc base parameters
      fastqc_cmd.extend(fastqc_param)                                           # add additional parameters
      fastqc_cmd.append(fastq_file)                                             # fastqc input file
      subprocess.check_call(' '.join(fastqc_cmd),shell=True)                    # run fastqc

      fastqc_zip = None
      fastqc_html = None
      for root, _, files in os.walk(top=fastqc_output):
        for file in files:
          if fnmatch.fnmatch(file, '*.zip'):
            input_fastqc_zip = os.path.join(root,file)
            copy2(input_fastqc_zip,fastqc_result_dir)
            fastqc_zip = os.path.join(fastqc_result_dir,file)

          if fnmatch.fnmatch(file, '*.html'):
            input_fastqc_html = os.path.join(root,file)
            copy2(input_fastqc_html,fastqc_result_dir)
            fastqc_html = os.path.join(fastqc_result_dir,file)

      if fastqc_html is None or fastqc_zip is None:
        raise ValueError('Missing required values, fastqc zip: {0}, fastqc html: {1}'.\
                         format(fastqc_zip,fastqc_html))

      if tag=='known' and store_file:
        if collection_name is None:
          raise ValueError('couldn\'t retrieve collection name for {0}'.\
                           format(fastq_file))

        fastqc_files = \
          [{'name':collection_name,
            'type':fastqc_collection_type,
            'table':required_collection_table,
            'file_path':fastqc_zip,
            'location':hpc_location},
           {'name':collection_name,
            'type':fastqc_collection_type,
            'table':required_collection_table,
            'file_path':fastqc_html,
            'location':hpc_location},
          ]
        ca = CollectionAdaptor(**{'session_class':igf_session_class})
        ca.start_session()
        ca.load_file_and_create_collection(data=fastqc_files)                 # store fastqc files to db
        ca.close_session()

      self.param('dataflow_params',
                 {'fastqc_html':fastqc_html,
                  'lane_index_info':lane_index_info,
                  'sample_name':sample_name,
                  'fastqc':{'fastq_dir':fastq_dir,
                            'fastqc_zip':fastqc_zip,
                            'fastqc_html':fastqc_html}})                        # set dataflow params
    except Exception as e:
      message = \
        'seqrun: {2}, Error in {0}: {1}'.\
        format(\
          self.__class__.__name__,
          e,
          seqrun_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise
Пример #24
0
    def run(self):
        '''
    A method for running picard commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param reference_type: Reference genome collection type, default GENOME_FASTA
    :param reference_refFlat: Reference genome collection type, default GENE_REFFLAT
    :param ribosomal_interval_type: Collection type for ribosomal interval list, default RIBOSOMAL_INTERVAL
    :param species_name: species_name
    :param java_exe: Java path
    :param java_java_paramexe: Java run parameters
    :param picard_jar: Picard jar path
    :param picard_command: Picard command
    :param base_work_dir: Base workd directory
    :param copy_input: A toggle for copying input file to temp, 1 for True default 0 for False
    :param use_ephemeral_space: A toggle for temp dir setting, default 0
    :param patterned_flowcell_list: A list of paterned flowcells, default ['HISEQ4000','NEXTSEQ']
    '''
        try:
            temp_output_dir = False
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            java_exe = self.param_required('java_exe')
            java_param = self.param_required('java_param')
            picard_jar = self.param_required('picard_jar')
            input_files = self.param_required('input_files')
            picard_command = self.param_required('picard_command')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param('species_name')
            reference_type = self.param('reference_type')
            reference_refFlat = self.param('reference_refFlat')
            ribosomal_interval_type = self.param('ribosomal_interval_type')
            base_work_dir = self.param_required('base_work_dir')
            analysis_files = self.param_required('analysis_files')
            picard_option = self.param('picard_option')
            patterned_flowcell_list = self.param('patterned_flowcell_list')
            platform_name = self.param_required('platform_name')
            output_prefix = self.param('output_prefix')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            cram_collection_type = self.param('cram_collection_type')
            seed_date_stamp = self.param_required('date_stamp')
            use_ephemeral_space = self.param('use_ephemeral_space')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = \
                  '{0}_{1}'.\
                    format(
                      output_prefix,
                      seed_date_stamp)                                                  # adding seed datestamp to output prefix

            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = \
              self.get_job_work_dir(work_dir=work_dir_prefix)                         # get a run work dir
            temp_output_dir = \
              get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp work dir
            ref_genome = \
              Reference_genome_utils(
                genome_tag=species_name,
                dbsession_class=igf_session_class,
                genome_fasta_type=reference_type,
                gene_reflat_type=reference_refFlat,
                ribosomal_interval_type=ribosomal_interval_type)                      # setup ref genome utils
            genome_fasta = ref_genome.get_genome_fasta()  # get genome fasta
            ref_flat_file = ref_genome.get_gene_reflat()  # get refFlat file
            ribosomal_interval_file = ref_genome.get_ribosomal_interval(
            )  # get ribosomal interval file
            patterned_flowcell = False
            if platform_name in patterned_flowcell_list:  # check for patterned flowcell
                patterned_flowcell = True

            if load_metrics_to_cram and \
               not cram_collection_type:
                raise ValueError(
                    'Cram file collection type is required for loading picard metrics to db'
                )

            picard=\
              Picard_tools(\
                java_exe=java_exe,
                java_param=java_param,
                picard_jar=picard_jar,
                input_files=input_files,
                output_dir=temp_output_dir,
                ref_fasta=genome_fasta,
                patterned_flowcell=patterned_flowcell,
                ref_flat_file=ref_flat_file,
                picard_option=picard_option,
                output_prefix=output_prefix,
                use_ephemeral_space=use_ephemeral_space,
                ribisomal_interval=ribosomal_interval_file)                           # setup picard tool
            temp_output_files,picard_command_line,picard_metrics = \
              picard.run_picard_command(command_name=picard_command)                  # run picard command
            output_file_list = list()
            for source_path in temp_output_files:
                dest_path=\
                  os.path.join(
                    work_dir,
                    os.path.basename(source_path))                                      # get destination filepath
                move_file(source_path=source_path,
                          destinationa_path=dest_path,
                          force=True)  # move files to work dir
                output_file_list.append(dest_path)
            remove_dir(temp_output_dir)
            analysis_files.extend(output_file_list)
            bam_files = list()
            for file in output_file_list:
                if file.endswith('.bam'):
                    bam_files.append(file)

            if load_metrics_to_cram and \
               len(picard_metrics)>0:
                ca = CollectionAdaptor(**{'session_class': igf_session_class})
                attribute_data = \
                  ca.prepare_data_for_collection_attribute(
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    data_list=picard_metrics)                                           # fromat data for collection attribute table
                ca.start_session()
                try:
                    ca.create_or_update_collection_attributes(\
                      data=attribute_data,
                      autosave=False
                    )                                                                     # load data to collection attribute table
                    ca.commit_session()
                    ca.close_session()
                except:
                    ca.rollback_session()
                    ca.close_session()
                    raise

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'bam_files': bam_files,
                    'seed_date_stamp': seed_date_stamp
                })  # pass on picard output list
            message = \
              'finished picard {0} for {1} {2}'.\
                format(
                  picard_command,
                  project_igf_id,
                  sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message = \
              'Picard {0} command: {1}'.\
                format(
                  picard_command,
                  picard_command_line)
            #self.comment_asana_task(task_name=project_igf_id, comment=message)        # send commandline to Asana
        except Exception as e:
            if temp_output_dir and \
               os.path.exists(temp_output_dir):
                remove_dir(temp_output_dir)

            message = \
              'project: {2}, sample:{3}, Error in {0}: {1}'.\
                format(
                  self.__class__.__name__,
                  e,
                  project_igf_id,
                  sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
Пример #25
0
    def run(self):
        '''
    A runnable method for running PPQT analysis
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            input_files = self.param_required('input_files')
            rscript_path = self.param_required('rscript_path')
            ppqt_exe = self.param_required('ppqt_exe')
            base_work_dir = self.param_required('base_work_dir')
            base_result_dir = self.param_required('base_result_dir')
            library_strategy = self.param_required('library_strategy')
            analysis_files = self.param_required('analysis_files')
            output_prefix = self.param_required('output_prefix')
            species_name = self.param_required('species_name')
            analysis_name = self.param('analysis_name')
            seed_date_stamp = self.param_required('date_stamp')
            load_metrics_to_cram = self.param('load_metrics_to_cram')
            ppqt_collection_type = self.param('ppqt_collection_type')
            cram_collection_type = self.param('cram_collection_type')
            collection_table = self.param('collection_table')
            force_overwrite = self.param('force_overwrite')
            use_ephemeral_space = self.param('use_ephemeral_space')
            threads = self.param('threads')
            seed_date_stamp = get_datestamp_label(seed_date_stamp)
            if output_prefix is not None:
                output_prefix = '{0}_{1}'.format(
                    output_prefix, seed_date_stamp
                )  # adding datestamp to the output file prefix

            if not isinstance(input_files, list) or \
               len(input_files) == 0:
                raise ValueError('No input file found')

            if len(input_files) > 1:
                raise ValueError('More than one input file found: {0}'.\
                                 format(input_files))

            if analysis_name is None:
                analysis_name = library_strategy  # use library_strategy as default analysis_name

            input_file = input_files[0]
            work_dir_prefix = \
              os.path.join(\
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix)  # get a run work dir
            ppqt_obj = \
              Ppqt_tools(\
                rscript_path=rscript_path,
                ppqt_exe=ppqt_exe,
                use_ephemeral_space=use_ephemeral_space,
                threads=threads)
            ppqt_cmd,spp_output, pdf_output,spp_data = \
              ppqt_obj.run_ppqt(\
                input_bam=input_file,
                output_dir=work_dir,
                output_spp_name='{0}_{1}.spp.out'.format(output_prefix,'PPQT'),
                output_pdf_name='{0}_{1}.spp.pdf'.format(output_prefix,'PPQT'))
            analysis_files.append(spp_output)
            au = \
              Analysis_collection_utils(\
                dbsession_class=igf_session_class,
                analysis_name=analysis_name,
                tag_name=species_name,
                collection_name=experiment_igf_id,
                collection_type=ppqt_collection_type,
                collection_table=collection_table,
                base_path=base_result_dir)
            output_ppqt_list = \
              au.load_file_to_disk_and_db(\
                input_file_list=[pdf_output],
                file_suffix='pdf',
                withdraw_exisitng_collection=force_overwrite)                         # load file to db and disk
            if load_metrics_to_cram and \
               len(spp_data) > 0:
                ca = CollectionAdaptor(**{'session_class': igf_session_class})
                attribute_data = \
                  ca.prepare_data_for_collection_attribute(\
                    collection_name=experiment_igf_id,
                    collection_type=cram_collection_type,
                    data_list=spp_data)
                ca.start_session()
                try:
                    ca.create_or_update_collection_attributes(\
                      data=attribute_data,
                      autosave=False)
                    ca.commit_session()
                    ca.close_session()
                except Exception as e:
                    ca.rollback_session()
                    ca.close_session()
                    raise ValueError('Failed to load data to db: {0}'.\
                                     format(e))

            self.param(
                'dataflow_params', {
                    'analysis_files': analysis_files,
                    'output_ppqt_list': output_ppqt_list
                })  # pass on samtools output list
            message='finished PPQT for {0} {1}'.\
                    format(project_igf_id,
                           sample_igf_id)
            self.post_message_to_slack(message,
                                       reaction='pass')  # send log to slack
            message='finished PPQT for {0} {1}: {2}'.\
                    format(project_igf_id,
                           sample_igf_id,
                           ppqt_cmd)
            self.comment_asana_task(task_name=project_igf_id,
                                    comment=message)  # send comment to Asana
        except Exception as e:
            message='project: {2}, sample:{3}, Error in {0}: {1}'.\
                    format(self.__class__.__name__,
                           e,
                           project_igf_id,
                           sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise
Пример #26
0
 def setUp(self):
     self.dbconfig = 'data/dbconfig.json'
     dbparam = read_dbconf_json(self.dbconfig)
     base = BaseAdaptor(**dbparam)
     self.engine = base.engine
     self.dbname = dbparam['dbname']
     Base.metadata.create_all(self.engine)
     self.session_class = base.get_session_class()
     base.start_session()
     platform_data = [
         {
             "platform_igf_id": "M03291",
             "model_name": "MISEQ",
             "vendor_name": "ILLUMINA",
             "software_name": "RTA",
             "software_version": "RTA1.18.54"
         },
     ]
     flowcell_rule_data = [{
         "platform_igf_id": "M03291",
         "flowcell_type": "MISEQ",
         "index_1": "NO_CHANGE",
         "index_2": "NO_CHANGE"
     }]
     pl = PlatformAdaptor(**{'session': base.session})
     pl.store_platform_data(data=platform_data)
     pl.store_flowcell_barcode_rule(data=flowcell_rule_data)
     project_data = [{'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq'}]
     pa = ProjectAdaptor(**{'session': base.session})
     pa.store_project_and_attribute_data(data=project_data)
     sample_data = [{
         'sample_igf_id': 'IGF103923',
         'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq',
         'species_name': 'HG38'
     }]
     sa = SampleAdaptor(**{'session': base.session})
     sa.store_sample_and_attribute_data(data=sample_data)
     seqrun_data = [
         {
             'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
             'flowcell_id': '000000000-BRN47',
             'platform_igf_id': 'M03291',
             'flowcell': 'MISEQ'
         },
     ]
     sra = SeqrunAdaptor(**{'session': base.session})
     sra.store_seqrun_and_attribute_data(data=seqrun_data)
     pipeline_data = [
         {
             "pipeline_name": "PrimaryAnalysis",
             "pipeline_db": "sqlite:////bcl2fastq.db"
         },
         {
             "pipeline_name": "DemultiplexIlluminaFastq",
             "pipeline_db": "sqlite:////bcl2fastq.db"
         },
     ]
     pla = PipelineAdaptor(**{'session': base.session})
     pla.store_pipeline_data(data=pipeline_data)
     file_data = [
         {
             'file_path': '/path/S20180405S_S1_L001_R1_001.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1528121404'
         },
         {
             'file_path': '/path/S20180405S_S1_L001_R2_001.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1467047580'
         },
         {
             'file_path': '/path/S20180405S_S3_L001_R2_001.fastq.gz',
             'location': 'HPC_PROJECT',
             'md5': 'fd5a95c18ebb7145645e95ce08d729e4',
             'size': '1467047580'
         },
     ]
     fa = FileAdaptor(**{'session': base.session})
     fa.store_file_and_attribute_data(data=file_data)
     collection_data = [
         {
             'name': 'IGF103923_MISEQ_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'table': 'run'
         },
         {
             'name': 'IGF103923_MISEQ1_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'table': 'run'
         },
     ]
     collection_files_data = [
         {
             'name': 'IGF103923_MISEQ_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'file_path': '/path/S20180405S_S1_L001_R1_001.fastq.gz'
         },
         {
             'name': 'IGF103923_MISEQ_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'file_path': '/path/S20180405S_S1_L001_R2_001.fastq.gz'
         },
         {
             'name': 'IGF103923_MISEQ1_000000000-BRN47_1',
             'type': 'demultiplexed_fastq',
             'file_path': '/path/S20180405S_S3_L001_R2_001.fastq.gz'
         },
     ]
     ca = CollectionAdaptor(**{'session': base.session})
     ca.store_collection_and_attribute_data(data=collection_data)
     ca.create_collection_group(data=collection_files_data)
     experiment_data = [{
         'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq',
         'sample_igf_id': 'IGF103923',
         'experiment_igf_id': 'IGF103923_MISEQ',
         'library_name': 'IGF103923',
         'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL',
         'library_strategy': 'RNA-SEQ',
         'experiment_type': 'TENX-TRANSCRIPTOME-3P',
         'library_layout': 'PAIRED',
         'platform_name': 'MISEQ'
     }, {
         'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq',
         'sample_igf_id': 'IGF103923',
         'experiment_igf_id': 'IGF103923_MISEQ1',
         'library_name': 'IGF103923_1',
         'library_source': 'GENOMIC_SINGLE_CELL',
         'library_strategy': 'WGS',
         'experiment_type': 'UNKNOWN',
         'library_layout': 'PAIRED',
         'platform_name': 'MISEQ'
     }]
     ea = ExperimentAdaptor(**{'session': base.session})
     ea.store_project_and_attribute_data(data=experiment_data)
     run_data = [{
         'experiment_igf_id': 'IGF103923_MISEQ',
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'run_igf_id': 'IGF103923_MISEQ_000000000-BRN47_1',
         'lane_number': '1'
     }, {
         'experiment_igf_id': 'IGF103923_MISEQ1',
         'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47',
         'run_igf_id': 'IGF103923_MISEQ1_000000000-BRN47_1',
         'lane_number': '1'
     }]
     ra = RunAdaptor(**{'session': base.session})
     ra.store_run_and_attribute_data(data=run_data)
     base.close_session()
Пример #27
0
  def run(self):
    try:
      project_igf_id = self.param_required('project_igf_id')
      sample_igf_id = self.param_required('sample_igf_id')
      file_list = self.param_required('file_list')
      remote_user = self.param_required('remote_user')
      remote_host = self.param_required('remote_host')
      remote_project_path = self.param_required('remote_project_path')
      dir_labels = self.param_required('dir_labels')
      igf_session_class = self.param_required('igf_session_class')
      force_overwrite = self.param('force_overwrite')
      collect_remote_file = self.param('collect_remote_file')
      collection_name = self.param('collection_name')
      collection_type = self.param('collection_type')
      collection_table = self.param('collection_table')
      file_location = self.param('file_location')
      use_ephemeral_space = self.param('use_ephemeral_space')
      destination_output_path = \
        os.path.join(
          remote_project_path,
          project_igf_id)                                                       # get base destination path
      if isinstance(dir_labels, list) and \
         len(dir_labels) > 0:
        destination_output_path=\
          os.path.join(destination_output_path,
                       *dir_labels)

      if collect_remote_file:
        if collection_name is None or \
           collection_type is None:
           raise ValueError('Name and type are required for db collection')

      output_file_list = list()
      temp_work_dir = \
        get_temp_dir(use_ephemeral_space=use_ephemeral_space)                   # get temp dir
      for file in file_list:
        if not os.path.exists(file):
          raise IOError('file {0} not found'.\
                        format(file))

        if os.path.isfile(file):
          copy2(
            file,
            os.path.join(
              temp_work_dir,
              os.path.basename(file)))                                          # copy file to a temp dir
          dest_file_path = \
            os.path.join(
              destination_output_path,
              os.path.basename(file))                                           # get destination file path
          os.chmod(
            os.path.join(
              temp_work_dir,
              os.path.basename(file)),
            mode=0o764)                                                         # set file permission
        elif os.path.isdir(file):
          copytree(\
            file,
            os.path.join(
              temp_work_dir,
              os.path.basename(file)))                                          # copy dir to a temp dir
          dest_file_path=destination_output_path
          for root,dirs,files in os.walk(temp_work_dir):
            for dir_name in dirs:
              os.chmod(
                os.path.join(root,dir_name),
                mode=0o775)
            for file_name in files:
              os.chmod(
                os.path.join(root,file_name),
                mode=0o764)                                                     # changing file and dir permissions for remote files
        else:
          raise ValueError('Unknown source file type: {0}'.\
                           format(file))

        #os.chmod(
        #  os.path.join(
        #    temp_work_dir,
        #    os.path.basename(file)),
        #  mode=0o754)                                                                       # set file permission
        copy_remote_file(\
          source_path=os.path.join(temp_work_dir,
                                   os.path.basename(file)),
          destinationa_path=dest_file_path,
          destination_address='{0}@{1}'.format(remote_user,remote_host),
          force_update=force_overwrite
        )                                                                       # copy file to remote
        if os.path.isdir(file):
          dest_file_path=\
            os.path.join(\
              dest_file_path,
              os.path.basename(file))                                           # fix for dir input

        output_file_list.append(dest_file_path)

      remove_dir(dir_path=temp_work_dir)                                        # remove temp dir
      self.param('dataflow_params',
                 {'status': 'done',
                  'output_list':output_file_list})                              # add dataflow params
      if collect_remote_file:
        data=list()
        remove_data_list=[{'name':collection_name,
                           'type':collection_type}]
        for file in output_file_list:
          data.append(
            {'name':collection_name,
             'type':collection_type,
             'table':collection_table,
             'file_path':file,
             'location':file_location
            }
          )

        ca = CollectionAdaptor(**{'session_class':igf_session_class})
        ca.start_session()
        try:
          ca.remove_collection_group_info(
            data=remove_data_list,
            autosave=False)                                                     # remove existing data before loading new collection
          ca.load_file_and_create_collection(
            data=data,
            autosave=False,
            calculate_file_size_and_md5=False)                                  # load remote files to db
          ca.commit_session()                                                   # commit changes
          ca.close_session()
        except:
          ca.rollback_session()                                                 # rollback changes
          ca.close_session()
          raise

    except Exception as e:
      message = \
        'project: {2}, sample:{3}, Error in {0}: {1}'.\
        format(
          self.__class__.__name__,
          e,
          project_igf_id,
          sample_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')                       # post msg to slack for failed jobs
      raise