Python CollectionAdaptor.get_collection_files 예제들, igf_data.igfdb.collectionadaptor.CollectionAdaptor.get_collection_files Python 예제들

예제 #1

0

파일 보기

파일: analysis_collection_utils_test.py 프로젝트: bballamudi/data-management-python

 def test_load_file_to_disk_and_db7(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='RunA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='run',
                                    base_path=self.temp_base_dir)
     input_file_list = [
         os.path.join(self.temp_work_dir, file_name)
         for file_name in self.input_list
     ]
     output_list = au.load_file_to_disk_and_db(
         input_file_list=input_file_list,
         withdraw_exisitng_collection=False
     )  # loading all files to same collection
     base = BaseAdaptor(**{'session_class': self.session_class})
     base.start_session()
     ca = CollectionAdaptor(**{'session': base.session})
     ca_files = ca.get_collection_files(collection_name='RunA',
                                        collection_type='AnalysisA_Files',
                                        output_mode='dataframe')
     file_list = list(ca_files['file_path'].to_dict().values())
     datestamp = get_datestamp_label()
     test_file = os.path.join(
         self.temp_base_dir, 'ProjectA', 'SampleA', 'ExperimentA', 'RunA',
         'AnalysisA', '{0}_{1}_{2}_{3}.{4}'.format('RunA', 'AnalysisA',
                                                   'TagA', datestamp,
                                                   'cram'))
     test_file = preprocess_path_name(input_path=test_file)
     self.assertTrue(test_file in file_list)
     self.assertTrue(test_file in output_list)
     base.close_session()

예제 #2

0

파일 보기

파일: analysis_collection_utils_test.py 프로젝트: bballamudi/data-management-python

 def test_load_file_to_disk_and_db2(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='ProjectA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='project')
     input_file_list = [
         os.path.join(self.temp_work_dir, file_name)
         for file_name in self.input_list
     ]
     output_list = au.load_file_to_disk_and_db(
         input_file_list=input_file_list, withdraw_exisitng_collection=True
     )  # withdrawing existing collection group before loading new
     base = BaseAdaptor(**{'session_class': self.session_class})
     base.start_session()
     ca = CollectionAdaptor(**{'session': base.session})
     ca_files = ca.get_collection_files(collection_name='ProjectA',
                                        collection_type='AnalysisA_Files',
                                        output_mode='dataframe')
     self.assertEqual(len(ca_files.index),
                      1)  # check for unique collection group
     fa = FileAdaptor(**{'session': base.session})
     query = fa.session.query(File)
     fa_records = fa.fetch_records(query=query, output_mode='dataframe')
     self.assertEqual(
         len(fa_records['file_path'].to_dict()), 3
     )  # check if all files are present although only one collection group exists
     self.assertEqual(len(output_list), 3)
     base.close_session()

예제 #3

0

파일 보기

파일: analysis_collection_utils_test.py 프로젝트: bballamudi/data-management-python

 def test_load_file_to_disk_and_db4(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='ProjectA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='project',
                                    rename_file=False)
     input_file_list = [
         os.path.join(self.temp_work_dir, file_name)
         for file_name in self.input_list
     ]
     output_list = au.load_file_to_disk_and_db(
         input_file_list=input_file_list,
         withdraw_exisitng_collection=False
     )  # loading all files to same collection, without rename
     base = BaseAdaptor(**{'session_class': self.session_class})
     base.start_session()
     ca = CollectionAdaptor(**{'session': base.session})
     ca_files = ca.get_collection_files(collection_name='ProjectA',
                                        collection_type='AnalysisA_Files',
                                        output_mode='dataframe')
     file_list = list(ca_files['file_path'].to_dict().values())
     self.assertTrue(input_file_list[0] in file_list)
     self.assertTrue(input_file_list[0] in output_list)
     base.close_session()

예제 #4

0

파일 보기

파일: analysis_collection_utils_test.py 프로젝트: bballamudi/data-management-python

 def test_load_file_to_disk_and_db1(self):
     au = Analysis_collection_utils(dbsession_class=self.session_class,
                                    analysis_name='AnalysisA',
                                    tag_name='TagA',
                                    collection_name='ProjectA',
                                    collection_type='AnalysisA_Files',
                                    collection_table='project')
     input_file_list = [
         os.path.join(self.temp_work_dir, file_name)
         for file_name in self.input_list
     ]
     output_list = au.load_file_to_disk_and_db(
         input_file_list=input_file_list,
         withdraw_exisitng_collection=False
     )  # loading all files to same collection
     base = BaseAdaptor(**{'session_class': self.session_class})
     base.start_session()
     ca = CollectionAdaptor(**{'session': base.session})
     ca_files = ca.get_collection_files(collection_name='ProjectA',
                                        collection_type='AnalysisA_Files',
                                        output_mode='dataframe')
     self.assertEqual(len(ca_files.index),
                      len(self.input_list))  # compare with input list
     self.assertEqual(len(output_list),
                      len(self.input_list))  # compare with output list
     base.close_session()

예제 #5

0

파일 보기

파일: analysis_collection_utils.py 프로젝트: bballamudi/data-management-python

    def create_or_update_analysis_collection(self,
                                             file_path,
                                             dbsession,
                                             withdraw_exisitng_collection=True,
                                             autosave_db=True,
                                             force=True,
                                             remove_file=False):
        '''
    A method for create or update analysis file collection in db. Required elements will be
    collected from database if base_path element is given.
    
    :param file_path: file path to load as db collection
    :param dbsession: An active database session
    :param withdraw_exisitng_collection: Remove existing collection group
    :param autosave_db: Save changes to database, default True
    :param remove_file: A toggle for removing existing file from disk, default False
    :param force: Toggle for removing existing file collection, default True
    '''
        try:
            ca = CollectionAdaptor(**{'session': dbsession})

            collection_exists = \
              ca.get_collection_files(
                collection_name=self.collection_name,
                collection_type=self.collection_type)
            if len(collection_exists.index) >0 and \
                withdraw_exisitng_collection:
                remove_data = [{
                    'name': self.collection_name,
                    'type': self.collection_type
                }]
                ca.remove_collection_group_info(
                    data=remove_data, autosave=autosave_db
                )  # removing all existing collection groups for the collection name and type

            fa = FileAdaptor(**{'session': dbsession})
            file_exists = fa.check_file_records_file_path(
                file_path=file_path)  # check if file already present in db
            if file_exists and force:
                fa.remove_file_data_for_file_path(
                    file_path=file_path,
                    remove_file=remove_file,
                    autosave=autosave_db
                )  # remove entry from file table and disk

            collection_data = [{
                'name': self.collection_name,
                'type': self.collection_type,
                'table': self.collection_table,
                'file_path': file_path
            }]
            ca.load_file_and_create_collection(
                data=collection_data,
                calculate_file_size_and_md5=True,
                autosave=autosave_db
            )  # load file, collection and create collection group
        except:
            raise

예제 #6

0

파일 보기

파일: analysis_collection_utils_test.py 프로젝트: bballamudi/data-management-python

    def test_create_or_update_analysis_collection_rename(self):
        au = Analysis_collection_utils(dbsession_class=self.session_class,
                                       analysis_name='AnalysisA',
                                       tag_name='TagA',
                                       collection_name='ProjectA',
                                       collection_type='AnalysisA_Files',
                                       collection_table='project')
        base = BaseAdaptor(**{'session_class': self.session_class})
        base.start_session()
        au.create_or_update_analysis_collection(file_path=os.path.join(
            self.temp_work_dir, 'a.cram'),
                                                dbsession=base.session,
                                                autosave_db=True)
        base.close_session()
        base.start_session()
        ca = CollectionAdaptor(**{'session': base.session})
        ca_files = ca.get_collection_files(collection_name='ProjectA',
                                           collection_type='AnalysisA_Files',
                                           output_mode='dataframe')
        self.assertEqual(len(ca_files.index), 1)
        au.create_or_update_analysis_collection(
            file_path=os.path.join(self.temp_work_dir, 'a.cram'),
            dbsession=base.session,
            autosave_db=True,
            force=True)  # overwriting file collection
        base.close_session()
        base.start_session()
        ca = CollectionAdaptor(**{'session': base.session})
        ca_files = ca.get_collection_files(collection_name='ProjectA',
                                           collection_type='AnalysisA_Files',
                                           output_mode='dataframe')
        self.assertEqual(len(ca_files.index), 1)

        with self.assertRaises(sqlalchemy.exc.IntegrityError
                               ):  # file collection without force
            au.create_or_update_analysis_collection(\
              file_path=os.path.join(self.temp_work_dir,
                                     'a.cram'),
              dbsession=base.session,
              autosave_db=True,
              force=False
            )
        base.close_session()

예제 #7

0

파일 보기

파일: FetchFastqForRun.py 프로젝트: bballamudi/data-management-python

    def run(self):
        try:
            project_igf_id = self.param_required('project_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            run_igf_id = self.param_required('run_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            fastq_collection_type = self.param('fastq_collection_type')
            fastq_collection_table = self.param('fastq_collection_table')
            ca = CollectionAdaptor(**{'session_class': igf_session_class})
            ca.start_session()
            fastq_files = ca.get_collection_files(
                collection_name=run_igf_id,
                collection_type=fastq_collection_type,
                collection_table=fastq_collection_table,
                output_mode='dataframe')
            ca.close_session()
            fastq_counts = len(fastq_files.index)
            fastq_list = list(fastq_files['file_path'].values
                              )  # converting fastq filepaths to a list
            if not isinstance(fastq_list, list) or \
               len(fastq_list)==0:
                raise ValueError(
                    'No fastq file found for run {0}'.format(run_igf_id))

            for file in fastq_list:
                if not os.path.exists(file):
                    raise IOError('Fastq file path {0} not found for run {1}'.\
                                  format(file,run_igf_id))

            self.param('dataflow_params',
                       {'fastq_files_list': fastq_list
                        })  # add fastq filepaths to dataflow
        except Exception as e:
            message='project: {2}, sample:{3}, Error in {0}: {1}'.\
                    format(self.__class__.__name__,
                           e,
                           project_igf_id,
                           sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise

예제 #8

0

파일 보기

파일: RunScanpy.py 프로젝트: bballamudi/data-management-python

    def run(self):
        '''
    A method for running samtools commands
    
    :param project_igf_id: A project igf id
    :param sample_igf_id: A sample igf id
    :param experiment_igf_id: A experiment igf id
    :param igf_session_class: A database session class
    :param species_name: species_name
    :param base_result_dir: Base results directory
    :param report_template_file: A template file for writing scanpy report
    :param analysis_name: Analysis name, default scanpy
    :param species_name_lookup: A dictionary for ensembl species name lookup
    :param cellranger_collection_type: Cellranger analysis collection type, default CELLRANGER_RESULTS
    :param scanpy_collection_type: Scanpy report collection type, default SCANPY_RESULTS
    :param collection_table: Collection table name for loading scanpy report, default experiment
    '''
        try:
            project_igf_id = self.param_required('project_igf_id')
            sample_igf_id = self.param_required('sample_igf_id')
            experiment_igf_id = self.param_required('experiment_igf_id')
            igf_session_class = self.param_required('igf_session_class')
            species_name = self.param_required('species_name')
            report_template_file = self.param_required('report_template_file')
            analysis_name = self.param_required('analysis_name')
            base_result_dir = self.param_required('base_result_dir')
            base_work_dir = self.param_required('base_work_dir')
            species_name_lookup = self.param('species_name_lookup')
            cellranger_collection_type = self.param(
                'cellranger_collection_type')
            scanpy_collection_type = self.param('scanpy_collection_type')
            collection_table = self.param('collection_table')
            cellbrowser_dir_prefix = self.param('cellbrowser_dir_prefix')
            use_ephemeral_space = self.param('use_ephemeral_space')
            cellranger_tarfile = ''
            output_report = ''
            work_dir_prefix = \
              os.path.join(
                base_work_dir,
                project_igf_id,
                sample_igf_id,
                experiment_igf_id)
            work_dir = self.get_job_work_dir(
                work_dir=work_dir_prefix)  # get a run work dir
            if species_name in species_name_lookup.keys(
            ):  # check for human or mice
                ensembl_species_name = species_name_lookup[
                    species_name]  # get ensembl species name
                # fetch cellranger tar path from db
                if cellranger_tarfile == '':
                    ca = CollectionAdaptor(
                        **{'session_class': igf_session_class})
                    ca.start_session()  # connect to database
                    cellranger_tarfiles = \
                      ca.get_collection_files(\
                        collection_name=experiment_igf_id,
                        collection_type=cellranger_collection_type,
                        output_mode='dataframe')                                          # fetch collection files
                    ca.close_session()
                    if len(cellranger_tarfiles.index) == 0:
                        raise ValueError('No cellranger analysis output found for exp {0}'.\
                                         format(experiment_igf_id))

                    cellranger_tarfile = cellranger_tarfiles[
                        'file_path'].values[
                            0]  # select first file as analysis file

                # extract filtered metrics files from tar
                output_dir = \
                  get_temp_dir(use_ephemeral_space=use_ephemeral_space)                 # get a temp dir
                datestamp = get_datestamp_label()
                cellbrowser_dir = \
                  os.path.join( \
                    work_dir,
                    '{0}_{1}'.\
                      format( \
                        cellbrowser_dir_prefix,
                        datestamp))
                cellbrowser_h5ad = \
                  os.path.join(\
                    cellbrowser_dir,
                    'scanpy.h5ad')
                output_report = \
                  os.path.join(\
                    output_dir,
                    'report.html')                                                      # get temp report path
                matrix_file,gene_file,barcode_file = \
                  self._extract_cellranger_filtered_metrics(\
                    tar_file=cellranger_tarfile,
                    output_dir=output_dir)                                              # get cellranger output files
                sp = \
                  Scanpy_tool(\
                    project_name=project_igf_id,
                    sample_name=sample_igf_id,
                    matrix_file=matrix_file,
                    features_tsv=gene_file,
                    barcode_tsv=barcode_file,
                    html_template_file=report_template_file,
                    species_name=ensembl_species_name,
                    output_file=output_report,
                    use_ephemeral_space=use_ephemeral_space,
                    cellbrowser_h5ad=cellbrowser_h5ad)
                sp.generate_report()  # generate scanpy report
                # load files to db and disk
                au = \
                  Analysis_collection_utils(\
                    dbsession_class=igf_session_class,
                    analysis_name=analysis_name,
                    tag_name=species_name,
                    collection_name=experiment_igf_id,
                    collection_type=scanpy_collection_type,
                    collection_table=collection_table,
                    base_path=base_result_dir)                                          # initiate loading of report file
                output_file_list = \
                  au.load_file_to_disk_and_db(\
                    input_file_list=[output_report],
                    withdraw_exisitng_collection=True)                                  # load file to db and disk
                output_report = output_file_list[0]

            self.param(
                'dataflow_params', {
                    'output_report': output_report,
                    'scanpy_h5ad_path': cellbrowser_h5ad
                })  # pass on output report filepath
        except Exception as e:
            message = 'project: {2}, sample:{3}, Error in {0}: {1}'.\
                      format(self.__class__.__name__,
                             e,
                             project_igf_id,
                             sample_igf_id)
            self.warning(message)
            self.post_message_to_slack(
                message, reaction='fail')  # post msg to slack for failed jobs
            raise

예제 #9

0

파일 보기

파일: SeqrunFileFactory.py 프로젝트: bballamudi/data-management-python

  def run(self):
    try:
      seqrun_igf_id=self.param_required('seqrun_igf_id')
      seqrun_source=self.param_required('seqrun_source')
      seqrun_server=self.param_required('seqrun_server')
      seqrun_user=self.param_required('seqrun_user')
      igf_session_class=self.param_required('igf_session_class')
      seqrun_md5_type=self.param_required('seqrun_md5_type')
      hpc_location=self.param_required('hpc_location')
      db_file_location_label=self.param_required('db_file_location_label')
      db_file_path_label=self.param_required('db_file_path_label')

      seqrun_path=os.path.join(seqrun_source,seqrun_igf_id)                     # get new seqrun path
      seqrun_server_login='******'.format(seqrun_user, seqrun_server)          # get destination path
      subprocess.check_call(['ssh', 
                             seqrun_server_login,
                             'ls', 
                             seqrun_path])                                      # check remote file
      ca=CollectionAdaptor(**{'session_class':igf_session_class})               # get the md5 list from db
      ca.start_session()
      files=ca.get_collection_files(collection_name=seqrun_igf_id,
                                    collection_type=seqrun_md5_type)            # fetch file collection
      files=files.to_dict(orient='records')
      ca.close_session()

      if len(files)>1:
        raise ValueError('sequencing run {0} has more than one md5 json file'.\
                         format(seqrun_igf_id))

      if len(files)==0:
        raise ValueError('sequencing run {0} does not have any md5 json file'.\
                         format(seqrun_igf_id))
      
      md5_json_location=files[0][db_file_location_label]
      md5_json_path=files[0][db_file_path_label]
      if md5_json_location !=hpc_location:
        temp_dir=get_temp_dir(work_dir=os.getcwd())                             # create a temp directory
        destination_path=os.path.join(temp_dir,os.path.basename(md5_json_path)) # get destination path for md5 file
        copy_remote_file(source_path=md5_json_path,
                         destinationa_path=destination_path,
                         source_address=seqrun_server_login)                    # copy remote file to local disk
        md5_json_path=destination_path                                          # set md5 json filepath

      with open(md5_json_path) as json_data:
            md5_json=json.load(json_data)                                       # read json data, get all file and md5 from json file
      self.param('sub_tasks',md5_json)                                          # seed dataflow
      remove_dir(temp_dir)                                                      # remove temp dir when its not required
      
      message='seqrun: {0}, seeded {1} files for copy'.format(seqrun_igf_id, \
                                                              len(md5_json))
      self.warning(message)
      self.post_message_to_slack(message,reaction='pass')
      self.comment_asana_task(task_name=seqrun_igf_id, \
                              comment=message)

    except Exception as e:
      message='Error in {0}: {1}, seqrun: {2}'.format(self.__class__.__name__,\
                                                      e,\
                                                      seqrun_igf_id)
      self.warning(message)
      self.post_message_to_slack(message,reaction='fail')
      self.comment_asana_task(task_name=seqrun_igf_id, \
                              comment=message)
      raise

예제 #10

0

파일 보기

 def run(self):
     '''
 A method for resetting md5 values in the samplesheet json files for all seqrun ids
 '''
     try:
         db_connected = False
         seqrun_list = self._read_seqrun_list(
             self.seqrun_igf_list
         )  # fetch list of seqrun ids from input file
         if len(seqrun_list) > 0:
             base = self.base_adaptor
             base.start_session()  # connect to database
             db_connected = True
             ca = CollectionAdaptor(**{'session': base.session
                                       })  # connect to collection table
             fa = FileAdaptor(**{'session':
                                 base.session})  # connect to file table
             for seqrun_id in seqrun_list:
                 try:
                     files_data = ca.get_collection_files(
                         collection_name=seqrun_id,
                         collection_type=self.json_collection_type,
                         output_mode='one_or_none'
                     )  # check for existing md5 json file in db
                     # TO DO: skip seqrun_id if pipeline is still running
                     if files_data is not None:
                         json_file_path = [
                             element.file_path for element in files_data
                             if isinstance(element, File)
                         ][0]  # get md5 json file path from sqlalchemy collection results
                         samplesheet_md5 = self._get_samplesheet_md5(
                             seqrun_id
                         )  # get md5 value for new samplesheet file
                         new_json_path = self._get_updated_json_file(
                             json_file_path, samplesheet_md5,
                             self.samplesheet_name
                         )  # get updated md5 json file if samplesheet has been changed
                         if new_json_path is not None:
                             new_json_file_md5 = calculate_file_checksum(
                                 filepath=new_json_path, hasher='md5')
                             fa.update_file_table_for_file_path(
                                 file_path=json_file_path,
                                 tag='md5',
                                 value=new_json_file_md5,
                                 autosave=False
                             )  # update json file md5 in db, don't commit yet
                             move_file(source_path=new_json_path,
                                       destinationa_path=json_file_path,
                                       force=True)  # overwrite json file
                             base.commit_session()  # save changes in db
                             message='Setting new Samplesheet info for run {0}'.\
                                     format(seqrun_id)
                             if self.log_slack:
                                 self.igf_slack.post_message_to_channel(
                                     message,
                                     reaction='pass')  # send log to slack
                             if self.log_asana:
                                 self.igf_asana.comment_asana_task(
                                     task_name=seqrun_id,
                                     comment=message)  # send log to asana
                         else:
                             message = 'no change in samplesheet for seqrun {0}'.format(
                                 seqrun_id)
                             warnings.warn(message)
                             if self.log_slack:
                                 self.igf_slack.post_message_to_channel(
                                     message, reaction='pass')
                     else:
                         message='No md5 json file found for seqrun_igf_id: {0}'.\
                                 format(seqrun_id)
                         warnings.warn(
                             message
                         )  # not raising any exception if seqrun id is not found
                         if self.log_slack:
                             self.igf_slack.post_message_to_channel(
                                 message, reaction='fail')
                 except Exception as e:
                     base.rollback_session()
                     message='Failed to update  json file for seqrun id {0}, error : {1}'.\
                             format(seqrun_id,e)
                     warnings.warn(message)
                     if self.log_slack:
                         self.igf_slack.post_message_to_channel(
                             message, reaction='fail')
             base.close_session()  # close db connection
             if self.clean_up:
                 self._clear_seqrun_list(
                     self.seqrun_igf_list)  # clear input file
         else:
             if self.log_slack:
                 message = 'No new seqrun id found for changing samplesheet md5'
                 warnings.warn(message)
                 if self.log_slack:
                     self.igf_slack.post_message_to_channel(
                         message, reaction='sleep')
     except:
         if db_connected:
             base.rollback_session()
             base.close_session()
         raise