def test_load_file_to_disk_and_db7(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='RunA', collection_type='AnalysisA_Files', collection_table='run', base_path=self.temp_base_dir) input_file_list = [ os.path.join(self.temp_work_dir, file_name) for file_name in self.input_list ] output_list = au.load_file_to_disk_and_db( input_file_list=input_file_list, withdraw_exisitng_collection=False ) # loading all files to same collection base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='RunA', collection_type='AnalysisA_Files', output_mode='dataframe') file_list = list(ca_files['file_path'].to_dict().values()) datestamp = get_datestamp_label() test_file = os.path.join( self.temp_base_dir, 'ProjectA', 'SampleA', 'ExperimentA', 'RunA', 'AnalysisA', '{0}_{1}_{2}_{3}.{4}'.format('RunA', 'AnalysisA', 'TagA', datestamp, 'cram')) test_file = preprocess_path_name(input_path=test_file) self.assertTrue(test_file in file_list) self.assertTrue(test_file in output_list) base.close_session()
def test_load_file_to_disk_and_db2(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='ProjectA', collection_type='AnalysisA_Files', collection_table='project') input_file_list = [ os.path.join(self.temp_work_dir, file_name) for file_name in self.input_list ] output_list = au.load_file_to_disk_and_db( input_file_list=input_file_list, withdraw_exisitng_collection=True ) # withdrawing existing collection group before loading new base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') self.assertEqual(len(ca_files.index), 1) # check for unique collection group fa = FileAdaptor(**{'session': base.session}) query = fa.session.query(File) fa_records = fa.fetch_records(query=query, output_mode='dataframe') self.assertEqual( len(fa_records['file_path'].to_dict()), 3 ) # check if all files are present although only one collection group exists self.assertEqual(len(output_list), 3) base.close_session()
def test_load_file_to_disk_and_db4(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='ProjectA', collection_type='AnalysisA_Files', collection_table='project', rename_file=False) input_file_list = [ os.path.join(self.temp_work_dir, file_name) for file_name in self.input_list ] output_list = au.load_file_to_disk_and_db( input_file_list=input_file_list, withdraw_exisitng_collection=False ) # loading all files to same collection, without rename base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') file_list = list(ca_files['file_path'].to_dict().values()) self.assertTrue(input_file_list[0] in file_list) self.assertTrue(input_file_list[0] in output_list) base.close_session()
def test_load_file_to_disk_and_db1(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='ProjectA', collection_type='AnalysisA_Files', collection_table='project') input_file_list = [ os.path.join(self.temp_work_dir, file_name) for file_name in self.input_list ] output_list = au.load_file_to_disk_and_db( input_file_list=input_file_list, withdraw_exisitng_collection=False ) # loading all files to same collection base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') self.assertEqual(len(ca_files.index), len(self.input_list)) # compare with input list self.assertEqual(len(output_list), len(self.input_list)) # compare with output list base.close_session()
def create_or_update_analysis_collection(self, file_path, dbsession, withdraw_exisitng_collection=True, autosave_db=True, force=True, remove_file=False): ''' A method for create or update analysis file collection in db. Required elements will be collected from database if base_path element is given. :param file_path: file path to load as db collection :param dbsession: An active database session :param withdraw_exisitng_collection: Remove existing collection group :param autosave_db: Save changes to database, default True :param remove_file: A toggle for removing existing file from disk, default False :param force: Toggle for removing existing file collection, default True ''' try: ca = CollectionAdaptor(**{'session': dbsession}) collection_exists = \ ca.get_collection_files( collection_name=self.collection_name, collection_type=self.collection_type) if len(collection_exists.index) >0 and \ withdraw_exisitng_collection: remove_data = [{ 'name': self.collection_name, 'type': self.collection_type }] ca.remove_collection_group_info( data=remove_data, autosave=autosave_db ) # removing all existing collection groups for the collection name and type fa = FileAdaptor(**{'session': dbsession}) file_exists = fa.check_file_records_file_path( file_path=file_path) # check if file already present in db if file_exists and force: fa.remove_file_data_for_file_path( file_path=file_path, remove_file=remove_file, autosave=autosave_db ) # remove entry from file table and disk collection_data = [{ 'name': self.collection_name, 'type': self.collection_type, 'table': self.collection_table, 'file_path': file_path }] ca.load_file_and_create_collection( data=collection_data, calculate_file_size_and_md5=True, autosave=autosave_db ) # load file, collection and create collection group except: raise
def test_create_or_update_analysis_collection_rename(self): au = Analysis_collection_utils(dbsession_class=self.session_class, analysis_name='AnalysisA', tag_name='TagA', collection_name='ProjectA', collection_type='AnalysisA_Files', collection_table='project') base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() au.create_or_update_analysis_collection(file_path=os.path.join( self.temp_work_dir, 'a.cram'), dbsession=base.session, autosave_db=True) base.close_session() base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') self.assertEqual(len(ca_files.index), 1) au.create_or_update_analysis_collection( file_path=os.path.join(self.temp_work_dir, 'a.cram'), dbsession=base.session, autosave_db=True, force=True) # overwriting file collection base.close_session() base.start_session() ca = CollectionAdaptor(**{'session': base.session}) ca_files = ca.get_collection_files(collection_name='ProjectA', collection_type='AnalysisA_Files', output_mode='dataframe') self.assertEqual(len(ca_files.index), 1) with self.assertRaises(sqlalchemy.exc.IntegrityError ): # file collection without force au.create_or_update_analysis_collection(\ file_path=os.path.join(self.temp_work_dir, 'a.cram'), dbsession=base.session, autosave_db=True, force=False ) base.close_session()
def run(self): try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') run_igf_id = self.param_required('run_igf_id') igf_session_class = self.param_required('igf_session_class') fastq_collection_type = self.param('fastq_collection_type') fastq_collection_table = self.param('fastq_collection_table') ca = CollectionAdaptor(**{'session_class': igf_session_class}) ca.start_session() fastq_files = ca.get_collection_files( collection_name=run_igf_id, collection_type=fastq_collection_type, collection_table=fastq_collection_table, output_mode='dataframe') ca.close_session() fastq_counts = len(fastq_files.index) fastq_list = list(fastq_files['file_path'].values ) # converting fastq filepaths to a list if not isinstance(fastq_list, list) or \ len(fastq_list)==0: raise ValueError( 'No fastq file found for run {0}'.format(run_igf_id)) for file in fastq_list: if not os.path.exists(file): raise IOError('Fastq file path {0} not found for run {1}'.\ format(file,run_igf_id)) self.param('dataflow_params', {'fastq_files_list': fastq_list }) # add fastq filepaths to dataflow except Exception as e: message='project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): ''' A method for running samtools commands :param project_igf_id: A project igf id :param sample_igf_id: A sample igf id :param experiment_igf_id: A experiment igf id :param igf_session_class: A database session class :param species_name: species_name :param base_result_dir: Base results directory :param report_template_file: A template file for writing scanpy report :param analysis_name: Analysis name, default scanpy :param species_name_lookup: A dictionary for ensembl species name lookup :param cellranger_collection_type: Cellranger analysis collection type, default CELLRANGER_RESULTS :param scanpy_collection_type: Scanpy report collection type, default SCANPY_RESULTS :param collection_table: Collection table name for loading scanpy report, default experiment ''' try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') experiment_igf_id = self.param_required('experiment_igf_id') igf_session_class = self.param_required('igf_session_class') species_name = self.param_required('species_name') report_template_file = self.param_required('report_template_file') analysis_name = self.param_required('analysis_name') base_result_dir = self.param_required('base_result_dir') base_work_dir = self.param_required('base_work_dir') species_name_lookup = self.param('species_name_lookup') cellranger_collection_type = self.param( 'cellranger_collection_type') scanpy_collection_type = self.param('scanpy_collection_type') collection_table = self.param('collection_table') cellbrowser_dir_prefix = self.param('cellbrowser_dir_prefix') use_ephemeral_space = self.param('use_ephemeral_space') cellranger_tarfile = '' output_report = '' work_dir_prefix = \ os.path.join( base_work_dir, project_igf_id, sample_igf_id, experiment_igf_id) work_dir = self.get_job_work_dir( work_dir=work_dir_prefix) # get a run work dir if species_name in species_name_lookup.keys( ): # check for human or mice ensembl_species_name = species_name_lookup[ species_name] # get ensembl species name # fetch cellranger tar path from db if cellranger_tarfile == '': ca = CollectionAdaptor( **{'session_class': igf_session_class}) ca.start_session() # connect to database cellranger_tarfiles = \ ca.get_collection_files(\ collection_name=experiment_igf_id, collection_type=cellranger_collection_type, output_mode='dataframe') # fetch collection files ca.close_session() if len(cellranger_tarfiles.index) == 0: raise ValueError('No cellranger analysis output found for exp {0}'.\ format(experiment_igf_id)) cellranger_tarfile = cellranger_tarfiles[ 'file_path'].values[ 0] # select first file as analysis file # extract filtered metrics files from tar output_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp dir datestamp = get_datestamp_label() cellbrowser_dir = \ os.path.join( \ work_dir, '{0}_{1}'.\ format( \ cellbrowser_dir_prefix, datestamp)) cellbrowser_h5ad = \ os.path.join(\ cellbrowser_dir, 'scanpy.h5ad') output_report = \ os.path.join(\ output_dir, 'report.html') # get temp report path matrix_file,gene_file,barcode_file = \ self._extract_cellranger_filtered_metrics(\ tar_file=cellranger_tarfile, output_dir=output_dir) # get cellranger output files sp = \ Scanpy_tool(\ project_name=project_igf_id, sample_name=sample_igf_id, matrix_file=matrix_file, features_tsv=gene_file, barcode_tsv=barcode_file, html_template_file=report_template_file, species_name=ensembl_species_name, output_file=output_report, use_ephemeral_space=use_ephemeral_space, cellbrowser_h5ad=cellbrowser_h5ad) sp.generate_report() # generate scanpy report # load files to db and disk au = \ Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, tag_name=species_name, collection_name=experiment_igf_id, collection_type=scanpy_collection_type, collection_table=collection_table, base_path=base_result_dir) # initiate loading of report file output_file_list = \ au.load_file_to_disk_and_db(\ input_file_list=[output_report], withdraw_exisitng_collection=True) # load file to db and disk output_report = output_file_list[0] self.param( 'dataflow_params', { 'output_report': output_report, 'scanpy_h5ad_path': cellbrowser_h5ad }) # pass on output report filepath except Exception as e: message = 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack( message, reaction='fail') # post msg to slack for failed jobs raise
def run(self): try: seqrun_igf_id=self.param_required('seqrun_igf_id') seqrun_source=self.param_required('seqrun_source') seqrun_server=self.param_required('seqrun_server') seqrun_user=self.param_required('seqrun_user') igf_session_class=self.param_required('igf_session_class') seqrun_md5_type=self.param_required('seqrun_md5_type') hpc_location=self.param_required('hpc_location') db_file_location_label=self.param_required('db_file_location_label') db_file_path_label=self.param_required('db_file_path_label') seqrun_path=os.path.join(seqrun_source,seqrun_igf_id) # get new seqrun path seqrun_server_login='******'.format(seqrun_user, seqrun_server) # get destination path subprocess.check_call(['ssh', seqrun_server_login, 'ls', seqrun_path]) # check remote file ca=CollectionAdaptor(**{'session_class':igf_session_class}) # get the md5 list from db ca.start_session() files=ca.get_collection_files(collection_name=seqrun_igf_id, collection_type=seqrun_md5_type) # fetch file collection files=files.to_dict(orient='records') ca.close_session() if len(files)>1: raise ValueError('sequencing run {0} has more than one md5 json file'.\ format(seqrun_igf_id)) if len(files)==0: raise ValueError('sequencing run {0} does not have any md5 json file'.\ format(seqrun_igf_id)) md5_json_location=files[0][db_file_location_label] md5_json_path=files[0][db_file_path_label] if md5_json_location !=hpc_location: temp_dir=get_temp_dir(work_dir=os.getcwd()) # create a temp directory destination_path=os.path.join(temp_dir,os.path.basename(md5_json_path)) # get destination path for md5 file copy_remote_file(source_path=md5_json_path, destinationa_path=destination_path, source_address=seqrun_server_login) # copy remote file to local disk md5_json_path=destination_path # set md5 json filepath with open(md5_json_path) as json_data: md5_json=json.load(json_data) # read json data, get all file and md5 from json file self.param('sub_tasks',md5_json) # seed dataflow remove_dir(temp_dir) # remove temp dir when its not required message='seqrun: {0}, seeded {1} files for copy'.format(seqrun_igf_id, \ len(md5_json)) self.warning(message) self.post_message_to_slack(message,reaction='pass') self.comment_asana_task(task_name=seqrun_igf_id, \ comment=message) except Exception as e: message='Error in {0}: {1}, seqrun: {2}'.format(self.__class__.__name__,\ e,\ seqrun_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') self.comment_asana_task(task_name=seqrun_igf_id, \ comment=message) raise
def run(self): ''' A method for resetting md5 values in the samplesheet json files for all seqrun ids ''' try: db_connected = False seqrun_list = self._read_seqrun_list( self.seqrun_igf_list ) # fetch list of seqrun ids from input file if len(seqrun_list) > 0: base = self.base_adaptor base.start_session() # connect to database db_connected = True ca = CollectionAdaptor(**{'session': base.session }) # connect to collection table fa = FileAdaptor(**{'session': base.session}) # connect to file table for seqrun_id in seqrun_list: try: files_data = ca.get_collection_files( collection_name=seqrun_id, collection_type=self.json_collection_type, output_mode='one_or_none' ) # check for existing md5 json file in db # TO DO: skip seqrun_id if pipeline is still running if files_data is not None: json_file_path = [ element.file_path for element in files_data if isinstance(element, File) ][0] # get md5 json file path from sqlalchemy collection results samplesheet_md5 = self._get_samplesheet_md5( seqrun_id ) # get md5 value for new samplesheet file new_json_path = self._get_updated_json_file( json_file_path, samplesheet_md5, self.samplesheet_name ) # get updated md5 json file if samplesheet has been changed if new_json_path is not None: new_json_file_md5 = calculate_file_checksum( filepath=new_json_path, hasher='md5') fa.update_file_table_for_file_path( file_path=json_file_path, tag='md5', value=new_json_file_md5, autosave=False ) # update json file md5 in db, don't commit yet move_file(source_path=new_json_path, destinationa_path=json_file_path, force=True) # overwrite json file base.commit_session() # save changes in db message='Setting new Samplesheet info for run {0}'.\ format(seqrun_id) if self.log_slack: self.igf_slack.post_message_to_channel( message, reaction='pass') # send log to slack if self.log_asana: self.igf_asana.comment_asana_task( task_name=seqrun_id, comment=message) # send log to asana else: message = 'no change in samplesheet for seqrun {0}'.format( seqrun_id) warnings.warn(message) if self.log_slack: self.igf_slack.post_message_to_channel( message, reaction='pass') else: message='No md5 json file found for seqrun_igf_id: {0}'.\ format(seqrun_id) warnings.warn( message ) # not raising any exception if seqrun id is not found if self.log_slack: self.igf_slack.post_message_to_channel( message, reaction='fail') except Exception as e: base.rollback_session() message='Failed to update json file for seqrun id {0}, error : {1}'.\ format(seqrun_id,e) warnings.warn(message) if self.log_slack: self.igf_slack.post_message_to_channel( message, reaction='fail') base.close_session() # close db connection if self.clean_up: self._clear_seqrun_list( self.seqrun_igf_list) # clear input file else: if self.log_slack: message = 'No new seqrun id found for changing samplesheet md5' warnings.warn(message) if self.log_slack: self.igf_slack.post_message_to_channel( message, reaction='sleep') except: if db_connected: base.rollback_session() base.close_session() raise