def create_or_update_analysis_collection(self, file_path, dbsession, withdraw_exisitng_collection=True, autosave_db=True, force=True, remove_file=False): ''' A method for create or update analysis file collection in db. Required elements will be collected from database if base_path element is given. :param file_path: file path to load as db collection :param dbsession: An active database session :param withdraw_exisitng_collection: Remove existing collection group :param autosave_db: Save changes to database, default True :param remove_file: A toggle for removing existing file from disk, default False :param force: Toggle for removing existing file collection, default True ''' try: ca = CollectionAdaptor(**{'session': dbsession}) collection_exists = \ ca.get_collection_files( collection_name=self.collection_name, collection_type=self.collection_type) if len(collection_exists.index) >0 and \ withdraw_exisitng_collection: remove_data = [{ 'name': self.collection_name, 'type': self.collection_type }] ca.remove_collection_group_info( data=remove_data, autosave=autosave_db ) # removing all existing collection groups for the collection name and type fa = FileAdaptor(**{'session': dbsession}) file_exists = fa.check_file_records_file_path( file_path=file_path) # check if file already present in db if file_exists and force: fa.remove_file_data_for_file_path( file_path=file_path, remove_file=remove_file, autosave=autosave_db ) # remove entry from file table and disk collection_data = [{ 'name': self.collection_name, 'type': self.collection_type, 'table': self.collection_table, 'file_path': file_path }] ca.load_file_and_create_collection( data=collection_data, calculate_file_size_and_md5=True, autosave=autosave_db ) # load file, collection and create collection group except: raise
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam=read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname=dbparam['dbname'] Base.metadata.drop_all(self.engine) if os.path.exists(self.dbname): os.remove(self.dbname) Base.metadata.create_all(self.engine) self.session_class=base.get_session_class() base.start_session() project_data=[{'project_igf_id':'ProjectA'}] pa=ProjectAdaptor(**{'session':base.session}) pa.store_project_and_attribute_data(data=project_data) # load project data sample_data=[{'sample_igf_id':'SampleA', 'project_igf_id':'ProjectA'}] # sample data sa=SampleAdaptor(**{'session':base.session}) sa.store_sample_and_attribute_data(data=sample_data) # store sample data experiment_data=[{'experiment_igf_id':'ExperimentA', 'sample_igf_id':'SampleA', 'library_name':'SampleA', 'platform_name':'MISEQ', 'project_igf_id':'ProjectA'}] # experiment data ea=ExperimentAdaptor(**{'session':base.session}) ea.store_project_and_attribute_data(data=experiment_data) self.temp_dir=get_temp_dir() temp_files=['a.csv','b.csv'] for temp_file in temp_files: with open(os.path.join(self.temp_dir,temp_file),'w') as fp: fp.write('A') collection_data=[{'name':'ExperimentA', 'type':'AnalysisA_html', 'table':'experiment', 'file_path':os.path.join(self.temp_dir,temp_file)} for temp_file in temp_files] ca=CollectionAdaptor(**{'session':base.session}) ca.load_file_and_create_collection(data=collection_data, calculate_file_size_and_md5=False) base.close_session()
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() self.json_file_path = 'data/reset_samplesheet_md5/seqrun1_file_md5.json' json_data = pd.DataFrame([{ 'file_md5': '1e7531158974b5a5b7cbb7dde09ac779', 'seqrun_file_name': 'SampleSheet.csv' }, { 'file_md5': '2b22f945bc9e7e390af5432425783a03', 'seqrun_file_name': 'RTAConfiguration.xml' }]) with open(self.json_file_path, 'w') as jp: json.dump(json_data.to_dict(orient='record'), jp, indent=4) self.initial_json_md5 = calculate_file_checksum( filepath=self.json_file_path) self.correct_samplesheet_md5 = '259ed03f2e8c45980de121f7c3a70565' self.json_collection_name = 'seqrun1' self.json_collection_type = 'ILLUMINA_BCL_MD5' self.seqrun_path = 'data/reset_samplesheet_md5' self.seqrun_input_list = 'data/reset_samplesheet_md5/seqrun_input_list.txt' ca = CollectionAdaptor(**{'session_class': self.session_class}) ca.start_session() data = pd.DataFrame([{ 'name': self.json_collection_name, 'type': self.json_collection_type, 'table': 'seqrun', 'file_path': self.json_file_path, }]) ca.load_file_and_create_collection(data, autosave=True, hasher='md5') ca.close_session() with open(self.seqrun_input_list, 'w') as fp: fp.write(self.json_collection_name)
def run(self): try: fastq_file = self.param_required('fastq_file') fastq_dir = self.param_required('fastq_dir') igf_session_class = self.param_required('igf_session_class') fastqc_exe = self.param_required('fastqc_exe') tag = self.param_required('tag') seqrun_igf_id = self.param_required('seqrun_igf_id') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') fastqc_options = self.param('fastqc_options') base_results_dir = self.param_required('base_results_dir') project_name = self.param_required('project_name') force_overwrite = self.param('force_overwrite') fastqc_dir_label = self.param('fastqc_dir_label') required_collection_table = self.param('required_collection_table') sample_name = self.param('sample_name') hpc_location = self.param('hpc_location') fastqc_collection_type = self.param('fastqc_collection_type') use_ephemeral_space = self.param('use_ephemeral_space') store_file = self.param('store_file') lane_index_info = os.path.basename(fastq_dir) # get the lane and index length info fastq_file_label = os.path.basename(fastq_file).replace('.fastq.gz','') collection_name = None collection_table = None if tag=='known' and store_file: # fetch sample name for known fastq, if its not defined base = BaseAdaptor(**{'session_class':igf_session_class}) base.start_session() # connect to db ca = CollectionAdaptor(**{'session':base.session}) (collection_name,collection_table) = \ ca.fetch_collection_name_and_table_from_file_path(\ file_path=fastq_file) # fetch collection name and table info if collection_table != required_collection_table: raise ValueError( 'Expected collection table {0} and got {1}, {2}'.\ format( required_collection_table, collection_table, fastq_file)) ra = RunAdaptor(**{'session':base.session}) sample = ra.fetch_sample_info_for_run(run_igf_id=collection_name) sample_name = sample['sample_igf_id'] base.close_session() fastqc_result_dir = \ os.path.join(\ base_results_dir, project_name, seqrun_date, flowcell_id, lane_index_info, tag) # result dir path is generic if sample_name is not None: fastqc_result_dir = \ os.path.join(\ fastqc_result_dir, sample_name) # add sample name to dir path if its available fastqc_result_dir = \ os.path.join(\ fastqc_result_dir, fastq_file_label, fastqc_dir_label) # keep multiple files under same dir if os.path.exists(fastqc_result_dir) and force_overwrite: remove_dir(fastqc_result_dir) # remove existing output dir if force_overwrite is true if not os.path.exists(fastqc_result_dir): os.makedirs(fastqc_result_dir,mode=0o775) # create output dir if its not present temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp work dir if not os.path.exists(fastq_file): raise IOError('fastq file {0} not readable'.format(fastq_file)) # raise if fastq file path is not readable fastqc_output = \ os.path.join(\ temp_work_dir, fastq_file_label) os.mkdir(fastqc_output) # create fastqc output dir fastqc_param = \ self.format_tool_options(fastqc_options) # format fastqc params fastqc_cmd = \ [fastqc_exe, '-o',fastqc_output, '-d',temp_work_dir ] # fastqc base parameters fastqc_cmd.extend(fastqc_param) # add additional parameters fastqc_cmd.append(fastq_file) # fastqc input file subprocess.check_call(' '.join(fastqc_cmd),shell=True) # run fastqc fastqc_zip = None fastqc_html = None for root, _, files in os.walk(top=fastqc_output): for file in files: if fnmatch.fnmatch(file, '*.zip'): input_fastqc_zip = os.path.join(root,file) copy2(input_fastqc_zip,fastqc_result_dir) fastqc_zip = os.path.join(fastqc_result_dir,file) if fnmatch.fnmatch(file, '*.html'): input_fastqc_html = os.path.join(root,file) copy2(input_fastqc_html,fastqc_result_dir) fastqc_html = os.path.join(fastqc_result_dir,file) if fastqc_html is None or fastqc_zip is None: raise ValueError('Missing required values, fastqc zip: {0}, fastqc html: {1}'.\ format(fastqc_zip,fastqc_html)) if tag=='known' and store_file: if collection_name is None: raise ValueError('couldn\'t retrieve collection name for {0}'.\ format(fastq_file)) fastqc_files = \ [{'name':collection_name, 'type':fastqc_collection_type, 'table':required_collection_table, 'file_path':fastqc_zip, 'location':hpc_location}, {'name':collection_name, 'type':fastqc_collection_type, 'table':required_collection_table, 'file_path':fastqc_html, 'location':hpc_location}, ] ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() ca.load_file_and_create_collection(data=fastqc_files) # store fastqc files to db ca.close_session() self.param('dataflow_params', {'fastqc_html':fastqc_html, 'lane_index_info':lane_index_info, 'sample_name':sample_name, 'fastqc':{'fastq_dir':fastq_dir, 'fastqc_zip':fastqc_zip, 'fastqc_html':fastqc_html}}) # set dataflow params except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
dbconfig_path = args.dbconfig_path collection_file_data = args.collection_file_data calculate_checksum = args.calculate_checksum if __name__ == '__main__': try: dbconnected = False if not os.path.exists(dbconfig_path): raise IOError('Dbconfig file {0} not found'.format(dbconfig_path)) if not os.path.exists(collection_file_data): raise IOError('Collection data json file {0} not found'.format( collection_file_data)) dbparam = read_dbconf_json(dbconfig_path) # read db config collection_data = read_json_data( collection_file_data) # read collection data json ca = CollectionAdaptor(**dbparam) ca.start_session() # connect to database dbconnected = True ca.load_file_and_create_collection( data=collection_data, calculate_file_size_and_md5=calculate_checksum, autosave=True) # load data and commit changes ca.close_session() dbconnected = False except Exception as e: if dbconnected: ca.rollback_session() ca.close_session() raise ValueError('Error: {0}'.format(e))
def run(self): try: project_igf_id = self.param_required('project_igf_id') sample_igf_id = self.param_required('sample_igf_id') file_list = self.param_required('file_list') remote_user = self.param_required('remote_user') remote_host = self.param_required('remote_host') remote_project_path = self.param_required('remote_project_path') dir_labels = self.param_required('dir_labels') igf_session_class = self.param_required('igf_session_class') force_overwrite = self.param('force_overwrite') collect_remote_file = self.param('collect_remote_file') collection_name = self.param('collection_name') collection_type = self.param('collection_type') collection_table = self.param('collection_table') file_location = self.param('file_location') use_ephemeral_space = self.param('use_ephemeral_space') destination_output_path = \ os.path.join( remote_project_path, project_igf_id) # get base destination path if isinstance(dir_labels, list) and \ len(dir_labels) > 0: destination_output_path=\ os.path.join(destination_output_path, *dir_labels) if collect_remote_file: if collection_name is None or \ collection_type is None: raise ValueError('Name and type are required for db collection') output_file_list = list() temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get temp dir for file in file_list: if not os.path.exists(file): raise IOError('file {0} not found'.\ format(file)) if os.path.isfile(file): copy2( file, os.path.join( temp_work_dir, os.path.basename(file))) # copy file to a temp dir dest_file_path = \ os.path.join( destination_output_path, os.path.basename(file)) # get destination file path os.chmod( os.path.join( temp_work_dir, os.path.basename(file)), mode=0o764) # set file permission elif os.path.isdir(file): copytree(\ file, os.path.join( temp_work_dir, os.path.basename(file))) # copy dir to a temp dir dest_file_path=destination_output_path for root,dirs,files in os.walk(temp_work_dir): for dir_name in dirs: os.chmod( os.path.join(root,dir_name), mode=0o775) for file_name in files: os.chmod( os.path.join(root,file_name), mode=0o764) # changing file and dir permissions for remote files else: raise ValueError('Unknown source file type: {0}'.\ format(file)) #os.chmod( # os.path.join( # temp_work_dir, # os.path.basename(file)), # mode=0o754) # set file permission copy_remote_file(\ source_path=os.path.join(temp_work_dir, os.path.basename(file)), destinationa_path=dest_file_path, destination_address='{0}@{1}'.format(remote_user,remote_host), force_update=force_overwrite ) # copy file to remote if os.path.isdir(file): dest_file_path=\ os.path.join(\ dest_file_path, os.path.basename(file)) # fix for dir input output_file_list.append(dest_file_path) remove_dir(dir_path=temp_work_dir) # remove temp dir self.param('dataflow_params', {'status': 'done', 'output_list':output_file_list}) # add dataflow params if collect_remote_file: data=list() remove_data_list=[{'name':collection_name, 'type':collection_type}] for file in output_file_list: data.append( {'name':collection_name, 'type':collection_type, 'table':collection_table, 'file_path':file, 'location':file_location } ) ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() try: ca.remove_collection_group_info( data=remove_data_list, autosave=False) # remove existing data before loading new collection ca.load_file_and_create_collection( data=data, autosave=False, calculate_file_size_and_md5=False) # load remote files to db ca.commit_session() # commit changes ca.close_session() except: ca.rollback_session() # rollback changes ca.close_session() raise except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format( self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise