def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam=read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname=dbparam['dbname'] Base.metadata.drop_all(self.engine) if os.path.exists(self.dbname): os.remove(self.dbname) Base.metadata.create_all(self.engine) self.session_class=base.get_session_class() base = BaseAdaptor(**{'session_class':self.session_class}) base.start_session() platform_data=[{ "platform_igf_id" : "M001", "model_name" : "MISEQ" , "vendor_name" : "ILLUMINA" , "software_name" : "RTA", "software_version" : "RTA1.18.54"}] # platform data flowcell_rule_data=[{"platform_igf_id":"M001", "flowcell_type":"MISEQ", "index_1":"NO_CHANGE", "index_2":"NO_CHANGE"}] # flowcell rule data pl=PlatformAdaptor(**{'session':base.session}) pl.store_platform_data(data=platform_data) # loading platform data pl.store_flowcell_barcode_rule(data=flowcell_rule_data) # loading flowcell rules data project_data=[{'project_igf_id':'ProjectA'}] # project data pa=ProjectAdaptor(**{'session':base.session}) pa.store_project_and_attribute_data(data=project_data) # load project data sample_data=[{'sample_igf_id':'SampleA', 'project_igf_id':'ProjectA'}] # sample data sa=SampleAdaptor(**{'session':base.session}) sa.store_sample_and_attribute_data(data=sample_data) # store sample data seqrun_data=[{'seqrun_igf_id':'SeqrunA', 'flowcell_id':'000000000-D0YLK', 'platform_igf_id':'M001', 'flowcell':'MISEQ'}] # seqrun data sra=SeqrunAdaptor(**{'session':base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) # load seqrun data experiment_data=[{'experiment_igf_id':'ExperimentA', 'sample_igf_id':'SampleA', 'library_name':'SampleA', 'platform_name':'MISEQ', 'project_igf_id':'ProjectA'}] # experiment data ea=ExperimentAdaptor(**{'session':base.session}) ea.store_project_and_attribute_data(data=experiment_data) # load experiment data base.commit_session() base.close_session()
def _check_and_register_data(self, data, project_info_file): ''' An internal method for checking and registering data :param data: A dictionary containing following keys project_data user_data project_user_data sample_data :param project_info_file: A filepath for project info ''' try: db_connected = False project_data = pd.DataFrame(data['project_data']) user_data = pd.DataFrame(data['user_data']) project_user_data = pd.DataFrame(data['project_user_data']) sample_data = pd.DataFrame(data['sample_data']) base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() # connect_to db db_connected = True project_data = project_data[project_data[ self.project_lookup_column].isnull() == False] project_data = project_data.drop_duplicates() if project_data.index.size > 0: project_data=project_data.\ apply(lambda x: \ self._check_existing_data(\ data=x,\ dbsession=base.session, \ table_name='project', check_column='EXISTS'),\ axis=1) # get project map project_data = project_data[project_data['EXISTS'] == False] # filter existing projects project_data.drop('EXISTS', axis=1, inplace=True) # remove extra column user_data = user_data[user_data[self.user_lookup_column].isnull() == False] user_data = user_data.drop_duplicates() if user_data.index.size > 0: user_data=user_data.apply(lambda x: \ self._assign_username_and_password(x), \ axis=1) # check for use account and password user_data=user_data.\ apply(lambda x: \ self._check_existing_data(\ data=x,\ dbsession=base.session, \ table_name='user', check_column='EXISTS'),\ axis=1) # get user map user_data = user_data[user_data['EXISTS'] == False] # filter existing users user_data.drop('EXISTS', axis=1, inplace=True) # remove extra column sample_data = sample_data[sample_data[ self.sample_lookup_column].isnull() == False] sample_data = sample_data.drop_duplicates() if sample_data.index.size > 0: sample_data=sample_data.\ apply(lambda x: \ self._check_existing_data(\ data=x,\ dbsession=base.session, \ table_name='sample', check_column='EXISTS'),\ axis=1) # get sample map sample_data = sample_data[sample_data['EXISTS'] == False] # filter existing samples sample_data.drop('EXISTS', axis=1, inplace=True) # remove extra column project_user_data = project_user_data.drop_duplicates() project_user_data_mask=(project_user_data[self.project_lookup_column].isnull()==False) & \ (project_user_data[self.user_lookup_column].isnull()==False) project_user_data = project_user_data[ project_user_data_mask] # not allowing any empty values for project or user lookup if project_user_data.index.size > 0: project_user_data = self._add_default_user_to_project( project_user_data ) # update project_user_data with default users project_user_data=project_user_data.\ apply(lambda x: \ self._check_existing_data(\ data=x,\ dbsession=base.session, \ table_name='project_user', check_column='EXISTS'),\ axis=1) # get project user map project_user_data = project_user_data[project_user_data[ 'EXISTS'] == False] # filter existing project user project_user_data.drop('EXISTS', axis=1, inplace=True) # remove extra column if len(project_data.index) > 0: # store new projects pa1 = ProjectAdaptor(**{'session': base.session }) # connect to project adaptor pa1.store_project_and_attribute_data( data=project_data, autosave=False) # load project data if len(user_data.index) > 0: # store new users ua = UserAdaptor(**{'session': base.session}) ua.store_user_data(data=user_data, autosave=False) # load user data if len(project_user_data.index) > 0: # store new project users pa2 = ProjectAdaptor(**{'session': base.session }) # connect to project adaptor project_user_data = project_user_data.to_dict( orient='records') # convert dataframe to dictionary pa2.assign_user_to_project( data=project_user_data, autosave=False) # load project user data if len(sample_data.index) > 0: # store new samples sa = SampleAdaptor(**{'session': base.session }) # connect to sample adaptor sa.store_sample_and_attribute_data( data=sample_data, autosave=False) # load samples data if self.setup_irods: user_data.apply(lambda x: self._setup_irods_account(data=x), axis=1) # create irods account file_checksum = calculate_file_checksum(filepath=project_info_file) file_size = os.path.getsize(project_info_file) file_data=[{'file_path':project_info_file,\ 'location':'ORWELL',\ 'md5':file_checksum,\ 'size':file_size,\ }] fa = FileAdaptor(**{'session': base.session}) # connect to file adaptor fa.store_file_data(data=file_data, autosave=False) except: if db_connected: base.rollback_session() # rollback session raise else: if db_connected: base.commit_session() # commit changes to db if len(user_data.index) > 0 and self.notify_user: user_data.apply(lambda x: self._notify_about_new_user_account(x),\ axis=1) # send mail to new user with their password and forget it finally: if db_connected: base.close_session() # close db connection
def setUp(self): self.path = 'data/seqrun_dir' self.dbconfig = 'data/dbconfig.json' self.md5_out_path = 'data/md5_dir' self.pipeline_name = 'demultiplexing_fastq' seqrun_json = 'data/seqrun_db_data.json' platform_json = 'data/platform_db_data.json' pipeline_json = 'data/pipeline_data.json' os.mkdir(self.md5_out_path) dbparam = None with open(self.dbconfig, 'r') as json_data: dbparam = json.load(json_data) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] self.pipeline_name = '' Base.metadata.create_all(self.engine) base.start_session() user_data = [ { 'name': 'user1', 'email_id': '*****@*****.**', 'username': '******' }, ] ua = UserAdaptor(**{'session': base.session}) ua.store_user_data(data=user_data) project_data = [{ 'project_igf_id': 'project_1', 'project_name': 'test_22-8-2017_rna', 'description': 'Its project 1', 'project_deadline': 'Before August 2017', 'comments': 'Some samples are treated with drug X', }] pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) project_user_data = [{ 'project_igf_id': 'project_1', 'email_id': '*****@*****.**', 'data_authority': True }] pa.assign_user_to_project(data=project_user_data) sample_data = [ { 'sample_igf_id': 'IGF0001', 'project_igf_id': 'project_1', }, { 'sample_igf_id': 'IGF0002', 'project_igf_id': 'project_1', }, { 'sample_igf_id': 'IGF0003', 'project_igf_id': 'project_1', }, ] sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) base.commit_session() with open(pipeline_json, 'r') as json_data: # store pipeline data to db pipeline_data = json.load(json_data) pa = PipelineAdaptor(**{'session': base.session}) pa.store_pipeline_data(data=pipeline_data) with open(platform_json, 'r') as json_data: # store platform data to db platform_data = json.load(json_data) pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) with open(seqrun_json, 'r') as json_data: # store seqrun data to db seqrun_data = json.load(json_data) sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) base.close_session()
def load_file_to_disk_and_db(self, input_file_list, withdraw_exisitng_collection=True, autosave_db=True, file_suffix=None, force=True, remove_file=False): ''' A method for loading analysis results to disk and database. File will be moved to a new path if base_path is present. Directory structure of the final path is based on the collection_table information. Following will be the final directory structure if base_path is present project - base_path/project_igf_id/analysis_name sample - base_path/project_igf_id/sample_igf_id/analysis_name experiment - base_path/project_igf_id/sample_igf_id/experiment_igf_id/analysis_name run - base_path/project_igf_id/sample_igf_id/experiment_igf_id/run_igf_id/analysis_name :param input_file_list: A list of input file to load, all using the same collection info :param withdraw_exisitng_collection: Remove existing collection group, DO NOT use this while loading a list of files :param autosave_db: Save changes to database, default True :param file_suffix: Use a specific file suffix, use None if it should be same as original file e.g. input.vcf.gz to output.vcf.gz :param force: Toggle for removing existing file, default True :param remove_file: A toggle for removing existing file from disk, default False :returns: A list of final filepath ''' try: project_igf_id = None sample_igf_id = None experiment_igf_id = None experiment_igf_id = None run_igf_id = None output_path_list = list() # define empty output list dbconnected = False if self.collection_name is None or \ self.collection_type is None or \ self.collection_table is None: raise ValueError('File collection information is incomplete' ) # check for collection information base = BaseAdaptor(**{'session_class': self.dbsession_class}) base.start_session() # connect to db dbconnected = True if self.base_path is not None: if self.collection_table == 'sample': sa = SampleAdaptor(**{'session': base.session}) sample_igf_id = self.collection_name sample_exists = sa.check_sample_records_igf_id( sample_igf_id=sample_igf_id) if not sample_exists: raise ValueError('Sample {0} not found in db'.\ format(sample_igf_id)) project_igf_id = \ sa.fetch_sample_project(sample_igf_id=sample_igf_id) # fetch project id for sample elif self.collection_table == 'experiment': ea = ExperimentAdaptor(**{'session': base.session}) experiment_igf_id = self.collection_name experiment_exists = \ ea.check_experiment_records_id( experiment_igf_id=experiment_igf_id) if not experiment_exists: raise ValueError('Experiment {0} not present in database'.\ format(experiment_igf_id)) (project_igf_id,sample_igf_id) = \ ea.fetch_project_and_sample_for_experiment( experiment_igf_id=experiment_igf_id) # fetch project and sample id for experiment elif self.collection_table == 'run': ra = RunAdaptor(**{'session': base.session}) run_igf_id = self.collection_name run_exists = ra.check_run_records_igf_id( run_igf_id=run_igf_id) if not run_exists: raise ValueError('Run {0} not found in database'.\ format(run_igf_id)) (project_igf_id,sample_igf_id,experiment_igf_id) = \ ra.fetch_project_sample_and_experiment_for_run( run_igf_id=run_igf_id) # fetch project, sample and experiment id for run elif self.collection_table == 'project': pa = ProjectAdaptor(**{'session': base.session}) project_igf_id = self.collection_name project_exists = \ pa.check_project_records_igf_id( project_igf_id=project_igf_id) if not project_exists: raise ValueError('Project {0} not found in database'.\ format(project_igf_id)) if self.rename_file and self.analysis_name is None: raise ValueError('Analysis name is required for renaming file' ) # check analysis name for input_file in input_file_list: final_path = '' if self.base_path is None: # do not move file if base_path is absent final_path = os.path.dirname(input_file) else: # move file path if self.collection_table == 'project': if project_igf_id is None: raise ValueError('Missing project id for collection {0}'.\ format(self.collection_name)) final_path = \ os.path.join( self.base_path, project_igf_id, self.analysis_name) # final path for project elif self.collection_table == 'sample': if project_igf_id is None or \ sample_igf_id is None: raise ValueError('Missing project and sample id for collection {0}'.\ format(self.collection_name)) final_path = \ os.path.join( self.base_path, project_igf_id, sample_igf_id, self.analysis_name) # final path for sample elif self.collection_table == 'experiment': if project_igf_id is None or \ sample_igf_id is None or \ experiment_igf_id is None: raise ValueError('Missing project,sample and experiment id for collection {0}'.\ format(self.collection_name)) final_path = \ os.path.join( self.base_path, project_igf_id, sample_igf_id, experiment_igf_id, self.analysis_name) # final path for experiment elif self.collection_table == 'run': if project_igf_id is None or \ sample_igf_id is None or \ experiment_igf_id is None or \ run_igf_id is None: raise ValueError('Missing project,sample,experiment and run id for collection {0}'.\ format(self.collection_name)) final_path = \ os.path.join(\ self.base_path, project_igf_id, sample_igf_id, experiment_igf_id, run_igf_id, self.analysis_name) # final path for run if self.rename_file: new_filename = \ self.get_new_file_name( input_file=input_file, file_suffix=file_suffix) final_path = \ os.path.join( final_path, new_filename) # get new filepath else: final_path = \ os.path.join( final_path, os.path.basename(input_file)) if final_path != input_file: # move file if its required final_path = preprocess_path_name( input_path=final_path ) # remove unexpected characters from file path move_file(source_path=input_file, destinationa_path=final_path, force=force ) # move or overwrite file to destination dir output_path_list.append( final_path) # add final path to the output list self.create_or_update_analysis_collection( file_path=final_path, dbsession=base.session, withdraw_exisitng_collection=withdraw_exisitng_collection, remove_file=remove_file, autosave_db=autosave_db) # load new file collection in db if autosave_db: base.commit_session() # save changes to db for each file base.commit_session() # save changes to db base.close_session() # close db connection return output_path_list except: if dbconnected: base.rollback_session() base.close_session() raise
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.drop_all(self.engine) if os.path.exists(self.dbname): os.remove(self.dbname) Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() self.temp_work_dir = get_temp_dir() self.temp_base_dir = get_temp_dir() self.input_list = ['a.cram', 'a.vcf.gz', 'b.tar.gz'] for file_name in self.input_list: file_path = os.path.join(self.temp_work_dir, file_name) with open(file_path, 'w') as fq: fq.write('AAAA') # create input files base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() platform_data = [{ "platform_igf_id": "M001", "model_name": "MISEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA1.18.54" }] # platform data flowcell_rule_data = [{ "platform_igf_id": "M001", "flowcell_type": "MISEQ", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }] # flowcell rule data pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) # loading platform data pl.store_flowcell_barcode_rule( data=flowcell_rule_data) # loading flowcell rules data project_data = [{'project_igf_id': 'ProjectA'}] # project data pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data( data=project_data) # load project data sample_data = [{ 'sample_igf_id': 'SampleA', 'project_igf_id': 'ProjectA' }] # sample data sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data( data=sample_data) # store sample data seqrun_data = [{ 'seqrun_igf_id': 'SeqrunA', 'flowcell_id': '000000000-D0YLK', 'platform_igf_id': 'M001', 'flowcell': 'MISEQ' }] # seqrun data sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data( data=seqrun_data) # load seqrun data experiment_data = [{ 'experiment_igf_id': 'ExperimentA', 'sample_igf_id': 'SampleA', 'library_name': 'SampleA', 'platform_name': 'MISEQ', 'project_igf_id': 'ProjectA' }] # experiment data ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data( data=experiment_data) # load experiment data run_data = [{ 'run_igf_id': 'RunA', 'experiment_igf_id': 'ExperimentA', 'seqrun_igf_id': 'SeqrunA', 'lane_number': '1' }] # run data ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data) # load run data base.commit_session() base.close_session()
pipeline_seed_data = [{ 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 1, 'seed_table': 'experiment' }, { 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 2, 'seed_table': 'experiment' }, { 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 3, 'seed_table': 'experiment' }] pla.store_pipeline_data(data=pipeline_data) pla.create_pipeline_seed(data=pipeline_seed_data) base.commit_session() base.close_session() ps = Project_status(igf_session_class=base.get_session_class(), project_igf_id='ProjectA') #print(ps.get_seqrun_info(demultiplexing_pipeline='DemultiplexIlluminaFastq')) #print(ps.get_seqrun_info(active_seqrun_igf_id='SeqrunA')) #print(ps.get_seqrun_info(demultiplexing_pipeline='DemultiplexIlluminaFastq', # active_seqrun_igf_id='180410_K00345_0063_AHWL7CBBXX')) #print(ps.get_status_description()) #print(ps.get_status_column_order()) #print(ps.get_analysis_info(analysis_pipeline='PrimaryAnalysis')) #ps.generate_gviz_json_file(output_file='a', # demultiplexing_pipeline='DemultiplexIlluminaFastq', # analysis_pipeline='PrimaryAnalysis', # active_seqrun_igf_id='180410_K00345_0063_AHWL7CBBXX') Base.metadata.drop_all(engine)
def _build_and_store_exp_run_and_collection_in_db(self,fastq_files_list, \ restricted_list=('10X')): ''' An internal method for building db collections for the raw fastq files ''' session_class = self.session_class db_connected = False try: restricted_list = list(restricted_list) dataframe = pd.DataFrame(fastq_files_list) # calculate additional detail dataframe=dataframe.apply(lambda data: \ self._calculate_experiment_run_and_file_info(data, restricted_list),\ axis=1) # get file data file_group_columns = [ 'name', 'type', 'location', 'R1', 'R1_md5', 'R1_size', 'R2', 'R2_md5', 'R2_size' ] file_group_data = dataframe.loc[:, file_group_columns] file_group_data = file_group_data.drop_duplicates() (file_data, file_group_data) = self._reformat_file_group_data( data=file_group_data) # get base session base = BaseAdaptor(**{'session_class': session_class}) base.start_session() db_connected = True # get experiment data experiment_columns=base.get_table_columns(table_name=Experiment, \ excluded_columns=['experiment_id', 'project_id', 'sample_id' ]) experiment_columns.extend(['project_igf_id', 'sample_igf_id']) exp_data = dataframe.loc[:, experiment_columns] exp_data = exp_data.drop_duplicates() if exp_data.index.size > 0: exp_data=exp_data.apply(lambda x: \ self._check_existing_data(\ data=x,\ dbsession=base.session,\ table_name='experiment',\ check_column='EXISTS'),\ axis=1) exp_data = exp_data[exp_data['EXISTS'] == False] # filter existing experiments exp_data.drop('EXISTS', axis=1, inplace=True) # remove extra columns exp_data = exp_data[pd.isnull(exp_data['experiment_igf_id']) == False] # filter exp with null values # get run data run_columns=base.get_table_columns(table_name=Run, \ excluded_columns=['run_id', 'seqrun_id', 'experiment_id', 'date_created', 'status' ]) run_columns.extend([ 'seqrun_igf_id', 'experiment_igf_id', 'R1_READ_COUNT', 'R2_READ_COUNT' ]) run_data = dataframe.loc[:, run_columns] run_data = run_data.drop_duplicates() if run_data.index.size > 0: run_data=run_data.apply(lambda x: \ self._check_existing_data(\ data=x,\ dbsession=base.session,\ table_name='run',\ check_column='EXISTS'),\ axis=1) run_data = run_data[run_data['EXISTS'] == False] # filter existing runs run_data.drop('EXISTS', axis=1, inplace=True) # remove extra columns run_data = run_data[pd.isnull(run_data['run_igf_id']) == False] # filter run with null values # get collection data collection_columns = ['name', 'type', 'table'] collection_data = dataframe.loc[:, collection_columns] collection_data = collection_data.drop_duplicates() if collection_data.index.size > 0: collection_data=collection_data.apply(lambda x: \ self._check_existing_data( \ data=x, \ dbsession=base.session, \ table_name='collection', \ check_column='EXISTS'), \ axis=1) collection_data = collection_data[collection_data[ 'EXISTS'] == False] # filter existing collection collection_data.drop('EXISTS', axis=1, inplace=True) # remove extra columns collection_data = collection_data[pd.isnull( collection_data['name'] ) == False] # filter collection with null values # store experiment to db if exp_data.index.size > 0: ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data(data=exp_data, autosave=False) base.session.flush() # store run to db if run_data.index.size > 0: ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data, autosave=False) base.session.flush() # store file to db fa = FileAdaptor(**{'session': base.session}) fa.store_file_and_attribute_data(data=file_data, autosave=False) base.session.flush() # store collection to db ca = CollectionAdaptor(**{'session': base.session}) if collection_data.index.size > 0: ca.store_collection_and_attribute_data(data=collection_data,\ autosave=False) base.session.flush() ca.create_collection_group(data=file_group_data, autosave=False) base.commit_session() self._write_manifest_file(file_data) except: if db_connected: base.rollback_session() raise finally: if db_connected: base.close_session()