def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() base.start_session() platform_data = [ { "platform_igf_id": "M03291", "model_name": "MISEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA1.18.54" }, ] flowcell_rule_data = [{ "platform_igf_id": "M03291", "flowcell_type": "MISEQ", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }] pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) pl.store_flowcell_barcode_rule(data=flowcell_rule_data) project_data = [{'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq'}] pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) sample_data = [{ 'sample_igf_id': 'IGF103923', 'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq', 'species_name': 'HG38' }] sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) seqrun_data = [ { 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'flowcell_id': '000000000-BRN47', 'platform_igf_id': 'M03291', 'flowcell': 'MISEQ' }, ] sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) pipeline_data = [ { "pipeline_name": "PrimaryAnalysis", "pipeline_db": "sqlite:////bcl2fastq.db" }, { "pipeline_name": "DemultiplexIlluminaFastq", "pipeline_db": "sqlite:////bcl2fastq.db" }, ] pla = PipelineAdaptor(**{'session': base.session}) pla.store_pipeline_data(data=pipeline_data) file_data = [ { 'file_path': '/path/S20180405S_S1_L001_R1_001.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1528121404' }, { 'file_path': '/path/S20180405S_S1_L001_R2_001.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1467047580' }, { 'file_path': '/path/S20180405S_S3_L001_R2_001.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1467047580' }, ] fa = FileAdaptor(**{'session': base.session}) fa.store_file_and_attribute_data(data=file_data) collection_data = [ { 'name': 'IGF103923_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'table': 'run' }, { 'name': 'IGF103923_MISEQ1_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'table': 'run' }, ] collection_files_data = [ { 'name': 'IGF103923_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/S20180405S_S1_L001_R1_001.fastq.gz' }, { 'name': 'IGF103923_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/S20180405S_S1_L001_R2_001.fastq.gz' }, { 'name': 'IGF103923_MISEQ1_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/S20180405S_S3_L001_R2_001.fastq.gz' }, ] ca = CollectionAdaptor(**{'session': base.session}) ca.store_collection_and_attribute_data(data=collection_data) ca.create_collection_group(data=collection_files_data) experiment_data = [{ 'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq', 'sample_igf_id': 'IGF103923', 'experiment_igf_id': 'IGF103923_MISEQ', 'library_name': 'IGF103923', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'TENX-TRANSCRIPTOME-3P', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ' }, { 'project_igf_id': 'IGFQ000123_avik_10-4-2018_Miseq', 'sample_igf_id': 'IGF103923', 'experiment_igf_id': 'IGF103923_MISEQ1', 'library_name': 'IGF103923_1', 'library_source': 'GENOMIC_SINGLE_CELL', 'library_strategy': 'WGS', 'experiment_type': 'UNKNOWN', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ' }] ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data(data=experiment_data) run_data = [{ 'experiment_igf_id': 'IGF103923_MISEQ', 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'run_igf_id': 'IGF103923_MISEQ_000000000-BRN47_1', 'lane_number': '1' }, { 'experiment_igf_id': 'IGF103923_MISEQ1', 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'run_igf_id': 'IGF103923_MISEQ1_000000000-BRN47_1', 'lane_number': '1' }] ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data) base.close_session()
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() base.start_session() platform_data = [{ "platform_igf_id": "M03291", "model_name": "MISEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA1.18.54" }, { "platform_igf_id": "NB501820", "model_name": "NEXTSEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA2" }, { "platform_igf_id": "K00345", "model_name": "HISEQ4000", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA2" }] flowcell_rule_data = [{ "platform_igf_id": "K00345", "flowcell_type": "HiSeq 3000/4000 SR", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }, { "platform_igf_id": "K00345", "flowcell_type": "HiSeq 3000/4000 PE", "index_1": "NO_CHANGE", "index_2": "REVCOMP" }, { "platform_igf_id": "NB501820", "flowcell_type": "NEXTSEQ", "index_1": "NO_CHANGE", "index_2": "REVCOMP" }, { "platform_igf_id": "M03291", "flowcell_type": "MISEQ", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }] pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) pl.store_flowcell_barcode_rule(data=flowcell_rule_data) seqrun_data = [{ 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'flowcell_id': '000000000-BRN47', 'platform_igf_id': 'M03291', 'flowcell': 'MISEQ', }, { 'seqrun_igf_id': '180416_NB03291_013_000000001-BRN47', 'flowcell_id': '000000001-BRN47', 'platform_igf_id': 'NB501820', 'flowcell': 'NEXTSEQ', }] sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) project_data = [{'project_igf_id': 'projectA'}] pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) sample_data = [ { 'sample_igf_id': 'sampleA', 'project_igf_id': 'projectA', 'species_name': 'HG38' }, { 'sample_igf_id': 'sampleB', 'project_igf_id': 'projectA', 'species_name': 'UNKNOWN' }, ] sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) experiment_data = [ { 'project_igf_id': 'projectA', 'sample_igf_id': 'sampleA', 'experiment_igf_id': 'sampleA_MISEQ', 'library_name': 'sampleA', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'TENX-TRANSCRIPTOME-3P', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ', }, { 'project_igf_id': 'projectA', 'sample_igf_id': 'sampleA', 'experiment_igf_id': 'sampleA_NEXTSEQ', 'library_name': 'sampleA', 'library_source': 'UNKNOWN', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'TENX-TRANSCRIPTOME-3P', 'library_layout': 'PAIRED', 'platform_name': 'NEXTSEQ', }, { 'project_igf_id': 'projectA', 'sample_igf_id': 'sampleB', 'experiment_igf_id': 'sampleB_MISEQ', 'library_name': 'sampleB', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'TENX-TRANSCRIPTOME-3P', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ', }, ] ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data(data=experiment_data) run_data = [{ 'experiment_igf_id': 'sampleA_MISEQ', 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'run_igf_id': 'sampleA_MISEQ_000000000-BRN47_1', 'lane_number': '1' }, { 'experiment_igf_id': 'sampleA_NEXTSEQ', 'seqrun_igf_id': '180416_NB03291_013_000000001-BRN47', 'run_igf_id': 'sampleA_NEXTSEQ_000000001-BRN47_2', 'lane_number': '2' }, { 'experiment_igf_id': 'sampleB_MISEQ', 'seqrun_igf_id': '180416_M03291_0139_000000000-BRN47', 'run_igf_id': 'sampleB_MISEQ_HVWN7BBXX_1', 'lane_number': '1' }] ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data) file_data = [ { 'file_path': '/path/sampleA_MISEQ_000000000-BRN47_1_R1.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1528121404', }, { 'file_path': '/path/sampleA_NEXTSEQ_000000001-BRN47_2_R1.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1528121404', }, { 'file_path': '/path/sampleB_MISEQ_HVWN7BBXX_1_R1.fastq.gz', 'location': 'HPC_PROJECT', 'md5': 'fd5a95c18ebb7145645e95ce08d729e4', 'size': '1528121404', }, ] fa = FileAdaptor(**{'session': base.session}) fa.store_file_and_attribute_data(data=file_data) collection_data = [{ 'name': 'sampleA_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'table': 'run' }, { 'name': 'sampleA_NEXTSEQ_000000001-BRN47_2', 'type': 'demultiplexed_fastq', 'table': 'run' }, { 'name': 'sampleB_MISEQ_HVWN7BBXX_1', 'type': 'demultiplexed_fastq', 'table': 'run' }] collection_files_data = [{ 'name': 'sampleA_MISEQ_000000000-BRN47_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/sampleA_MISEQ_000000000-BRN47_1_R1.fastq.gz' }, { 'name': 'sampleA_NEXTSEQ_000000001-BRN47_2', 'type': 'demultiplexed_fastq', 'file_path': '/path/sampleA_NEXTSEQ_000000001-BRN47_2_R1.fastq.gz' }, { 'name': 'sampleB_MISEQ_HVWN7BBXX_1', 'type': 'demultiplexed_fastq', 'file_path': '/path/sampleB_MISEQ_HVWN7BBXX_1_R1.fastq.gz' }] ca = CollectionAdaptor(**{'session': base.session}) ca.store_collection_and_attribute_data(data=collection_data) ca.create_collection_group(data=collection_files_data) base.close_session()
def load_file_to_disk_and_db(self, input_file_list, withdraw_exisitng_collection=True, autosave_db=True, file_suffix=None, force=True, remove_file=False): ''' A method for loading analysis results to disk and database. File will be moved to a new path if base_path is present. Directory structure of the final path is based on the collection_table information. Following will be the final directory structure if base_path is present project - base_path/project_igf_id/analysis_name sample - base_path/project_igf_id/sample_igf_id/analysis_name experiment - base_path/project_igf_id/sample_igf_id/experiment_igf_id/analysis_name run - base_path/project_igf_id/sample_igf_id/experiment_igf_id/run_igf_id/analysis_name :param input_file_list: A list of input file to load, all using the same collection info :param withdraw_exisitng_collection: Remove existing collection group, DO NOT use this while loading a list of files :param autosave_db: Save changes to database, default True :param file_suffix: Use a specific file suffix, use None if it should be same as original file e.g. input.vcf.gz to output.vcf.gz :param force: Toggle for removing existing file, default True :param remove_file: A toggle for removing existing file from disk, default False :returns: A list of final filepath ''' try: project_igf_id = None sample_igf_id = None experiment_igf_id = None experiment_igf_id = None run_igf_id = None output_path_list = list() # define empty output list dbconnected = False if self.collection_name is None or \ self.collection_type is None or \ self.collection_table is None: raise ValueError('File collection information is incomplete' ) # check for collection information base = BaseAdaptor(**{'session_class': self.dbsession_class}) base.start_session() # connect to db dbconnected = True if self.base_path is not None: if self.collection_table == 'sample': sa = SampleAdaptor(**{'session': base.session}) sample_igf_id = self.collection_name sample_exists = sa.check_sample_records_igf_id( sample_igf_id=sample_igf_id) if not sample_exists: raise ValueError('Sample {0} not found in db'.\ format(sample_igf_id)) project_igf_id = \ sa.fetch_sample_project(sample_igf_id=sample_igf_id) # fetch project id for sample elif self.collection_table == 'experiment': ea = ExperimentAdaptor(**{'session': base.session}) experiment_igf_id = self.collection_name experiment_exists = \ ea.check_experiment_records_id( experiment_igf_id=experiment_igf_id) if not experiment_exists: raise ValueError('Experiment {0} not present in database'.\ format(experiment_igf_id)) (project_igf_id,sample_igf_id) = \ ea.fetch_project_and_sample_for_experiment( experiment_igf_id=experiment_igf_id) # fetch project and sample id for experiment elif self.collection_table == 'run': ra = RunAdaptor(**{'session': base.session}) run_igf_id = self.collection_name run_exists = ra.check_run_records_igf_id( run_igf_id=run_igf_id) if not run_exists: raise ValueError('Run {0} not found in database'.\ format(run_igf_id)) (project_igf_id,sample_igf_id,experiment_igf_id) = \ ra.fetch_project_sample_and_experiment_for_run( run_igf_id=run_igf_id) # fetch project, sample and experiment id for run elif self.collection_table == 'project': pa = ProjectAdaptor(**{'session': base.session}) project_igf_id = self.collection_name project_exists = \ pa.check_project_records_igf_id( project_igf_id=project_igf_id) if not project_exists: raise ValueError('Project {0} not found in database'.\ format(project_igf_id)) if self.rename_file and self.analysis_name is None: raise ValueError('Analysis name is required for renaming file' ) # check analysis name for input_file in input_file_list: final_path = '' if self.base_path is None: # do not move file if base_path is absent final_path = os.path.dirname(input_file) else: # move file path if self.collection_table == 'project': if project_igf_id is None: raise ValueError('Missing project id for collection {0}'.\ format(self.collection_name)) final_path = \ os.path.join( self.base_path, project_igf_id, self.analysis_name) # final path for project elif self.collection_table == 'sample': if project_igf_id is None or \ sample_igf_id is None: raise ValueError('Missing project and sample id for collection {0}'.\ format(self.collection_name)) final_path = \ os.path.join( self.base_path, project_igf_id, sample_igf_id, self.analysis_name) # final path for sample elif self.collection_table == 'experiment': if project_igf_id is None or \ sample_igf_id is None or \ experiment_igf_id is None: raise ValueError('Missing project,sample and experiment id for collection {0}'.\ format(self.collection_name)) final_path = \ os.path.join( self.base_path, project_igf_id, sample_igf_id, experiment_igf_id, self.analysis_name) # final path for experiment elif self.collection_table == 'run': if project_igf_id is None or \ sample_igf_id is None or \ experiment_igf_id is None or \ run_igf_id is None: raise ValueError('Missing project,sample,experiment and run id for collection {0}'.\ format(self.collection_name)) final_path = \ os.path.join(\ self.base_path, project_igf_id, sample_igf_id, experiment_igf_id, run_igf_id, self.analysis_name) # final path for run if self.rename_file: new_filename = \ self.get_new_file_name( input_file=input_file, file_suffix=file_suffix) final_path = \ os.path.join( final_path, new_filename) # get new filepath else: final_path = \ os.path.join( final_path, os.path.basename(input_file)) if final_path != input_file: # move file if its required final_path = preprocess_path_name( input_path=final_path ) # remove unexpected characters from file path move_file(source_path=input_file, destinationa_path=final_path, force=force ) # move or overwrite file to destination dir output_path_list.append( final_path) # add final path to the output list self.create_or_update_analysis_collection( file_path=final_path, dbsession=base.session, withdraw_exisitng_collection=withdraw_exisitng_collection, remove_file=remove_file, autosave_db=autosave_db) # load new file collection in db if autosave_db: base.commit_session() # save changes to db for each file base.commit_session() # save changes to db base.close_session() # close db connection return output_path_list except: if dbconnected: base.rollback_session() base.close_session() raise
def run(self): try: fastq_file = self.param_required('fastq_file') fastq_dir = self.param_required('fastq_dir') igf_session_class = self.param_required('igf_session_class') fastqc_exe = self.param_required('fastqc_exe') tag = self.param_required('tag') seqrun_igf_id = self.param_required('seqrun_igf_id') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') fastqc_options = self.param('fastqc_options') base_results_dir = self.param_required('base_results_dir') project_name = self.param_required('project_name') force_overwrite = self.param('force_overwrite') fastqc_dir_label = self.param('fastqc_dir_label') required_collection_table = self.param('required_collection_table') sample_name = self.param('sample_name') hpc_location = self.param('hpc_location') fastqc_collection_type = self.param('fastqc_collection_type') use_ephemeral_space = self.param('use_ephemeral_space') store_file = self.param('store_file') lane_index_info = os.path.basename(fastq_dir) # get the lane and index length info fastq_file_label = os.path.basename(fastq_file).replace('.fastq.gz','') collection_name = None collection_table = None if tag=='known' and store_file: # fetch sample name for known fastq, if its not defined base = BaseAdaptor(**{'session_class':igf_session_class}) base.start_session() # connect to db ca = CollectionAdaptor(**{'session':base.session}) (collection_name,collection_table) = \ ca.fetch_collection_name_and_table_from_file_path(\ file_path=fastq_file) # fetch collection name and table info if collection_table != required_collection_table: raise ValueError( 'Expected collection table {0} and got {1}, {2}'.\ format( required_collection_table, collection_table, fastq_file)) ra = RunAdaptor(**{'session':base.session}) sample = ra.fetch_sample_info_for_run(run_igf_id=collection_name) sample_name = sample['sample_igf_id'] base.close_session() fastqc_result_dir = \ os.path.join(\ base_results_dir, project_name, seqrun_date, flowcell_id, lane_index_info, tag) # result dir path is generic if sample_name is not None: fastqc_result_dir = \ os.path.join(\ fastqc_result_dir, sample_name) # add sample name to dir path if its available fastqc_result_dir = \ os.path.join(\ fastqc_result_dir, fastq_file_label, fastqc_dir_label) # keep multiple files under same dir if os.path.exists(fastqc_result_dir) and force_overwrite: remove_dir(fastqc_result_dir) # remove existing output dir if force_overwrite is true if not os.path.exists(fastqc_result_dir): os.makedirs(fastqc_result_dir,mode=0o775) # create output dir if its not present temp_work_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) # get a temp work dir if not os.path.exists(fastq_file): raise IOError('fastq file {0} not readable'.format(fastq_file)) # raise if fastq file path is not readable fastqc_output = \ os.path.join(\ temp_work_dir, fastq_file_label) os.mkdir(fastqc_output) # create fastqc output dir fastqc_param = \ self.format_tool_options(fastqc_options) # format fastqc params fastqc_cmd = \ [fastqc_exe, '-o',fastqc_output, '-d',temp_work_dir ] # fastqc base parameters fastqc_cmd.extend(fastqc_param) # add additional parameters fastqc_cmd.append(fastq_file) # fastqc input file subprocess.check_call(' '.join(fastqc_cmd),shell=True) # run fastqc fastqc_zip = None fastqc_html = None for root, _, files in os.walk(top=fastqc_output): for file in files: if fnmatch.fnmatch(file, '*.zip'): input_fastqc_zip = os.path.join(root,file) copy2(input_fastqc_zip,fastqc_result_dir) fastqc_zip = os.path.join(fastqc_result_dir,file) if fnmatch.fnmatch(file, '*.html'): input_fastqc_html = os.path.join(root,file) copy2(input_fastqc_html,fastqc_result_dir) fastqc_html = os.path.join(fastqc_result_dir,file) if fastqc_html is None or fastqc_zip is None: raise ValueError('Missing required values, fastqc zip: {0}, fastqc html: {1}'.\ format(fastqc_zip,fastqc_html)) if tag=='known' and store_file: if collection_name is None: raise ValueError('couldn\'t retrieve collection name for {0}'.\ format(fastq_file)) fastqc_files = \ [{'name':collection_name, 'type':fastqc_collection_type, 'table':required_collection_table, 'file_path':fastqc_zip, 'location':hpc_location}, {'name':collection_name, 'type':fastqc_collection_type, 'table':required_collection_table, 'file_path':fastqc_html, 'location':hpc_location}, ] ca = CollectionAdaptor(**{'session_class':igf_session_class}) ca.start_session() ca.load_file_and_create_collection(data=fastqc_files) # store fastqc files to db ca.close_session() self.param('dataflow_params', {'fastqc_html':fastqc_html, 'lane_index_info':lane_index_info, 'sample_name':sample_name, 'fastqc':{'fastq_dir':fastq_dir, 'fastqc_zip':fastqc_zip, 'fastqc_html':fastqc_html}}) # set dataflow params except Exception as e: message = \ 'seqrun: {2}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, seqrun_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.drop_all(self.engine) if os.path.exists(self.dbname): os.remove(self.dbname) Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() self.temp_work_dir = get_temp_dir() self.temp_base_dir = get_temp_dir() self.input_list = ['a.cram', 'a.vcf.gz', 'b.tar.gz'] for file_name in self.input_list: file_path = os.path.join(self.temp_work_dir, file_name) with open(file_path, 'w') as fq: fq.write('AAAA') # create input files base = BaseAdaptor(**{'session_class': self.session_class}) base.start_session() platform_data = [{ "platform_igf_id": "M001", "model_name": "MISEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA1.18.54" }] # platform data flowcell_rule_data = [{ "platform_igf_id": "M001", "flowcell_type": "MISEQ", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }] # flowcell rule data pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) # loading platform data pl.store_flowcell_barcode_rule( data=flowcell_rule_data) # loading flowcell rules data project_data = [{'project_igf_id': 'ProjectA'}] # project data pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data( data=project_data) # load project data sample_data = [{ 'sample_igf_id': 'SampleA', 'project_igf_id': 'ProjectA' }] # sample data sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data( data=sample_data) # store sample data seqrun_data = [{ 'seqrun_igf_id': 'SeqrunA', 'flowcell_id': '000000000-D0YLK', 'platform_igf_id': 'M001', 'flowcell': 'MISEQ' }] # seqrun data sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data( data=seqrun_data) # load seqrun data experiment_data = [{ 'experiment_igf_id': 'ExperimentA', 'sample_igf_id': 'SampleA', 'library_name': 'SampleA', 'platform_name': 'MISEQ', 'project_igf_id': 'ProjectA' }] # experiment data ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data( data=experiment_data) # load experiment data run_data = [{ 'run_igf_id': 'RunA', 'experiment_igf_id': 'ExperimentA', 'seqrun_igf_id': 'SeqrunA', 'lane_number': '1' }] # run data ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data) # load run data base.commit_session() base.close_session()
}] # run data base.start_session() pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) # loading platform data pl.store_flowcell_barcode_rule( data=flowcell_rule_data) # loading flowcell rules data pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) # load project data sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) # store sample data sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) # load seqrun data ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data( data=experiment_data) # load experiment data ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data) # load run data pipeline_data = [{ "pipeline_name": "DemultiplexIlluminaFastq", "pipeline_db": "sqlite:////bcl2fastq.db", }] pipeline_seed_data = [ { 'pipeline_name': 'DemultiplexIlluminaFastq', 'seed_id': 1, 'seed_table': 'seqrun' }, { 'pipeline_name': 'DemultiplexIlluminaFastq', 'seed_id': 2,
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() # load platform data platform_data=\ [{"platform_igf_id" : "M03291" , "model_name" : "MISEQ" , "vendor_name" : "ILLUMINA" , "software_name" : "RTA" , "software_version" : "RTA1.18.54" }, {"platform_igf_id" : "NB501820", "model_name" : "NEXTSEQ", "vendor_name" : "ILLUMINA", "software_name" : "RTA", "software_version" : "RTA2" }, {"platform_igf_id" : "K00345", "model_name" : "HISEQ4000", "vendor_name" : "ILLUMINA", "software_name" : "RTA", "software_version" : "RTA2" }] flowcell_rule_data=\ [{"platform_igf_id":"K00345", "flowcell_type":"HiSeq 3000/4000 SR", "index_1":"NO_CHANGE", "index_2":"NO_CHANGE"}, {"platform_igf_id":"K00345", "flowcell_type":"HiSeq 3000/4000 PE", "index_1":"NO_CHANGE", "index_2":"REVCOMP"}, {"platform_igf_id":"NB501820", "flowcell_type":"NEXTSEQ", "index_1":"NO_CHANGE", "index_2":"REVCOMP"}, {"platform_igf_id":"M03291", "flowcell_type":"MISEQ", "index_1":"NO_CHANGE", "index_2":"NO_CHANGE"}] pl = PlatformAdaptor(**{'session_class': base.session_class}) pl.start_session() pl.store_platform_data(data=platform_data) pl.store_flowcell_barcode_rule(data=flowcell_rule_data) pl.close_session() # load project data project_data = [{'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA'}] pa = ProjectAdaptor(**{'session_class': base.session_class}) pa.start_session() pa.store_project_and_attribute_data(data=project_data) pa.close_session() # load samples sample_data = [ { 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109792', 'expected_read': 40000000 }, { 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109793', 'expected_read': 40000000 }, { 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109794', 'expected_read': 40000000 }, { 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109795', 'expected_read': 40000000 }, { 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109796', 'expected_read': 40000000 }, { 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109797', 'expected_read': 40000000 }, { 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109797_1', 'expected_read': 40000000 }, ] sa = SampleAdaptor(**{'session_class': base.session_class}) sa.start_session() sa.store_sample_and_attribute_data(data=sample_data) sa.close_session() # load seqrun data seqrun_data = [{ 'flowcell_id': 'HV2GJBBXX', 'platform_igf_id': 'K00345', 'seqrun_igf_id': '180518_K00345_0047_BHV2GJBBXX' }] sra = SeqrunAdaptor(**{'session_class': base.session_class}) sra.start_session() sra.store_seqrun_and_attribute_data(data=seqrun_data) sra.close_session() # load experiment data experiment_data=\ [{'experiment_igf_id': 'IGF109792_HISEQ4000', 'library_name': 'IGF109792', 'platform_name': 'HISEQ4000', 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109792', }, {'experiment_igf_id': 'IGF109793_HISEQ4000', 'library_name': 'IGF109793', 'platform_name': 'HISEQ4000', 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109793', }, {'experiment_igf_id': 'IGF109794_HISEQ4000', 'library_name': 'IGF109794', 'platform_name': 'HISEQ4000', 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109794', }, {'experiment_igf_id': 'IGF109795_HISEQ4000', 'library_name': 'IGF109795', 'platform_name': 'HISEQ4000', 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109795', }, {'experiment_igf_id': 'IGF109796_HISEQ4000', 'library_name': 'IGF109796', 'platform_name': 'HISEQ4000', 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109796', }, {'experiment_igf_id': 'IGF109797_HISEQ4000', 'library_name': 'IGF109797', 'platform_name': 'HISEQ4000', 'project_igf_id': 'IGFQ000472_avik_28-3-2018_RNA', 'sample_igf_id': 'IGF109797', }, ] ea = ExperimentAdaptor(**{'session_class': base.session_class}) ea.start_session() ea.store_project_and_attribute_data(data=experiment_data) ea.close_session() # load run data run_data=\ [{'experiment_igf_id': 'IGF109792_HISEQ4000', 'lane_number': '7', 'run_igf_id': 'IGF109792_HISEQ4000_H2N3MBBXY_7', 'seqrun_igf_id': '180518_K00345_0047_BHV2GJBBXX', 'R1_READ_COUNT':288046541 }, {'experiment_igf_id': 'IGF109793_HISEQ4000', 'lane_number': '7', 'run_igf_id': 'IGF109793_HISEQ4000_H2N3MBBXY_7', 'seqrun_igf_id': '180518_K00345_0047_BHV2GJBBXX', 'R1_READ_COUNT':14666330 }, {'experiment_igf_id': 'IGF109794_HISEQ4000', 'lane_number': '7', 'run_igf_id': 'IGF109794_HISEQ4000_H2N3MBBXY_7', 'seqrun_igf_id': '180518_K00345_0047_BHV2GJBBXX', 'R1_READ_COUNT':5009143 }, {'experiment_igf_id': 'IGF109795_HISEQ4000', 'lane_number': '7', 'run_igf_id': 'IGF109795_HISEQ4000_H2N3MBBXY_7', 'seqrun_igf_id': '180518_K00345_0047_BHV2GJBBXX', 'R1_READ_COUNT':1391747 }, {'experiment_igf_id': 'IGF109796_HISEQ4000', 'lane_number': '7', 'run_igf_id': ' IGF109796_HISEQ4000_H2N3MBBXY_7', 'seqrun_igf_id': '180518_K00345_0047_BHV2GJBBXX', 'R1_READ_COUNT':1318008 }, {'experiment_igf_id': 'IGF109797_HISEQ4000', 'lane_number': '7', 'run_igf_id': 'IGF109797_HISEQ4000_H2N3MBBXY_7', 'seqrun_igf_id': '180518_K00345_0047_BHV2GJBBXX', 'R1_READ_COUNT':1216324 }, ] ra = RunAdaptor(**{'session_class': base.session_class}) ra.start_session() ra.store_run_and_attribute_data(data=run_data) ra.close_session()
def _build_and_store_exp_run_and_collection_in_db(self,fastq_files_list, \ restricted_list=('10X')): ''' An internal method for building db collections for the raw fastq files ''' session_class = self.session_class db_connected = False try: restricted_list = list(restricted_list) dataframe = pd.DataFrame(fastq_files_list) # calculate additional detail dataframe=dataframe.apply(lambda data: \ self._calculate_experiment_run_and_file_info(data, restricted_list),\ axis=1) # get file data file_group_columns = [ 'name', 'type', 'location', 'R1', 'R1_md5', 'R1_size', 'R2', 'R2_md5', 'R2_size' ] file_group_data = dataframe.loc[:, file_group_columns] file_group_data = file_group_data.drop_duplicates() (file_data, file_group_data) = self._reformat_file_group_data( data=file_group_data) # get base session base = BaseAdaptor(**{'session_class': session_class}) base.start_session() db_connected = True # get experiment data experiment_columns=base.get_table_columns(table_name=Experiment, \ excluded_columns=['experiment_id', 'project_id', 'sample_id' ]) experiment_columns.extend(['project_igf_id', 'sample_igf_id']) exp_data = dataframe.loc[:, experiment_columns] exp_data = exp_data.drop_duplicates() if exp_data.index.size > 0: exp_data=exp_data.apply(lambda x: \ self._check_existing_data(\ data=x,\ dbsession=base.session,\ table_name='experiment',\ check_column='EXISTS'),\ axis=1) exp_data = exp_data[exp_data['EXISTS'] == False] # filter existing experiments exp_data.drop('EXISTS', axis=1, inplace=True) # remove extra columns exp_data = exp_data[pd.isnull(exp_data['experiment_igf_id']) == False] # filter exp with null values # get run data run_columns=base.get_table_columns(table_name=Run, \ excluded_columns=['run_id', 'seqrun_id', 'experiment_id', 'date_created', 'status' ]) run_columns.extend([ 'seqrun_igf_id', 'experiment_igf_id', 'R1_READ_COUNT', 'R2_READ_COUNT' ]) run_data = dataframe.loc[:, run_columns] run_data = run_data.drop_duplicates() if run_data.index.size > 0: run_data=run_data.apply(lambda x: \ self._check_existing_data(\ data=x,\ dbsession=base.session,\ table_name='run',\ check_column='EXISTS'),\ axis=1) run_data = run_data[run_data['EXISTS'] == False] # filter existing runs run_data.drop('EXISTS', axis=1, inplace=True) # remove extra columns run_data = run_data[pd.isnull(run_data['run_igf_id']) == False] # filter run with null values # get collection data collection_columns = ['name', 'type', 'table'] collection_data = dataframe.loc[:, collection_columns] collection_data = collection_data.drop_duplicates() if collection_data.index.size > 0: collection_data=collection_data.apply(lambda x: \ self._check_existing_data( \ data=x, \ dbsession=base.session, \ table_name='collection', \ check_column='EXISTS'), \ axis=1) collection_data = collection_data[collection_data[ 'EXISTS'] == False] # filter existing collection collection_data.drop('EXISTS', axis=1, inplace=True) # remove extra columns collection_data = collection_data[pd.isnull( collection_data['name'] ) == False] # filter collection with null values # store experiment to db if exp_data.index.size > 0: ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data(data=exp_data, autosave=False) base.session.flush() # store run to db if run_data.index.size > 0: ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data, autosave=False) base.session.flush() # store file to db fa = FileAdaptor(**{'session': base.session}) fa.store_file_and_attribute_data(data=file_data, autosave=False) base.session.flush() # store collection to db ca = CollectionAdaptor(**{'session': base.session}) if collection_data.index.size > 0: ca.store_collection_and_attribute_data(data=collection_data,\ autosave=False) base.session.flush() ca.create_collection_group(data=file_group_data, autosave=False) base.commit_session() self._write_manifest_file(file_data) except: if db_connected: base.rollback_session() raise finally: if db_connected: base.close_session()
def _check_existing_data(data, dbsession, table_name, check_column='EXISTS'): try: if not isinstance(data, pd.Series): raise ValueError('Expecting a data series and got {0}'.format( type(data))) if table_name == 'experiment': if 'experiment_igf_id' in data and \ not pd.isnull(data['experiment_igf_id']): experiment_igf_id = data['experiment_igf_id'] ea = ExperimentAdaptor(**{'session': dbsession}) experiment_exists = ea.check_experiment_records_id( experiment_igf_id) if experiment_exists: # store data only if experiment is not existing data[check_column] = True else: data[check_column] = False return data else: raise ValueError( 'Missing or empty required column experiment_igf_id') elif table_name == 'run': if 'run_igf_id' in data and \ not pd.isnull(data['run_igf_id']): run_igf_id = data['run_igf_id'] ra = RunAdaptor(**{'session': dbsession}) run_exists = ra.check_run_records_igf_id(run_igf_id) if run_exists: # store data only if run is not existing data[check_column] = True else: data[check_column] = False return data else: raise ValueError( 'Missing or empty required column run_igf_id') elif table_name == 'collection': if 'name' in data and 'type' in data and \ not pd.isnull(data['name']) and \ not pd.isnull(data['type']): ca = CollectionAdaptor(**{'session': dbsession}) collection_exists=ca.check_collection_records_name_and_type(\ collection_name=data['name'], \ collection_type=data['type']) if collection_exists: data[check_column] = True else: data[check_column] = False return data else: raise ValueError( 'Missing or empty required column name or type') else: raise ValueError( 'table {0} not supported yet'.format(table_name)) except: raise
def _process_samples_data(self): ''' An internal method for processing samples data ''' try: fastq_dir = self.param_required('fastq_dir') qc_files = self.param_required('qc_files') samplesheet_filename = self.param('samplesheet_filename') igf_session_class = self.param_required('igf_session_class') remote_project_path = self.param_required('remote_project_path') project_name = self.param_required('project_name') seqrun_date = self.param_required('seqrun_date') flowcell_id = self.param_required('flowcell_id') lane_index_info = self.param_required('lane_index_info') singlecell_tag = self.param('singlecell_tag') remote_path = \ os.path.join(\ remote_project_path, project_name, seqrun_date, flowcell_id, lane_index_info) # get remote base path base = BaseAdaptor(**{'session_class': igf_session_class}) base.start_session() # connect to db ca = CollectionAdaptor(**{'session': base.session}) ra = RunAdaptor(**{'session': base.session}) fastqc_data = list() for fastqc_file in qc_files[ 'fastqc']: # get fastqc files for fastq_dir fastqc_zip = fastqc_file['fastqc_zip'] fastq_file = fastqc_file['fastq_file'] qc_fastq_dir = fastqc_file['fastq_dir'] if qc_fastq_dir == fastq_dir: # check for fastq dir remote_fastqc_path = fastqc_file['remote_fastqc_path'] remote_fastqc_path = \ os.path.relpath(\ remote_fastqc_path, start=remote_path) # get relative path (total_reads, _) = \ get_fastq_info_from_fastq_zip(fastqc_zip) (collection_name,_) = \ ca.fetch_collection_name_and_table_from_file_path(\ file_path=fastq_file) # fetch collection name and table info sample = ra.fetch_sample_info_for_run( run_igf_id=collection_name) sample_name = sample['sample_igf_id'] fastqc_data.\ append(\ {'Sample_ID':sample_name, 'Fastqc':remote_fastqc_path, 'FastqFile':fastq_file, 'TotalReads':total_reads}) base.close_session() # close db connection fastqs_data = list() for fastqs_file in qc_files[ 'fastqscreen']: # get fastqs files for fastq_dir fastq_file = fastqs_file['fastq_file'] remote_fastqs_path = fastqs_file['remote_fastqscreen_path'] qs_fastq_dir = fastqc_file['fastq_dir'] if qs_fastq_dir == fastq_dir: # check for accu data remote_fastqs_path = \ os.path.relpath(\ remote_fastqs_path, start=remote_path) # get relative path fastqs_data.\ append(\ {'Fastqscreen':remote_fastqs_path, 'FastqFile':fastq_file}) if len(fastqc_data) == 0 or len(fastqs_data) == 0: raise ValueError('Value not found for fastqc: {0} or fastqscreen:{1}'.\ format(len(fastqc_data), len(fastqs_data))) fastqc_data = pd.DataFrame(fastqc_data) fastqs_data = pd.DataFrame(fastqs_data).set_index( 'FastqFile') # convert to dataframe merged_qc_info = \ fastqc_data.\ join(\ fastqs_data, how='inner', on='FastqFile', lsuffix='', rsuffix='_s' ) # merge fastqc and fastqscreen info if len(merged_qc_info) == 0: raise ValueError('No QC data found for merging, fastqc:{0}, fastqscreen: {1}'.\ format(len(fastqc_data), len(fastqs_data))) samplesheet_file = \ os.path.join(\ fastq_dir, samplesheet_filename) if not os.path.exists(samplesheet_file): raise IOError('samplesheet file {0} not found'.\ format(samplesheet_file)) final_samplesheet_data = list() samplesheet_sc = SampleSheet( infile=samplesheet_file ) # read samplesheet for single cell check samplesheet_sc.\ filter_sample_data(\ condition_key='Description', condition_value=singlecell_tag, method='include') # keep only single cell samples if len(samplesheet_sc._data) > 0: sc_data = \ pd.DataFrame(samplesheet_sc._data).\ drop(['Sample_ID','Sample_Name','index'],axis=1).\ drop_duplicates().\ rename(columns={'Original_Sample_ID':'Sample_ID', 'Original_Sample_Name':'Sample_Name', 'Original_index':'index'}).\ to_dict(orient='region') # restructure single cell data. sc data doesn't have index2 final_samplesheet_data.extend( sc_data) # add single cell samples to final data sa = SampleSheet(infile=samplesheet_file) sa.filter_sample_data(\ condition_key='Description', condition_value=singlecell_tag, method='exclude') # remove only single cell samples if len(sa._data) > 0: final_samplesheet_data.extend( sa._data) # add non single cell samples info to final data sample_data = \ pd.DataFrame(final_samplesheet_data).\ set_index('Sample_ID') # get sample info from final data merged_data = \ merged_qc_info.\ join(\ sample_data, how='inner', on='Sample_ID', lsuffix='', rsuffix='_sa') # merge sample data with qc data required_headers = \ ['Sample_ID', 'Sample_Name', 'FastqFile', 'TotalReads', 'index'] if 'index2' in list(sample_data.columns): required_headers.append('index2') required_headers.\ extend(\ ['Fastqc', 'Fastqscreen']) # create header order merged_data['FastqFile'] = \ merged_data['FastqFile'].\ map(lambda path: os.path.basename(path)) # keep only fastq filename qc_merged_data = \ merged_data.loc[:,required_headers].\ to_dict(orient='records') # extract final data return required_headers, qc_merged_data except: raise
def run(self): try: project_igf_id = self.param_required('project_igf_id') experiment_igf_id=self.param_required('experiment_igf_id') sample_igf_id = self.param_required('sample_igf_id') input_files = self.param_required('input_files') igf_session_class = self.param_required('igf_session_class') template_report_file = self.param_required('template_report_file') rscript_path = self.param_required('rscript_path') batch_effect_rscript_path = self.param_required('batch_effect_rscript_path') base_result_dir = self.param_required('base_result_dir') strand_info = self.param('strand_info') read_threshold = self.param('read_threshold') collection_type = self.param('collection_type') collection_table = self.param('collection_table') analysis_name = self.param('analysis_name') tag_name = self.param('tag_name') use_ephemeral_space = self.param('use_ephemeral_space') output_file_list = None if len(input_files)==0: raise ValueError('No input files found for bactch effect checking') elif len(input_files) < 3: output_file_list = '' # can't run batch effect checking on less than 3 lanes else: for file in input_files: check_file_path(file) # check input filepath file_data = list() ra = RunAdaptor(**{'session_class':igf_session_class}) ra.start_session() for file in input_files: run_igf_id = os.path.basename(file).\ replace('ReadsPerGene.out.tab','') # using simple string match to fetch run igf ids flowcell_id, lane_id = \ ra.fetch_flowcell_and_lane_for_run(run_igf_id=run_igf_id) # fetch flowcell id and lane info file_data.append({'file':file, 'flowcell':flowcell_id, 'lane':lane_id }) ra.close_session() temp_dir = \ get_temp_dir(use_ephemeral_space=use_ephemeral_space) temp_json_file = \ os.path.join(temp_dir,'star_gene_counts.json') # temp json file path temp_output_file = \ os.path.join(\ temp_dir, os.path.basename(template_report_file)) # temp report file path with open(temp_json_file,'w') as jp: json.dump(file_data,jp,indent=2) # dumping json output br = Batch_effect_report(\ input_json_file=temp_json_file, template_file=template_report_file, rscript_path=rscript_path, batch_effect_rscript_path=batch_effect_rscript_path, strand_info=strand_info, read_threshold=read_threshold ) # set up batch effect run br.check_lane_effect_and_log_report(\ project_name=project_igf_id, sample_name=sample_igf_id, output_file=temp_output_file ) # generate report file au = Analysis_collection_utils(\ dbsession_class=igf_session_class, analysis_name=analysis_name, base_path=base_result_dir, tag_name=tag_name, collection_name=experiment_igf_id, collection_type=collection_type, collection_table=collection_table ) # prepare to load file output_file_list = \ au.load_file_to_disk_and_db(\ input_file_list=[temp_output_file]) # load file to db and disk self.param('dataflow_params', {'batch_effect_reports':output_file_list}) # populating data flow only if report is present except Exception as e: message = \ 'project: {2}, sample:{3}, Error in {0}: {1}'.\ format(\ self.__class__.__name__, e, project_igf_id, sample_igf_id) self.warning(message) self.post_message_to_slack(message,reaction='fail') # post msg to slack for failed jobs raise
def setUp(self): self.dbconfig = 'data/dbconfig.json' dbparam = read_dbconf_json(self.dbconfig) base = BaseAdaptor(**dbparam) self.engine = base.engine self.dbname = dbparam['dbname'] Base.metadata.drop_all(self.engine) if os.path.exists(self.dbname): os.remove(self.dbname) Base.metadata.create_all(self.engine) self.session_class = base.get_session_class() base.start_session() # PLATFORM platform_data = [{ "platform_igf_id": "M03291", "model_name": "MISEQ", "vendor_name": "ILLUMINA", "software_name": "RTA", "software_version": "RTA1.18.54" }] flowcell_rule_data = [{ "platform_igf_id": "M03291", "flowcell_type": "MISEQ", "index_1": "NO_CHANGE", "index_2": "NO_CHANGE" }] pl = PlatformAdaptor(**{'session': base.session}) pl.store_platform_data(data=platform_data) pl.store_flowcell_barcode_rule(data=flowcell_rule_data) # SEQRUN seqrun_data = [{ 'seqrun_igf_id': '180416_M03291_0139_000000000-TEST', 'flowcell_id': '000000000-TEST', 'platform_igf_id': 'M03291', 'flowcell': 'MISEQ', }, { 'seqrun_igf_id': '180416_M03291_0140_000000000-TEST', 'flowcell_id': '000000000-TEST', 'platform_igf_id': 'M03291', 'flowcell': 'MISEQ', }] sra = SeqrunAdaptor(**{'session': base.session}) sra.store_seqrun_and_attribute_data(data=seqrun_data) # PROJECT project_data = [{'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq'}] pa = ProjectAdaptor(**{'session': base.session}) pa.store_project_and_attribute_data(data=project_data) # SAMPLE sample_data = [{ 'sample_igf_id': 'IGF00123', 'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq' }, { 'sample_igf_id': 'IGF00124', 'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq' }] sa = SampleAdaptor(**{'session': base.session}) sa.store_sample_and_attribute_data(data=sample_data) # EXPERIMENT experiment_data = [{ 'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq', 'sample_igf_id': 'IGF00123', 'experiment_igf_id': 'IGF00123_MISEQ', 'library_name': 'IGF00123', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'POLYA-RNA', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ', 'singlecell_chemistry': 'TENX' }, { 'project_igf_id': 'IGFQ000123_test_10-4-2018_Miseq', 'sample_igf_id': 'IGF00124', 'experiment_igf_id': 'IGF00124_MISEQ', 'library_name': 'IGF00124', 'library_source': 'TRANSCRIPTOMIC_SINGLE_CELL', 'library_strategy': 'RNA-SEQ', 'experiment_type': 'POLYA-RNA', 'library_layout': 'PAIRED', 'platform_name': 'MISEQ', 'singlecell_chemistry': 'TENX' }] ea = ExperimentAdaptor(**{'session': base.session}) ea.store_project_and_attribute_data(data=experiment_data) # RUN run_data = [{ 'experiment_igf_id': 'IGF00123_MISEQ', 'seqrun_igf_id': '180416_M03291_0139_000000000-TEST', 'run_igf_id': 'IGF00123_MISEQ_000000000-TEST_1', 'lane_number': '1' }] ra = RunAdaptor(**{'session': base.session}) ra.store_run_and_attribute_data(data=run_data) # PIPELINE pipeline_data = [{ "pipeline_name": "PrimaryAnalysis", "pipeline_db": "sqlite:////aln.db", }, { "pipeline_name": "DemultiplexingFastq", "pipeline_db": "sqlite:////fastq.db", }] pipeline_seed_data = [ { 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 1, 'seed_table': 'experiment' }, { 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 2, 'seed_table': 'experiment' }, { 'pipeline_name': 'DemultiplexingFastq', 'seed_id': 1, 'seed_table': 'seqrun' }, { 'pipeline_name': 'DemultiplexingFastq', 'seed_id': 2, 'seed_table': 'seqrun' }, ] update_data = [{ 'pipeline_name': 'PrimaryAnalysis', 'seed_id': 2, 'seed_table': 'experiment', 'status': 'FINISHED' }, { 'pipeline_name': 'DemultiplexingFastq', 'seed_id': 2, 'seed_table': 'seqrun', 'status': 'FINISHED' }] pla = PipelineAdaptor(**{'session': base.session}) pla.store_pipeline_data(data=pipeline_data) pla.create_pipeline_seed(data=pipeline_seed_data) pla.update_pipeline_seed(update_data) base.close_session()