def import_dlp_realign_bams( storage_name, storage_type, bam_filenames, tantalus_api, tag_name=None, analysis_id=None, **kwargs ): metadata = [] storage = tantalus_api.get("storage", name=storage_name) if storage_type == "blob": for bam_filename in bam_filenames: blob_container_name = os.path.join( storage["storage_account"], storage["storage_container"] ) metadata.extend( import_dlp_realign_bam_blob(bam_filename, blob_container_name) ) elif storage_type == "server": for bam_filename in bam_filenames: metadata.extend( import_dlp_realign_bam_server( bam_filename, storage["storage_directory"] ) ) else: raise ValueError("unsupported storage type {}".format(storage_type)) create_sequence_dataset_models( metadata, storage_name, tag_name, tantalus_api, analysis_id )
def load_brc_fastqs( flowcell_id, storage_name, storage_directory, output_dir, tantalus_api, tag_name=None, ): # Check for .. in file path if ".." in output_dir: raise Exception("Invalid path for output_dir. '..' detected") # Check that output_dir is actually in storage if not output_dir.startswith(storage_directory): raise Exception( "Invalid path for output_dir. {} doesn't seem to be in the specified storage" .format(output_dir)) # Check that path is valid. if not os.path.isdir(output_dir): raise Exception( "output directory {} not a directory".format(output_dir)) fastq_file_info = get_fastq_info(output_dir, flowcell_id, storage_directory) fastq_paired_end_check(fastq_file_info) create_sequence_dataset_models(fastq_file_info, storage_name, tag_name, tantalus_api)
def load_brc_fastqs( flowcell_id, output_dir, storage_name, storage, tantalus_api, storage_client, tag_name=None, update=False, threshold=20, ): if not os.path.isdir(output_dir): raise Exception( "output directory {} not a directory".format(output_dir)) fastq_file_info = get_fastq_info(output_dir, flowcell_id, storage, storage_client, threshold) fastq_paired_end_check(fastq_file_info) fastq_dlp_index_check(fastq_file_info) create_sequence_dataset_models( fastq_file_info, storage_name, tag_name, tantalus_api, update=update, ) update_ticket(flowcell_id) logging.info('import succeeded')
def import_gsc_dlp_paired_fastqs(colossus_api, tantalus_api, dlp_library_id, storage, tag_name=None): ''' Import dlp fastq data from the GSC. Args: colossus_api: Basic client for colossus tantalus_api: Basic client for tantalus dlp_library_id: library id for the dlp run storage: to storage details for transfer tag_name: a tag to add to imported data ''' logging.info('importing data for {}'.format(dlp_library_id)) # Existing fastqs in tantalus as a set of tuples of # the form (flowcell_id, lane_number, index_sequence, read_end) existing_data = get_existing_fastq_data(tantalus_api, dlp_library_id) primary_sample_id = colossus_api.query_libraries_by_library_id( dlp_library_id)['sample']['sample_id'] cell_samples = query_colossus_dlp_cell_info(colossus_api, dlp_library_id) rev_comp_overrides = query_colossus_dlp_rev_comp_override( colossus_api, dlp_library_id) external_identifier = "{}_{}".format(primary_sample_id, dlp_library_id) gsc_api = GSCAPI() library_infos = gsc_api.query( "library?external_identifier={}".format(external_identifier)) if len(library_infos) == 0: logging.error( 'no libraries with external_identifier {} in gsc api'.format( external_identifier)) return elif len(library_infos) > 1: raise Exception( "multiple libraries with external_identifier {} in gsc api".format( external_identifier)) library_info = library_infos[0] gsc_library_id = library_info["name"] fastq_infos = gsc_api.query( "fastq?parent_library={}".format(gsc_library_id)) fastq_file_info = [] flowcells_to_be_created = [] for fastq_info in fastq_infos: fastq_path = fastq_info["data_path"] if fastq_info["status"] != "production": logging.info("skipping file {} marked as {}".format( fastq_info["data_path"], fastq_info["status"])) continue flowcell_id = str( fastq_info['libcore']['run']['flowcell']['lims_flowcell_code']) lane_number = fastq_info['libcore']['run']['lane_number'] if fastq_info['removed_datetime'] is not None: logging.info('skipping file {} marked as removed {}'.format( fastq_info['data_path'], fastq_info['removed_datetime'])) continue sequencing_instrument = get_sequencing_instrument( fastq_info["libcore"]["run"]["machine"]) solexa_run_type = fastq_info["libcore"]["run"]["solexarun_type"] read_type = solexa_run_type_map[solexa_run_type] primer_id = fastq_info["libcore"]["primer_id"] primer_info = gsc_api.query("primer/{}".format(primer_id)) raw_index_sequence = primer_info["adapter_index_sequence"] logging.info( "loading fastq %s, index %s, %s", fastq_info["id"], raw_index_sequence, fastq_path, ) flowcell_lane = flowcell_id if lane_number is not None: flowcell_lane = flowcell_lane + "_" + str(lane_number) rev_comp_override = rev_comp_overrides.get(flowcell_lane) index_sequence = decode_raw_index_sequence(raw_index_sequence, sequencing_instrument, rev_comp_override) filename_pattern = fastq_info["file_type"]["filename_pattern"] read_end, passed = filename_pattern_map.get(filename_pattern, (None, None)) if read_end is None: raise Exception( "Unrecognized file type: {}".format(filename_pattern)) if not passed: continue if (flowcell_id, str(lane_number), index_sequence, read_end) in existing_data: logging.info( 'skipping file {} that has already been imported'.format( fastq_info['data_path'])) continue try: cell_sample_id = cell_samples[index_sequence] except KeyError: raise Exception( 'unable to find index {} for flowcell lane {} for library {}'. format(index_sequence, flowcell_lane, dlp_library_id)) extension = '' compression = 'UNCOMPRESSED' if fastq_path.endswith('.gz'): extension = '.gz' compression = 'GZIP' elif not fastq_path.endswith('.fastq'): raise ValueError( 'unknown extension for filename {}'.format(fastq_path)) tantalus_filename = dlp_fastq_template.format( primary_sample_id=primary_sample_id, dlp_library_id=dlp_library_id, flowcell_id=flowcell_id, lane_number=lane_number, cell_sample_id=cell_sample_id, index_sequence=index_sequence, read_end=read_end, extension=extension, ) tantalus_path = os.path.join(storage["storage_directory"], tantalus_filename) rsync_file(fastq_path, tantalus_path) fastq_file_info.append( dict( dataset_type="FQ", sample_id=cell_sample_id, library_id=dlp_library_id, library_type="SC_WGS", index_format="D", sequence_lanes=[ dict( flowcell_id=flowcell_id, lane_number=lane_number, sequencing_centre="GSC", sequencing_instrument=sequencing_instrument, sequencing_library_id=gsc_library_id, read_type=read_type, ) ], size=os.path.getsize(fastq_path), created=pd.Timestamp(time.ctime(os.path.getmtime(fastq_path)), tz="Canada/Pacific"), file_type="FQ", read_end=read_end, index_sequence=index_sequence, compression=compression, filename=tantalus_filename, )) flowcells_to_be_created.append(flowcell_id + '_' + str(lane_number)) fastq_paired_end_check(fastq_file_info) create_sequence_dataset_models(fastq_file_info, storage["name"], tag_name, tantalus_api) logging.info('import succeeded') return flowcells_to_be_created