def import_bam(storage_name, bam_file_path, sample=None, library=None, lane_infos=None, read_type=None, ref_genome=None, tag_name=None, update=False): """ Imports bam into tantalus Args: storage_name: (string) name of destination storage bam_file_path: (string) filepath to bam on destination storage sample: (dict) contains sample_id library: (dict) contains library_id, library_type, index_format lane_infos: (dict) contains flowcell_id, lane_number, adapter_index_sequence, sequencing_cenre, read_type, reference_genome, aligner read_type: (string) read type for the run tag_name: (string) update: (boolean) Returns: sequence_dataset: (dict) sequence dataset created on tantalus """ tantalus_api = TantalusApi() # Get a url allowing access regardless of whether the file # is in cloud or local storage storage_client = tantalus_api.get_storage_client(storage_name) bam_filename = tantalus_api.get_file_resource_filename( storage_name, bam_file_path) bam_url = storage_client.get_url(bam_filename) bam_header = pysam.AlignmentFile(bam_url).header bam_header_info = get_bam_header_info(bam_header) if ref_genome is None: ref_genome = get_bam_ref_genome(bam_header) aligner_name = get_bam_aligner_name(bam_header) logging.info( f"bam header shows reference genome {ref_genome} and aligner {aligner_name}" ) bai_file_path = None if storage_client.exists(bam_filename + ".bai"): bai_file_path = bam_file_path + ".bai" else: logging.info(f"no bam index found at {bam_filename + '.bai'}") # If no sample was specified assume it exists in tantalus and # search for it based on header info if sample is None: if len(bam_header_info["sample_ids"]) != 1: raise ValueError( f"found sample_ids={bam_header_info['sample_ids']}, please specify override sample id" ) sample_id = list(bam_header_info["sample_ids"])[0] sample = tantalus_api.get('sample', sample_id=sample_id) # If no library was specified assume it exists in tantalus and # search for it based on header info if library is None: if len(bam_header_info["library_ids"]) != 1: raise ValueError( f"found library_ids={bam_header_info['library_ids']}, please specify override library id" ) library_id = list(bam_header_info["library_ids"])[0] library = tantalus_api.get('dna_library', library_id=library_id) # Default paired end reads if read_type is None: read_type = 'P' # If no lane infos were specified create them from header info if lane_infos is None: lane_infos = [] for lane in bam_header_info["sequence_lanes"]: lane_info = { "flowcell_id": lane["flowcell_id"], "lane_number": lane["lane_number"], "library_id": lane["library_id"], "sequencing_centre": lane["sequencing_centre"], "read_type": read_type, } lane_infos.append(lane_info) # Add the sequence dataset to Tantalus sequence_dataset = add_sequence_dataset( tantalus_api, storage_name=storage_name, sample=sample, library=library, dataset_type="BAM", sequence_lanes=lane_infos, bam_file_path=bam_file_path, reference_genome=ref_genome, aligner=aligner_name, bai_file_path=bai_file_path, tag_name=tag_name, update=update, ) return sequence_dataset
def add_generic_results(filepaths, storage_name, results_name, results_type, results_version, sample_ids=(), library_ids=(), analysis_pk=None, recursive=False, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) sample_pks = [] for sample_id in sample_ids: samples = tantalus_api.get( "sample", sample_id=sample_id, ) sample_pks.append(samples['id']) library_pks = [] for library_id in library_ids: librarys = tantalus_api.get( "dna_library", library_id=library_id, ) library_pks.append(librarys['id']) #Add the file resource to tantalus file_resource_pks = [] for filepath in filepaths: if recursive: logging.info("Recursing directory {}".format(filepath)) filename_prefix = tantalus_api.get_file_resource_filename( storage_name, filepath) add_filepaths = [] for filename in storage_client.list(filename_prefix): add_filepaths.append( tantalus_api.get_filepath(storage_name, filename)) else: add_filepaths = [filepath] for add_filepath in add_filepaths: logging.info( "Adding file resource for {} to Tantalus".format(add_filepath)) resource, instance = tantalus_api.add_file( storage_name=storage_name, filepath=add_filepath, update=update, ) file_resource_pks.append(resource["id"]) results_dataset_fields = dict( name=results_name, results_type=results_type, results_version=results_version, analysis=analysis_pk, samples=sample_pks, libraries=library_pks, file_resources=file_resource_pks, ) #Add the dataset to tantalus try: results_id = tantalus_api.get( "results", name=results_dataset_fields["name"])["id"] except NotFoundError: results_id = None if update and results_id is not None: logging.warning("results dataset {} exists, updating".format( results_dataset_fields["name"])) results_dataset = tantalus_api.update("results", id=results_id, **results_dataset_fields) else: logging.info("creating results dataset {}".format( results_dataset_fields["name"])) results_dataset = tantalus_api.get_or_create("results", **results_dataset_fields) if tag_name is not None: tantalus_api.tag(tag_name, resultsdataset_set=[results_id]) logging.info("Succesfully created sequence dataset with ID {}".format( results_dataset["id"])) if remote_storage_name is not None: transfer_files.transfer_dataset(tantalus_api, results_dataset['id'], "resultsdataset", storage_name, remote_storage_name) return results_dataset