def create_output_datasets(self, storages, update=False): """ Create the set of output sequence datasets produced by this analysis. """ assert len(self.analysis['input_datasets']) == 1 input_dataset = self.tantalus_api.get('sequence_dataset', id=self.analysis['input_datasets'][0]) storage_client = self.tantalus_api.get_storage_client(storages["working_inputs"]) metadata_yaml_path = os.path.join(self.bams_dir, "metadata.yaml") metadata_yaml = yaml.safe_load(storage_client.open_file(metadata_yaml_path)) name = templates.WGS_SPLIT_BAM_NAME_TEMPLATE.format( dataset_type="BAM", sample_id=input_dataset["sample"]["sample_id"], library_type=input_dataset["library"]["library_type"], library_id=input_dataset["library"]["library_id"], lanes_hash=get_lanes_hash(input_dataset["sequence_lanes"]), aligner=input_dataset['aligner'], reference_genome=input_dataset['reference_genome'], split_length=self.split_size, ) file_resources = [] for filename in metadata_yaml["filenames"] + ['metadata.yaml']: filepath = os.path.join( storage_client.prefix, self.bams_dir, filename) file_resource, file_instance = self.tantalus_api.add_file( storages["working_inputs"], filepath, update=update) file_resources.append(file_resource["id"]) data = { 'name': name, 'version_number': 1, 'dataset_type': "BAM", 'sample': input_dataset["sample"]["id"], 'library': input_dataset["library"]["id"], 'sequence_lanes': [a["id"] for a in input_dataset["sequence_lanes"]], 'file_resources': file_resources, 'aligner': input_dataset["aligner"], 'reference_genome': input_dataset["reference_genome"], 'region_split_length': self.split_size, 'analysis': self.analysis['id'], } keys = [ 'name', 'version_number', ] output_dataset, _ = self.tantalus_api.create( 'sequencedataset', data, keys, get_existing=True, do_update=update) logging.info("Created sequence dataset {}".format(name)) return [output_dataset]
def generate_unique_name(cls, tantalus_api, jira, version, args, input_datasets, input_results): assert len(input_datasets) == 1 dataset = tantalus_api.get('sequence_dataset', id=input_datasets[0]) name = templates.SC_PSEUDOBULK_ANALYSIS_NAME_TEMPLATE.format( analysis_type=cls.analysis_type_, aligner=dataset['aligner'], ref_genome=dataset['reference_genome'], library_id=dataset['library']['library_id'], sample_id=dataset['sample']['sample_id'], lanes_hashed=get_lanes_hash(dataset["sequence_lanes"]), ) return name
def get_tantalus_bam_filename(sample, library, lane_infos): """ Creates filename for bam that matches current naming conventions in Tantalus Args: sample: (dict) the sample associated with the bam library: (dict) the library associated with the bam lane_infos: (list of dictionaries) contains lane info associated with the bam Returns: bam_path: (string) the filename for the bam following naming conventions """ lanes_str = get_lanes_hash(lane_infos) bam_path = WGS_BAM_NAME_TEMPLATE.format( sample_id=sample["sample_id"], library_type=library["library_type"], library_id=library["library_id"], lanes_str=lanes_str, ) return bam_path
def generate_unique_name(cls, tantalus_api, jira, version, args, input_datasets, input_results): assert len(input_datasets) == 2 for dataset_id in input_datasets: dataset = tantalus_api.get('sequencedataset', id=dataset_id) if dataset['sample']['sample_id'] == args['sample_id']: tumour_dataset = dataset assert tumour_dataset['aligner'].startswith(args['aligner']) assert tumour_dataset['reference_genome'] == args['ref_genome'] assert tumour_dataset['library']['library_id'] == args['library_id'] assert tumour_dataset['sample']['sample_id'] == args['sample_id'] name = templates.SC_PSEUDOBULK_ANALYSIS_NAME_TEMPLATE.format( analysis_type=cls.analysis_type_, aligner=args['aligner'], ref_genome=args['ref_genome'], library_id=args['library_id'], sample_id=args['sample_id'], lanes_hashed=get_lanes_hash(tumour_dataset["sequence_lanes"]), ) return name
def import_tenx_fastqs(storage_name, sequencing, no_comments=False, update=False): storage_client = tantalus_api.get_storage_client(storage_name) # get colossus sequencing id sequencing_id = sequencing["id"] # get pool id from sequencing pool_id = sequencing["tenx_pool"] # get colossus tenx pool object pool = colossus_api.get("tenxpool", id=pool_id) # get pool name pool_name = pool['pool_name'] # get gsc id (this may not have been filled out) gsc_pool_id = sequencing["gsc_library_id"] # query gsc by gsc pool id gsc_pool_infos = gsc_api.query(f"library?name={gsc_pool_id}") # check if not results returned if not gsc_pool_id: # query gsc by our indentifier instead i.e. colossus pool name gsc_pool_infos = gsc_api.query( f"library?external_identifier={pool_name}") if gsc_pool_infos: # get name used internally at gsc gsc_pool_id = gsc_pool_infos[0]["name"] # try to fetch for gsc pool info again gsc_pool = gsc_api.query(f"library?name={gsc_pool_id}") # if no results found for a second time, exit if not gsc_pool: logging.info(f"cannot find data for {pool_name}, {gsc_pool_id}") return None # get id of gsc pool pool_id = gsc_pool[0]["id"] # get information about sequecing run run_info = gsc_api.query(f"run?library_id={pool_id}") logging.info(f"Importing {pool_name} ") # init dictionary to be used for collecting library index pairs index_lib = dict() pool_libraries = [] # for each library in the pool, collect the sample and index of the library for library in pool["libraries"]: # get colossus tenx library tenxlib = colossus_api.get("tenxlibrary", id=library) library = tenxlib['name'] pool_libraries.append(library) # get sample name sample = tenxlib["sample"]["sample_id"] # get index name index_used = tenxlib["tenxlibraryconstructioninformation"][ "index_used"] # index always ends with comma, so remove comma from name index = index_used.split(",")[0] print(f"{tenxlib['name']} {tenxlib['sample']['sample_id']} {index}") # add info keyed by index index_lib[index] = dict(tenxlib=tenxlib, library=tenxlib['name'], sample=tenxlib['sample']['sample_id']) # iterate through all sequencing runs of this pool for run in run_info: run_id = run["id"] # get all libcores of run # in the case of tenx, a libcore represents a colossus tenxlibrary libcore = gsc_api.query( f"libcore?run_id={run_id}&relations=primer%2Crun%2Clibrary&primer_columns=name" ) gsc_sublibraries = [] dataset_ids = [] # skip run if no libcores found if not libcore: logging.info(f"no libcore") continue for lib in libcore: lanes = [] lane_pks = [] filenames = [] index = lib["primer"]["name"] flowcell_id = lib["run"]["flowcell_id"] flowcell = gsc_api.query(f"flowcell?id={flowcell_id}") # check if the libcore is associated with a library in the pool try: tenxlib = index_lib[index]["tenxlib"] library = index_lib[index]["library"] sample = index_lib[index]["sample"] except Exception as e: logging.error(f"Index not found: {e}") raise Exception(f"Index not found: {e}") # collect sequencing info flowcell_id = str(flowcell[0]['lims_flowcell_code']) lane_number = str(lib['run']['lane_number']) sequencing_date = str(lib["run"]["run_datetime"]) sequencing_instrument = get_sequencing_instrument( lib["run"]["machine"]) sequencing_instrument = sequencing_instrument_map[ sequencing_instrument] flowcell_lane = f"{flowcell_id}_{lane_number}" # get existing data existing_data = get_existing_fastq_data(tantalus_api, library) if flowcell_lane in existing_data: logging.info( f"skipping {flowcell_lane} since already imported") continue # get internal gsc library name gsc_library_id = lib["library"]["name"] # update library's gsc name colossus_api.update("tenxlibrary", id=tenxlib["id"], gsc_library_id=gsc_library_id) gsc_sublibraries.append(gsc_library_id) # query for fastqs of the library fastqs = gsc_api.query(f"concat_fastq?libcore_id={lib['id']}") print(fastqs) for fastq in fastqs: filename_pattern = fastq["file_type"]["filename_pattern"] read_end, passed = filename_pattern_map.get( filename_pattern, (None, None)) if read_end is None: logging.info( "Unrecognized file type: {}".format(filename_pattern)) continue # construct fastq name new_filename = "_".join([ library, sample, "S1", f"L00{lane_number}", f"R{read_end}", "001.fastq.gz" ]) fullpath = os.path.join(storage_client.prefix, library, flowcell_lane, new_filename) filenames.append(fullpath) # add fastq to cloud storage storage_client.create( os.path.join(library, flowcell_lane, new_filename), fastq["data_path"], update=True, ) # if no files were found move onto next library if not filenames: print( f"no data for run_id: {run_id}; lane {flowcell_id}_{lane_number}" ) continue # collect and add lane info lane = dict(flowcell_id=flowcell_id, lane_number=str(lane_number)) lanes.append(lane) # create tantalus library dna_library = tantalus_api.get_or_create( "dna_library", library_id=library, library_type="SC_RNASEQ", index_format="TENX", ) try: lane_object = tantalus_api.get( "sequencing_lane", flowcell_id=flowcell_id, lane_number=str(lane_number), dna_library=dna_library["id"], ) tantalus_api.update( "sequencing_lane", id=lane_object["id"], sequencing_centre="GSC", sequencing_instrument=sequencing_instrument, read_type="TENX", ) except: lane_object, _ = tantalus_api.create( "sequencing_lane", fields=dict( flowcell_id=flowcell_id, lane_number=str(lane_number), sequencing_centre="GSC", sequencing_instrument=sequencing_instrument, read_type="TENX", dna_library=dna_library["id"], ), keys=[ "flowcell_id", "lane_number", "sequencing_centre", "dna_library", ], get_existing=True, ) lane_pks.append(lane_object["id"]) dataset_name = TENX_SCRNA_DATASET_TEMPLATE.format( dataset_type="FQ", sample_id=sample, library_type="SC_RNASEQ", library_id=library, lanes_hash=get_lanes_hash(lanes), ) sequence_dataset = add_generic_dataset( filepaths=filenames, sample_id=sample, library_id=library, storage_name="scrna_fastq", dataset_name=dataset_name, dataset_type="FQ", sequence_lane_pks=lane_pks, reference_genome="HG38", update=True, ) dataset_ids.append(sequence_dataset) url = f"https://colossus.canadacentral.cloudapp.azure.com/tenx/sequencing/{sequencing_id}" comment = f"Import successful:\n\nLane: {flowcell_lane}\nGSC Library ID: {gsc_library_id}\n{url}" comments = jira_api.comments(tenxlib["jira_ticket"]) commented = False for c in comments: if c.body == comment: commented = True break if not commented: comment_jira(tenxlib["jira_ticket"], comment) # create jira ticket jira_ticket = create_analysis_jira_ticket(library, sample, tenxlib['jira_ticket']) # create colossus analysis analysis, _ = colossus_api.create( "tenxanalysis", fields={ "version": "vm", "jira_ticket": jira_ticket, "run_status": "idle", "tenx_library": tenxlib["id"], "submission_date": str(datetime.date.today()), "tenxsequencing_set": [], }, keys=["jira_ticket"], ) # create tantalus analysis create_tenx_analysis_from_library(jira_ticket, library) # check if data has been imported if filenames: # add lanes to colossus colossus_lane = colossus_api.get_or_create( "tenxlane", flow_cell_id=flowcell_lane, sequencing=sequencing_id, ) # update lane with gsc id and date colossus_api.update( "tenxlane", id=colossus_lane["id"], tantalus_datasets=list(set(dataset_ids)), gsc_sublibrary_names=gsc_sublibraries, sequencing_date=sequencing_date, ) # check if gsc id hasn't been added correctly if sequencing["gsc_library_id"] != gsc_pool_id: logging.info( "Updating gsc library id of sequencing {} from {} to {}".format( sequencing["id"], sequencing["gsc_library_id"], gsc_pool_id)) colossus_api.update("tenxsequencing", sequencing["id"], gsc_library_id=gsc_pool_id) logging.info("Succesfully imported {} {}".format(pool_name, gsc_pool_id)) import_info = dict( pool_name=pool_name, libraries=pool_libraries, gsc_library_id=gsc_pool_id, ) return import_info
def create_sequence_dataset_models(file_info, storage_name, tag_name, tantalus_api, analysis_id=None, update=False): """Create tantalus sequence models for a list of files.""" analysis = None if analysis_id is not None: analysis = tantalus_api.get('analysis', id=analysis_id) # Get storage and tag PKs storage = tantalus_api.get("storage", name=storage_name) storage_pk = storage["id"] # Sort files by dataset dataset_info = collections.defaultdict(list) for info in file_info: if info["dataset_type"] == 'BAM': dataset_name = templates.SC_WGS_BAM_NAME_TEMPLATE.format( dataset_type=info["dataset_type"], sample_id=info["sample_id"], library_type=info["library_type"], library_id=info["library_id"], lanes_hash=get_lanes_hash(info["sequence_lanes"]), aligner=info["aligner_name"], reference_genome=info["ref_genome"], jira_ticket=analysis["jira_ticket"], ) elif info["dataset_type"] == 'FQ': dataset_name = templates.SC_WGS_FQ_NAME_TEMPLATE.format( dataset_type=info["dataset_type"], sample_id=info["sample_id"], library_type=info["library_type"], library_id=info["library_id"], lane=get_lane_str(info["sequence_lanes"][0]), ) dataset_info[dataset_name].append(info) # Create datasets dataset_ids = set() for dataset_name, infos in dataset_info.items(): # Get library PK library = tantalus_api.get_or_create( "dna_library", library_id=infos[0]["library_id"], library_type=infos[0]["library_type"], index_format=infos[0]["index_format"], ) library_pk = library["id"] # Get sample PK sample = tantalus_api.get_or_create( "sample", sample_id=infos[0]["sample_id"], ) sample_pk = sample["id"] # Build up sequence dataset attrs; we'll add to this as we # proceed throughout the function sequence_dataset = dict( name=dataset_name, dataset_type=infos[0]["dataset_type"], sample=sample_pk, library=library_pk, sequence_lanes=[], file_resources=[], ) # Add in the analysis id if it's provided if analysis_id is not None: sequence_dataset["analysis"] = analysis_id # Add in BAM specific items if infos[0]["dataset_type"] == "BAM": sequence_dataset["aligner"] = infos[0]["aligner_name"] sequence_dataset["reference_genome"] = infos[0]["ref_genome"] for info in infos: # Check consistency for fields used for dataset check_fields = ( "dataset_type", "sample_id", "library_id", "library_type", "index_format", ) for field_name in check_fields: if info[field_name] != infos[0][field_name]: raise Exception("error with field {}".format(field_name)) for sequence_lane in info["sequence_lanes"]: sequence_lane = dict(sequence_lane) sequence_lane["dna_library"] = library_pk sequence_lane["lane_number"] = str( sequence_lane["lane_number"]) sequence_lane = tantalus_api.get_or_create( "sequencing_lane", **sequence_lane) sequence_dataset["sequence_lanes"].append(sequence_lane["id"]) sequence_file_info = dict(index_sequence=info["index_sequence"]) if "read_end" in info: sequence_file_info["read_end"] = info["read_end"] file_resource, file_instance = tantalus_api.add_file( storage_name, info["filepath"], update=update, ) sequence_file_info = tantalus_api.get_or_create( "sequence_file_info", file_resource=file_resource["id"], **sequence_file_info) sequence_dataset["file_resources"].append(file_resource["id"]) try: dataset_id = tantalus_api.get("sequence_dataset", name=sequence_dataset["name"])["id"] except NotFoundError: dataset_id = None if update and dataset_id is not None: log.warning("sequence dataset {} has changed, updating".format( sequence_dataset["name"])) dataset = tantalus_api.update("sequence_dataset", id=dataset_id, **sequence_dataset) else: log.info("creating sequence dataset {}".format( sequence_dataset["name"])) dataset = tantalus_api.get_or_create("sequence_dataset", **sequence_dataset) if tag_name is not None: tantalus_api.tag(tag_name, sequencedataset_set=[dataset['id']]) dataset_ids.add(dataset['id']) return dataset_ids
def add_sequence_dataset(tantalus_api, storage_name, sample, library, dataset_type, sequence_lanes, bam_file_path, reference_genome, aligner, bai_file_path=None, tag_name=None, update=False): """ Add a sequence dataset, gets or creates the required sample, library, and sequence lanes for the dataset Args: storage_name (str) dataset_type (str) sample_id (dict): contains: sample_id library (dict): contains: library_id, library_type, index_format sequence_lanes (list): contains: flowcell_id, read_type, lane_number, sequencing_centre, sequencing_instrument, library_id bam_file_path (str): bam file path to data included in dataset reference_genome (str) aligner (str) bai_file_path (str): bam index file path to data included in dataset (optional) tags (list) Returns: sequence_dataset (dict) """ # Create the sample sample = tantalus_api.get_or_create( "sample", sample_id=sample['sample_id'], ) # Create the library library = tantalus_api.get_or_create("dna_library", library_id=library["library_id"], library_type=library["library_type"], index_format=library["index_format"]) # Create the sequence lanes sequence_lane_pks = [] for lane in sequence_lanes: # Get library ID associated with each lane lane_library_pk = tantalus_api.get_or_create( "dna_library", library_id=lane["library_id"], library_type=library["library_type"], index_format=library["index_format"])["id"] lane_fields = dict( dna_library=lane_library_pk, flowcell_id=lane["flowcell_id"], lane_number=str(lane["lane_number"]), ) # Optional fields for create for field_name in ("read_type", "sequencing_centre", "sequencing_instrument"): if field_name in lane_fields: lane_fields[field_name] = lane[field_name] else: logging.warning( f"field {field_name} missing for lane {lane['flowcell_id']}_{lane['lane_number']}" ) lane_pk = tantalus_api.get_or_create("sequencing_lane", **lane_fields)["id"] sequence_lane_pks.append(lane_pk) # Create the tag if tag_name is not None: tag_pk = tantalus_api.get_or_create("tag", name=tag_name)["id"] tags = [tag_pk] else: tags = [] # Create the file resources file_resource_pks = [] file_resource, file_instance = tantalus_api.add_file(storage_name, bam_file_path, update=update) file_resource_pks.append(file_resource["id"]) if bai_file_path is not None: file_resource, file_instance = tantalus_api.add_file(storage_name, bai_file_path, update=update) file_resource_pks.append(file_resource["id"]) dataset_name = templates.WGS_BAM_NAME_TEMPLATE.format( dataset_type="BAM", sample_id=sample["sample_id"], library_type=library["library_type"], library_id=library["library_id"], lanes_hash=get_lanes_hash(sequence_lanes), aligner=aligner, reference_genome=reference_genome, ) # Find all similarly named datasets similar_datasets = list( tantalus_api.list( "sequence_dataset", name=dataset_name, )) # Filter for a similarly named dataset with the same files existing_dataset = None for dataset in similar_datasets: if set(dataset['file_resources']) == set(file_resource_pks): existing_dataset = dataset logging.info( f"found existing dataset {dataset['id']} with identical file list" ) break elif set(dataset['file_resources']).intersection( set(file_resource_pks)): raise ValueError( f"dataset {dataset['id']} has files {dataset['file_resources']} partially intersecting with {list(file_resource_pks)}" ) if existing_dataset is not None: # Get or create to check field consistency sequence_dataset = tantalus_api.get_or_create( "sequence_dataset", name=dataset_name, version_number=existing_dataset['version_number'], dataset_type=dataset_type, sample=sample["id"], library=library["id"], sequence_lanes=sequence_lane_pks, file_resources=file_resource_pks, reference_genome=reference_genome, aligner=aligner, ) # Update the existing dataset tags tag_ids = tags + existing_dataset["tags"] sequence_dataset = tantalus_api.update( "sequence_dataset", id=existing_dataset["id"], tags=tag_ids, ) else: # Find a new version number if necessary version_number = 1 if len(similar_datasets) > 0: version_number = max(d['version_number'] for d in similar_datasets) + 1 logging.info( f"creating new version of dataset {dataset_name} with version number {version_number}" ) fields = { 'name': dataset_name, 'version_number': version_number, 'dataset_type': dataset_type, 'sample': sample["id"], 'library': library["id"], 'sequence_lanes': sequence_lane_pks, 'file_resources': file_resource_pks, 'reference_genome': reference_genome, 'aligner': aligner, 'tags': tags, } sequence_dataset, is_updated = tantalus_api.create( "sequence_dataset", fields, keys=["name", "version_number"]) return sequence_dataset
def tantalus_import(library_id, sample_id, lane_infos, blob_paths, sequencing_centre, dataset_type, storage_name, tag_name=None, update=False): """ Imports tenx sequence dataset and file resources into tantalus Args: library_id: (str) internal name for the library sample_id: (str) internal name for the sample lane_infos: (list) a list of dictionaries containing flowcell ID, lane number, and sequencing instrument blob_paths: (list) a list of filepaths to the FASTQs on azure storage sequencing_centre: (str) GSC or BRC according to where the library was sequenced dataset_type: (str) FQ, BAM, or BCL storage_name: (str) name of the azure storage in tantalus tag_name: (str) name of the tag associated with the dataset update: (bool) a boolean indicating whether to update any information already in tantalus Returns: sequence_dataset["id"]: ID of the newly created sequence dataset """ file_resource_ids, file_instance_ids, sequence_lanes, sequence_lanes_pks = [], [], [], [] sample_pk = tantalus_api.get_or_create( "sample", sample_id=sample_id, )["id"] library_pk = tantalus_api.get_or_create( "dna_library", library_id=library_id, library_type="SC_RNASEQ", index_format="TENX", )["id"] # Add the file resources to tantalus for blob_path in blob_paths: file_resource, file_instance = tantalus_api.add_file(storage_name, blob_path, update=update) file_resource_ids.append(file_resource["id"]) file_instance_ids.append(file_instance["id"]) logging.info("Adding lanes to Tantalus") for lane_info in lane_infos: # Try to find a match for the sequencing instrument try: sequencing_instrument = TANTALUS_SEQUENCING_MAP[ lane_info["sequencing_instrument"]] except KeyError: sequencing_instrument = lane_info["sequencing_instrument"] lane = tantalus_api.get_or_create( "sequencing_lane", flowcell_id=lane_info["flowcell_id"], dna_library=library_pk, read_type="TENX", lane_number=str(lane_info["lane_number"]), sequencing_centre=sequencing_centre, sequencing_instrument=lane_info["sequencing_instrument"]) sequence_lanes.append(lane) sequence_lanes_pks.append(lane["id"]) dataset_name = TENX_SCRNA_DATASET_TEMPLATE.format( dataset_type=dataset_type, sample_id=sample_id, library_type="SC_RNASEQ", library_id=library_id, lanes_hash=get_lanes_hash(sequence_lanes), ) # Create tags if tag_name is not None: tag_pk = tantalus_api.get_or_create("tag", name=tag_name)["id"] tags = [tag_pk] else: tags = [] # Add the sequence dataset to tantalus sequence_dataset = tantalus_api.get_or_create( "sequence_dataset", name=dataset_name, dataset_type=dataset_type, sample=sample_pk, library=library_pk, sequence_lanes=sequence_lanes_pks, file_resources=file_resource_ids, tags=tags, ) logging.info("Sequence dataset has ID {}".format(sequence_dataset["id"])) return sequence_dataset["id"]