def main(storage_name, bam_file_path, **kwargs): """ Imports the bam into tantalus by creating a sequence dataset and file resources """ logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO) tantalus_api = TantalusApi() sample = None if kwargs.get('sample_id') is not None: sample = tantalus_api.get_or_create( 'sample', sample_id=kwargs['sample_id'], ) library = None if kwargs.get('library_id') is not None: if kwargs.get('library_type') is not None and kwargs.get( 'index_format') is not None: library = tantalus_api.get_or_create( 'dna_library', library_id=kwargs['library_id'], library_type=kwargs['library_type'], index_format=kwargs['index_format'], ) else: library = tantalus_api.get( 'dna_library', library_id=kwargs['library_id'], ) dataset = import_bam( storage_name, bam_file_path, sample=sample, library=library, read_type=kwargs.get('read_type'), ref_genome=kwargs.get('ref_genome'), update=kwargs.get('update'), tag_name=kwargs.get('tag_name'), ) print("dataset {}".format(dataset["id"]))
def add_generic_dataset(**kwargs): tantalus_api = TantalusApi() file_resource_pks = [] sample = tantalus_api.get_or_create("sample", sample_id=kwargs['sample_id']) library = tantalus_api.get_or_create("dna_library", library_id=kwargs['library_id']) #Add the file resource to tantalus for filepath in kwargs['filepaths']: logging.info("Adding file resource for {} to Tantalus".format(filepath)) resource, instance = tantalus_api.add_file( storage_name=kwargs['storage_name'], filepath=filepath, update=kwargs['update'], ) file_resource_pks.append(resource["id"]) if "tag_name" in kwargs: tag = tantalus_api.get("tag", name=kwargs["tag_name"]) tags = [tag["id"]] else: tags = [] ref_genome = kwargs.get("reference_genome") aligner = kwargs.get("aligner") # Add the dataset to tantalus sequence_dataset = tantalus_api.get_or_create( "sequence_dataset", name=kwargs['dataset_name'], dataset_type=kwargs['dataset_type'], sample=sample["id"], library=library["id"], sequence_lanes=kwargs["sequence_lane_pks"], file_resources=file_resource_pks, reference_genome=ref_genome, aligner=aligner, tags=tags, ) logging.info("Succesfully created sequence dataset with ID {}".format(sequence_dataset["id"]))
def add_analysis(**kwargs): tantalus_api = TantalusApi() #Create new analysis object analysis = tantalus_api.get_or_create("analysis", name=kwargs['name'], jira_ticket=kwargs['jira_id'], analysis_type=kwargs['type'], version=kwargs['version']) logging.info("Successfully created analysis with ID {}".format( analysis["id"]))
def main(**kwargs): """ Queries the GSC for WGS bams. Transfers bams to specified storage if necessary and uploads metadata to tantalus Args: ids: (string) a list of internal IDs to query the GSC for storage: (string) destination storage to transfer bams to id_type: (string) type of ID specified (either sample or library) skip_older_than: (string) skip bams older than this date tag_name: (string) tag name to associate the resulting sequence datasets with when importing into tantalus update: (flag) specifies whether metadata in tantalus is to be updated or not skip_file_import: (flag) import only new lanes into tantalus query_only: (flag) only query for the bam paths on the GSC """ # Check if this script is being run on thost # If not, connect to an ssh client to access /projects/files if socket.gethostname() != "txshah": ssh_client = connect_to_client("10.9.208.161") sftp = ssh_client.open_sftp() else: sftp = None # Connect to the Tantalus API tantalus_api = TantalusApi() storage = tantalus_api.get_storage(kwargs["storage"]) # Convert the date to the format we want if kwargs["skip_older_than"]: skip_older_than = valid_date(kwargs["skip_older_than"]) # Check that an ID type was specified if not kwargs["id_type"]: raise Exception("Please specify an ID type (sample or library") details = [] for identifier in kwargs["ids"]: # Query the GSC to see if the ID exists infos = query_gsc(identifier, kwargs["id_type"]) if not infos: logging.info("No results for {} {}. Skipping import".format( kwargs["id_type"], identifier)) else: logging.info("{} {} exists on the GSC".format( kwargs["id_type"], identifier)) # Get the data from GSC details = get_gsc_details( infos, skip_older_than=kwargs["skip_older_than"], ) # Import and transfer each file for detail in details: # Rename the bams according to internal templates bam_paths = rename_bam_paths(detail, storage, sftp) # If the bam path does not exist at the source, skip # the transfer and import if not bam_paths["source_bam_path"]: break # Skip import if we only wanted to query for paths if kwargs["query_only"]: continue if not kwargs["skip_file_import"]: # Transfer the bam to the specified storage transfer_gsc_bams(detail, bam_paths, storage, sftp) # Add the files to Tantalus logging.info("Importing {} to Tantalus".format( bam_paths["tantalus_bam_path"])) dataset = import_bam( storage_name=storage["name"], bam_file_path=bam_paths["tantalus_bam_path"], sample=detail["sample"], library=detail["library"], lane_infos=detail["lane_info"], read_type=detail["read_type"], tag_name=kwargs["tag_name"], update=kwargs["update"]) logging.info( "Successfully added sequence dataset with ID {}".format( dataset["id"])) else: logging.info("Importing library {} to tantalus".format( detail["library"]["library_id"])) library_pk = tantalus_api.get_or_create( "dna_library", library_id=detail["library"]["library_id"], library_type=detail["library"]["library_type"], index_format=detail["library"]["index_format"])["id"] #Only add lanes, libraries, and samples to tantalus logging.info( "Importing lanes for library {} to tantalus".format( detail["library"]["library_id"])) for lane in detail["lane_info"]: lane = tantalus_api.get_or_create( "sequencing_lane", flowcell_id=lane["flowcell_id"], dna_library=library_pk, read_type=lane["read_type"], lane_number=str(lane["lane_number"]), sequencing_centre="GSC", sequencing_instrument=lane["sequencing_instrument"]) logging.info( "Successfully created lane {} in tantalus".format( lane["id"]))
def add_generic_results(filepaths, storage_name, results_name, results_type, results_version, sample_ids=(), library_ids=(), analysis_pk=None, recursive=False, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() sample_pks = [] for sample_id in sample_ids: samples = tantalus_api.get( "sample", sample_id=sample_id, ) sample_pks.append(samples['id']) library_pks = [] for library_id in library_ids: librarys = tantalus_api.get( "dna_library", library_id=library_id, ) library_pks.append(librarys['id']) #Add the file resource to tantalus file_resource_pks = [] for filepath in filepaths: if recursive: logging.info("Recursing directory {}".format(filepath)) add_filepaths = [] for (dirpath, dirnames, filenames) in os.walk(filepath): for filename in filenames: add_filepaths.append(os.path.join(dirpath, filename)) else: add_filepaths = [filepath] for add_filepath in add_filepaths: logging.info( "Adding file resource for {} to Tantalus".format(add_filepath)) resource, instance = tantalus_api.add_file( storage_name=storage_name, filepath=add_filepath, update=update, ) file_resource_pks.append(resource["id"]) results_dataset_fields = dict( name=results_name, results_type=results_type, results_version=results_version, analysis=analysis_pk, samples=sample_pks, libraries=library_pks, file_resources=file_resource_pks, ) #Add the dataset to tantalus try: results_id = tantalus_api.get( "results", name=results_dataset_fields["name"])["id"] except NotFoundError: results_id = None if update and results_id is not None: logging.warning("results dataset {} exists, updating".format( results_dataset_fields["name"])) results_dataset = tantalus_api.update("results", id=results_id, **results_dataset_fields) else: logging.info("creating results dataset {}".format( results_dataset_fields["name"])) results_dataset = tantalus_api.get_or_create("results", **results_dataset_fields) if tag_name is not None: tantalus_api.tag(tag_name, resultsdataset_set=[results_id]) logging.info("Succesfully created sequence dataset with ID {}".format( results_dataset["id"])) if remote_storage_name is not None: transfer_files.transfer_dataset(tantalus_api, results_dataset['id'], "resultsdataset", storage_name, remote_storage_name) return results_dataset
import json import os import datetime from dbclients.tantalus import TantalusApi if __name__ == "__main__": tantalusApi = TantalusApi() print os.getcwd() json_file = open('docker/dummy.json', "r") data = json.load(json_file) storage = tantalusApi.get_or_create("storage_azure_blob", name="singlecellblob", storage_account="singlecelldata", storage_container="data") ids = [] for i in range(1, 81): ids.append(i) resource = tantalusApi.get_or_create( 'file_resource', file_type=data[i]['file_type'], last_updated=data[i]['last_updated'], size=data[i]['size'], created=data[i]['created'], compression=data[i]['compression'], filename=data[i]['filename'], is_folder=data[i]['is_folder'], owner=data[i]['owner'])