def process_cellenone_dataset(dataset, storage_name, tag_name=None, update=False, remote_storage_name=None): assert len(dataset['libraries']) == 1 library_id = dataset['libraries'][0]['library_id'] tantalus_api = TantalusApi() if not tantalus_api.is_dataset_on_storage(dataset['id'], 'resultsdataset', storage_name): raise ValueError( f"dataset {dataset['id']} not on storage {storage_name}") # Assume all files in the raw dataset are under the directory: # single_cell_indexing/Cellenone/Cellenone_images/{date}_{library_id} filename_prefix = 'single_cell_indexing/Cellenone/Cellenone_images/' source_dir = None for file_resource in tantalus_api.get_dataset_file_resources( dataset['id'], 'resultsdataset'): if source_dir is None: if not file_resource['filename'].startswith(filename_prefix): raise ValueError( f"file {file_resource['filename']} is not in directory {filename_prefix}" ) library_subdir = file_resource['filename'].split('/')[3] if not library_subdir.endswith(library_id): raise ValueError( f"file {file_resource['filename']} is not in a directory ending with {library_id}" ) source_dir = '/'.join(file_resource['filename'].split('/')[:4]) elif not file_resource['filename'].startswith(source_dir): raise ValueError( f"file {file_resource['filename']} is not in directory {source_dir}" ) assert source_dir is not None source_dir = tantalus_api.get_filepath(storage_name, source_dir) process_cellenone_images( library_id, source_dir, storage_name, tag_name=tag_name, update=update, remote_storage_name=remote_storage_name, )
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a all FQ datasets for a library id. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) for dataset_info, metadata in create_lane_fastq_metadata( tantalus_api, library_id): metadata_filename = os.path.join(dataset_info['base_dir'], 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write( yaml.dump(metadata, default_flow_style=False).encode()) logging.info(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) logging.info(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file( storage_name, metadata_filepath, update=True) for dataset_id in dataset_info['dataset_ids']: dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False): """ Create a metadata.yaml file for a dataset and add to tantalus. """ tantalus_api = TantalusApi() storage = tantalus_api.get_storage(storage_name) client = tantalus_api.get_storage_client(storage_name) metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id) metadata_filename = os.path.join(base_dir, 'metadata.yaml') metadata_filepath = tantalus_api.get_filepath(storage_name, metadata_filename) metadata_io = io.BytesIO() metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode()) print(f'writing metadata to file {metadata_filepath}') client.write_data(metadata_filename, metadata_io) print(f'adding {metadata_filepath} to tantalus') if not dry_run: file_resource, file_instance = tantalus_api.add_file(storage_name, metadata_filepath, update=True) dataset = tantalus_api.get('sequencedataset', id=dataset_id) new_file_resources = set(dataset['file_resources']) new_file_resources.add(file_resource['id']) tantalus_api.update('sequencedataset', id=dataset_id, file_resources=list(new_file_resources))
def add_generic_results(filepaths, storage_name, results_name, results_type, results_version, sample_ids=(), library_ids=(), analysis_pk=None, recursive=False, tag_name=None, update=False, remote_storage_name=None): tantalus_api = TantalusApi() storage_client = tantalus_api.get_storage_client(storage_name) sample_pks = [] for sample_id in sample_ids: samples = tantalus_api.get( "sample", sample_id=sample_id, ) sample_pks.append(samples['id']) library_pks = [] for library_id in library_ids: librarys = tantalus_api.get( "dna_library", library_id=library_id, ) library_pks.append(librarys['id']) #Add the file resource to tantalus file_resource_pks = [] for filepath in filepaths: if recursive: logging.info("Recursing directory {}".format(filepath)) filename_prefix = tantalus_api.get_file_resource_filename( storage_name, filepath) add_filepaths = [] for filename in storage_client.list(filename_prefix): add_filepaths.append( tantalus_api.get_filepath(storage_name, filename)) else: add_filepaths = [filepath] for add_filepath in add_filepaths: logging.info( "Adding file resource for {} to Tantalus".format(add_filepath)) resource, instance = tantalus_api.add_file( storage_name=storage_name, filepath=add_filepath, update=update, ) file_resource_pks.append(resource["id"]) results_dataset_fields = dict( name=results_name, results_type=results_type, results_version=results_version, analysis=analysis_pk, samples=sample_pks, libraries=library_pks, file_resources=file_resource_pks, ) #Add the dataset to tantalus try: results_id = tantalus_api.get( "results", name=results_dataset_fields["name"])["id"] except NotFoundError: results_id = None if update and results_id is not None: logging.warning("results dataset {} exists, updating".format( results_dataset_fields["name"])) results_dataset = tantalus_api.update("results", id=results_id, **results_dataset_fields) else: logging.info("creating results dataset {}".format( results_dataset_fields["name"])) results_dataset = tantalus_api.get_or_create("results", **results_dataset_fields) if tag_name is not None: tantalus_api.tag(tag_name, resultsdataset_set=[results_id]) logging.info("Succesfully created sequence dataset with ID {}".format( results_dataset["id"])) if remote_storage_name is not None: transfer_files.transfer_dataset(tantalus_api, results_dataset['id'], "resultsdataset", storage_name, remote_storage_name) return results_dataset