Пример #1
0
def glob_cellenone_data(filepaths, storage_name, tag_name=None, update=False, remote_storage_name=None):

    tantalus_api = TantalusApi()

    for filepath in filepaths:
        match = re.match(r".*/single_cell_indexing/Cellenone/Cellenone_images/(\d+)_(A\d+[A-Z]*)", filepath)
        if match is None:
            logging.warning('skipping malformed {}'.format(filepath))
            continue

        fields = match.groups()
        date = fields[0]
        library_id = fields[1]

        try:
            tantalus_api.get('dna_library', library_id=library_id)
        except NotFoundError:
            logging.warning('skipping file with unknown library {}'.format(filepath))
            continue

        try:
            process_cellenone_images(
                library_id,
                filepath,
                storage_name,
                tag_name=tag_name,
                update=update,
                remote_storage_name=remote_storage_name,
            )
        except ValueError:
            logging.exception(f'unable to process {library_id}, {filepath}')
Пример #2
0
def process_cellenone_images(
    library_id,
    source_dir,
    storage_name,
    tag_name=None,
    update=False,
    remote_storage_name=None,
):

    tantalus_api = TantalusApi()

    results_name = 'CELLENONE_IMAGES_{}'.format(library_id)
    results_type = 'CELLENONE_IMAGES'
    results_version = 'v1'

    try:
        existing_results = tantalus_api.get('results', name=results_name)
    except NotFoundError:
        existing_results = None

    if existing_results is not None and not update:
        logging.info(f'results for {library_id} exist, not processing')
        return

    storage = tantalus_api.get('storage', name=storage_name)
    storage_directory = storage['storage_directory']

    destination_dir = os.path.join(
        storage_directory,
        'single_cell_indexing',
        'Cellenone',
        'Cellenone_processed',
        library_id,
        results_version,
    )

    try:
        os.makedirs(destination_dir)
    except:
        pass

    with tempfile.TemporaryDirectory() as temp_dir:
        filepaths = catalog_images(library_id, source_dir, destination_dir,
                                   temp_dir)

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=[library_id],
        recursive=False,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Пример #3
0
def cache_tagged_datasets(tag_name,
                          from_storage_name,
                          cache_directory,
                          suffix_filter=None):
    """ Cache a set of tagged datasets
    """

    tantalus_api = TantalusApi()

    tag = tantalus_api.get("tag", name=tag_name)

    for dataset_id in tag['sequencedataset_set']:
        cache_dataset(tantalus_api,
                      dataset_id,
                      "sequencedataset",
                      from_storage_name,
                      cache_directory,
                      suffix_filter=suffix_filter)

    for dataset_id in tag['resultsdataset_set']:
        cache_dataset(tantalus_api,
                      dataset_id,
                      "resultsdataset",
                      from_storage_name,
                      cache_directory,
                      suffix_filter=suffix_filter)
Пример #4
0
def add_microscope_results(filepaths,
                           chip_id,
                           library_ids,
                           storage_name,
                           tag_name=None,
                           update=False,
                           remote_storage_name=None):
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    results_name = 'MICROSCOPE_{}'.format(chip_id)
    results_type = 'MICROSCOPE'
    results_version = None

    try:
        existing_results = tantalus_api.get('results', name=results_name)
    except NotFoundError:
        existing_results = None

    if existing_results is not None and not update:
        logging.info(f'results for {chip_id} exist, not processing')
        return

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=library_ids,
        recursive=True,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Пример #5
0
def add_generic_dataset(**kwargs):
    tantalus_api = TantalusApi()

    file_resource_pks = []

    sample = tantalus_api.get("sample", sample_id=kwargs['sample_id'])

    library = tantalus_api.get("dna_library", library_id=kwargs['library_id'])

    #Add the file resource to tantalus
    for filepath in kwargs['filepaths']:
        logging.info(
            "Adding file resource for {} to Tantalus".format(filepath))
        resource, instance = tantalus_api.add_file(
            storage_name=kwargs['storage_name'],
            filepath=filepath,
            update=kwargs['update'])
        file_resource_pks.append(resource["id"])

    if "tag_name" in kwargs:
        tag = tantalus_api.get("tag", name=kwargs["tag_name"])
        tags = [tag["id"]]
    else:
        tags = []

    ref_genome = kwargs.get("reference_genome")
    aligner = kwargs.get("aligner")

    if "sequence_lane_pks" in kwargs:
        sequence_pks = map(str, kwargs["sequence_lane_pks"])

    #Add the dataset to tantalus
    sequence_dataset = tantalus_api.get_or_create(
        "sequence_dataset",
        name=kwargs['dataset_name'],
        dataset_type=kwargs['dataset_type'],
        sample=sample["id"],
        library=library["id"],
        sequence_lanes=sequence_pks,
        file_resources=file_resource_pks,
        reference_genome=ref_genome,
        aligner=aligner,
        tags=tags,
    )

    logging.info("Succesfully created sequence dataset with ID {}".format(
        sequence_dataset["id"]))
Пример #6
0
def glob_cellenone_data(filepaths,
                        storage_name,
                        tag_name=None,
                        update=False,
                        skip_existing=False,
                        remote_storage_name=None):

    tantalus_api = TantalusApi()

    library_paths = collections.defaultdict(set)

    for filepath in filepaths:
        match = re.match(
            r".*/single_cell_indexing/Cellenone/Cellenone_images/(\d+)_(A\d+[A-Z]*)/?$",
            filepath)
        if match is None:
            logging.warning('skipping malformed {}'.format(filepath))
            continue

        fields = match.groups()
        date = fields[0]
        library_id = fields[1]

        try:
            tantalus_api.get('dna_library', library_id=library_id)
        except NotFoundError:
            logging.warning(
                'skipping file with unknown library {}'.format(filepath))
            continue

        logging.info(f'queueing library {library_id} data from {filepath}')
        library_paths[library_id].add(filepath)

    for library_id in library_paths:
        add_cellenone_results(
            library_paths[library_id],
            library_id,
            storage_name,
            tag_name=tag_name,
            update=update,
            skip_existing=skip_existing,
            remote_storage_name=remote_storage_name,
        )
Пример #7
0
def transfer_tagged_datasets(tag_name, from_storage_name, to_storage_name):
    """ Transfer a set of tagged datasets
    """

    tantalus_api = TantalusApi()

    tag = tantalus_api.get("tag", name=tag_name)

    for dataset_id in tag['sequencedataset_set']:
        transfer_dataset(tantalus_api, dataset_id, "sequencedataset",
                         from_storage_name, to_storage_name)

    for dataset_id in tag['resultsdataset_set']:
        transfer_dataset(tantalus_api, dataset_id, "resultsdataset",
                         from_storage_name, to_storage_name)
Пример #8
0
def catalog_cellenone_dataset(library_id,
                              storage_name,
                              tag_name=None,
                              update=False,
                              remote_storage_name=None):

    tantalus_api = TantalusApi()

    dataset = tantalus_api.get('resultsdataset',
                               results_type='CELLENONE',
                               libraries__library_id=library_id)

    process_cellenone_dataset(dataset,
                              storage_name,
                              tag_name=tag_name,
                              update=update,
                              remote_storage_name=remote_storage_name)
Пример #9
0
def main(storage_name, bam_file_path, **kwargs):
    """
    Imports the bam into tantalus by creating a sequence dataset and 
    file resources 
    """
    logging.basicConfig(format=LOGGING_FORMAT,
                        stream=sys.stderr,
                        level=logging.INFO)

    tantalus_api = TantalusApi()

    sample = None
    if kwargs.get('sample_id') is not None:
        sample = tantalus_api.get_or_create(
            'sample',
            sample_id=kwargs['sample_id'],
        )

    library = None
    if kwargs.get('library_id') is not None:
        if kwargs.get('library_type') is not None and kwargs.get(
                'index_format') is not None:
            library = tantalus_api.get_or_create(
                'dna_library',
                library_id=kwargs['library_id'],
                library_type=kwargs['library_type'],
                index_format=kwargs['index_format'],
            )
        else:
            library = tantalus_api.get(
                'dna_library',
                library_id=kwargs['library_id'],
            )

    dataset = import_bam(
        storage_name,
        bam_file_path,
        sample=sample,
        library=library,
        read_type=kwargs.get('read_type'),
        ref_genome=kwargs.get('ref_genome'),
        update=kwargs.get('update'),
        tag_name=kwargs.get('tag_name'),
    )

    print("dataset {}".format(dataset["id"]))
Пример #10
0
def fix_bams(jira_ticket=None, dry_run=False):

    tantalus_api = TantalusApi()

    analyses_list = []
    storage_name = "singlecellresults"

    if jira_ticket is not None:
        analyses_list.append(tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete"))
    
    else:
        # Get all completed align analyses ran with specific version
        # the bams associated to these analyses are in the wrong storage account
        for version in ('v0.5.2', 'v0.5.3'):
            analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version)
            analyses_list += [a for a in analyses]

    for analysis in analyses_list:
        jira_ticket = analysis["jira_ticket"]

        filename = f'{jira_ticket}/results/bams/metadata.yaml'

        logging.info(f'adding file {filename}')
        if not dry_run:
            file_instance, file_resource = tantalus_api.add_file(storage_name, filename)

        # get all bam datasets associated with the jira ticket
        bam_datasets = tantalus_api.list(
            "sequencedataset",
            dataset_type="BAM",
            analysis__jira_ticket=jira_ticket,
        )

        for dataset in bam_datasets:
            dataset_id = dataset['id']

            logging.info(f'adding file to dataset {dataset_id}')
            if not dry_run:
                file_resource_ids = dataset['file_resources']
                file_resource_ids = file_resource_ids.append(file_resource['id'])
                tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=file_resource_ids)
Пример #11
0
def add_cellenone_results(filepaths,
                          library_id,
                          storage_name,
                          tag_name=None,
                          update=False,
                          skip_existing=False,
                          remote_storage_name=None):
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    results_name = 'CELLENONE_{}'.format(library_id)
    results_type = 'CELLENONE'
    results_version = None

    try:
        existing_results = tantalus_api.get('resultsdataset',
                                            name=results_name,
                                            results_type=results_type)
    except NotFoundError:
        existing_results = None

    if skip_existing and existing_results is not None:
        return existing_results

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=[library_id],
        recursive=True,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )

    return results_dataset
Пример #12
0
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a all FQ datasets for a library id.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    for dataset_info, metadata in create_lane_fastq_metadata(
            tantalus_api, library_id):
        metadata_filename = os.path.join(dataset_info['base_dir'],
                                         'metadata.yaml')
        metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                      metadata_filename)

        metadata_io = io.BytesIO()
        metadata_io.write(
            yaml.dump(metadata, default_flow_style=False).encode())

        logging.info(f'writing metadata to file {metadata_filepath}')
        client.write_data(metadata_filename, metadata_io)

        logging.info(f'adding {metadata_filepath} to tantalus')

        if not dry_run:
            file_resource, file_instance = tantalus_api.add_file(
                storage_name, metadata_filepath, update=True)

            for dataset_id in dataset_info['dataset_ids']:
                dataset = tantalus_api.get('sequencedataset', id=dataset_id)

                new_file_resources = set(dataset['file_resources'])
                new_file_resources.add(file_resource['id'])

                tantalus_api.update('sequencedataset',
                                    id=dataset_id,
                                    file_resources=list(new_file_resources))
Пример #13
0
def catalog_cellenone_datasets(storage_name,
                               tag_name=None,
                               update=False,
                               remote_storage_name=None):

    tantalus_api = TantalusApi()

    for dataset in tantalus_api.list('resultsdataset',
                                     results_type='CELLENONE'):
        # HACK: Check for metadata yaml file in dataset
        found_metadata = False
        try:
            file_resource = tantalus_api.get(
                'file_resource',
                resultsdataset__id=dataset['id'],
                filename__endswith='metadata.yaml')
            found_metadata = True
        except NotFoundError:
            logging.info(f"no metadata for dataset {dataset['id']}")

        if found_metadata:
            logging.info(
                f"found metadata for dataset {dataset['id']}, skipping")
            continue

        try:
            process_cellenone_dataset(dataset,
                                      storage_name,
                                      tag_name=tag_name,
                                      update=update,
                                      remote_storage_name=remote_storage_name)

        except KeyboardInterrupt:
            raise

        except:
            logging.exception(f"catalog failed for dataset {dataset['id']}")
Пример #14
0
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a dataset and add to tantalus.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id)

    metadata_filename = os.path.join(base_dir, 'metadata.yaml')
    metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                  metadata_filename)

    metadata_io = io.BytesIO()
    metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode())

    print(f'writing metadata to file {metadata_filepath}')
    client.write_data(metadata_filename, metadata_io)

    print(f'adding {metadata_filepath} to tantalus')

    if not dry_run:
        file_resource, file_instance = tantalus_api.add_file(storage_name,
                                                             metadata_filepath,
                                                             update=True)

        dataset = tantalus_api.get('sequencedataset', id=dataset_id)

        new_file_resources = set(dataset['file_resources'])
        new_file_resources.add(file_resource['id'])

        tantalus_api.update('sequencedataset',
                            id=dataset_id,
                            file_resources=list(new_file_resources))
Пример #15
0
def main(
    storage_name,
    dataset_type=None,
    dataset_id=None,
    tag_name=None,
    all_file_instances=False,
    dry_run=False,
    fix_corrupt=False,
    remove_missing=False,
):
    logging.info('checking integrity of storage {}'.format(storage_name))

    tantalus_api = TantalusApi()

    if all_file_instances:
        file_resources = tantalus_api.list(
            'file_resource', fileinstance__storage__name=storage_name)

    else:
        file_resources = get_dataset_file_instances(tantalus_api,
                                                    dataset_type,
                                                    dataset_id=dataset_id,
                                                    tag_name=tag_name)

    for file_resource in file_resources:
        try:
            file_instance = tantalus_api.get('file_instance',
                                             file_resource=file_resource['id'],
                                             storage__name=storage_name)
        except NotFoundError:
            logging.exception(
                f'file {file_resource["filename"]} not on storage')
            continue

        logging.info('checking file instance {} with path {}'.format(
            file_instance['id'], file_instance['filepath']))

        if file_instance['is_deleted']:
            logging.info('file instance {} marked as deleted'.format(
                file_instance['id']))
            continue

        file_corrupt = False
        file_missing = False
        try:
            tantalus_api.check_file(file_instance)
        except DataCorruptionError:
            file_corrupt = True
            logging.exception('check file failed')
        except DataMissingError:
            file_missing = True
            logging.exception('missing file')

        if file_corrupt and fix_corrupt:
            logging.info('updating file instance {} with path {}'.format(
                file_instance['id'], file_instance['filepath']))

            if not dry_run:
                tantalus_api.update_file(file_instance)

        if file_missing and remove_missing:
            logging.info('deleting file instance {} with path {}'.format(
                file_instance['id'], file_instance['filepath']))

            if not dry_run:
                file_instance = tantalus_api.update(
                    'file_instance',
                    id=file_instance['id'],
                    is_deleted=True,
                )
Пример #16
0
def process_cellenone_images(
    library_id,
    source_dir,
    storage_name,
    tag_name=None,
    update=False,
    remote_storage_name=None,
):
    """ Catalog cellenone images for a library and add to tantalus.

    Args:
        library_id (str): library id associated with the images
        source_dir (str): source cellenone directory
        storage_name (str): local storage in which to organize the results
    
    KwArgs:
        tag_name (str): tantalus tag
        update (bool): update and possibly overwrite an existing dataset in tantalus
        remote_storage_name (str): upload to a remote storage
    """

    tantalus_api = TantalusApi()

    results_name = 'CELLENONE_IMAGES_{}'.format(library_id)
    results_type = 'CELLENONE_IMAGES'
    results_version = 'v1'

    try:
        existing_results = tantalus_api.get('results', name=results_name)
    except NotFoundError:
        existing_results = None

    if existing_results is not None and not update:
        logging.info(f'results for {library_id} exist, not processing')
        return

    storage = tantalus_api.get('storage', name=storage_name)
    storage_directory = storage['storage_directory']

    destination_dir = os.path.join(
        storage_directory,
        'single_cell_indexing',
        'Cellenone',
        'Cellenone_processed',
        library_id,
        results_version,
    )

    try:
        os.makedirs(destination_dir)
    except:
        pass

    with tempfile.TemporaryDirectory() as temp_dir:
        filepaths = catalog_images(library_id, source_dir, destination_dir,
                                   temp_dir)

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=[library_id],
        recursive=False,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Пример #17
0
def fix():
    tantalus_api = TantalusApi()

    datasets = list(
        tantalus_api.list(
            'sequence_dataset',
            dataset_type='BAM',
            library__library_type__name='WGS',
        ))

    for dataset in datasets:
        bams = {}
        bais = {}
        specs = {}
        for file_resource_id in dataset['file_resources']:
            file_resource = tantalus_api.get('file_resource',
                                             id=file_resource_id)
            if file_resource['filename'].endswith('.bam'):
                bams[file_resource_id] = file_resource['filename']
            elif file_resource['filename'].endswith('.spec'):
                specs[file_resource_id] = file_resource['filename']
            elif file_resource['filename'].endswith('.bam.bai'):
                bais[file_resource_id] = file_resource['filename']

        if len(bams) == 0 and len(specs) == 0:
            print(dataset['id'])

        elif len(bams) > 1:
            logging.info(f"fixing {dataset['name']}, {bams}")

            to_remove_bam_id = max(bams.keys())
            to_remove_bai_id = None
            for id_, bai in bais.items():
                if bai.startswith(bams[to_remove_bam_id]):
                    assert to_remove_bai_id is None
                    to_remove_bai_id = id_
                    break
            assert to_remove_bai_id is not None

            logging.info((to_remove_bam_id, bams[to_remove_bam_id],
                          to_remove_bai_id, bais[to_remove_bai_id]))

            new_file_resources = dataset['file_resources']
            new_file_resources.remove(to_remove_bam_id)
            new_file_resources.remove(to_remove_bai_id)

            logging.info(
                f"updating {dataset['id']} to have files {new_file_resources}")

            tantalus_api.update('sequencedataset',
                                id=dataset['id'],
                                file_resources=new_file_resources)

            assert dataset["name"].endswith(str(dataset["version_number"]))

            similar_datasets = list(
                tantalus_api.list(
                    "sequence_dataset",
                    name=dataset["name"],
                ))
            new_version_number = max(d['version_number']
                                     for d in similar_datasets) + 1

            new_dataset_params = dict(
                sample=dataset['sample']['id'],
                library=dataset['library']['id'],
                sequence_lanes=[l['id'] for l in dataset['sequence_lanes']],
                aligner=dataset['aligner'],
                reference_genome=dataset['reference_genome'],
                name=dataset['name'][:-1] + str(new_version_number),
                dataset_type=dataset['dataset_type'],
                version_number=new_version_number,
                file_resources=[to_remove_bam_id, to_remove_bai_id],
            )

            logging.info(new_dataset_params)

            new_dataset = tantalus_api.create('sequencedataset',
                                              **new_dataset_params)

            logging.info(new_dataset)
Пример #18
0
def run_h5_convert(cache_dir,
                   dataset_id=None,
                   results_type=None,
                   redo=False,
                   dry_run=False,
                   check_done=False):
    tantalus_api = TantalusApi()

    local_cache_client = tantalus_api.get_cache_client(cache_dir)
    remote_storage_client = tantalus_api.get_storage_client(
        remote_storage_name)

    if dataset_id is not None:
        results_list = [tantalus_api.get("resultsdataset", id=dataset_id)]
        logging.info('converting results with id {}'.format(dataset_id))

    elif results_type is not None:
        results_list = tantalus_api.list("resultsdataset",
                                         results_type=results_type)
        logging.info(
            'converting results with results type {}'.format(results_type))

    else:
        results_list = tantalus_api.list("resultsdataset")
        logging.info('converting all results')

    for result in results_list:
        logging.info('processing results dataset {}'.format(result['id']))

        try:
            file_instances = tantalus_api.get_dataset_file_instances(
                result["id"],
                "resultsdataset",
                remote_storage_name,
            )

            existing_filenames = set(
                [i['file_resource']['filename'] for i in file_instances])

            found_csv_yaml = False
            for existing_filename in existing_filenames:
                # Destruct outputs csv.yaml directly, check non destruct files
                if 'destruct' in existing_filename:
                    continue
                if existing_filename.endswith('.csv.gz.yaml'):
                    found_csv_yaml = True
                    break

            if found_csv_yaml and check_done:
                logging.info('found filename {}, skipping conversion'.format(
                    existing_filename))
                continue

            file_resource_ids = []

            filepaths_to_clean = []

            for file_instance in file_instances:
                if not file_instance['file_resource']['filename'].endswith(
                        '.h5'):
                    continue

                datamanagement.transfer_files.cache_file(
                    tantalus_api, file_instance, cache_dir)

                h5_filepath = local_cache_client.get_url(
                    file_instance['file_resource']['filename'])

                filepaths_to_clean.append(h5_filepath)

                logging.info('converting {}'.format(h5_filepath))

                for key, csv_filepath in get_h5_csv_info(h5_filepath):
                    if not csv_filepath.startswith(cache_dir):
                        raise Exception(
                            'unexpected csv path {}'.format(csv_filepath))

                    csv_filename = csv_filepath[len(cache_dir):]
                    csv_filename = csv_filename.lstrip('/')

                    if csv_filename in existing_filenames and not redo:
                        logging.info(
                            'file {} already exists, not converting'.format(
                                csv_filename))
                        continue

                    if dry_run:
                        logging.info('would convert {}, key {} to {}'.format(
                            h5_filepath, key, csv_filepath))
                        continue

                    logging.info('converting {}, key {} to {}'.format(
                        h5_filepath, key, csv_filepath))
                    convert_h5(h5_filepath, key, csv_filepath)

                    yaml_filename = csv_filename + '.yaml'
                    yaml_filepath = csv_filepath + '.yaml'

                    fileinfo_to_add = [
                        (csv_filename, csv_filepath),
                        (yaml_filename, yaml_filepath),
                    ]

                    for filename, filepath in fileinfo_to_add:
                        logging.info('creating file {} from path {}'.format(
                            filename, filepath))

                        remote_storage_client.create(filename,
                                                     filepath,
                                                     update=redo)
                        remote_filepath = os.path.join(
                            remote_storage_client.prefix, filename)

                        logging.info('adding file {} from path {}'.format(
                            filename, remote_filepath))

                        (file_resource, file_instance) = tantalus_api.add_file(
                            remote_storage_name, remote_filepath,
                            update=True)  #redo)

                        file_resource_ids.append(file_resource["id"])
                        filepaths_to_clean.append(filepath)

            if len(file_resource_ids) == 0:
                logging.warning('no files added')
                continue

            logging.info('adding file resources {} to dataset {}'.format(
                file_resource_ids, result["id"]))

            tantalus_api.update(
                "resultsdataset",
                result["id"],
                file_resources=result["file_resources"] + file_resource_ids,
            )

            for filepath in filepaths_to_clean:
                logging.info('removing file {}'.format(filepath))
                os.remove(filepath)

        except NotFoundError:
            logging.exception('no files found for conversion')

        except KeyboardInterrupt:
            raise

        except Exception:
            logging.exception('conversion failed')
def main(storage_name,
         dlp_library_id=None,
         internal_id=None,
         tag_name=None,
         all=False,
         update=False,
         check_library=False,
         dry_run=False):

    # Set up the root logger
    logging.basicConfig(format=LOGGING_FORMAT,
                        stream=sys.stderr,
                        level=logging.INFO)

    # Connect to the Tantalus API (this requires appropriate environment)
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    # initiate arrays to store successful and failed libraries
    successful_libs = []
    failed_libs = []

    storage = tantalus_api.get("storage", name=storage_name)
    sequencing_list = list()

    if dry_run:
        logging.info("This is a dry run. No lanes will be imported.")

    # Importing a single library
    if dlp_library_id is not None:
        sequencing_list = list(
            colossus_api.list('sequencing',
                              sequencing_center='BCCAGSC',
                              library__pool_id=dlp_library_id))
    # importing all libraries from the gsc
    elif all:
        sequencing_list = list(
            colossus_api.list('sequencing', sequencing_center='BCCAGSC'))
    # importing only sequencing expecting more lanes
    else:
        sequencing_list = list(
            colossus_api.list('sequencing', sequencing_center='BCCAGSC'))
        sequencing_list = list(
            filter(
                lambda s: s['number_of_lanes_requested'] != len(s[
                    'dlplane_set']), sequencing_list))

    for sequencing in sequencing_list:
        # import library
        try:
            import_info = import_gsc_dlp_paired_fastqs(
                colossus_api,
                tantalus_api,
                sequencing,
                storage,
                internal_id,
                tag_name,
                update=update,
                check_library=check_library,
                dry_run=dry_run,
            )

            # check if no import information exists, if so, library does not exist on GSC
            if import_info is None:
                lane_requested_date = sequencing["lane_requested_date"]
                failed_libs.append(
                    dict(
                        dlp_library_id=sequencing["library"],
                        gsc_library_id="None",
                        lane_requested_date=lane_requested_date,
                        error="Doesn't exist on GSC",
                    ))
                continue

            # check if library excluded from import
            elif import_info is False:
                continue

            # update lanes in sequencing
            update_colossus_lane(colossus_api, sequencing,
                                 import_info['lanes'])
            # get sequencing object again since sequencing may have with new info
            updated_sequencing = colossus_api.get("sequencing",
                                                  id=sequencing["id"])
            # check if lanes have been imported
            check_lanes(colossus_api, updated_sequencing,
                        len(updated_sequencing["dlplane_set"]))

            # add lane_requested_date to import info for import status report
            import_info['lane_requested_date'] = sequencing[
                'lane_requested_date']

            # add library to list of succesfully imported libraries
            successful_libs.append(import_info)

            # create jira ticket and analyses with new lanes and datasets
            create_tickets_and_analyses(import_info)

        except Exception as e:
            # add lane_requested_date to import info for import status report
            lane_requested_date = sequencing["lane_requested_date"]
            updated_sequencing = colossus_api.get("sequencing",
                                                  id=sequencing["id"])
            # add library to list of libraries that failed to import
            failed_libs.append(
                dict(
                    dlp_library_id=sequencing["library"],
                    gsc_library_id=updated_sequencing["gsc_library_id"],
                    lane_requested_date=lane_requested_date,
                    error=str(e),
                ))

            logging.exception(
                f"Library {sequencing['library']} failed to import: {e}")
            continue

    # Only write import statuses for bulk imports
    if all or dlp_library_id is None:
        # Sort lists by date in descending order
        successful_libs.sort(
            key=lambda x: datetime.datetime.strptime(x['lane_requested_date'],
                                                     '%Y-%m-%d'),
            reverse=True,
        )
        failed_libs.sort(
            key=lambda x: datetime.datetime.strptime(x['lane_requested_date'],
                                                     '%Y-%m-%d'),
            reverse=True,
        )
        # write import report
        write_import_statuses(successful_libs, failed_libs)
Пример #20
0
def rename_fastqs(dataset_id, storage_name, dry_run=False, check_only=False):
    logging.info(f'dataset: {dataset_id}')
    logging.info(f'dry run: {dry_run}')

    tantalus_api = TantalusApi()
    storage_client = tantalus_api.get_storage_client(storage_name)

    dataset = tantalus_api.get('sequencedataset', id=dataset_id)

    file_instances = tantalus_api.get_dataset_file_instances(
        dataset['id'],
        'sequencedataset',
        storage_name,
    )

    for file_instance in file_instances:
        filename = file_instance['file_resource']['filename']

        if os.path.basename(filename) == 'metadata.yaml':
            continue

        assert len(dataset['sequence_lanes']) == 1

        parts = filename.split('/')
        basename = os.path.basename(filename)

        non_conforming = False
        try:
            assert parts[0] == 'single_cell_indexing'
            assert parts[1] == 'fastq'
            assert parts[2] == dataset['library']['library_id']
            assert parts[3].split(
                '_')[0] == dataset['sequence_lanes'][0]['flowcell_id']
            assert parts[3].split(
                '_')[1] == dataset['sequence_lanes'][0]['lane_number']
            assert parts[4] == dataset['sample']['sample_id']
        except AssertionError:
            non_conforming = True

        if check_only:
            if non_conforming:
                raise Exception(f'filename {filename} does not conform')
            continue

        new_filename = SC_WGS_FQ_TEMPLATE.format(
            dlp_library_id=dataset['library']['library_id'],
            flowcell_id=dataset['sequence_lanes'][0]['flowcell_id'],
            lane_number=dataset['sequence_lanes'][0]['lane_number'],
            cell_sample_id=dataset['sample']['sample_id'],
            cell_filename=basename,
        )

        if new_filename == filename:
            logging.info(f'skipping conforming {filename} on {storage_name}')
            continue

        logging.info(
            f'renaming {filename} to {new_filename} on {storage_name}')

        if not dry_run:
            if not storage_client.exists(new_filename):
                storage_client.copy(filename, new_filename, wait=True)
            tantalus_api.swap_file(file_instance, new_filename)
            storage_client.delete(filename)
Пример #21
0
if __name__ == '__main__':
    logging.basicConfig(format=LOGGING_FORMAT,
                        stream=sys.stderr,
                        level=logging.INFO)

    gsc_api = GSCAPI()

    tantalus_api = TantalusApi()

    # List of relevant libraries from GSC lanes
    lanes = list(tantalus_api.list('sequencing_lane', sequencing_centre='GSC'))

    libraries = set()
    for lane in lanes:
        library = tantalus_api.get('dna_library', id=lane['dna_library'])
        if library['library_type'] == 'WGS':
            libraries.add(library['library_id'])

    lane_fixes = []

    for library_id in libraries:
        infos = gsc_api.query("library?name={}".format(library_id))

        if len(infos) == 0:
            logging.warning('unable to find {}'.format(library_id))

        elif len(infos) > 1:
            raise Exception('found {} libraries for {}'.format(
                len(infos), library_id))
Пример #22
0
def main(
    storage_name,
    dry_run=False,
    check_remote=None,
):
    tantalus_api = TantalusApi()

    storage_client = tantalus_api.get_storage_client(storage_name)

    remote_storage_client = None
    if check_remote is not None:
        remote_storage_client = tantalus_api.get_storage_client(check_remote)

    file_instances = tantalus_api.list('file_instance',
                                       storage__name=storage_name,
                                       is_deleted=True)

    # DEBUG: check whether we are getting back
    # consistent ordered results from tantalus
    file_instances = list(file_instances)
    file_instance_ids = set([f['id'] for f in file_instances])
    if len(file_instances) != len(file_instance_ids):
        raise Exception('received duplicate results from tantalus')

    logging.info('processing {} file instances'.format(len(file_instance_ids)))
    logging.info('processing the following file instances: {}'.format(
        str(file_instance_ids)))

    for file_instance in file_instances:
        file_resource = tantalus_api.get(
            'file_resource', id=file_instance['file_resource']['id'])
        all_file_instances = list(
            tantalus_api.list('file_instance',
                              file_resource=file_resource['id']))

        logging.info(
            'checking file instance {}, file resource {}, filepath {}'.format(
                file_instance['id'], file_resource['id'],
                file_instance['filepath']))

        sequencedatasets = tantalus_api.list(
            'sequencedataset', file_resources__id=file_resource['id'])
        resultsdatasets = tantalus_api.list(
            'resultsdataset', file_resources__id=file_resource['id'])

        sequencedataset_ids = list(set([a['id'] for a in sequencedatasets]))
        resultsdataset_ids = list(set([a['id'] for a in resultsdatasets]))

        logging.info(
            'file resource {} belongs to sequencedataset {} and resultsdatasets {}'
            .format(file_resource['id'], sequencedataset_ids,
                    resultsdataset_ids))

        # Optionally check for a remote version
        if remote_storage_client:
            remote_instance = None
            for other_instance in file_resource['file_instances']:
                if other_instance['storage']['name'] == check_remote:
                    remote_instance = other_instance

            if not remote_instance:
                logging.info(
                    'not deleting file instance {}, no other instance'.format(
                        file_instance['id']))
                continue

            if remote_instance['is_deleted']:
                logging.info(
                    'not deleting file instance {}, other instance {} deleted'.
                    format(file_instance['id'], other_instance['id']))
                continue

            if not remote_storage_client.exists(file_resource['filename']):
                logging.info(
                    'not deleting file instance {}, other instance {} doesnt exist'
                    .format(file_instance['id'], other_instance['id']))
                continue

            logging.info(
                'deletion ok for file instance {}, found other instance {}'.
                format(file_instance['id'], other_instance['id']))

        # Delete the file from the filesystem
        logging.info('deleting file {}'.format(file_instance['filepath']))
        if not dry_run:
            try:
                storage_client.delete(file_resource['filename'])
            except FileNotFoundError:
                logging.exception('file already deleted')

        # Delete the instance model from tantalus
        logging.info('deleting file instance {}'.format(file_instance['id']))
        if not dry_run:
            tantalus_api.delete('file_instance', id=file_instance['id'])

        # If this is the only file instance for this file resource, delete the file resource
        if len(all_file_instances) == 1:
            assert all_file_instances[0]['id'] == file_instance['id']
            logging.info('deleting file resource {}'.format(
                file_resource['id']))
            if not dry_run:
                tantalus_api.delete('file_resource', id=file_resource['id'])
Пример #23
0
def fix_bams(jira_ticket=None, dry_run=False):

    logging.info(f'dry run: {dry_run}')

    tantalus_api = TantalusApi()

    SC_WGS_BAM_DIR_TEMPLATE = os.path.join(
        'single_cell_indexing',
        'bam',
        '{library_id}',
        '{ref_genome}',
        '{aligner_name}',
        'numlanes_{number_lanes}',
        '{jira_ticket}',
    )

    reference_genome_map = {
        'HG19': 'grch37',
        'MM10': 'mm10',
    }

    analyses_list = []
    from_storage_name = "singlecellresults"
    to_storage_name = "singlecellblob"
    from_storage_client = tantalus_api.get_storage_client(from_storage_name)
    to_storage_client = tantalus_api.get_storage_client(to_storage_name)
    to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id']

    if jira_ticket is not None:
        analyses_list.append(
            tantalus_api.get('analysis',
                             jira_ticket=jira_ticket,
                             analysis_type__name="align",
                             status="complete"))

    else:
        # Get all completed align analyses ran with specific version
        # the bams associated to these analyses are in the wrong storage account
        for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'):
            analyses = tantalus_api.list('analysis',
                                         analysis_type__name="align",
                                         status="complete",
                                         version=version)
            analyses_list += [a for a in analyses]

    for analysis in analyses_list:
        jira_ticket = analysis["jira_ticket"]
        print(f"moving bams for {jira_ticket}")

        # get all bam datasets associated with the jira ticket
        bam_datasets = tantalus_api.list(
            "sequencedataset",
            dataset_type="BAM",
            analysis__jira_ticket=jira_ticket,
        )

        for dataset in bam_datasets:
            # Get number of lanes from dataset for use with filepath
            lanes = set()
            for sequence_lane in dataset['sequence_lanes']:
                lane = "{}_{}".format(sequence_lane['flowcell_id'],
                                      sequence_lane['lane_number'])
                lanes.add(lane)
            number_lanes = len(lanes)

            try:
                file_instances = tantalus_api.get_dataset_file_instances(
                    dataset["id"],
                    "sequencedataset",
                    from_storage_name,
                )
            except dbclients.tantalus.DataNotOnStorageError:
                logging.info(
                    f'dataset {dataset["id"]} not on {from_storage_name}, skipping'
                )
                continue

            for file_instance in file_instances:
                blobname = file_instance["file_resource"]["filename"]

                # get url of source blob
                blob_url = from_storage_client.get_url(blobname)

                bam_filename = blobname.split("/bams/")[1]
                new_blobname = os.path.join(
                    SC_WGS_BAM_DIR_TEMPLATE.format(
                        library_id=dataset["library"]["library_id"],
                        ref_genome=reference_genome_map[
                            dataset["reference_genome"]],
                        aligner_name=dataset["aligner"],
                        number_lanes=number_lanes,
                        jira_ticket=jira_ticket,
                    ),
                    bam_filename,
                )

                # copy blob to desired storage account with new blobname
                blob_filepath = f"{to_storage_client.prefix}/{new_blobname}"
                logging.info(
                    f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}'
                )
                if not dry_run:
                    to_storage_client.blob_service.copy_blob(
                        container_name="data",
                        blob_name=new_blobname,
                        copy_source=blob_url,
                    )

                file_resource_id = file_instance['file_resource']['id']
                file_instance_id = file_instance['id']

                logging.info(
                    f'updating file resource {file_resource_id} to have filename {new_blobname}'
                )
                if not dry_run:
                    tantalus_api.update('file_resource',
                                        id=file_resource_id,
                                        filename=new_blobname)

                logging.info(
                    f'updating file instance {file_instance_id} to have storage with id {to_storage_id}'
                )
                if not dry_run:
                    tantalus_api.update('file_instance',
                                        id=file_instance_id,
                                        storage=to_storage_id)
Пример #24
0
def import_bam(storage_name,
               bam_file_path,
               sample=None,
               library=None,
               lane_infos=None,
               read_type=None,
               ref_genome=None,
               tag_name=None,
               update=False):
    """
    Imports bam into tantalus

    Args:
        storage_name:   (string) name of destination storage
        bam_file_path:  (string) filepath to bam on destination storage
        sample:         (dict) contains sample_id
        library:        (dict) contains library_id, library_type, index_format
        lane_infos:     (dict) contains flowcell_id, lane_number, 
                        adapter_index_sequence, sequencing_cenre, read_type, 
                        reference_genome, aligner
        read_type:      (string) read type for the run
        tag_name:       (string)
        update:         (boolean)
    Returns:
        sequence_dataset:   (dict) sequence dataset created on tantalus
    """
    tantalus_api = TantalusApi()

    # Get a url allowing access regardless of whether the file
    # is in cloud or local storage
    storage_client = tantalus_api.get_storage_client(storage_name)
    bam_filename = tantalus_api.get_file_resource_filename(
        storage_name, bam_file_path)
    bam_url = storage_client.get_url(bam_filename)

    bam_header = pysam.AlignmentFile(bam_url).header
    bam_header_info = get_bam_header_info(bam_header)

    if ref_genome is None:
        ref_genome = get_bam_ref_genome(bam_header)

    aligner_name = get_bam_aligner_name(bam_header)

    logging.info(
        f"bam header shows reference genome {ref_genome} and aligner {aligner_name}"
    )

    bai_file_path = None
    if storage_client.exists(bam_filename + ".bai"):
        bai_file_path = bam_file_path + ".bai"
    else:
        logging.info(f"no bam index found at {bam_filename + '.bai'}")

    # If no sample was specified assume it exists in tantalus and
    # search for it based on header info
    if sample is None:
        if len(bam_header_info["sample_ids"]) != 1:
            raise ValueError(
                f"found sample_ids={bam_header_info['sample_ids']}, please specify override sample id"
            )
        sample_id = list(bam_header_info["sample_ids"])[0]
        sample = tantalus_api.get('sample', sample_id=sample_id)

    # If no library was specified assume it exists in tantalus and
    # search for it based on header info
    if library is None:
        if len(bam_header_info["library_ids"]) != 1:
            raise ValueError(
                f"found library_ids={bam_header_info['library_ids']}, please specify override library id"
            )
        library_id = list(bam_header_info["library_ids"])[0]
        library = tantalus_api.get('dna_library', library_id=library_id)

    # Default paired end reads
    if read_type is None:
        read_type = 'P'

    # If no lane infos were specified create them from header info
    if lane_infos is None:
        lane_infos = []
        for lane in bam_header_info["sequence_lanes"]:
            lane_info = {
                "flowcell_id": lane["flowcell_id"],
                "lane_number": lane["lane_number"],
                "library_id": lane["library_id"],
                "sequencing_centre": lane["sequencing_centre"],
                "read_type": read_type,
            }
            lane_infos.append(lane_info)

    # Add the sequence dataset to Tantalus
    sequence_dataset = add_sequence_dataset(
        tantalus_api,
        storage_name=storage_name,
        sample=sample,
        library=library,
        dataset_type="BAM",
        sequence_lanes=lane_infos,
        bam_file_path=bam_file_path,
        reference_genome=ref_genome,
        aligner=aligner_name,
        bai_file_path=bai_file_path,
        tag_name=tag_name,
        update=update,
    )

    return sequence_dataset
Пример #25
0
tantalus_api = TantalusApi()
colossus_api = ColossusApi()


if __name__ == '__main__':
    print "STARTING"
    colossus_analyses = colossus_api.list('analysis_information')
    tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align")

    analysis_lane_dict = {}

    for analysis in tantalus_analyses:
        lane_set = set()
        for input_dataset in analysis['input_datasets']:
            dataset = tantalus_api.get('sequencedataset',id=input_dataset)
            for lane in dataset['sequence_lanes']:
                lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number'])))

        analysis_lane_dict[analysis['name']] = lane_set

    print analysis_lane_dict

    for analysis in colossus_analyses:
        key = analysis['analysis_jira_ticket'] + '_align'
        if key in analysis_lane_dict.keys():
            lanes = []
            print "ID" + str(analysis['id'])
            for lane in analysis_lane_dict[key]:
                try:
                    print "Getting lane: " + lane
Пример #26
0
def add_generic_results(filepaths,
                        storage_name,
                        results_name,
                        results_type,
                        results_version,
                        sample_ids=(),
                        library_ids=(),
                        analysis_pk=None,
                        recursive=False,
                        tag_name=None,
                        update=False,
                        remote_storage_name=None):

    tantalus_api = TantalusApi()

    sample_pks = []
    for sample_id in sample_ids:
        samples = tantalus_api.get(
            "sample",
            sample_id=sample_id,
        )
        sample_pks.append(samples['id'])

    library_pks = []
    for library_id in library_ids:
        librarys = tantalus_api.get(
            "dna_library",
            library_id=library_id,
        )
        library_pks.append(librarys['id'])

    #Add the file resource to tantalus
    file_resource_pks = []
    for filepath in filepaths:
        if recursive:
            logging.info("Recursing directory {}".format(filepath))
            add_filepaths = []
            for (dirpath, dirnames, filenames) in os.walk(filepath):
                for filename in filenames:
                    add_filepaths.append(os.path.join(dirpath, filename))

        else:
            add_filepaths = [filepath]

        for add_filepath in add_filepaths:
            logging.info(
                "Adding file resource for {} to Tantalus".format(add_filepath))
            resource, instance = tantalus_api.add_file(
                storage_name=storage_name,
                filepath=add_filepath,
                update=update,
            )
            file_resource_pks.append(resource["id"])

    results_dataset_fields = dict(
        name=results_name,
        results_type=results_type,
        results_version=results_version,
        analysis=analysis_pk,
        samples=sample_pks,
        libraries=library_pks,
        file_resources=file_resource_pks,
    )

    #Add the dataset to tantalus
    try:
        results_id = tantalus_api.get(
            "results", name=results_dataset_fields["name"])["id"]
    except NotFoundError:
        results_id = None

    if update and results_id is not None:
        logging.warning("results dataset {} exists, updating".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.update("results",
                                              id=results_id,
                                              **results_dataset_fields)

    else:
        logging.info("creating results dataset {}".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.get_or_create("results",
                                                     **results_dataset_fields)

    if tag_name is not None:
        tantalus_api.tag(tag_name, resultsdataset_set=[results_id])

    logging.info("Succesfully created sequence dataset with ID {}".format(
        results_dataset["id"]))

    if remote_storage_name is not None:
        transfer_files.transfer_dataset(tantalus_api, results_dataset['id'],
                                        "resultsdataset", storage_name,
                                        remote_storage_name)

    return results_dataset