예제 #1
0
def add_microscope_results(filepaths,
                           chip_id,
                           library_ids,
                           storage_name,
                           tag_name=None,
                           update=False,
                           remote_storage_name=None):
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    results_name = 'MICROSCOPE_{}'.format(chip_id)
    results_type = 'MICROSCOPE'
    results_version = None

    try:
        existing_results = tantalus_api.get('results', name=results_name)
    except NotFoundError:
        existing_results = None

    if existing_results is not None and not update:
        logging.info(f'results for {chip_id} exist, not processing')
        return

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=library_ids,
        recursive=True,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
예제 #2
0
def add_cellenone_results(filepaths,
                          library_id,
                          storage_name,
                          tag_name=None,
                          update=False,
                          remote_storage_name=None):
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    results_name = 'CELLENONE_{}'.format(library_id)
    results_type = 'CELLENONE'
    results_version = None

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=[library_id],
        recursive=True,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
예제 #3
0
def fastq_dlp_index_check(file_info):
    """ Check consistency between colossus indices and file indices. """

    colossus_api = ColossusApi()

    # Assumption: only 1 library per imported set of fastqs
    dlp_library_ids = list(set([a['library_id'] for a in file_info]))
    if len(dlp_library_ids) != 1:
        raise ValueError(
            'Expected 1 library_id, received {}'.format(dlp_library_ids))
    dlp_library_id = dlp_library_ids[0]

    cell_samples = query_colossus_dlp_cell_info(colossus_api, dlp_library_id)

    cell_index_sequences = set(cell_samples.keys())

    fastq_lane_index_sequences = collections.defaultdict(set)

    # Check that all fastq files refer to indices known in colossus
    for info in file_info:
        if info['index_sequence'] not in cell_index_sequences:
            raise Exception(
                'fastq {} with index {}, flowcell {}, lane {} with index not in colossus'
                .format(info['filepath'], info['index_sequence'],
                        info['sequence_lanes'][0]['flowcell_id'],
                        info['sequence_lanes'][0]['lane_number']))
        flowcell_lane = (info['sequence_lanes'][0]['flowcell_id'],
                         info['sequence_lanes'][0]['lane_number'])
        fastq_lane_index_sequences[flowcell_lane].add(info['index_sequence'])
    log.info('all fastq files refer to indices known in colossus')

    # Check that all index sequences in colossus have fastq files
    for flowcell_lane in fastq_lane_index_sequences:
        for index_sequence in cell_index_sequences:
            if index_sequence not in fastq_lane_index_sequences[flowcell_lane]:
                raise Exception(
                    'no fastq found for index sequence {}, flowcell {}, lane {}'
                    .format(index_sequence, flowcell_lane[0],
                            flowcell_lane[1]))
    log.info('all indices in colossus have fastq files')
예제 #4
0
def add_cellenone_results(filepaths,
                          library_id,
                          storage_name,
                          tag_name=None,
                          update=False,
                          skip_existing=False,
                          remote_storage_name=None):
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    results_name = 'CELLENONE_{}'.format(library_id)
    results_type = 'CELLENONE'
    results_version = None

    try:
        existing_results = tantalus_api.get('resultsdataset',
                                            name=results_name,
                                            results_type=results_type)
    except NotFoundError:
        existing_results = None

    if skip_existing and existing_results is not None:
        return existing_results

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=[library_id],
        recursive=True,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )

    return results_dataset
예제 #5
0
from collections import defaultdict

from workflows.unanalyzed_data import *

import datamanagement.templates as templates

from dbclients.tantalus import TantalusApi
from dbclients.colossus import ColossusApi
from dbclients.basicclient import NotFoundError

from workflows.utils import file_utils
from workflows.utils import saltant_utils
from workflows.utils.colossus_utils import get_ref_genome

tantalus_api = TantalusApi()
colossus_api = ColossusApi()

log = logging.getLogger('sisyphus')
log.setLevel(logging.DEBUG)
stream_handler = logging.StreamHandler()
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
log.addHandler(stream_handler)
log.propagate = False


def get_sequencings(library_info):
    '''
    Given library id (str), return list of sequencings
    '''
예제 #6
0
from dbclients.tantalus import TantalusApi
from dbclients.colossus import ColossusApi
import logging


tantalus_api = TantalusApi()
colossus_api = ColossusApi()


if __name__ == '__main__':
    print "STARTING"
    colossus_analyses = colossus_api.list('analysis_information')
    tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align")

    analysis_lane_dict = {}

    for analysis in tantalus_analyses:
        lane_set = set()
        for input_dataset in analysis['input_datasets']:
            dataset = tantalus_api.get('sequencedataset',id=input_dataset)
            for lane in dataset['sequence_lanes']:
                lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number'])))

        analysis_lane_dict[analysis['name']] = lane_set

    print analysis_lane_dict

    for analysis in colossus_analyses:
        key = analysis['analysis_jira_ticket'] + '_align'
        if key in analysis_lane_dict.keys():
            lanes = []
예제 #7
0
def check_indices(library_id=None):
    tantalus_api = TantalusApi()
    colossus_api = ColossusApi()

    if library_id is None:
        library_ids = set([a['pool_id'] for a in colossus_api.list('library')])

    else:
        library_ids = [library_id]

    for library_id in library_ids:

        # Get colossus sublibrary indices
        sublibraries = colossus_api.list('sublibraries',
                                         library__pool_id=library_id)
        colossus_indices = set(
            [a['primer_i7'] + '-' + a['primer_i5'] for a in sublibraries])

        datasets = tantalus_api.list(
            'sequence_dataset',
            library__library_id=library_id,
            library__library_type__name='SC_WGS',
            dataset_type='FQ',
        )

        lane_datasets = collections.defaultdict(list)

        for dataset in datasets:

            assert len(dataset['sequence_lanes']) == 1

            flowcell_lane = '_'.join([
                dataset['sequence_lanes'][0]['flowcell_id'],
                dataset['sequence_lanes'][0]['lane_number'],
            ])

            lane_datasets[flowcell_lane].append(dataset)

        for flowcell_lane in lane_datasets:

            # Get tantalus sublibraries and indices
            tantalus_indices = set()
            tantalus_dataset_ids = []
            tantalus_sequencing_centre = set()
            for dataset in lane_datasets[flowcell_lane]:
                file_resources = list(
                    tantalus_api.list('file_resource',
                                      sequencedataset__id=dataset['id']))
                tantalus_indices.update(
                    set([
                        a['sequencefileinfo']['index_sequence']
                        for a in file_resources
                    ]))
                tantalus_dataset_ids.append(dataset['id'])
                tantalus_sequencing_centre.update([
                    a['sequencing_centre'] for a in dataset['sequence_lanes']
                ])

            assert len(tantalus_sequencing_centre) == 1
            tantalus_sequencing_centre = list(tantalus_sequencing_centre)[0]

            if len(colossus_indices - tantalus_indices) > 0:
                print(
                    'library {}, datasets {}, lane {}, sequencing_centre {}: {} in colossus but not tantalus'
                    .format(library_id, tantalus_dataset_ids, flowcell_lane,
                            tantalus_sequencing_centre,
                            len(colossus_indices - tantalus_indices)))

            if len(tantalus_indices - colossus_indices) > 0:
                print(
                    'library {}, datasets {}, lane {}, sequencing_centre {}: {} in tantalus but not colossus'
                    .format(library_id, tantalus_dataset_ids, flowcell_lane,
                            tantalus_sequencing_centre,
                            len(tantalus_indices - colossus_indices)))

            if tantalus_indices == colossus_indices:
                print(
                    'library {}, datasets {}, lane {}, sequencing_centre {}: OK'
                    .format(library_id, tantalus_dataset_ids, flowcell_lane,
                            tantalus_sequencing_centre))
def main(storage_name,
         dlp_library_id=None,
         internal_id=None,
         tag_name=None,
         all=False,
         update=False,
         check_library=False,
         dry_run=False):

    # Set up the root logger
    logging.basicConfig(format=LOGGING_FORMAT,
                        stream=sys.stderr,
                        level=logging.INFO)

    # Connect to the Tantalus API (this requires appropriate environment)
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    # initiate arrays to store successful and failed libraries
    successful_libs = []
    failed_libs = []

    storage = tantalus_api.get("storage", name=storage_name)
    sequencing_list = list()

    if dry_run:
        logging.info("This is a dry run. No lanes will be imported.")

    # Importing a single library
    if dlp_library_id is not None:
        sequencing_list = list(
            colossus_api.list('sequencing',
                              sequencing_center='BCCAGSC',
                              library__pool_id=dlp_library_id))
    # importing all libraries from the gsc
    elif all:
        sequencing_list = list(
            colossus_api.list('sequencing', sequencing_center='BCCAGSC'))
    # importing only sequencing expecting more lanes
    else:
        sequencing_list = list(
            colossus_api.list('sequencing', sequencing_center='BCCAGSC'))
        sequencing_list = list(
            filter(
                lambda s: s['number_of_lanes_requested'] != len(s[
                    'dlplane_set']), sequencing_list))

    for sequencing in sequencing_list:
        # import library
        try:
            import_info = import_gsc_dlp_paired_fastqs(
                colossus_api,
                tantalus_api,
                sequencing,
                storage,
                internal_id,
                tag_name,
                update=update,
                check_library=check_library,
                dry_run=dry_run,
            )

            # check if no import information exists, if so, library does not exist on GSC
            if import_info is None:
                lane_requested_date = sequencing["lane_requested_date"]
                failed_libs.append(
                    dict(
                        dlp_library_id=sequencing["library"],
                        gsc_library_id="None",
                        lane_requested_date=lane_requested_date,
                        error="Doesn't exist on GSC",
                    ))
                continue

            # check if library excluded from import
            elif import_info is False:
                continue

            # update lanes in sequencing
            update_colossus_lane(colossus_api, sequencing,
                                 import_info['lanes'])
            # get sequencing object again since sequencing may have with new info
            updated_sequencing = colossus_api.get("sequencing",
                                                  id=sequencing["id"])
            # check if lanes have been imported
            check_lanes(colossus_api, updated_sequencing,
                        len(updated_sequencing["dlplane_set"]))

            # add lane_requested_date to import info for import status report
            import_info['lane_requested_date'] = sequencing[
                'lane_requested_date']

            # add library to list of succesfully imported libraries
            successful_libs.append(import_info)

            # create jira ticket and analyses with new lanes and datasets
            create_tickets_and_analyses(import_info)

        except Exception as e:
            # add lane_requested_date to import info for import status report
            lane_requested_date = sequencing["lane_requested_date"]
            updated_sequencing = colossus_api.get("sequencing",
                                                  id=sequencing["id"])
            # add library to list of libraries that failed to import
            failed_libs.append(
                dict(
                    dlp_library_id=sequencing["library"],
                    gsc_library_id=updated_sequencing["gsc_library_id"],
                    lane_requested_date=lane_requested_date,
                    error=str(e),
                ))

            logging.exception(
                f"Library {sequencing['library']} failed to import: {e}")
            continue

    # Only write import statuses for bulk imports
    if all or dlp_library_id is None:
        # Sort lists by date in descending order
        successful_libs.sort(
            key=lambda x: datetime.datetime.strptime(x['lane_requested_date'],
                                                     '%Y-%m-%d'),
            reverse=True,
        )
        failed_libs.sort(
            key=lambda x: datetime.datetime.strptime(x['lane_requested_date'],
                                                     '%Y-%m-%d'),
            reverse=True,
        )
        # write import report
        write_import_statuses(successful_libs, failed_libs)
예제 #9
0
def create_lane_fastq_metadata(tantalus_api, dataset_id):
    """
    Get meatadata per lane of sequencing for a given dataset.
    """
    colossus_api = ColossusApi()

    dataset = tantalus_api.get("sequencedataset", id=dataset_id)
    library_id = dataset['library']['library_id']
    sample_id = dataset['sample']['sample_id']
    assert len(dataset['sequence_lanes']) == 1
    flowcell_id = dataset['sequence_lanes'][0]['flowcell_id']
    lane_number = dataset['sequence_lanes'][0]['lane_number']

    sample_info = generate_inputs.generate_sample_info(library_id)
    index_sequence_cell_id = sample_info.set_index(
        'index_sequence')['cell_id'].to_dict()

    metadata = {'files': {}, 'meta': {}}

    metadata['meta']['type'] = DATASET_TYPE
    metadata['meta']['version'] = DATASET_VERSION

    metadata['meta']['sample_id'] = sample_id
    metadata['meta']['library_id'] = library_id

    base_dirs = set()
    cell_ids = set()

    file_resources = list(
        tantalus_api.list('file_resource', sequencedataset__id=dataset['id']))

    for file_resource in file_resources:
        filename = os.path.basename(file_resource['filename'])
        dirname = os.path.dirname(file_resource['filename'])

        if filename.endswith('metadata.yaml'):
            continue

        index_sequence = file_resource['sequencefileinfo']['index_sequence']
        cell_id = index_sequence_cell_id[index_sequence]
        read_end = file_resource['sequencefileinfo']['read_end']

        if filename in metadata['files']:
            raise ValueError(f'duplicate filename {filename}')

        metadata['files'][filename] = {
            'cell_id': cell_id,
            'read_end': read_end,
            'flowcell_id': flowcell_id,
            'lane_number': lane_number,
        }

        base_dirs.add(dirname)
        cell_ids.add(cell_id)

    if len(base_dirs) != 1:
        raise ValueError(
            f'found files in zero or multiple directories {base_dirs}')

    assert not sample_info['cell_id'].duplicated().any()

    metadata['meta']['cells'] = {}
    for idx, row in sample_info.iterrows():
        cell_id = row['cell_id']

        if cell_id not in cell_ids:
            continue

        metadata['meta']['cells'][cell_id] = {
            'library_id': row['library_id'],
            'sample_id': row['sample_id'],
            'pick_met': row['pick_met'],
            'condition': row['condition'],
            'sample_type': row['sample_type'],
            'img_col': row['img_col'],
            'row': row['row'],
            'column': row['column'],
            'primer_i5': row['primer_i5'],
            'index_i5': row['index_i5'],
            'primer_i7': row['primer_i7'],
            'index_i7': row['index_i7'],
            'index_sequence': row['index_sequence'],
        }

    metadata['meta']['lanes'] = {
        flowcell_id: {
            lane_number: {
                'sequencing_centre':
                dataset['sequence_lanes'][0]['sequencing_centre'],
                'sequencing_instrument':
                dataset['sequence_lanes'][0]['sequencing_instrument'],
                'sequencing_library_id':
                dataset['sequence_lanes'][0]['sequencing_library_id'],
                'read_type':
                dataset['sequence_lanes'][0]['read_type'],
            }
        }
    }

    return metadata, base_dirs.pop()
예제 #10
0
def create_lane_fastq_metadata(tantalus_api, library_id):
    """
    Get meatadata per lane of sequencing for a given library.
    """
    colossus_api = ColossusApi()

    sample_info = generate_inputs.generate_sample_info(library_id)
    index_sequence_cell_id = sample_info.set_index(
        'index_sequence')['cell_id'].to_dict()

    datasets = list(
        tantalus_api.list("sequencedataset",
                          dataset_type='FQ',
                          library__library_id=library_id))

    datasets_by_lane = collections.defaultdict(list)

    for dataset in datasets:
        assert len(dataset['sequence_lanes']) == 1
        flowcell_id = dataset['sequence_lanes'][0]['flowcell_id']
        lane_number = dataset['sequence_lanes'][0]['lane_number']
        datasets_by_lane[(flowcell_id, lane_number)].append(dataset)

    for (flowcell_id, lane_number), lane_datasets in datasets_by_lane.items():
        metadata = {'files': {}, 'meta': {}}

        metadata['meta']['type'] = DATASET_TYPE
        metadata['meta']['version'] = DATASET_VERSION

        dataset_ids = set()
        base_dirs = set()
        sequence_lane_ids = set()
        for dataset in lane_datasets:
            file_resources = list(
                tantalus_api.list('file_resource',
                                  sequencedataset__id=dataset['id']))

            dataset_ids.add(dataset['id'])
            sequence_lane_ids.add(dataset['sequence_lanes'][0]['id'])

            for file_resource in file_resources:
                filename = os.path.basename(file_resource['filename'])

                # Find common directory as subdirectory ending with flowcell/lane
                flowcell_lane = f'{flowcell_id}_{lane_number}'
                flowcell_idx = file_resource['filename'].index(flowcell_lane +
                                                               '/')
                flowcell_idx += len(flowcell_lane)
                base_dir = file_resource['filename'][:flowcell_idx]
                filename = file_resource['filename'][flowcell_idx + 1:]
                base_dirs.add(base_dir)

                index_sequence = file_resource['sequencefileinfo'][
                    'index_sequence']
                cell_id = index_sequence_cell_id[index_sequence]
                read_end = file_resource['sequencefileinfo']['read_end']

                if filename in metadata:
                    raise ValueError(f'duplicate filename {filename}')

                metadata['files'][filename] = {
                    'cell_id': cell_id,
                    'read_end': read_end,
                    'flowcell_id': flowcell_id,
                    'lane_number': lane_number,
                }

        if len(base_dirs) != 1:
            raise ValueError(
                f'found files in zero or multiple directories {base_dirs}')

        if len(sequence_lane_ids) != 1:
            raise ValueError(
                f'found zero or multiple lanes {sequence_lane_ids}')

        assert not sample_info['cell_id'].duplicated().any()

        metadata['meta']['cells'] = {}
        for idx, row in sample_info.iterrows():
            metadata['meta']['cells'][row['cell_id']] = {
                'library_id': row['library_id'],
                'sample_id': row['sample_id'],
                'pick_met': row['pick_met'],
                'condition': row['condition'],
                'sample_type': row['sample_type'],
                'img_col': row['img_col'],
                'row': row['row'],
                'column': row['column'],
                'primer_i5': row['primer_i5'],
                'index_i5': row['index_i5'],
                'primer_i7': row['primer_i7'],
                'index_i7': row['index_i7'],
                'index_sequence': row['index_sequence'],
            }

        metadata['meta']['lanes'] = {
            flowcell_id: {
                lane_number: {
                    'sequencing_centre':
                    lane_datasets[0]['sequence_lanes'][0]['sequencing_centre'],
                    'sequencing_instrument':
                    lane_datasets[0]['sequence_lanes'][0]
                    ['sequencing_instrument'],
                    'sequencing_library_id':
                    lane_datasets[0]['sequence_lanes'][0]
                    ['sequencing_library_id'],
                    'read_type':
                    lane_datasets[0]['sequence_lanes'][0]['read_type'],
                }
            }
        }

        dataset_info = {
            'dataset_ids': dataset_ids,
            'flowcell_id': flowcell_id,
            'lane_number': lane_number,
            'base_dir': list(base_dirs)[0],
        }

        yield dataset_info, metadata