def import_dlp_realign_bams(
    storage_name,
    storage_type,
    bam_filenames,
    tantalus_api,
    tag_name=None,
    analysis_id=None,
    **kwargs
):
    metadata = []

    storage = tantalus_api.get("storage", name=storage_name)

    if storage_type == "blob":
        for bam_filename in bam_filenames:
            blob_container_name = os.path.join(
                storage["storage_account"], storage["storage_container"]
            )
            metadata.extend(
                import_dlp_realign_bam_blob(bam_filename, blob_container_name)
            )
    elif storage_type == "server":
        for bam_filename in bam_filenames:

            metadata.extend(
                import_dlp_realign_bam_server(
                    bam_filename, storage["storage_directory"]
                )
            )
    else:
        raise ValueError("unsupported storage type {}".format(storage_type))

    create_sequence_dataset_models(
        metadata, storage_name, tag_name, tantalus_api, analysis_id
    )
Пример #2
0
def load_brc_fastqs(
    flowcell_id,
    storage_name,
    storage_directory,
    output_dir,
    tantalus_api,
    tag_name=None,
):
    # Check for .. in file path
    if ".." in output_dir:
        raise Exception("Invalid path for output_dir. '..' detected")

    # Check that output_dir is actually in storage
    if not output_dir.startswith(storage_directory):
        raise Exception(
            "Invalid path for output_dir. {} doesn't seem to be in the specified storage"
            .format(output_dir))

    # Check that path is valid.
    if not os.path.isdir(output_dir):
        raise Exception(
            "output directory {} not a directory".format(output_dir))

    fastq_file_info = get_fastq_info(output_dir, flowcell_id,
                                     storage_directory)

    fastq_paired_end_check(fastq_file_info)

    create_sequence_dataset_models(fastq_file_info, storage_name, tag_name,
                                   tantalus_api)
Пример #3
0
def load_brc_fastqs(
    flowcell_id,
    output_dir,
    storage_name,
    storage,
    tantalus_api,
    storage_client,
    tag_name=None,
    update=False,
    threshold=20,
):
    if not os.path.isdir(output_dir):
        raise Exception(
            "output directory {} not a directory".format(output_dir))

    fastq_file_info = get_fastq_info(output_dir, flowcell_id, storage,
                                     storage_client, threshold)

    fastq_paired_end_check(fastq_file_info)

    fastq_dlp_index_check(fastq_file_info)

    create_sequence_dataset_models(
        fastq_file_info,
        storage_name,
        tag_name,
        tantalus_api,
        update=update,
    )

    update_ticket(flowcell_id)

    logging.info('import succeeded')
def import_gsc_dlp_paired_fastqs(colossus_api,
                                 tantalus_api,
                                 dlp_library_id,
                                 storage,
                                 tag_name=None):
    ''' Import dlp fastq data from the GSC.
    
    Args:
        colossus_api: Basic client for colossus
        tantalus_api: Basic client for tantalus
        dlp_library_id: library id for the dlp run
        storage: to storage details for transfer
        tag_name: a tag to add to imported data

    '''

    logging.info('importing data for {}'.format(dlp_library_id))

    # Existing fastqs in tantalus as a set of tuples of
    # the form (flowcell_id, lane_number, index_sequence, read_end)
    existing_data = get_existing_fastq_data(tantalus_api, dlp_library_id)

    primary_sample_id = colossus_api.query_libraries_by_library_id(
        dlp_library_id)['sample']['sample_id']
    cell_samples = query_colossus_dlp_cell_info(colossus_api, dlp_library_id)
    rev_comp_overrides = query_colossus_dlp_rev_comp_override(
        colossus_api, dlp_library_id)

    external_identifier = "{}_{}".format(primary_sample_id, dlp_library_id)

    gsc_api = GSCAPI()

    library_infos = gsc_api.query(
        "library?external_identifier={}".format(external_identifier))

    if len(library_infos) == 0:
        logging.error(
            'no libraries with external_identifier {} in gsc api'.format(
                external_identifier))
        return
    elif len(library_infos) > 1:
        raise Exception(
            "multiple libraries with external_identifier {} in gsc api".format(
                external_identifier))

    library_info = library_infos[0]

    gsc_library_id = library_info["name"]

    fastq_infos = gsc_api.query(
        "fastq?parent_library={}".format(gsc_library_id))

    fastq_file_info = []

    flowcells_to_be_created = []

    for fastq_info in fastq_infos:
        fastq_path = fastq_info["data_path"]

        if fastq_info["status"] != "production":
            logging.info("skipping file {} marked as {}".format(
                fastq_info["data_path"], fastq_info["status"]))
            continue

        flowcell_id = str(
            fastq_info['libcore']['run']['flowcell']['lims_flowcell_code'])
        lane_number = fastq_info['libcore']['run']['lane_number']

        if fastq_info['removed_datetime'] is not None:
            logging.info('skipping file {} marked as removed {}'.format(
                fastq_info['data_path'], fastq_info['removed_datetime']))
            continue

        sequencing_instrument = get_sequencing_instrument(
            fastq_info["libcore"]["run"]["machine"])
        solexa_run_type = fastq_info["libcore"]["run"]["solexarun_type"]
        read_type = solexa_run_type_map[solexa_run_type]

        primer_id = fastq_info["libcore"]["primer_id"]
        primer_info = gsc_api.query("primer/{}".format(primer_id))
        raw_index_sequence = primer_info["adapter_index_sequence"]

        logging.info(
            "loading fastq %s, index %s, %s",
            fastq_info["id"],
            raw_index_sequence,
            fastq_path,
        )

        flowcell_lane = flowcell_id
        if lane_number is not None:
            flowcell_lane = flowcell_lane + "_" + str(lane_number)

        rev_comp_override = rev_comp_overrides.get(flowcell_lane)

        index_sequence = decode_raw_index_sequence(raw_index_sequence,
                                                   sequencing_instrument,
                                                   rev_comp_override)

        filename_pattern = fastq_info["file_type"]["filename_pattern"]
        read_end, passed = filename_pattern_map.get(filename_pattern,
                                                    (None, None))

        if read_end is None:
            raise Exception(
                "Unrecognized file type: {}".format(filename_pattern))

        if not passed:
            continue

        if (flowcell_id, str(lane_number), index_sequence,
                read_end) in existing_data:
            logging.info(
                'skipping file {} that has already been imported'.format(
                    fastq_info['data_path']))
            continue

        try:
            cell_sample_id = cell_samples[index_sequence]
        except KeyError:
            raise Exception(
                'unable to find index {} for flowcell lane {} for library {}'.
                format(index_sequence, flowcell_lane, dlp_library_id))

        extension = ''
        compression = 'UNCOMPRESSED'
        if fastq_path.endswith('.gz'):
            extension = '.gz'
            compression = 'GZIP'
        elif not fastq_path.endswith('.fastq'):
            raise ValueError(
                'unknown extension for filename {}'.format(fastq_path))

        tantalus_filename = dlp_fastq_template.format(
            primary_sample_id=primary_sample_id,
            dlp_library_id=dlp_library_id,
            flowcell_id=flowcell_id,
            lane_number=lane_number,
            cell_sample_id=cell_sample_id,
            index_sequence=index_sequence,
            read_end=read_end,
            extension=extension,
        )

        tantalus_path = os.path.join(storage["storage_directory"],
                                     tantalus_filename)

        rsync_file(fastq_path, tantalus_path)

        fastq_file_info.append(
            dict(
                dataset_type="FQ",
                sample_id=cell_sample_id,
                library_id=dlp_library_id,
                library_type="SC_WGS",
                index_format="D",
                sequence_lanes=[
                    dict(
                        flowcell_id=flowcell_id,
                        lane_number=lane_number,
                        sequencing_centre="GSC",
                        sequencing_instrument=sequencing_instrument,
                        sequencing_library_id=gsc_library_id,
                        read_type=read_type,
                    )
                ],
                size=os.path.getsize(fastq_path),
                created=pd.Timestamp(time.ctime(os.path.getmtime(fastq_path)),
                                     tz="Canada/Pacific"),
                file_type="FQ",
                read_end=read_end,
                index_sequence=index_sequence,
                compression=compression,
                filename=tantalus_filename,
            ))

        flowcells_to_be_created.append(flowcell_id + '_' + str(lane_number))

    fastq_paired_end_check(fastq_file_info)

    create_sequence_dataset_models(fastq_file_info, storage["name"], tag_name,
                                   tantalus_api)

    logging.info('import succeeded')

    return flowcells_to_be_created