Пример #1
0
def import_bam(storage_name,
               bam_file_path,
               sample=None,
               library=None,
               lane_infos=None,
               read_type=None,
               ref_genome=None,
               tag_name=None,
               update=False):
    """
    Imports bam into tantalus

    Args:
        storage_name:   (string) name of destination storage
        bam_file_path:  (string) filepath to bam on destination storage
        sample:         (dict) contains sample_id
        library:        (dict) contains library_id, library_type, index_format
        lane_infos:     (dict) contains flowcell_id, lane_number, 
                        adapter_index_sequence, sequencing_cenre, read_type, 
                        reference_genome, aligner
        read_type:      (string) read type for the run
        tag_name:       (string)
        update:         (boolean)
    Returns:
        sequence_dataset:   (dict) sequence dataset created on tantalus
    """
    tantalus_api = TantalusApi()

    # Get a url allowing access regardless of whether the file
    # is in cloud or local storage
    storage_client = tantalus_api.get_storage_client(storage_name)
    bam_filename = tantalus_api.get_file_resource_filename(
        storage_name, bam_file_path)
    bam_url = storage_client.get_url(bam_filename)

    bam_header = pysam.AlignmentFile(bam_url).header
    bam_header_info = get_bam_header_info(bam_header)

    if ref_genome is None:
        ref_genome = get_bam_ref_genome(bam_header)

    aligner_name = get_bam_aligner_name(bam_header)

    logging.info(
        f"bam header shows reference genome {ref_genome} and aligner {aligner_name}"
    )

    bai_file_path = None
    if storage_client.exists(bam_filename + ".bai"):
        bai_file_path = bam_file_path + ".bai"
    else:
        logging.info(f"no bam index found at {bam_filename + '.bai'}")

    # If no sample was specified assume it exists in tantalus and
    # search for it based on header info
    if sample is None:
        if len(bam_header_info["sample_ids"]) != 1:
            raise ValueError(
                f"found sample_ids={bam_header_info['sample_ids']}, please specify override sample id"
            )
        sample_id = list(bam_header_info["sample_ids"])[0]
        sample = tantalus_api.get('sample', sample_id=sample_id)

    # If no library was specified assume it exists in tantalus and
    # search for it based on header info
    if library is None:
        if len(bam_header_info["library_ids"]) != 1:
            raise ValueError(
                f"found library_ids={bam_header_info['library_ids']}, please specify override library id"
            )
        library_id = list(bam_header_info["library_ids"])[0]
        library = tantalus_api.get('dna_library', library_id=library_id)

    # Default paired end reads
    if read_type is None:
        read_type = 'P'

    # If no lane infos were specified create them from header info
    if lane_infos is None:
        lane_infos = []
        for lane in bam_header_info["sequence_lanes"]:
            lane_info = {
                "flowcell_id": lane["flowcell_id"],
                "lane_number": lane["lane_number"],
                "library_id": lane["library_id"],
                "sequencing_centre": lane["sequencing_centre"],
                "read_type": read_type,
            }
            lane_infos.append(lane_info)

    # Add the sequence dataset to Tantalus
    sequence_dataset = add_sequence_dataset(
        tantalus_api,
        storage_name=storage_name,
        sample=sample,
        library=library,
        dataset_type="BAM",
        sequence_lanes=lane_infos,
        bam_file_path=bam_file_path,
        reference_genome=ref_genome,
        aligner=aligner_name,
        bai_file_path=bai_file_path,
        tag_name=tag_name,
        update=update,
    )

    return sequence_dataset
Пример #2
0
def add_generic_results(filepaths,
                        storage_name,
                        results_name,
                        results_type,
                        results_version,
                        sample_ids=(),
                        library_ids=(),
                        analysis_pk=None,
                        recursive=False,
                        tag_name=None,
                        update=False,
                        remote_storage_name=None):

    tantalus_api = TantalusApi()
    storage_client = tantalus_api.get_storage_client(storage_name)

    sample_pks = []
    for sample_id in sample_ids:
        samples = tantalus_api.get(
            "sample",
            sample_id=sample_id,
        )
        sample_pks.append(samples['id'])

    library_pks = []
    for library_id in library_ids:
        librarys = tantalus_api.get(
            "dna_library",
            library_id=library_id,
        )
        library_pks.append(librarys['id'])

    #Add the file resource to tantalus
    file_resource_pks = []
    for filepath in filepaths:
        if recursive:
            logging.info("Recursing directory {}".format(filepath))
            filename_prefix = tantalus_api.get_file_resource_filename(
                storage_name, filepath)
            add_filepaths = []
            for filename in storage_client.list(filename_prefix):
                add_filepaths.append(
                    tantalus_api.get_filepath(storage_name, filename))

        else:
            add_filepaths = [filepath]

        for add_filepath in add_filepaths:
            logging.info(
                "Adding file resource for {} to Tantalus".format(add_filepath))
            resource, instance = tantalus_api.add_file(
                storage_name=storage_name,
                filepath=add_filepath,
                update=update,
            )
            file_resource_pks.append(resource["id"])

    results_dataset_fields = dict(
        name=results_name,
        results_type=results_type,
        results_version=results_version,
        analysis=analysis_pk,
        samples=sample_pks,
        libraries=library_pks,
        file_resources=file_resource_pks,
    )

    #Add the dataset to tantalus
    try:
        results_id = tantalus_api.get(
            "results", name=results_dataset_fields["name"])["id"]
    except NotFoundError:
        results_id = None

    if update and results_id is not None:
        logging.warning("results dataset {} exists, updating".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.update("results",
                                              id=results_id,
                                              **results_dataset_fields)

    else:
        logging.info("creating results dataset {}".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.get_or_create("results",
                                                     **results_dataset_fields)

    if tag_name is not None:
        tantalus_api.tag(tag_name, resultsdataset_set=[results_id])

    logging.info("Succesfully created sequence dataset with ID {}".format(
        results_dataset["id"]))

    if remote_storage_name is not None:
        transfer_files.transfer_dataset(tantalus_api, results_dataset['id'],
                                        "resultsdataset", storage_name,
                                        remote_storage_name)

    return results_dataset