예제 #1
0
def test_fetch_ensembl_transcripts(mocker):
    """Test fetch resource"""

    # GIVEN a mock
    mocker.patch.object(scout_requests, "EnsemblBiomartClient")
    # WHEN fetching the resource
    client = scout_requests.fetch_ensembl_transcripts()

    # THEN assert that a result is returned
    assert client
예제 #2
0
def generate_ensembl_transcripts(ensembl_genes, build=None):
    """Generate a file with reduced ensembl gene information

    Args:
        genes(dict): A dictionary with ensembl_id as key and hgnc_id as value
        build(str): What build to use. Defaults to 37

    Yields:
        print_line(str):  Lines from the reduced file

    """
    build = build or "37"

    ensembl_transcripts = fetch_ensembl_transcripts(build=build)

    ensembl_header = [
        "Chromosome/scaffold name",
        "Gene stable ID",
        "Transcript stable ID",
        "Transcript start (bp)",
        "Transcript end (bp)",
        "RefSeq mRNA ID",
        "RefSeq mRNA predicted ID",
        "RefSeq ncRNA ID",
    ]

    yield "\t".join(ensembl_header)

    for tx_info in parse_ensembl_transcripts(ensembl_transcripts):
        ens_gene_id = tx_info["ensembl_gene_id"]
        if ens_gene_id in ensembl_genes:
            print_line = [
                tx_info["chrom"],
                tx_info["ensembl_gene_id"],
                tx_info["ensembl_transcript_id"],
                str(tx_info["transcript_start"]),
                str(tx_info["transcript_end"]),
                tx_info["refseq_mrna"] or "",
                tx_info["refseq_mrna_predicted"] or "",
                tx_info["refseq_ncrna"] or "",
            ]
            yield "\t".join(print_line)
예제 #3
0
파일: setup.py 프로젝트: terestahl/scout
def setup_scout(
    adapter,
    institute_id="cust000",
    user_name="Clark Kent",
    user_mail="*****@*****.**",
    api_key=None,
    demo=False,
    resource_files=None,
):
    """Function to setup a working scout instance.

    WARNING: If the instance is populated all collections will be deleted

    Build insert a institute and an admin user.
    There are multiple sources of information that is used by scout and that needs to exist for
    scout to work proper.

    Genes:
         Scout uses HGNC as the source for gene identifiers en ensembl as source for coordinates.
         Additional information of disease connections for genes if fetched from OMIM.
         Link between hpo terms and genes is fetched from HPO
         For more details check the documentation.

    """

    LOG.info("Check if there was a database, delete if existing")
    existing_database = False
    for collection_name in adapter.db.list_collection_names():
        if collection_name.startswith("system"):
            continue
        LOG.info("Deleting collection %s", collection_name)
        adapter.db.drop_collection(collection_name)
        existing_database = True

    if existing_database:
        LOG.info("Database deleted")

    institute_obj = build_institute(
        internal_id=institute_id,
        display_name=institute_id,
        sanger_recipients=[user_mail],
    )
    adapter.add_institute(institute_obj)

    user_obj = dict(
        _id=user_mail,
        email=user_mail,
        name=user_name,
        roles=["admin"],
        institutes=[institute_id],
    )

    adapter.add_user(user_obj)

    resource_files = resource_files or {}
    if demo:
        resource_files = demo_files
    mim2gene_lines = None
    genemap_lines = None
    mim2gene_path = resource_files.get("mim2gene_path")
    genemap_path = resource_files.get("genemap_path")
    if genemap_path and mim2gene_path:
        mim2gene_lines = [line for line in get_file_handle(mim2gene_path)]
        genemap_lines = [line for line in get_file_handle(genemap_path)]

    if (genemap_lines is None) and api_key:
        try:
            mim_files = fetch_mim_files(api_key, mim2genes=True, genemap2=True)
        except Exception as err:
            LOG.warning(err)
            raise err
        mim2gene_lines = mim_files["mim2genes"]
        genemap_lines = mim_files["genemap2"]

    if resource_files.get("hpogenes_path"):
        hpo_gene_lines = [
            line
            for line in get_file_handle(resource_files.get("hpogenes_path"))
        ]
    else:
        hpo_gene_lines = fetch_genes_to_hpo_to_disease()

    if resource_files.get("hgnc_path"):
        hgnc_lines = [
            line for line in get_file_handle(resource_files.get("hgnc_path"))
        ]
    else:
        hgnc_lines = fetch_hgnc()

    if resource_files.get("exac_path"):
        exac_lines = [
            line for line in get_file_handle(resource_files.get("exac_path"))
        ]
    else:
        exac_lines = fetch_exac_constraint()

    # Load cytobands into cytoband collection
    for genome_build, cytobands_path in cytoband_files.items():
        load_cytobands(cytobands_path, genome_build, adapter)

    builds = ["37", "38"]
    for build in builds:
        genes_path = "genes{}_path".format(build)
        if resource_files.get(genes_path):
            ensembl_genes = get_file_handle(resource_files[genes_path])
        else:
            ensembl_genes = fetch_ensembl_genes(build=build)

        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
            hpo_lines=hpo_gene_lines,
            build=build,
        )

        # Create a map from ensembl ids to gene objects
        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj["ensembl_id"]
            ensembl_genes[ensembl_id] = gene_obj

        tx_path = "transcripts{}_path".format(build)
        if resource_files.get(tx_path):
            ensembl_transcripts = get_file_handle(resource_files[tx_path])
        else:
            ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        # Load the transcripts for a certain build
        transcripts = load_transcripts(adapter, ensembl_transcripts, build,
                                       ensembl_genes)

    hpo_terms_handle = None
    if resource_files.get("hpoterms_path"):
        hpo_terms_handle = get_file_handle(resource_files["hpoterms_path"])

    hpo_to_genes_handle = None
    if resource_files.get("hpo_to_genes_path"):
        hpo_to_genes_handle = get_file_handle(
            resource_files["hpo_to_genes_path"])

    hpo_disease_handle = None
    if resource_files.get("hpo_disease_path"):
        hpo_disease_handle = get_file_handle(
            resource_files["hpo_disease_path"])

    load_hpo(
        adapter=adapter,
        disease_lines=genemap_lines,
        hpo_lines=hpo_terms_handle,
        hpo_gene_lines=hpo_to_genes_handle,
    )

    # If demo we load a gene panel and some case information
    if demo:
        parsed_panel = parse_gene_panel(
            path=panel_path,
            institute="cust000",
            panel_id="panel1",
            version=1.0,
            display_name="Test panel",
        )
        adapter.load_panel(parsed_panel)

        case_handle = get_file_handle(load_path)
        case_data = yaml.load(case_handle, Loader=yaml.FullLoader)
        config_data = parse_case_data(config=case_data)
        adapter.load_case(config_data)

    LOG.info("Creating indexes")
    adapter.load_indexes()
    LOG.info("Scout instance setup successful")
예제 #4
0
def genes(build, api_key):
    """
    Load the hgnc aliases to the mongo database.
    """
    LOG.info("Running scout update genes")
    adapter = store

    # Fetch the omim information
    api_key = api_key or current_app.config.get("OMIM_API_KEY")
    mim_files = {}
    if not api_key:
        LOG.warning("No omim api key provided, Please not that some information will be missing")

    else:
        try:
            mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True)
        except Exception as err:
            LOG.warning(err)
            raise click.Abort()

    LOG.warning("Dropping all gene information")
    adapter.drop_genes(build)
    LOG.info("Genes dropped")
    LOG.warning("Dropping all transcript information")
    adapter.drop_transcripts(build)
    LOG.info("transcripts dropped")

    hpo_genes = fetch_genes_to_hpo_to_disease()

    if build:
        builds = [build]
    else:
        builds = ["37", "38"]

    hgnc_lines = fetch_hgnc()
    exac_lines = fetch_exac_constraint()

    for build in builds:
        ensembl_genes = fetch_ensembl_genes(build=build)

        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim_files.get("mim2genes"),
            genemap_lines=mim_files.get("genemap2"),
            hpo_lines=hpo_genes,
            build=build,
        )

        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj["ensembl_id"]
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        ensembl_transcripts = fetch_ensembl_transcripts(build=build)

        transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes)

    adapter.update_indexes()

    LOG.info("Genes, transcripts and Exons loaded")
예제 #5
0
def load_transcripts(adapter,
                     transcripts_lines=None,
                     build="37",
                     ensembl_genes=None):
    """Load all the transcripts

    Transcript information is from ensembl.

    Args:
        adapter(MongoAdapter)
        transcripts_lines(iterable): iterable with ensembl transcript lines
        build(str)
        ensembl_genes(dict): Map from ensembl_id -> HgncGene

    Returns:
        transcript_objs(list): A list with all transcript objects
    """
    # Fetch all genes with ensemblid as keys
    ensembl_genes = ensembl_genes or adapter.ensembl_genes(build)

    if transcripts_lines is None:
        transcripts_lines = fetch_ensembl_transcripts(build=build)

    # Map with all transcripts enstid -> parsed transcript
    transcripts_dict = parse_transcripts(transcripts_lines)
    for ens_tx_id in list(transcripts_dict):
        parsed_tx = transcripts_dict[ens_tx_id]
        # Get the ens gene id
        ens_gene_id = parsed_tx["ensembl_gene_id"]

        # Fetch the internal gene object to find out the correct hgnc id
        gene_obj = ensembl_genes.get(ens_gene_id)
        # If the gene is non existing in scout we skip the transcript
        if not gene_obj:
            transcripts_dict.pop(ens_tx_id)
            LOG.debug("Gene %s does not exist in build %s", ens_gene_id, build)
            continue

        # Add the correct hgnc id
        parsed_tx["hgnc_id"] = gene_obj["hgnc_id"]
        # Primary transcript information is collected from HGNC
        parsed_tx["primary_transcripts"] = set(
            gene_obj.get("primary_transcripts", []))

    ref_seq_transcripts = 0
    nr_primary_transcripts = 0
    nr_transcripts = len(transcripts_dict)

    transcript_objs = []

    with progressbar(transcripts_dict.values(),
                     label="Building transcripts",
                     length=nr_transcripts) as bar:
        for tx_data in bar:

            #################### Get the correct refseq identifier ####################
            # We need to decide one refseq identifier for each transcript, if there are any to
            # choose from. The algorithm is as follows:
            # If there is ONE mrna this is choosen
            # If there are several mrna the one that is in 'primary_transcripts' is choosen
            # Else one is choosen at random
            # The same follows for the other categories where nc_rna has precedense over mrna_predicted
            # We will store all refseq identifiers in a "refseq_identifiers" list as well
            tx_data["is_primary"] = False
            primary_transcripts = tx_data["primary_transcripts"]
            refseq_identifier = None
            refseq_identifiers = []
            for category in TRANSCRIPT_CATEGORIES:
                identifiers = tx_data[category]
                if not identifiers:
                    continue

                for refseq_id in identifiers:
                    # Add all refseq identifiers to refseq_identifiers
                    refseq_identifiers.append(refseq_id)
                    ref_seq_transcripts += 1

                    if refseq_id in primary_transcripts:
                        refseq_identifier = refseq_id
                        tx_data["is_primary"] = True
                        nr_primary_transcripts += 1

                    if not refseq_identifier:
                        refseq_identifier = refseq_id

            if refseq_identifier:
                tx_data["refseq_id"] = refseq_identifier
            if refseq_identifiers:
                tx_data["refseq_identifiers"] = refseq_identifiers

            ####################  ####################  ####################
            # Build the transcript object
            tx_obj = build_transcript(tx_data, build)
            transcript_objs.append(tx_obj)

    # Load all transcripts
    LOG.info("Loading transcripts...")
    if len(transcript_objs) > 0:
        adapter.load_transcript_bulk(transcript_objs)

    LOG.info("Number of transcripts in build %s: %s", build, nr_transcripts)
    LOG.info("Number of transcripts with refseq identifier: %s",
             ref_seq_transcripts)
    LOG.info("Number of primary transcripts: %s", nr_primary_transcripts)

    return transcript_objs