示例#1
0
def print_hpo(out_dir):
    """Print HPO files to a directory

    Args:
        out_dir(Path)
    """
    hpo_file_name = "hpo.obo"
    hpo_file_path = out_dir / hpo_file_name
    LOG.info("Download HPO terms to %s", hpo_file_path)
    with hpo_file_path.open("w", encoding="utf-8") as outfile:
        for line in fetch_hpo_terms():
            outfile.write(line + "\n")

    hpo_file_name = "ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype.txt"
    hpo_file_path = out_dir / hpo_file_name
    LOG.info("Download HPO genes to %s", hpo_file_path)
    with hpo_file_path.open("w", encoding="utf-8") as outfile:
        for line in fetch_hpo_genes():
            outfile.write(line + "\n")

    hpo_file_name = "ALL_SOURCES_ALL_FREQUENCIES_phenotype_to_genes.txt"
    hpo_file_path = out_dir / hpo_file_name
    LOG.info("Download HPO TO genes to %s", hpo_file_path)
    with hpo_file_path.open("w", encoding="utf-8") as outfile:
        for line in fetch_hpo_to_genes():
            outfile.write(line + "\n")

    hpo_file_name = "ALL_SOURCES_ALL_FREQUENCIES_diseases_to_genes_to_phenotypes.txt"
    hpo_file_path = out_dir / hpo_file_name
    LOG.info("Download HPO disease %s", hpo_file_path)
    with hpo_file_path.open("w", encoding="utf-8") as outfile:
        for line in fetch_hpo_phenotype_to_terms():
            outfile.write(line + "\n")
def generate_hpo_terms(genes):
    """Generate the lines from a reduced hpo terms file

    Args:
        genes(dict): A map from hgnc_symbol to hgnc_id

    Yields:
        line(str): Lines from hpo with connection to genes
    """
    hpo_lines = fetch_hpo_genes()
    nr_terms = 0

    for i, line in enumerate(hpo_lines):
        line = line.rstrip()
        if not len(line) > 1:
            continue
        # Header line
        if i == 0:
            yield line
            continue

        splitted_line = line.split("\t")
        hgnc_symbol = splitted_line[1]

        if hgnc_symbol in genes:
            nr_terms
            yield line
示例#3
0
def genes(build, api_key):
    """
    Load the hgnc aliases to the mongo database.
    """
    LOG.info("Running scout update genes")
    adapter = store

    # Fetch the omim information
    api_key = api_key or current_app.config.get("OMIM_API_KEY")
    mim_files = {}
    if not api_key:
        LOG.warning(
            "No omim api key provided, Please not that some information will be missing"
        )

    else:
        try:
            mim_files = fetch_mim_files(api_key,
                                        mim2genes=True,
                                        morbidmap=True,
                                        genemap2=True)
        except Exception as err:
            LOG.warning(err)
            raise click.Abort()

    LOG.warning("Dropping all gene information")
    adapter.drop_genes(build)
    LOG.info("Genes dropped")
    LOG.warning("Dropping all transcript information")
    adapter.drop_transcripts(build)
    LOG.info("transcripts dropped")

    hpo_genes = fetch_hpo_genes()

    if build:
        builds = [build]
    else:
        builds = ["37", "38"]

    hgnc_lines = fetch_hgnc()
    exac_lines = fetch_exac_constraint()

    for build in builds:
        ensembl_genes = fetch_ensembl_genes(build=build)

        # load the genes
        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim_files.get("mim2genes"),
            genemap_lines=mim_files.get("genemap2"),
            hpo_lines=hpo_genes,
            build=build,
        )

        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj["ensembl_id"]
            ensembl_genes[ensembl_id] = gene_obj

        # Fetch the transcripts from ensembl
        ensembl_transcripts = fetch_ensembl_transcripts(build=build)

        transcripts = load_transcripts(adapter, ensembl_transcripts, build,
                                       ensembl_genes)

    adapter.update_indexes()

    LOG.info("Genes, transcripts and Exons loaded")
示例#4
0
def setup_scout(
    adapter,
    institute_id="cust000",
    user_name="Clark Kent",
    user_mail="*****@*****.**",
    api_key=None,
    demo=False,
    resource_files=None,
):
    """Function to setup a working scout instance.

    WARNING: If the instance is populated all collections will be deleted

    Build insert a institute and an admin user.
    There are multiple sources of information that is used by scout and that needs to exist for
    scout to work proper.

    Genes:
         Scout uses HGNC as the source for gene identifiers en ensembl as source for coordinates.
         Additional information of disease connections for genes if fetched from OMIM.
         Link between hpo terms and genes is fetched from HPO
         For more details check the documentation.

    """
    LOG.info("Check if there was a database, delet if existing")
    existing_database = False
    for collection_name in adapter.db.collection_names():
        if collection_name.startswith("system"):
            continue
        LOG.info("Deleting collection %s", collection_name)
        adapter.db.drop_collection(collection_name)
        existing_database = True

    if existing_database:
        LOG.info("Database deleted")

    institute_obj = build_institute(
        internal_id=institute_id,
        display_name=institute_id,
        sanger_recipients=[user_mail],
    )
    adapter.add_institute(institute_obj)

    user_obj = dict(
        _id=user_mail,
        email=user_mail,
        name=user_name,
        roles=["admin"],
        institutes=[institute_id],
    )

    adapter.add_user(user_obj)

    resource_files = resource_files or {}
    if demo:
        resource_files = demo_files

    mim2gene_lines = None
    genemap_lines = None
    mim2gene_path = resource_files.get("mim2gene_path")
    genemap_path = resource_files.get("genemap_path")
    if genemap_path and mim2gene_path:
        mim2gene_lines = [line for line in get_file_handle(mim2gene_path)]
        genemap_lines = [line for line in get_file_handle(genemap_path)]

    if (genemap_lines is None) and api_key:
        try:
            mim_files = fetch_mim_files(api_key, mim2genes=True, genemap2=True)
        except Exception as err:
            LOG.warning(err)
            raise err
        mim2gene_lines = mim_files["mim2genes"]
        genemap_lines = mim_files["genemap2"]

    if resource_files.get("hpogenes_path"):
        hpo_gene_lines = [
            line
            for line in get_file_handle(resource_files.get("hpogenes_path"))
        ]
    else:
        hpo_gene_lines = fetch_hpo_genes()

    if resource_files.get("hgnc_path"):
        hgnc_lines = [
            line for line in get_file_handle(resource_files.get("hgnc_path"))
        ]
    else:
        hgnc_lines = fetch_hgnc()

    if resource_files.get("exac_path"):
        exac_lines = [
            line for line in get_file_handle(resource_files.get("exac_path"))
        ]
    else:
        exac_lines = fetch_exac_constraint()

    builds = ["37", "38"]
    for build in builds:
        genes_path = "genes{}_path".format(build)
        if resource_files.get(genes_path):
            ensembl_genes = get_file_handle(resource_files[genes_path])
        else:
            ensembl_genes = fetch_ensembl_genes(build=build)

        hgnc_genes = load_hgnc_genes(
            adapter=adapter,
            ensembl_lines=ensembl_genes,
            hgnc_lines=hgnc_lines,
            exac_lines=exac_lines,
            mim2gene_lines=mim2gene_lines,
            genemap_lines=genemap_lines,
            hpo_lines=hpo_gene_lines,
            build=build,
        )

        # Create a map from ensembl ids to gene objects
        ensembl_genes = {}
        for gene_obj in hgnc_genes:
            ensembl_id = gene_obj["ensembl_id"]
            ensembl_genes[ensembl_id] = gene_obj

        tx_path = "transcripts{}_path".format(build)
        if resource_files.get(tx_path):
            ensembl_transcripts = get_file_handle(resource_files[tx_path])
        else:
            ensembl_transcripts = fetch_ensembl_transcripts(build=build)
        # Load the transcripts for a certain build
        transcripts = load_transcripts(adapter, ensembl_transcripts, build,
                                       ensembl_genes)

    hpo_terms_handle = None
    if resource_files.get("hpoterms_path"):
        hpo_terms_handle = get_file_handle(resource_files["hpoterms_path"])

    hpo_to_genes_handle = None
    if resource_files.get("hpo_to_genes_path"):
        hpo_to_genes_handle = get_file_handle(
            resource_files["hpo_to_genes_path"])

    hpo_disease_handle = None
    if resource_files.get("hpo_disease_path"):
        hpo_disease_handle = get_file_handle(
            resource_files["hpo_disease_path"])

    load_hpo(
        adapter=adapter,
        hpo_lines=hpo_terms_handle,
        hpo_gene_lines=hpo_to_genes_handle,
        disease_lines=genemap_lines,
        hpo_disease_lines=hpo_disease_handle,
    )

    # If demo we load a gene panel and some case information
    if demo:
        parsed_panel = parse_gene_panel(
            path=panel_path,
            institute="cust000",
            panel_id="panel1",
            version=1.0,
            display_name="Test panel",
        )
        adapter.load_panel(parsed_panel)

        case_handle = get_file_handle(load_path)
        case_data = yaml.load(case_handle, Loader=yaml.FullLoader)

        adapter.load_case(case_data)

    LOG.info("Creating indexes")
    adapter.load_indexes()
    LOG.info("Scout instance setup successful")