def print_hgnc(out_dir): """Print HPO files to a directory Args: out_dir(Path) """ file_name = "hgnc.txt" file_path = out_dir / file_name LOG.info("Downloads HGNC genes to %s", file_path) with file_path.open("w", encoding="utf-8") as outfile: for line in fetch_hgnc(): outfile.write(line + "\n")
def test_fetch_hgnc(hgnc_file, mocker): """Test fetch hgnc""" # GIVEN file with hgnc info mocker.patch.object(scout_requests.urllib.request, "urlopen") with open(hgnc_file, "rb") as hgnc_handle: hgnc_info = hgnc_handle.read() with tempfile.TemporaryFile() as temp: temp.write(hgnc_info) temp.seek(0) scout_requests.urllib.request.urlopen.return_value = temp # WHEN fetching the resource data = scout_requests.fetch_hgnc() # THEN assert that the HGNC header is there assert "hgnc_id\tsymbol" in data[0]
def generate_hgnc(genes): """Generate lines from a file with reduced hgnc information Args: genes(dict): A dictionary with hgnc_id as key and hgnc_symbol as value outpath(str): Defaults to hgnc_reduced_path Yields: print_line(str): Lines from the reduced file """ LOG.info("Generating new hgnc reduced file") # fetch the latest hgnc file here hgnc_gene_lines = fetch_hgnc() header = None genes_found = 0 # Loop over all hgnc gene lines for i, line in enumerate(hgnc_gene_lines): line = line.rstrip() # Skip lines that are empty if not len(line) > 0: continue # If we are reading the header, print it if i == 0: header = line.split("\t") yield line continue # Parse the hgnc gene line gene = parse_hgnc_line(line, header) if not gene: continue hgnc_id = int(gene["hgnc_id"]) # Check if the gene is in the reduced if hgnc_id in genes: genes_found += 1 yield line LOG.info("Number of genes printed to file: %s", genes_found)
def load_hgnc_genes( adapter, genes=None, ensembl_lines=None, hgnc_lines=None, exac_lines=None, mim2gene_lines=None, genemap_lines=None, hpo_lines=None, build="37", omim_api_key="", ): """Load genes into the database link_genes will collect information from all the different sources and merge it into a dictionary with hgnc_id as key and gene information as values. Args: adapter(scout.adapter.MongoAdapter) genes(dict): If genes are already parsed ensembl_lines(iterable(str)): Lines formated with ensembl gene information hgnc_lines(iterable(str)): Lines with gene information from genenames.org exac_lines(iterable(str)): Lines with information pLi-scores from ExAC mim2gene(iterable(str)): Lines with map from omim id to gene symbol genemap_lines(iterable(str)): Lines with information of omim entries hpo_lines(iterable(str)): Lines information about map from hpo terms to genes build(str): What build to use. Defaults to '37' Returns: gene_objects(list): A list with all gene_objects that was loaded into database """ gene_objects = list() if not genes: # Fetch the resources if not provided if ensembl_lines is None: ensembl_lines = fetch_ensembl_genes(build=build) hgnc_lines = hgnc_lines or fetch_hgnc() exac_lines = exac_lines or fetch_exac_constraint() if not (mim2gene_lines and genemap_lines): if not omim_api_key: LOG.warning("No omim api key provided!") else: mim_files = fetch_mim_files(omim_api_key, mim2genes=True, genemap2=True) mim2gene_lines = mim_files["mim2genes"] genemap_lines = mim_files["genemap2"] if not hpo_lines: hpo_files = fetch_hpo_files(hpogenes=True) hpo_lines = hpo_files["hpogenes"] # Link the resources genes = link_genes( ensembl_lines=ensembl_lines, hgnc_lines=hgnc_lines, exac_lines=exac_lines, hpo_lines=hpo_lines, mim2gene_lines=mim2gene_lines, genemap_lines=genemap_lines, ) non_existing = 0 nr_genes = len(genes) with progressbar(genes.values(), label="Building genes", length=nr_genes) as bar: for gene_data in bar: if not gene_data.get("chromosome"): LOG.debug( "skipping gene: %s. No coordinates found", gene_data.get("hgnc_symbol", "?"), ) non_existing += 1 continue gene_obj = build_hgnc_gene(gene_data, build=build) gene_objects.append(gene_obj) LOG.info("Loading genes build %s", build) adapter.load_hgnc_bulk(gene_objects) LOG.info("Loading done. %s genes loaded", len(gene_objects)) LOG.info("Nr of genes without coordinates in build %s: %s", build, non_existing) return gene_objects
def setup_scout( adapter, institute_id="cust000", user_name="Clark Kent", user_mail="*****@*****.**", api_key=None, demo=False, resource_files=None, ): """Function to setup a working scout instance. WARNING: If the instance is populated all collections will be deleted Build insert a institute and an admin user. There are multiple sources of information that is used by scout and that needs to exist for scout to work proper. Genes: Scout uses HGNC as the source for gene identifiers en ensembl as source for coordinates. Additional information of disease connections for genes if fetched from OMIM. Link between hpo terms and genes is fetched from HPO For more details check the documentation. """ LOG.info("Check if there was a database, delete if existing") existing_database = False for collection_name in adapter.db.list_collection_names(): if collection_name.startswith("system"): continue LOG.info("Deleting collection %s", collection_name) adapter.db.drop_collection(collection_name) existing_database = True if existing_database: LOG.info("Database deleted") institute_obj = build_institute( internal_id=institute_id, display_name=institute_id, sanger_recipients=[user_mail], ) adapter.add_institute(institute_obj) user_obj = dict( _id=user_mail, email=user_mail, name=user_name, roles=["admin"], institutes=[institute_id], ) adapter.add_user(user_obj) resource_files = resource_files or {} if demo: resource_files = demo_files mim2gene_lines = None genemap_lines = None mim2gene_path = resource_files.get("mim2gene_path") genemap_path = resource_files.get("genemap_path") if genemap_path and mim2gene_path: mim2gene_lines = [line for line in get_file_handle(mim2gene_path)] genemap_lines = [line for line in get_file_handle(genemap_path)] if (genemap_lines is None) and api_key: try: mim_files = fetch_mim_files(api_key, mim2genes=True, genemap2=True) except Exception as err: LOG.warning(err) raise err mim2gene_lines = mim_files["mim2genes"] genemap_lines = mim_files["genemap2"] if resource_files.get("hpogenes_path"): hpo_gene_lines = [ line for line in get_file_handle(resource_files.get("hpogenes_path")) ] else: hpo_gene_lines = fetch_genes_to_hpo_to_disease() if resource_files.get("hgnc_path"): hgnc_lines = [ line for line in get_file_handle(resource_files.get("hgnc_path")) ] else: hgnc_lines = fetch_hgnc() if resource_files.get("exac_path"): exac_lines = [ line for line in get_file_handle(resource_files.get("exac_path")) ] else: exac_lines = fetch_exac_constraint() # Load cytobands into cytoband collection for genome_build, cytobands_path in cytoband_files.items(): load_cytobands(cytobands_path, genome_build, adapter) builds = ["37", "38"] for build in builds: genes_path = "genes{}_path".format(build) if resource_files.get(genes_path): ensembl_genes = get_file_handle(resource_files[genes_path]) else: ensembl_genes = fetch_ensembl_genes(build=build) hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim2gene_lines, genemap_lines=genemap_lines, hpo_lines=hpo_gene_lines, build=build, ) # Create a map from ensembl ids to gene objects ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj["ensembl_id"] ensembl_genes[ensembl_id] = gene_obj tx_path = "transcripts{}_path".format(build) if resource_files.get(tx_path): ensembl_transcripts = get_file_handle(resource_files[tx_path]) else: ensembl_transcripts = fetch_ensembl_transcripts(build=build) # Load the transcripts for a certain build transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) hpo_terms_handle = None if resource_files.get("hpoterms_path"): hpo_terms_handle = get_file_handle(resource_files["hpoterms_path"]) hpo_to_genes_handle = None if resource_files.get("hpo_to_genes_path"): hpo_to_genes_handle = get_file_handle( resource_files["hpo_to_genes_path"]) hpo_disease_handle = None if resource_files.get("hpo_disease_path"): hpo_disease_handle = get_file_handle( resource_files["hpo_disease_path"]) load_hpo( adapter=adapter, disease_lines=genemap_lines, hpo_lines=hpo_terms_handle, hpo_gene_lines=hpo_to_genes_handle, ) # If demo we load a gene panel and some case information if demo: parsed_panel = parse_gene_panel( path=panel_path, institute="cust000", panel_id="panel1", version=1.0, display_name="Test panel", ) adapter.load_panel(parsed_panel) case_handle = get_file_handle(load_path) case_data = yaml.load(case_handle, Loader=yaml.FullLoader) config_data = parse_case_data(config=case_data) adapter.load_case(config_data) LOG.info("Creating indexes") adapter.load_indexes() LOG.info("Scout instance setup successful")
def genes(build, api_key): """ Load the hgnc aliases to the mongo database. """ LOG.info("Running scout update genes") adapter = store # Fetch the omim information api_key = api_key or current_app.config.get("OMIM_API_KEY") mim_files = {} if not api_key: LOG.warning("No omim api key provided, Please not that some information will be missing") else: try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) raise click.Abort() LOG.warning("Dropping all gene information") adapter.drop_genes(build) LOG.info("Genes dropped") LOG.warning("Dropping all transcript information") adapter.drop_transcripts(build) LOG.info("transcripts dropped") hpo_genes = fetch_genes_to_hpo_to_disease() if build: builds = [build] else: builds = ["37", "38"] hgnc_lines = fetch_hgnc() exac_lines = fetch_exac_constraint() for build in builds: ensembl_genes = fetch_ensembl_genes(build=build) # load the genes hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim_files.get("mim2genes"), genemap_lines=mim_files.get("genemap2"), hpo_lines=hpo_genes, build=build, ) ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj["ensembl_id"] ensembl_genes[ensembl_id] = gene_obj # Fetch the transcripts from ensembl ensembl_transcripts = fetch_ensembl_transcripts(build=build) transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) adapter.update_indexes() LOG.info("Genes, transcripts and Exons loaded")