def generate_mim2genes(genes, api_key): """Generate a reduced file with omim mim2gene information Args: genes(dict): A dictionary with hgnc_symbol as key and hgnc_id as value api_key(str) Yields: print_line(str): Lines from the reduced file """ mim_files = fetch_mim_files(api_key, mim2genes=True) mim2gene_lines = mim_files['mim2genes'] for line in mim2gene_lines: if line.startswith('#'): yield line else: break for gene_info in parse_mim2gene(mim2gene_lines): hgnc_symbol = gene_info.get('hgnc_symbol') if not hgnc_symbol: continue if hgnc_symbol in genes: yield gene_info['raw']
def diseases(context, api_key): """ Update disease terms in mongo database. """ adapter = context.obj['adapter'] # Fetch the omim information api_key = api_key or context.obj.get('omim_api_key') if not api_key: LOG.warning( "Please provide a omim api key to load the omim gene panel") context.abort() try: mim_files = fetch_mim_files(api_key, genemap2=True) except Exception as err: LOG.warning(err) context.abort() LOG.info("Dropping DiseaseTerms") adapter.disease_term_collection.drop() LOG.debug("DiseaseTerms dropped") load_disease_terms( adapter=adapter, genemap_lines=mim_files['genemap2'], ) LOG.info("Successfully loaded all disease terms")
def diseases(api_key): """ Update disease terms in mongo database. """ adapter = store # Fetch the omim information api_key = api_key or current_app.config.get('OMIM_API_KEY') if not api_key: LOG.warning("Please provide a omim api key to load the omim gene panel") raise click.Abort() try: mim_files = fetch_mim_files(api_key, genemap2=True) except Exception as err: LOG.warning(err) raise click.Abort() LOG.info("Dropping DiseaseTerms") adapter.disease_term_collection.drop() LOG.debug("DiseaseTerms dropped") load_disease_terms( adapter=adapter, genemap_lines=mim_files['genemap2'], ) LOG.info("Successfully loaded all disease terms")
def genes(context, build, api_key): """ Load the hgnc aliases to the mongo database. """ adapter = context.obj['adapter'] # Fetch the omim information api_key = api_key or context.obj.get('omim_api_key') if not api_key: LOG.warning("Please provide a omim api key to load the omim gene panel") context.abort() try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) context.abort() LOG.warning("Dropping all gene information") adapter.drop_genes(build) LOG.info("Genes dropped") hpo_genes = fetch_hpo_genes() if build: builds = [build] else: builds = ['37', '38'] for build in builds: LOG.info("Loading hgnc file from {0}".format(hgnc_path)) hgnc_handle = get_file_handle(hgnc_path) ensembl_handle = None if build == '37': ensembl_handle = get_file_handle(transcripts37_path) elif build == '38': ensembl_handle = get_file_handle(transcripts38_path) LOG.info("Loading exac gene file from {0}".format(exac_path)) exac_handle = get_file_handle(exac_path) genes = link_genes( ensembl_lines=ensembl_handle, hgnc_lines=hgnc_handle, exac_lines=exac_handle, mim2gene_lines=mim_files['mim2genes'], genemap_lines=mim_files['genemap2'], hpo_lines=hpo_genes ) load_hgnc_genes(adapter=adapter, genes=genes, build=build)
def genes(build, api_key): """ Load the hgnc aliases to the mongo database. """ LOG.info("Running scout update genes") adapter = store # Fetch the omim information api_key = api_key or current_app.config.get('OMIM_API_KEY') if not api_key: LOG.warning( "Please provide a omim api key to load the omim gene panel") raise click.Abort() try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) raise click.Abort() LOG.warning("Dropping all gene information") adapter.drop_genes(build) LOG.info("Genes dropped") LOG.warning("Dropping all transcript information") adapter.drop_transcripts(build) LOG.info("transcripts dropped") hpo_genes = fetch_hpo_genes() if build: builds = [build] else: builds = ['37', '38'] hgnc_lines = fetch_hgnc() exac_lines = fetch_exac_constraint() for build in builds: ensembl_genes = fetch_ensembl_genes(build=build) # load the genes hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim_files['mim2genes'], genemap_lines=mim_files['genemap2'], hpo_lines=hpo_genes, build=build, ) ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj['ensembl_id'] ensembl_genes[ensembl_id] = gene_obj # Fetch the transcripts from ensembl ensembl_transcripts = fetch_ensembl_transcripts(build=build) transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) adapter.update_indexes() LOG.info("Genes, transcripts and Exons loaded")
def load_omim_panel(self, api_key, institute=None): """Create and load the OMIM-AUTO panel""" existing_panel = self.gene_panel(panel_id='OMIM-AUTO') if not existing_panel: LOG.warning("OMIM-AUTO does not exists in database") LOG.info('Creating a first version') version = 1.0 if existing_panel: version = float(math.floor(existing_panel['version']) + 1) LOG.info("Setting version to %s", version) try: mim_files = fetch_mim_files(api_key=api_key, genemap2=True, mim2genes=True) except Exception as err: raise err date_string = None # Get the correct date when omim files where released for line in mim_files['genemap2']: if 'Generated' in line: date_string = line.split(':')[-1].lstrip().rstrip() date_obj = get_date(date_string) if existing_panel: if existing_panel['date'] == date_obj: LOG.warning("There is no new version of OMIM") return panel_data = {} panel_data['path'] = None panel_data['type'] = 'clinical' panel_data['date'] = date_obj panel_data['panel_id'] = 'OMIM-AUTO' panel_data['institute'] = institute or 'cust002' panel_data['version'] = version panel_data['display_name'] = 'OMIM-AUTO' panel_data['genes'] = [] alias_genes = self.genes_by_alias() genes = get_omim_panel_genes( genemap2_lines = mim_files['genemap2'], mim2gene_lines = mim_files['mim2genes'], alias_genes = alias_genes, ) for gene in genes: panel_data['genes'].append(gene) panel_obj = build_panel(panel_data, self) if existing_panel: new_genes = self.compare_mim_panels(existing_panel, panel_obj) if new_genes: self.update_mim_version(new_genes, panel_obj, old_version=existing_panel['version']) else: LOG.info("The new version of omim does not differ from the old one") LOG.info("No update is added") return self.add_gene_panel(panel_obj)
def setup_scout(adapter, institute_id='cust000', user_name='Clark Kent', user_mail='*****@*****.**', api_key=None, demo=False): """docstring for setup_scout""" ########################## Delete previous information ########################## LOG.info("Deleting previous database") for collection_name in adapter.db.collection_names(): if not collection_name.startswith('system'): LOG.info("Deleting collection %s", collection_name) adapter.db.drop_collection(collection_name) LOG.info("Database deleted") ########################## Add a institute ########################## ##################################################################### # Build a institute with id institute_name institute_obj = build_institute(internal_id=institute_id, display_name=institute_id, sanger_recipients=[user_mail]) # Add the institute to database adapter.add_institute(institute_obj) ########################## Add a User ############################### ##################################################################### # Build a user obj user_obj = dict(_id=user_mail, email=user_mail, name=user_name, roles=['admin'], institutes=[institute_id]) adapter.add_user(user_obj) ### Get the mim information ### if not demo: # Fetch the mim files try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) context.abort() mim2gene_lines = mim_files['mim2genes'] genemap_lines = mim_files['genemap2'] # Fetch the genes to hpo information hpo_gene_lines = fetch_hpo_genes() # Fetch the latest version of the hgnc information hgnc_lines = fetch_hgnc() # Fetch the latest exac pli score information exac_lines = fetch_exac_constraint() else: mim2gene_lines = [ line for line in get_file_handle(mim2gene_reduced_path) ] genemap_lines = [ line for line in get_file_handle(genemap2_reduced_path) ] # Fetch the genes to hpo information hpo_gene_lines = [ line for line in get_file_handle(hpogenes_reduced_path) ] # Fetch the reduced hgnc information hgnc_lines = [line for line in get_file_handle(hgnc_reduced_path)] # Fetch the latest exac pli score information exac_lines = [line for line in get_file_handle(exac_reduced_path)] builds = ['37', '38'] ################## Load Genes and transcripts ####################### ##################################################################### for build in builds: # Fetch the ensembl information if not demo: ensembl_genes = fetch_ensembl_genes(build=build) else: ensembl_genes = get_file_handle(genes37_reduced_path) # load the genes hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim2gene_lines, genemap_lines=genemap_lines, hpo_lines=hpo_gene_lines, build=build, ) # Create a map from ensembl ids to gene objects ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj['ensembl_id'] ensembl_genes[ensembl_id] = gene_obj # Fetch the transcripts from ensembl if not demo: ensembl_transcripts = fetch_ensembl_transcripts(build=build) else: ensembl_transcripts = get_file_handle(transcripts37_reduced_path) # Load the transcripts for a certain build transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) hpo_terms_handle = None hpo_to_genes_handle = None hpo_disease_handle = None if demo: hpo_terms_handle = get_file_handle(hpoterms_reduced_path) hpo_to_genes_handle = get_file_handle(hpo_to_genes_reduced_path) hpo_disease_handle = get_file_handle( hpo_phenotype_to_terms_reduced_path) load_hpo(adapter=adapter, hpo_lines=hpo_terms_handle, hpo_gene_lines=hpo_to_genes_handle, disease_lines=genemap_lines, hpo_disease_lines=hpo_disease_handle) # If demo we load a gene panel and some case information if demo: parsed_panel = parse_gene_panel(path=panel_path, institute='cust000', panel_id='panel1', version=1.0, display_name='Test panel') adapter.load_panel(parsed_panel) case_handle = get_file_handle(load_path) case_data = yaml.load(case_handle) adapter.load_case(case_data) LOG.info("Creating indexes") adapter.load_indexes() LOG.info("Scout instance setup successful")
def load_hgnc_genes(adapter, genes = None, ensembl_lines=None, hgnc_lines=None, exac_lines=None, mim2gene_lines=None, genemap_lines=None, hpo_lines=None, build='37', omim_api_key=''): """Load genes into the database link_genes will collect information from all the different sources and merge it into a dictionary with hgnc_id as key and gene information as values. Args: adapter(scout.adapter.MongoAdapter) genes(dict): If genes are already parsed ensembl_lines(iterable(str)): Lines formated with ensembl gene information hgnc_lines(iterable(str)): Lines with gene information from genenames.org exac_lines(iterable(str)): Lines with information pLi-scores from ExAC mim2gene(iterable(str)): Lines with map from omim id to gene symbol genemap_lines(iterable(str)): Lines with information of omim entries hpo_lines(iterable(str)): Lines information about map from hpo terms to genes build(str): What build to use. Defaults to '37' Returns: gene_objects(list): A list with all gene_objects that was loaded into database """ gene_objects = list() if not genes: # Fetch the resources if not provided if ensembl_lines is None: ensembl_lines = fetch_ensembl_genes(build=build) hgnc_lines = hgnc_lines or fetch_hgnc() exac_lines = exac_lines or fetch_exac_constraint() if not (mim2gene_lines and genemap_lines): if not omim_api_key: raise SyntaxError("Need to provide omim api key") mim_files = fetch_mim_files(omim_api_key, mim2genes=True, genemap2=True) mim2gene_lines = mim_files['mim2genes'] genemap_lines = mim_files['genemap2'] if not hpo_lines: hpo_files = fetch_hpo_files(hpogenes=True) hpo_lines = hpo_files['hpogenes'] # Link the resources genes = link_genes( ensembl_lines=ensembl_lines, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim2gene_lines, genemap_lines=genemap_lines, hpo_lines=hpo_lines ) non_existing = 0 nr_genes = len(genes) with progressbar(genes.values(), label="Building genes", length=nr_genes) as bar: for gene_data in bar: if not gene_data.get('chromosome'): LOG.debug("skipping gene: %s. No coordinates found", gene_data.get('hgnc_symbol', '?')) non_existing += 1 continue gene_obj = build_hgnc_gene(gene_data, build=build) gene_objects.append(gene_obj) LOG.info("Loading genes build %s", build) adapter.load_hgnc_bulk(gene_objects) LOG.info("Loading done. %s genes loaded", len(gene_objects)) LOG.info("Nr of genes without coordinates in build %s: %s", build,non_existing) return gene_objects
def setup_scout(adapter, institute_id='cust000', user_name='Clark Kent', user_mail='*****@*****.**', api_key=None, demo=False): """docstring for setup_scout""" ########################## Delete previous information ########################## LOG.info("Deleting previous database") for collection_name in adapter.db.collection_names(): if not collection_name.startswith('system'): LOG.info("Deleting collection %s", collection_name) adapter.db.drop_collection(collection_name) LOG.info("Database deleted") ########################## Add a institute ########################## ##################################################################### # Build a institute with id institute_name institute_obj = build_institute( internal_id=institute_id, display_name=institute_id, sanger_recipients=[user_mail] ) # Add the institute to database adapter.add_institute(institute_obj) ########################## Add a User ############################### ##################################################################### # Build a user obj user_obj = dict( _id=user_mail, email=user_mail, name=user_name, roles=['admin'], institutes=[institute_id] ) adapter.add_user(user_obj) ### Get the mim information ### if not demo: # Fetch the mim files try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) raise err mim2gene_lines = mim_files['mim2genes'] genemap_lines = mim_files['genemap2'] # Fetch the genes to hpo information hpo_gene_lines = fetch_hpo_genes() # Fetch the latest version of the hgnc information hgnc_lines = fetch_hgnc() # Fetch the latest exac pli score information exac_lines = fetch_exac_constraint() else: mim2gene_lines = [line for line in get_file_handle(mim2gene_reduced_path)] genemap_lines = [line for line in get_file_handle(genemap2_reduced_path)] # Fetch the genes to hpo information hpo_gene_lines = [line for line in get_file_handle(hpogenes_reduced_path)] # Fetch the reduced hgnc information hgnc_lines = [line for line in get_file_handle(hgnc_reduced_path)] # Fetch the latest exac pli score information exac_lines = [line for line in get_file_handle(exac_reduced_path)] builds = ['37', '38'] ################## Load Genes and transcripts ####################### ##################################################################### for build in builds: # Fetch the ensembl information if not demo: ensembl_genes = fetch_ensembl_genes(build=build) else: ensembl_genes = get_file_handle(genes37_reduced_path) # load the genes hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim2gene_lines, genemap_lines=genemap_lines, hpo_lines=hpo_gene_lines, build=build, ) # Create a map from ensembl ids to gene objects ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj['ensembl_id'] ensembl_genes[ensembl_id] = gene_obj # Fetch the transcripts from ensembl if not demo: ensembl_transcripts = fetch_ensembl_transcripts(build=build) else: ensembl_transcripts = get_file_handle(transcripts37_reduced_path) # Load the transcripts for a certain build transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) hpo_terms_handle = None hpo_to_genes_handle = None hpo_disease_handle = None if demo: hpo_terms_handle = get_file_handle(hpoterms_reduced_path) hpo_to_genes_handle = get_file_handle(hpo_to_genes_reduced_path) hpo_disease_handle = get_file_handle(hpo_phenotype_to_terms_reduced_path) load_hpo( adapter=adapter, hpo_lines=hpo_terms_handle, hpo_gene_lines=hpo_to_genes_handle, disease_lines=genemap_lines, hpo_disease_lines=hpo_disease_handle ) # If demo we load a gene panel and some case information if demo: parsed_panel = parse_gene_panel( path=panel_path, institute='cust000', panel_id='panel1', version=1.0, display_name='Test panel' ) adapter.load_panel(parsed_panel) case_handle = get_file_handle(load_path) case_data = yaml.load(case_handle, Loader=yaml.FullLoader) adapter.load_case(case_data) LOG.info("Creating indexes") adapter.load_indexes() LOG.info("Scout instance setup successful")
def database(context, institute_name, user_name, user_mail, api_key): """Setup a scout database""" LOG.info("Running scout setup database") # Fetch the omim information api_key = api_key or context.obj.get('omim_api_key') if not api_key: LOG.warning("Please provide a omim api key to load the omim gene panel") context.abort() try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) context.abort() # for fn in mim_files: # click.echo("{0}: {1}".format(fn, type(mim_files[fn]))) # # context.abort() institute_name = institute_name or context.obj['institute_name'] user_name = user_name or context.obj['user_name'] user_mail = user_mail or context.obj['user_mail'] adapter = context.obj['adapter'] LOG.info("Deleting previous database") for collection_name in adapter.db.collection_names(): if not collection_name.startswith('system'): LOG.info("Deleting collection %s", collection_name) adapter.db.drop_collection(collection_name) LOG.info("Database deleted") LOG.info("Setting up database %s", context.obj['mongodb']) # Build a institute with id institute_name institute_obj = build_institute( internal_id=institute_name, display_name=institute_name, sanger_recipients=[user_mail] ) # Add the institute to database adapter.add_institute(institute_obj) # Build a user obj user_obj = dict( _id=user_mail, email=user_mail, name=user_name, roles=['admin'], institutes=[institute_name] ) adapter.add_user(user_obj) # Fetch the genes to hpo information hpo_genes = fetch_hpo_genes() # Load the genes and transcripts genes37 = link_genes( ensembl_lines=get_file_handle(transcripts37_path), hgnc_lines=get_file_handle(hgnc_path), exac_lines=get_file_handle(exac_path), mim2gene_lines=mim_files['mim2genes'], genemap_lines=mim_files['genemap2'], hpo_lines=hpo_genes, ) load_hgnc_genes(adapter, genes37, build='37') genes38 = link_genes( ensembl_lines=get_file_handle(transcripts38_path), hgnc_lines=get_file_handle(hgnc_path), exac_lines=get_file_handle(exac_path), mim2gene_lines=mim_files['mim2genes'], genemap_lines=mim_files['genemap2'], hpo_lines=hpo_genes, ) load_hgnc_genes(adapter, genes38, build='38') load_hpo( adapter=adapter, disease_lines=mim_files['genemap2'], ) LOG.info("Creating indexes") adapter.load_indexes() LOG.info("Scout instance setup successful")
def genes(context, build, api_key): """ Load the hgnc aliases to the mongo database. """ LOG.info("Running scout update genes") adapter = context.obj['adapter'] # Fetch the omim information api_key = api_key or context.obj.get('omim_api_key') if not api_key: LOG.warning("Please provide a omim api key to load the omim gene panel") context.abort() try: mim_files = fetch_mim_files(api_key, mim2genes=True, morbidmap=True, genemap2=True) except Exception as err: LOG.warning(err) context.abort() LOG.warning("Dropping all gene information") adapter.drop_genes(build) LOG.info("Genes dropped") LOG.warning("Dropping all transcript information") adapter.drop_transcripts(build) LOG.info("transcripts dropped") hpo_genes = fetch_hpo_genes() if build: builds = [build] else: builds = ['37', '38'] hgnc_lines = fetch_hgnc() exac_lines = fetch_exac_constraint() for build in builds: ensembl_genes = fetch_ensembl_genes(build=build) # load the genes hgnc_genes = load_hgnc_genes( adapter=adapter, ensembl_lines=ensembl_genes, hgnc_lines=hgnc_lines, exac_lines=exac_lines, mim2gene_lines=mim_files['mim2genes'], genemap_lines=mim_files['genemap2'], hpo_lines=hpo_genes, build=build, ) ensembl_genes = {} for gene_obj in hgnc_genes: ensembl_id = gene_obj['ensembl_id'] ensembl_genes[ensembl_id] = gene_obj # Fetch the transcripts from ensembl ensembl_transcripts = fetch_ensembl_transcripts(build=build) transcripts = load_transcripts(adapter, ensembl_transcripts, build, ensembl_genes) adapter.update_indexes() LOG.info("Genes, transcripts and Exons loaded")