def index_downloads(): bulk_data = [] dbentity_file_obj = IndexESHelper.get_file_dbentity_keyword() files = DBSession.query(Filedbentity).filter(Filedbentity.is_public == True, Filedbentity.s3_url != None, Filedbentity.readme_file_id != None).all() print('indexing ' + str(len(files)) + ' download files') for x in files: keyword = [] status = '' temp = dbentity_file_obj.get(x.dbentity_id) if temp: keyword = temp if (x.dbentity_status == "Active" or x.dbentity_status == "Archived"): if x.dbentity_status == "Active": status = "Active" else: status = "Archived" obj = { 'name': x.display_name, 'href': x.s3_url, 'category': 'download', 'description': x.description, 'keyword': keyword, 'format': str(x.format.display_name), 'status': str(status), 'file_size': str(IndexESHelper.convertBytes(x.file_size)) if x.file_size is not None else x.file_size, 'year': str(x.year), 'readme_url': x.readme_file[0].s3_url, 'topic': x.topic.display_name, 'data': x.data.display_name, 'path_id': x.get_path_id() } bulk_data.append({ 'index': { '_index': INDEX_NAME, '_type': DOC_TYPE, '_id': x.sgdid } }) bulk_data.append(obj) if len(bulk_data) == 50: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_chemicals(): all_chebi_data = DBSession.query(Chebi).all() _result = IndexESHelper.get_chebi_annotations(all_chebi_data) bulk_data = [] print("Indexing " + str(len(all_chebi_data)) + " chemicals") for item_key, item_v in _result.items(): if item_v is not None: obj = { "name": item_v.display_name, "href": item_v.obj_url, "description": item_v.description, "category": "chemical", "keys": [] } bulk_data.append({ 'index': { '_index': INDEX_NAME, '_type': DOC_TYPE, '_id': 'chemical_' + str(item_key) } }) bulk_data.append(obj) if len(bulk_data) == 300: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_colleagues(): colleagues = DBSession.query(Colleague).all() _locus_ids = IndexESHelper.get_colleague_locus() _locus_names = IndexESHelper.get_colleague_locusdbentity() _combined_list = IndexESHelper.combine_locusdbentity_colleague( colleagues, _locus_names, _locus_ids) print(("Indexing " + str(len(colleagues)) + " colleagues")) bulk_data = [] for item_k, item_v in list(_combined_list.items()): bulk_data.append( {"index": { "_index": INDEX_NAME, "_id": str(uuid.uuid4()) }}) bulk_data.append(item_v) if len(bulk_data) == 1000: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_phenotypes(): phenotypes = DBSession.query(Phenotype).all() _phenos_annotation = IndexESHelper.get_phenotypes_phenotypeannotation() _annotation_cond = IndexESHelper.get_phenotypes_condition("chemical") _result = IndexESHelper.get_combined_phenotypes( phenotypes, _phenos_annotation, _annotation_cond) bulk_data = [] print("Indexing " + str(len(phenotypes)) + " phenotypes") for obj_k, obj_v in _result.items(): bulk_data.append({ 'index': { '_index': INDEX_NAME, '_type': DOC_TYPE, '_id': obj_v["format_name"] } }) bulk_data.append(obj_v) if len(bulk_data) == 500: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_colleagues(): colleagues = DBSession.query(Colleague).all() _locus_ids = IndexESHelper.get_colleague_locus() _locus_names = IndexESHelper.get_colleague_locusdbentity() _combined_list = IndexESHelper.combine_locusdbentity_colleague( colleagues, _locus_names, _locus_ids) print("Indexing " + str(len(colleagues)) + " colleagues") bulk_data = [] for item_k, item_v in _combined_list.items(): bulk_data.append({ 'index': { '_index': INDEX_NAME, '_type': DOC_TYPE, '_id': item_v["format_name"] } }) bulk_data.append(item_v) if len(bulk_data) == 1000: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_phenotypes(): bulk_data = [] phenotypes = DBSession.query(Phenotype).all() _result = IndexESHelper.get_pheno_annotations(phenotypes) print(("Indexing " + str(len(_result)) + " phenotypes")) for phenotype_item in _result: bulk_data.append( {"index": { "_index": INDEX_NAME, "_id": str(uuid.uuid4()) }}) bulk_data.append(phenotype_item) if len(bulk_data) == 50: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_references(): _ref_loci = IndexESHelper.get_dbentity_locus_note() _references = DBSession.query(Referencedbentity).all() _abstracts = IndexESHelper.get_ref_abstracts() _authors = IndexESHelper.get_ref_authors() _aliases = IndexESHelper.get_ref_aliases() bulk_data = [] print('Indexing ' + str(len(_references)) + ' references') for reference in _references: reference_loci = [] if len(_ref_loci) > 0: temp_loci = _ref_loci.get(reference.dbentity_id) if temp_loci is not None: reference_loci = list(set([x.display_name for x in IndexESHelper.flattern_list(temp_loci)])) abstract = _abstracts.get(reference.dbentity_id) if abstract is not None: abstract = abstract[0] sec_sgdids = _aliases.get(reference.dbentity_id) sec_sgdid = None authors = _authors.get(reference.dbentity_id) if sec_sgdids is not None: sec_sgdid = sec_sgdids[0] if authors is None: authors = [] journal = reference.journal if journal: journal = journal.display_name key_values = [ reference.pmcid, reference.pmid, "pmid: " + str(reference.pmid), "pmid:" + str(reference.pmid), "pmid " + str(reference.pmid), reference.sgdid ] keys = set([]) for k in key_values: if k is not None: keys.add(str(k).lower()) obj = { 'name': reference.citation, 'href': reference.obj_url, 'description': abstract, 'author': authors, 'journal': journal, 'year': str(reference.year), 'reference_loci': reference_loci, 'secondary_sgdid': sec_sgdid, 'category': 'reference', 'keys': list(keys) } bulk_data.append({ 'index': { '_index': INDEX_NAME, '_type': DOC_TYPE, '_id': reference.sgdid } }) bulk_data.append(obj) if len(bulk_data) == 1000: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_genes(): # Indexing just the S228C genes # dbentity: 1364643 (id) -> straindbentity -> 274901 (taxonomy_id) # list of dbentities comes from table DNASequenceAnnotation with taxonomy_id 274901 # feature_type comes from DNASequenceAnnotation as well gene_ids_so = DBSession.query( Dnasequenceannotation.dbentity_id, Dnasequenceannotation.so_id).filter( Dnasequenceannotation.taxonomy_id == 274901).all() dbentity_ids_to_so = {} dbentity_ids = set([]) so_ids = set([]) for gis in gene_ids_so: dbentity_ids.add(gis[0]) so_ids.add(gis[1]) dbentity_ids_to_so[gis[0]] = gis[1] # add some non S288C genes not_s288c = DBSession.query(Locusdbentity.dbentity_id).filter( Locusdbentity.not_in_s288c == True).all() for id in not_s288c: dbentity_ids.add(id[0]) # assume non S288C features to be ORFs dbentity_ids_to_so[id[0]] = 263757 all_genes = DBSession.query(Locusdbentity).filter( Locusdbentity.dbentity_id.in_(list(dbentity_ids))).all() # make list of merged/deleted genes so they don't redirect when they show up as an alias merged_deleted_r = DBSession.query(Locusdbentity.format_name).filter( Locusdbentity.dbentity_status.in_(['Merged', 'Deleted'])).all() merged_deleted = [d[0] for d in merged_deleted_r] feature_types_db = DBSession.query( So.so_id, So.display_name).filter(So.so_id.in_(list(so_ids))).all() feature_types = {} for ft in feature_types_db: feature_types[ft[0]] = ft[1] tc_numbers_db = DBSession.query(LocusAlias).filter_by( alias_type="TC number").all() tc_numbers = {} for tc in tc_numbers_db: if tc.locus_id in tc_numbers: tc_numbers[tc.locus_id].append(tc.display_name) else: tc_numbers[tc.locus_id] = [tc.display_name] ec_numbers_db = DBSession.query(LocusAlias).filter_by( alias_type="EC number").all() ec_numbers = {} for ec in ec_numbers_db: if ec.locus_id in ec_numbers: ec_numbers[ec.locus_id].append(ec.display_name) else: ec_numbers[ec.locus_id] = [ec.display_name] secondary_db = DBSession.query(LocusAlias).filter_by( alias_type="SGDID Secondary").all() secondary_sgdids = {} for sid in secondary_db: if sid.locus_id in secondary_sgdids: secondary_sgdids[sid.locus_id].append(sid.display_name) else: secondary_sgdids[sid.locus_id] = [sid.display_name] bulk_data = [] print('Indexing ' + str(len(all_genes)) + ' genes') ##### test newer methods ########## _summary = IndexESHelper.get_locus_dbentity_summary() _protein = IndexESHelper.get_locus_dbentity_alias(["NCBI protein name"]) _phenos = IndexESHelper.get_locus_phenotypeannotation() _goids = IndexESHelper.get_locus_go_annotation() _aliases_raw = IndexESHelper.get_locus_dbentity_alias( ["Uniform", "Non-uniform", "Retired name", "UniProtKB ID"]) ################################### not_mapped_genes = IndexESHelper.get_not_mapped_genes() is_quick_flag = True for gene in all_genes: if gene.gene_name: _name = gene.gene_name if gene.systematic_name and gene.gene_name != gene.systematic_name: _name += " / " + gene.systematic_name else: _name = gene.systematic_name #summary = DBSession.query(Locussummary.text).filter_by(locus_id=gene.dbentity_id).all() summary = [] if (_summary is not None): summary = _summary.get(gene.dbentity_id) #protein = DBSession.query(LocusAlias.display_name).filter_by(locus_id=gene.dbentity_id, alias_type="NCBI protein name").one_or_none() protein = _protein.get(gene.dbentity_id) if protein is not None: protein = protein[0].display_name # TEMP don't index due to schema schange # sequence_history = DBSession.query(Locusnoteannotation.note).filter_by(dbentity_id=gene.dbentity_id, note_type="Sequence").all() # gene_history = DBSession.query(Locusnoteannotation.note).filter_by(dbentity_id=gene.dbentity_id, note_type="Locus").all() #phenotype_ids = DBSession.query(Phenotypeannotation.phenotype_id).filter_by(dbentity_id=gene.dbentity_id).all() phenotype_ids = [] if _phenos is not None: temp = _phenos.get(gene.dbentity_id) if temp is not None: phenotype_ids = [x.phenotype_id for x in temp] if len(phenotype_ids) > 0: phenotypes = DBSession.query(Phenotype.display_name).filter( Phenotype.phenotype_id.in_(phenotype_ids)).all() else: phenotypes = [] #go_ids = DBSession.query(Goannotation.go_id).filter(and_(Goannotation.go_qualifier != 'NOT', Goannotation.dbentity_id == gene.dbentity_id)).all() go_ids = _goids.get(gene.dbentity_id) if go_ids is not None: go_ids = [x.go_id for x in go_ids] else: go_ids = [] go_annotations = { 'cellular component': set([]), 'molecular function': set([]), 'biological process': set([]) } if len(go_ids) > 0: #go_ids = [g[0] for g in go_ids] go = DBSession.query( Go.display_name, Go.go_namespace).filter(Go.go_id.in_(go_ids)).all() for g in go: go_annotations[g[1]].add(g[0] + ' (direct)') go_slim_ids = DBSession.query(Goslimannotation.goslim_id).filter( Goslimannotation.dbentity_id == gene.dbentity_id).all() if len(go_slim_ids) > 0: go_slim_ids = [g[0] for g in go_slim_ids] go_slim = DBSession.query(Goslim.go_id, Goslim.display_name).filter( Goslim.goslim_id.in_(go_slim_ids)).all() go_ids = [g[0] for g in go_slim] go = DBSession.query( Go.go_id, Go.go_namespace).filter(Go.go_id.in_(go_ids)).all() for g in go: for gs in go_slim: if (gs[0] == g[0]): go_annotations[g[1]].add(gs[1]) # add "quick direct" keys such as aliases, SGD, UniProt ID and format aliases #aliases_raw = DBSession.query(LocusAlias.display_name, LocusAlias.alias_type).filter(and_(LocusAlias.locus_id==gene.dbentity_id, LocusAlias.alias_type.in_())).all() aliases_raw = _aliases_raw.get(gene.dbentity_id) alias_quick_direct_keys = [] aliases = [] if aliases_raw is not None: for alias_item in aliases_raw: name = alias_item.display_name if name not in merged_deleted: alias_quick_direct_keys.append(name) if alias_item.alias_type != "UniProtKB ID": aliases.append(name) '''for d in aliases_raw: name = d[0] if name not in merged_deleted: alias_quick_direct_keys.append(name) if d[1] != "UniProtKB ID": aliases.append(name)''' # make everything in keys lowercase to ignore case keys = [] _keys = [gene.gene_name, gene.systematic_name, gene.sgdid ] + alias_quick_direct_keys # Add SGD:<gene SGDID> to list of keywords for quick search _keys.append('SGD:{}'.format(gene.sgdid)) # If this gene has a reservedname associated with it, add that reservedname to # the list of keywords used for the quick search of this gene reservedname = DBSession.query(Reservedname).filter_by(locus_id=gene.dbentity_id).one_or_none() if reservedname: _keys.append(reservedname.display_name) for k in _keys: if k: keys.append(k.lower()) obj = { 'name': _name, 'href': gene.obj_url, 'description': gene.description, 'category': 'locus', 'feature_type': feature_types[dbentity_ids_to_so[gene.dbentity_id]], 'name_description': gene.name_description, 'summary': summary, 'phenotypes': [p[0] for p in phenotypes], 'aliases': aliases, 'cellular_component': list(go_annotations["cellular component"] - set([ "cellular component", "cellular component (direct)", "cellular_component", "cellular_component (direct)" ])), 'biological_process': list(go_annotations["biological process"] - set([ "biological process (direct)", "biological process", "biological_process (direct)", "biological_process" ])), 'molecular_function': list(go_annotations["molecular function"] - set([ "molecular function (direct)", "molecular function", "molecular_function (direct)", "molecular_function" ])), 'ec_number': ec_numbers.get(gene.dbentity_id), 'protein': protein, 'tc_number': tc_numbers.get(gene.dbentity_id), 'secondary_sgdid': secondary_sgdids.get(gene.dbentity_id), 'status': gene.dbentity_status, # TEMP don't index due to schema change # 'sequence_history': [s[0] for s in sequence_history], # 'gene_history': [g[0] for g in gene_history], 'bioentity_id': gene.dbentity_id, 'keys': list(keys), 'is_quick_flag': str(is_quick_flag) } bulk_data.append({ 'index': { '_index': INDEX_NAME, '_type': DOC_TYPE, '_id': gene.sgdid } }) bulk_data.append(obj) if len(bulk_data) == 1000: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_downloads(): bulk_data = [] dbentity_file_obj = IndexESHelper.get_file_dbentity_keyword() files = DBSession.query(Filedbentity).filter( Filedbentity.is_public == True, Filedbentity.s3_url != None, Filedbentity.readme_file_id != None).all() print("indexing " + str(len(files)) + " download files") for x in files: keyword = [] status = "" temp = dbentity_file_obj.get(x.dbentity_id) if temp: keyword = temp if (x.dbentity_status == "Active" or x.dbentity_status == "Archived"): if x.dbentity_status == "Active": status = "Active" else: status = "Archived" obj = { "name": x.display_name, "href": x.s3_url, "category": "download", "description": x.description, "keyword": keyword, "format": str(x.format.display_name), "status": str(status), "file_size": str(IndexESHelper.convertBytes(x.file_size)) if x.file_size is not None else x.file_size, "year": str(x.year), "readme_url": x.readme_file[0].s3_url, "topic": x.topic.display_name, "data": x.data.display_name, "path_id": x.get_path_id() } bulk_data.append({ "index": { "_index": INDEX_NAME, "_type": DOC_TYPE, "_id": str(uuid.uuid4()) } }) bulk_data.append(obj) if len(bulk_data) == 50: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_references(): _ref_loci = IndexESHelper.get_dbentity_locus_note() _references = DBSession.query(Referencedbentity).all() _abstracts = IndexESHelper.get_ref_abstracts() _authors = IndexESHelper.get_ref_authors() _aliases = IndexESHelper.get_ref_aliases() bulk_data = [] print("Indexing " + str(len(_references)) + " references") for reference in _references: reference_loci = [] if len(_ref_loci) > 0: temp_loci = _ref_loci.get(reference.dbentity_id) if temp_loci is not None: reference_loci = list( set([ x.display_name for x in IndexESHelper.flattern_list(temp_loci) ])) abstract = _abstracts.get(reference.dbentity_id) if abstract is not None: abstract = abstract[0] sec_sgdids = _aliases.get(reference.dbentity_id) sec_sgdid = None authors = _authors.get(reference.dbentity_id) if sec_sgdids is not None: sec_sgdid = sec_sgdids[0] if authors is None: authors = [] journal = reference.journal if journal: journal = journal.display_name key_values = [ reference.pmcid, reference.pmid, "pmid: " + str(reference.pmid), "pmid:" + str(reference.pmid), "pmid " + str(reference.pmid), reference.sgdid ] keys = set([]) for k in key_values: if k is not None: keys.add(str(k).lower()) obj = { "name": reference.citation, "href": reference.obj_url, "description": abstract, "author": authors, "journal": journal, "year": str(reference.year), "reference_loci": reference_loci, "secondary_sgdid": sec_sgdid, "category": "reference", "keys": list(keys) } bulk_data.append({ "index": { "_index": INDEX_NAME, "_type": DOC_TYPE, "_id": str(uuid.uuid4()) } }) bulk_data.append(obj) if len(bulk_data) == 1000: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True) bulk_data = [] if len(bulk_data) > 0: es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)