예제 #1
0
def index_downloads():
    bulk_data = []
    dbentity_file_obj = IndexESHelper.get_file_dbentity_keyword()
    files = DBSession.query(Filedbentity).filter(Filedbentity.is_public == True,
                                                 Filedbentity.s3_url != None,
                                                 Filedbentity.readme_file_id != None).all()
    print('indexing ' + str(len(files)) + ' download files')
    for x in files:
        keyword = []
        status = ''
        temp = dbentity_file_obj.get(x.dbentity_id)
        if temp:
            keyword = temp
        if (x.dbentity_status == "Active" or x.dbentity_status == "Archived"):
            if x.dbentity_status == "Active":
                status = "Active"
            else:
                status = "Archived"
        obj = {
            'name':
                x.display_name,
            'href':
                x.s3_url,
            'category':
                'download',
            'description':
                x.description,
            'keyword':
                keyword,
            'format':
                str(x.format.display_name),
            'status':
                str(status),
            'file_size':
                str(IndexESHelper.convertBytes(x.file_size))
                if x.file_size is not None else x.file_size,
            'year':
                str(x.year),
            'readme_url':
                x.readme_file[0].s3_url,
            'topic': x.topic.display_name,
            'data': x.data.display_name,
            'path_id': x.get_path_id()
        }
        bulk_data.append({
            'index': {
                '_index': INDEX_NAME,
                '_type': DOC_TYPE,
                '_id': x.sgdid
            }
        })

        bulk_data.append(obj)
        if len(bulk_data) == 50:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
예제 #2
0
def index_chemicals():
    all_chebi_data = DBSession.query(Chebi).all()
    _result = IndexESHelper.get_chebi_annotations(all_chebi_data)
    bulk_data = []
    print("Indexing " + str(len(all_chebi_data)) + " chemicals")
    for item_key, item_v in _result.items():
        if item_v is not None:
            obj = {
                "name": item_v.display_name,
                "href": item_v.obj_url,
                "description": item_v.description,
                "category": "chemical",
                "keys": []
            }
            bulk_data.append({
                'index': {
                    '_index': INDEX_NAME,
                    '_type': DOC_TYPE,
                    '_id': 'chemical_' + str(item_key)
                }
            })

            bulk_data.append(obj)
            if len(bulk_data) == 300:
                es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
                bulk_data = []
    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
예제 #3
0
def index_colleagues():
    colleagues = DBSession.query(Colleague).all()
    _locus_ids = IndexESHelper.get_colleague_locus()
    _locus_names = IndexESHelper.get_colleague_locusdbentity()
    _combined_list = IndexESHelper.combine_locusdbentity_colleague(
        colleagues, _locus_names, _locus_ids)
    print(("Indexing " + str(len(colleagues)) + " colleagues"))
    bulk_data = []
    for item_k, item_v in list(_combined_list.items()):
        bulk_data.append(
            {"index": {
                "_index": INDEX_NAME,
                "_id": str(uuid.uuid4())
            }})

        bulk_data.append(item_v)
        if len(bulk_data) == 1000:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []
    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
예제 #4
0
def index_phenotypes():
    phenotypes = DBSession.query(Phenotype).all()
    _phenos_annotation = IndexESHelper.get_phenotypes_phenotypeannotation()
    _annotation_cond = IndexESHelper.get_phenotypes_condition("chemical")
    _result = IndexESHelper.get_combined_phenotypes(
        phenotypes, _phenos_annotation, _annotation_cond)
    bulk_data = []
    print("Indexing " + str(len(phenotypes)) + " phenotypes")
    for obj_k, obj_v in _result.items():
        bulk_data.append({
            'index': {
                '_index': INDEX_NAME,
                '_type': DOC_TYPE,
                '_id': obj_v["format_name"]
            }
        })
        bulk_data.append(obj_v)
        if len(bulk_data) == 500:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []
    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
예제 #5
0
def index_colleagues():
    colleagues = DBSession.query(Colleague).all()
    _locus_ids = IndexESHelper.get_colleague_locus()
    _locus_names = IndexESHelper.get_colleague_locusdbentity()
    _combined_list = IndexESHelper.combine_locusdbentity_colleague(
        colleagues, _locus_names, _locus_ids)
    print("Indexing " + str(len(colleagues)) + " colleagues")
    bulk_data = []
    for item_k, item_v in _combined_list.items():
        bulk_data.append({
            'index': {
                '_index': INDEX_NAME,
                '_type': DOC_TYPE,
                '_id': item_v["format_name"]
            }
        })

        bulk_data.append(item_v)
        if len(bulk_data) == 1000:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []
    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
예제 #6
0
def index_phenotypes():
    bulk_data = []
    phenotypes = DBSession.query(Phenotype).all()
    _result = IndexESHelper.get_pheno_annotations(phenotypes)
    print(("Indexing " + str(len(_result)) + " phenotypes"))
    for phenotype_item in _result:
        bulk_data.append(
            {"index": {
                "_index": INDEX_NAME,
                "_id": str(uuid.uuid4())
            }})
        bulk_data.append(phenotype_item)
        if len(bulk_data) == 50:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []
    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
예제 #7
0
def index_references():
    _ref_loci = IndexESHelper.get_dbentity_locus_note()
    _references = DBSession.query(Referencedbentity).all()
    _abstracts = IndexESHelper.get_ref_abstracts()
    _authors = IndexESHelper.get_ref_authors()
    _aliases = IndexESHelper.get_ref_aliases()

    bulk_data = []
    print('Indexing ' + str(len(_references)) + ' references')

    for reference in _references:
        reference_loci = []
        if len(_ref_loci) > 0:
            temp_loci = _ref_loci.get(reference.dbentity_id)
            if temp_loci is not None:
                reference_loci = list(set([x.display_name for x in IndexESHelper.flattern_list(temp_loci)]))

        abstract = _abstracts.get(reference.dbentity_id)
        if abstract is not None:
            abstract = abstract[0]
        sec_sgdids = _aliases.get(reference.dbentity_id)
        sec_sgdid = None
        authors = _authors.get(reference.dbentity_id)
        if sec_sgdids is not None:
            sec_sgdid = sec_sgdids[0]

        if authors is None:
            authors = []

        journal = reference.journal
        if journal:
            journal = journal.display_name
        key_values = [
            reference.pmcid, reference.pmid, "pmid: " + str(reference.pmid),
            "pmid:" + str(reference.pmid), "pmid " + str(reference.pmid),
            reference.sgdid
        ]

        keys = set([])
        for k in key_values:
            if k is not None:
                keys.add(str(k).lower())
        obj = {
            'name': reference.citation,
            'href': reference.obj_url,
            'description': abstract,
            'author': authors,
            'journal': journal,
            'year': str(reference.year),
            'reference_loci': reference_loci,
            'secondary_sgdid': sec_sgdid,
            'category': 'reference',
            'keys': list(keys)
        }

        bulk_data.append({
            'index': {
                '_index': INDEX_NAME,
                '_type': DOC_TYPE,
                '_id': reference.sgdid
            }
        })
        bulk_data.append(obj)
        if len(bulk_data) == 1000:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
예제 #8
0
def index_genes():
    # Indexing just the S228C genes
    # dbentity: 1364643 (id) -> straindbentity -> 274901 (taxonomy_id)
    # list of dbentities comes from table DNASequenceAnnotation with taxonomy_id 274901
    # feature_type comes from DNASequenceAnnotation as well
    gene_ids_so = DBSession.query(
        Dnasequenceannotation.dbentity_id, Dnasequenceannotation.so_id).filter(
            Dnasequenceannotation.taxonomy_id == 274901).all()
    dbentity_ids_to_so = {}
    dbentity_ids = set([])
    so_ids = set([])
    for gis in gene_ids_so:
        dbentity_ids.add(gis[0])
        so_ids.add(gis[1])
        dbentity_ids_to_so[gis[0]] = gis[1]
    # add some non S288C genes
    not_s288c = DBSession.query(Locusdbentity.dbentity_id).filter(
        Locusdbentity.not_in_s288c == True).all()
    for id in not_s288c:
        dbentity_ids.add(id[0])
        # assume non S288C features to be ORFs
        dbentity_ids_to_so[id[0]] = 263757
    all_genes = DBSession.query(Locusdbentity).filter(
        Locusdbentity.dbentity_id.in_(list(dbentity_ids))).all()

    # make list of merged/deleted genes so they don't redirect when they show up as an alias
    merged_deleted_r = DBSession.query(Locusdbentity.format_name).filter(
        Locusdbentity.dbentity_status.in_(['Merged', 'Deleted'])).all()
    merged_deleted = [d[0] for d in merged_deleted_r]

    feature_types_db = DBSession.query(
        So.so_id, So.display_name).filter(So.so_id.in_(list(so_ids))).all()
    feature_types = {}
    for ft in feature_types_db:
        feature_types[ft[0]] = ft[1]

    tc_numbers_db = DBSession.query(LocusAlias).filter_by(
        alias_type="TC number").all()
    tc_numbers = {}
    for tc in tc_numbers_db:
        if tc.locus_id in tc_numbers:
            tc_numbers[tc.locus_id].append(tc.display_name)
        else:
            tc_numbers[tc.locus_id] = [tc.display_name]

    ec_numbers_db = DBSession.query(LocusAlias).filter_by(
        alias_type="EC number").all()
    ec_numbers = {}
    for ec in ec_numbers_db:
        if ec.locus_id in ec_numbers:
            ec_numbers[ec.locus_id].append(ec.display_name)
        else:
            ec_numbers[ec.locus_id] = [ec.display_name]

    secondary_db = DBSession.query(LocusAlias).filter_by(
        alias_type="SGDID Secondary").all()
    secondary_sgdids = {}

    for sid in secondary_db:
        if sid.locus_id in secondary_sgdids:
            secondary_sgdids[sid.locus_id].append(sid.display_name)
        else:
            secondary_sgdids[sid.locus_id] = [sid.display_name]

    bulk_data = []

    print('Indexing ' + str(len(all_genes)) + ' genes')
    ##### test newer methods ##########
    _summary = IndexESHelper.get_locus_dbentity_summary()
    _protein = IndexESHelper.get_locus_dbentity_alias(["NCBI protein name"])
    _phenos = IndexESHelper.get_locus_phenotypeannotation()
    _goids = IndexESHelper.get_locus_go_annotation()
    _aliases_raw = IndexESHelper.get_locus_dbentity_alias(
        ["Uniform", "Non-uniform", "Retired name", "UniProtKB ID"])

    ###################################
    not_mapped_genes = IndexESHelper.get_not_mapped_genes()
    is_quick_flag = True

    for gene in all_genes:
        if gene.gene_name:
            _name = gene.gene_name
            if gene.systematic_name and gene.gene_name != gene.systematic_name:
                _name += " / " + gene.systematic_name
        else:
            _name = gene.systematic_name

        #summary = DBSession.query(Locussummary.text).filter_by(locus_id=gene.dbentity_id).all()
        summary = []
        if (_summary is not None):
            summary = _summary.get(gene.dbentity_id)
        #protein = DBSession.query(LocusAlias.display_name).filter_by(locus_id=gene.dbentity_id, alias_type="NCBI protein name").one_or_none()
        protein = _protein.get(gene.dbentity_id)
        if protein is not None:
            protein = protein[0].display_name

        # TEMP don't index due to schema schange
        # sequence_history = DBSession.query(Locusnoteannotation.note).filter_by(dbentity_id=gene.dbentity_id, note_type="Sequence").all()
        # gene_history = DBSession.query(Locusnoteannotation.note).filter_by(dbentity_id=gene.dbentity_id, note_type="Locus").all()

        #phenotype_ids = DBSession.query(Phenotypeannotation.phenotype_id).filter_by(dbentity_id=gene.dbentity_id).all()
        phenotype_ids = []
        if _phenos is not None:
            temp = _phenos.get(gene.dbentity_id)
            if temp is not None:
                phenotype_ids = [x.phenotype_id for x in temp]
        if len(phenotype_ids) > 0:
            phenotypes = DBSession.query(Phenotype.display_name).filter(
                Phenotype.phenotype_id.in_(phenotype_ids)).all()
        else:
            phenotypes = []
        #go_ids = DBSession.query(Goannotation.go_id).filter(and_(Goannotation.go_qualifier != 'NOT', Goannotation.dbentity_id == gene.dbentity_id)).all()
        go_ids = _goids.get(gene.dbentity_id)
        if go_ids is not None:
            go_ids = [x.go_id for x in go_ids]
        else:
            go_ids = []
        go_annotations = {
            'cellular component': set([]),
            'molecular function': set([]),
            'biological process': set([])
        }
        if len(go_ids) > 0:
            #go_ids = [g[0] for g in go_ids]
            go = DBSession.query(
                Go.display_name,
                Go.go_namespace).filter(Go.go_id.in_(go_ids)).all()
            for g in go:
                go_annotations[g[1]].add(g[0] + ' (direct)')
        go_slim_ids = DBSession.query(Goslimannotation.goslim_id).filter(
            Goslimannotation.dbentity_id == gene.dbentity_id).all()
        if len(go_slim_ids) > 0:
            go_slim_ids = [g[0] for g in go_slim_ids]
            go_slim = DBSession.query(Goslim.go_id, Goslim.display_name).filter(
                Goslim.goslim_id.in_(go_slim_ids)).all()
            go_ids = [g[0] for g in go_slim]
            go = DBSession.query(
                Go.go_id, Go.go_namespace).filter(Go.go_id.in_(go_ids)).all()
            for g in go:
                for gs in go_slim:
                    if (gs[0] == g[0]):
                        go_annotations[g[1]].add(gs[1])

        # add "quick direct" keys such as aliases, SGD, UniProt ID and format aliases
        #aliases_raw = DBSession.query(LocusAlias.display_name, LocusAlias.alias_type).filter(and_(LocusAlias.locus_id==gene.dbentity_id, LocusAlias.alias_type.in_())).all()
        aliases_raw = _aliases_raw.get(gene.dbentity_id)
        alias_quick_direct_keys = []
        aliases = []
        if aliases_raw is not None:
            for alias_item in aliases_raw:
                name = alias_item.display_name
                if name not in merged_deleted:
                    alias_quick_direct_keys.append(name)
                if alias_item.alias_type != "UniProtKB ID":
                    aliases.append(name)
        '''for d in aliases_raw:
            name = d[0]
            if name not in merged_deleted:
                alias_quick_direct_keys.append(name)
            if d[1] != "UniProtKB ID":
                aliases.append(name)'''
        # make everything in keys lowercase to ignore case
        keys = []
        _keys = [gene.gene_name, gene.systematic_name, gene.sgdid
                ] + alias_quick_direct_keys
        # Add SGD:<gene SGDID> to list of keywords for quick search
        _keys.append('SGD:{}'.format(gene.sgdid))
        # If this gene has a reservedname associated with it, add that reservedname to
        # the list of keywords used for the quick search of this gene
        reservedname = DBSession.query(Reservedname).filter_by(locus_id=gene.dbentity_id).one_or_none()
        if reservedname:
            _keys.append(reservedname.display_name)
        for k in _keys:
            if k:
                keys.append(k.lower())

        obj = {
            'name':
                _name,
            'href':
                gene.obj_url,
            'description':
                gene.description,
            'category':
                'locus',
            'feature_type':
                feature_types[dbentity_ids_to_so[gene.dbentity_id]],
            'name_description':
                gene.name_description,
            'summary':
                summary,
            'phenotypes': [p[0] for p in phenotypes],
            'aliases':
                aliases,
            'cellular_component':
                list(go_annotations["cellular component"] - set([
                    "cellular component", "cellular component (direct)",
                    "cellular_component", "cellular_component (direct)"
                ])),
            'biological_process':
                list(go_annotations["biological process"] - set([
                    "biological process (direct)", "biological process",
                    "biological_process (direct)", "biological_process"
                ])),
            'molecular_function':
                list(go_annotations["molecular function"] - set([
                    "molecular function (direct)", "molecular function",
                    "molecular_function (direct)", "molecular_function"
                ])),
            'ec_number':
                ec_numbers.get(gene.dbentity_id),
            'protein':
                protein,
            'tc_number':
                tc_numbers.get(gene.dbentity_id),
            'secondary_sgdid':
                secondary_sgdids.get(gene.dbentity_id),
            'status':
                gene.dbentity_status,
            # TEMP don't index due to schema change
            # 'sequence_history': [s[0] for s in sequence_history],
            # 'gene_history': [g[0] for g in gene_history],
            'bioentity_id':
                gene.dbentity_id,
            'keys':
                list(keys),
            'is_quick_flag': str(is_quick_flag)
        }

        bulk_data.append({
            'index': {
                '_index': INDEX_NAME,
                '_type': DOC_TYPE,
                '_id': gene.sgdid
            }
        })

        bulk_data.append(obj)

        if len(bulk_data) == 1000:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_downloads():
    bulk_data = []
    dbentity_file_obj = IndexESHelper.get_file_dbentity_keyword()
    files = DBSession.query(Filedbentity).filter(
        Filedbentity.is_public == True, Filedbentity.s3_url != None,
        Filedbentity.readme_file_id != None).all()
    print("indexing " + str(len(files)) + " download files")
    for x in files:
        keyword = []
        status = ""
        temp = dbentity_file_obj.get(x.dbentity_id)
        if temp:
            keyword = temp
        if (x.dbentity_status == "Active" or x.dbentity_status == "Archived"):
            if x.dbentity_status == "Active":
                status = "Active"
            else:
                status = "Archived"
        obj = {
            "name":
            x.display_name,
            "href":
            x.s3_url,
            "category":
            "download",
            "description":
            x.description,
            "keyword":
            keyword,
            "format":
            str(x.format.display_name),
            "status":
            str(status),
            "file_size":
            str(IndexESHelper.convertBytes(x.file_size))
            if x.file_size is not None else x.file_size,
            "year":
            str(x.year),
            "readme_url":
            x.readme_file[0].s3_url,
            "topic":
            x.topic.display_name,
            "data":
            x.data.display_name,
            "path_id":
            x.get_path_id()
        }
        bulk_data.append({
            "index": {
                "_index": INDEX_NAME,
                "_type": DOC_TYPE,
                "_id": str(uuid.uuid4())
            }
        })

        bulk_data.append(obj)
        if len(bulk_data) == 50:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
def index_references():
    _ref_loci = IndexESHelper.get_dbentity_locus_note()
    _references = DBSession.query(Referencedbentity).all()
    _abstracts = IndexESHelper.get_ref_abstracts()
    _authors = IndexESHelper.get_ref_authors()
    _aliases = IndexESHelper.get_ref_aliases()

    bulk_data = []
    print("Indexing " + str(len(_references)) + " references")

    for reference in _references:
        reference_loci = []
        if len(_ref_loci) > 0:
            temp_loci = _ref_loci.get(reference.dbentity_id)
            if temp_loci is not None:
                reference_loci = list(
                    set([
                        x.display_name
                        for x in IndexESHelper.flattern_list(temp_loci)
                    ]))

        abstract = _abstracts.get(reference.dbentity_id)
        if abstract is not None:
            abstract = abstract[0]
        sec_sgdids = _aliases.get(reference.dbentity_id)
        sec_sgdid = None
        authors = _authors.get(reference.dbentity_id)
        if sec_sgdids is not None:
            sec_sgdid = sec_sgdids[0]

        if authors is None:
            authors = []

        journal = reference.journal
        if journal:
            journal = journal.display_name
        key_values = [
            reference.pmcid, reference.pmid, "pmid: " + str(reference.pmid),
            "pmid:" + str(reference.pmid), "pmid " + str(reference.pmid),
            reference.sgdid
        ]

        keys = set([])
        for k in key_values:
            if k is not None:
                keys.add(str(k).lower())
        obj = {
            "name": reference.citation,
            "href": reference.obj_url,
            "description": abstract,
            "author": authors,
            "journal": journal,
            "year": str(reference.year),
            "reference_loci": reference_loci,
            "secondary_sgdid": sec_sgdid,
            "category": "reference",
            "keys": list(keys)
        }

        bulk_data.append({
            "index": {
                "_index": INDEX_NAME,
                "_type": DOC_TYPE,
                "_id": str(uuid.uuid4())
            }
        })
        bulk_data.append(obj)
        if len(bulk_data) == 1000:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
            bulk_data = []

    if len(bulk_data) > 0:
        es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)