示例#1
0
def browse_subcat(category):
    """
    Function to give subcategories results from button click
    :param category:  string text from website
    :return: category links for browse
    """
    #split cateogry text on tilde (future update would make these into
    # URL variables however too complicated to do in time remaining.)
    table, field = category.split("~")
    #get database field for query
    dbfield = tabledict[table][1][field]

    # Use query output or curated categories depending on which field etc.
    if table == 'Kinase' and field == 'Cellular_Location':
        return location_cats

    elif table == 'Kinase' and field == 'Family':
        return kin_family_cats

    elif table == 'Substrate' and field == 'All':
        results = browse_substrates()
        return results

    # run query for all distinct results from table and field name
    else:
        session = create_sqlsession()
        subcats = session.query(dbfield.distinct()).all()
        links = [subcat[0] for subcat in subcats if subcat[0] != None]
        #remove forward and black slash that cause problems in links
        return links
示例#2
0
def get_table_values_for_search(class_name):
    """
    Produces a list of the key values in class_name table that will be used in
    the API search. Works for classes with single primary key only.

    :param class_name: a sqlalchemy declarative class (sqlalchemy class object)
    :return: list of key values for search (list)
    """
    print('Obtaining key values for %s records in DB' % class_name.__name__)
    # get the name of the primary key attribute in the class's corresponding
    # table
    key_attr = get_class_key_attrs(class_name, single_key=True)

    # create a DB session
    session = create_sqlsession()
    # list of the value for the key field for all records [('val1',),...]
    records = session.query(getattr(class_name, key_attr)).all()
    # close the session
    session.close()

    # convert into list of str ['val1', ...]
    keys_list = [val[0] for val in records]

    print('Retrieved key values for %i %s records'
          % (len(keys_list), class_name.__name__))

    return keys_list
示例#3
0
def inhib_kin_query(inhib_pubchem_cid):
    """
    Query to pull targeted kinases using inhib CID
    :param inhib_pubchem_cid: string inhib CID
    :return: Flask_Table Kinase object
    """
    session = create_sqlsession()
    q = session.query(Inhibitor).filter_by(inhib_pubchem_cid=inhib_pubchem_cid)
    inh = q.first()
    #subset of information about substrate phosphosites sites.
    subsets = inh.inhib_target_kinases
    table = Kinase_results(subsets)
    session.close()
    return table
示例#4
0
def kin_inhib_query(kin_accession):
    """
    Query to pull related inhibitors using kinase accession
    :param kin_accession: string kinase accession
    :return: Flask_Table Phosphosite_results object
    """
    session = create_sqlsession()
    q = session.query(Kinase).filter_by(kin_accession=kin_accession)
    kin = q.first()
    #subset of information about substrate phosphosites sites.
    subsets = kin.kin_inhibitors
    table = Inhibitor_results(subsets)
    session.close()
    return table
示例#5
0
def phos_kin_query(phos_group_id):
    """
    Query to pull related kinases using phosphosite accession
    :param phos_group_id: string phosphosite group ID
    :return: Flask_Table Kinase object
    """
    session = create_sqlsession()
    q = session.query(Phosphosite).filter_by(phos_group_id=phos_group_id)
    phos = q.first()
    #subset of information about related kinases.
    subsets = phos.phosphorylated_by
    table = Kinase_results(subsets)
    session.close()
    return table
示例#6
0
def subs_phos_query(subs_accession):
    """
    Query to pull related phosphosites using substrate accession
    :param subs_accession: string substrate accession
    :return: Flask_Table Phosphosite_results object
    """
    session = create_sqlsession()
    q = session.query(Substrate).filter_by(subs_accession=subs_accession)
    sub = q.first()
    #subset of information about substrate phosphosites sites.
    subsites = sub.subs_sites
    table = Phosphosite_results(subsites)
    session.close()
    return table
示例#7
0
def all_table(table):
    """
    Function to return all results from one db table
    :param table: dbtable object
    :return: query output
    """
    session = create_sqlsession()
    results = session.query(table).all()
    session.close()
    # check if query has returned results
    if results:
        return results
    else:
        return ['No results found']
示例#8
0
def searchexact(text, table, fieldname):
    """
    Universal exact search function for table/field name
    :param text: search text (string)
    :param table: db table class object
    :param fieldname: dbtable field object
    :return: query results
    """
    session = create_sqlsession()
    results = session.query(table).filter(fieldname == text).all()
    session.close()
    # check if query has returned results
    if results:
        return results
    else:
        return ['No results found']
示例#9
0
def searchlike(text, table, fieldname):
    """
    Universal LIKE search function for table/field name,returns all fields
    :param text: search text (string)
    :param table: db table class object
    :param fieldname: dbtable field object
    :return: query results
    """
    text = '%' + text + '%'  # add wildcards for LIKE search
    session = create_sqlsession()
    results = session.query(table).filter(fieldname\
                                          .like(text)).all()
    session.close()
    # check if query has returned results
    if results:
        return results
    else:
        return ['No results found']
def link_ud_to_db(user_data_frame):
    """
    Check substrates and phosphosites in the user data against the database and
    return DB info of substrates, phosphosites and kinases records matching user
    data.

    :param user_data_frame: a data frame containing the significant hits in the
                            user csv file (pandas df)
    :return: data frame column containing user sites > db entry / kinases
             data frame with individual kinases >
    """
    # open sqlite session
    session = create_sqlsession()

    # create dictionary to link user phosphosites to db entries
    # 'Substrate Entry in DB': [{'ACC1'}, 'not in DB', ...] accessions are str
    # 'Phosphosite Entry in DB': [{id1}, 'not in DB', ...] ids are integers
    # 'Associated Kinases': [{'ACCa', 'ACCb'}, 'not in DB', ...]
    db_links = {Substrate: [], Phosphosite: [], Kinase: []}

    # create dictionary for kinase-centric analysis data frame
    # 'KIN_ACC': {('SUB_GENE', 'RSD'),...}
    kin_to_ud = {}

    # not found in DB message
    not_in_db = 'not in DB'

    # 3 table query left outer joining on substrate
    # one substrate can have multiple phosphosites; outer join allows for
    # return of a substrate if user modified residue has not been included in
    # DB
    # Kinases are associated with phosphosites, there may be no kinases for a
    # site
    query = session.query(Substrate, Phosphosite, Kinase)\
        .outerjoin(Phosphosite)\
        .outerjoin(kinases_phosphosites_table)\
        .outerjoin(Kinase)\
        .options(Load(Substrate).load_only("subs_gene"),
                 Load(Phosphosite).load_only("phos_modified_residue"),
                 Load(Kinase).load_only("kin_gene"))

    # iterate through each line of the user data frame
    for index, row in user_data_frame.iterrows():
        # substrate gene in the line
        # some substrates may have 'GENE_species', hence split on '_'
        s_gene = row['Substrate (gene name)'].split('_')[0]
        # modified residue in the line
        residue = row['Phospho site ID']
        # filter the query based on substrate gene and modified residue
        # or_ None allows modified residue not to be present in DB if substrate
        # is
        # query_res format is [(substrate instance, phosphosite instance /
        # None, kinase instance 1 / None), (subs, phos, kin2)]
        # the kinase instance is the only one that changes in
        # each substrate/phosphosite/kinase tuple in query_res
        query_res = query.filter(
            and_(
                Substrate.subs_gene == s_gene,
                or_(Phosphosite.phos_modified_residue == residue,
                    Phosphosite.phos_modified_residue == None))).all()
        # if the substrate is not in the DB, 'not in DB' is added to the three
        # columns in db_links dict
        if len(query_res) == 0:
            for key in db_links:
                db_links[key].append(not_in_db)
        else:
            # dict of classes to unique instances set from the query results
            # {Class: {inst1, inst2}}
            class_records = records_from_join_res(query_res)

            # add relevant instance information to dictionaries
            record_info = {}
            # substrates (always present)
            record_info[Substrate] = extract_record_info(
                class_records[Substrate], ('subs_gene', 'subs_accession'))
            # if phosphosites were found add info to substrate-centric dict
            if Phosphosite in class_records:
                record_info[Phosphosite] = extract_record_info(
                    class_records[Phosphosite],
                    ('phos_modified_residue', 'phos_group_id'))
            # if kinases were found add the kinase gene(s) to both
            # substrate-centric and kinase-centric dictionaries
            if Kinase in class_records:
                record_info[Kinase] = extract_record_info(
                    class_records[Kinase], ('kin_gene', 'kin_accession'))
                for kin in record_info[Kinase]:
                    kinase_gene = kin[0]
                    new_set = kin_to_ud.setdefault(kinase_gene, set())
                    new_set.add((s_gene, residue))

            # append the new values to the columns dict db_links
            for class_obj in db_links:
                if class_obj in record_info:
                    to_append = record_info[class_obj]
                else:
                    to_append = not_in_db
                db_links[class_obj].append(to_append)
        # remove all objects found in loop from session to reduce memory usage
        session.expire_all()
    session.close()

    # change key/column names
    db_links['Substrate/Isoform in DB (gene name)'] = \
        db_links.pop(Substrate)
    db_links['Phosphosite in DB (ID)'] = \
        db_links.pop(Phosphosite)
    db_links['Kinase in DB\n(gene name)'] = \
        db_links.pop(Kinase)

    return db_links, kin_to_ud
示例#11
0
def link_ud_to_db(user_data_frame):
    """
    Check substrates and phosphosites in the user data against the database and
    return DB info of substrates, phosphosites and kinases records matching the
    user data.

    :param user_data_frame: a data frame containing the significant hits in the
                            user csv file (pandas df)
    :return db_links: dictionary that will be converted into df columns linking
                      to DB of the same length as user_data_frame. Dict keys
                      are column headings and dict values are lists that will
                      become the column values, each element in the list
                      becoming a line (dict)
    :return kin_to_ud: dictionary that will be used for the kinase-centric
                       analyses, where keys are unique kinases associated with
                       user sites and values are the user substrates/sites
                       associated with the kinase (dict)
    """
    # open sqlite session
    session = create_sqlsession()

    # create dictionary to link user phosphosites to db entries
    # substrates col: [{('gene1', 'ACC1'),...}, 'not in DB', ...]
    # sites col: [{('rsd1', id1),...}, 'not in DB', ...] ids are integers
    # kinases col: [{('gene1', 'ACC1'),...}, 'not in DB', ...]
    db_links = {
        'Substrate/Isoform in DB (accession)': [],
        'Phosphosite in DB (DB ID)': [],
        'Kinase in DB\n(gene)': []
    }

    # create dictionary for kinase-centric analysis data frame
    # 'kin_gene': {('subs_gene', 'residue'),...}
    kin_to_ud = {}

    # not found in DB message
    not_in_db = 'not in DB'

    # 3 table query left joining on substrate
    # given that we need to filter on phosphosite residue and to reduce number
    # of entries returned, inner join on phosphosite is used
    # Kinases are associated with phosphosites, there may be no kinases for a
    # site;
    # one substrate/phosphosite can be associated with multiple kinases: outer
    # join allows for return of a substrate/site if no kinases are found in DB
    query = session.query(
        Substrate.subs_gene, Substrate.subs_accession,
        Phosphosite.phos_modified_residue, Phosphosite.phos_group_id,
        Kinase.kin_gene, Kinase.kin_accession)\
        .join(Phosphosite)\
        .outerjoin(kinases_phosphosites_table)\
        .outerjoin(Kinase)\
        .options(Load(Substrate).load_only("subs_gene"),
                 Load(Phosphosite).load_only("phos_modified_residue"),
                 Load(Kinase).load_only("kin_gene"))

    # iterate through each line of the user data frame
    for index, row in user_data_frame.iterrows():
        # substrate gene in the line
        # some substrates may have 'GENE_species', hence split on '_'
        s_gene = row['Substrate (gene name)'].split('_')[0]
        # modified residue in the line
        residue = row['Phospho site ID']
        # filter the query based on substrate gene and modified residue
        # query_res format is
        # [(subs_gene, subs_acc, site_rsd, site_id, kin_gene, kin_acc),...]
        # or []
        query_res = query.filter(
            and_(Substrate.subs_gene == s_gene,
                 Phosphosite.phos_modified_residue == residue)).all()

        # query_res can be [], [one], [several, records]

        # if query_res is [] because the phosphosite is not in DB, we want to
        # retrieve any substrate info available in the DB
        if not query_res:
            subs_only = session.query(
                Substrate.subs_gene, Substrate.subs_accession)\
                .filter(Substrate.subs_gene == s_gene).all()
            query_res = subs_only

        # if the substrate is not in the DB 'not in DB' is added to the
        # three entries/columns in db_links dict
        if len(query_res) == 0:
            for col in db_links:
                db_links[col].append(not_in_db)

        else:
            # de-duplicate records found
            unique_subs, unique_phos, unique_kin = distinct_records(query_res)

            # if kinases were found in the query, map kinase genes to the user
            # gene and residue
            for kin in unique_kin:
                kinase_gene = kin[0]
                kin_set = kin_to_ud.setdefault(kinase_gene, set())
                kin_set.add((s_gene, residue))

            # append the new values to the columns dict db_links
            # if there are no records in any of the sets containing unique
            # records replace the empty set with 'not in DB'
            db_links['Substrate/Isoform in DB (accession)'].append(
                unique_subs if unique_subs else not_in_db)
            db_links['Phosphosite in DB (DB ID)'].append(
                unique_phos if unique_phos else not_in_db)
            db_links['Kinase in DB\n(gene)'].append(
                unique_kin if unique_kin else not_in_db)

        # remove all objects found in loop from session to reduce memory usage
        session.expire_all()
    session.close()

    return db_links, kin_to_ud
示例#12
0
kinases_in_table = get_table_values_for_search(Kinase)

# filter BindingDB df based on existing kinases in DB
filtered_bdb_df = bindingDB_human[
    bindingDB_human['UniProt_(SwissProt)_Primary_ID_of_Target_Chain']\
        .isin(kinases_in_table)]

# import filtered BindingDB data into DB
import_data_from_data_frame(filtered_bdb_df, bindingDB_human_to_class)

# =========================================================================== #
# Manual data curating

# Inhibitor InChI key stored in InChI field as well
# open a SQLite session
session = create_sqlsession()
inh_to_correct = session.query(Inhibitor).filter(
    Inhibitor.inhib_pubchem_cid == 9549284).first()
inh_to_correct.inhib_int_chem_id = 'InChI=1S/C16H12F3N3S/c17-16(18,' \
                                   '19)14-4-2-1-3-12(14)13(9-20)15(22)' \
                                   '23-11-7-5-10(21)6-8-11/h1-8H,21-22H2/' \
                                   'b15-13+'

# --------------------------------------------------------------------------- #
# Inhibitor PubChem SID in place of CID; CID already in DB
inh_to_del = session.query(Inhibitor).filter(
    Inhibitor.inhib_pubchem_cid == 160968186).first()
session.delete(inh_to_del)

session.commit()
session.close()
示例#13
0
def link_ud_to_db(gene, residue):
    # TODO docstr

    # not found in DB message
    not_in_db = 'not in DB'

    # create dictionary for kinase-centric analysis data frame
    # 'KIN_ACC': {('SUB_GENE', 'RSD'),...}
    kin_to_ud = {}

    # open sqlite session
    session = create_sqlsession()

    # 3 table query left outer joining on substrate
    # one substrate can have multiple phosphosites; outer join allows for
    # return of a substrate if user modified residue has not been included in
    # DB
    # Kinases are associated with phosphosites, there may be no kinases for a
    # site
    query = session.query(Substrate.subs_gene, Substrate.subs_accession,
                          Phosphosite.phos_modified_residue,
                          Phosphosite.phos_group_id,
                          Kinase.kin_gene, Kinase.kin_accession)\
        .outerjoin(Phosphosite)\
        .outerjoin(kinases_phosphosites_table)\
        .outerjoin(Kinase)\
        .options(Load(Substrate).load_only("subs_gene"),
                 Load(Phosphosite).load_only("phos_modified_residue"),
                 Load(Kinase).load_only("kin_gene"))

    # substrate gene in the line
    # some substrates may have 'GENE_species', hence split on '_'
    s_gene = gene.split('_')[0]

    # filter the query based on substrate gene and modified residue
    # query_res format is [(gene, acc, rsd, grp id,
    # kin gene / None, kin acc / None)]
    query_res = query.filter(and_(
        Substrate.subs_gene == s_gene,
        Phosphosite.phos_modified_residue == residue)).all()

    # query_res can be [], [one], [several, records]
    # where each record is a tuple
    # (gene, accession, residue, grp id, kin gene, kin acc)

    # if query_res is [] because the phosphosite is not in DB, we still want
    # the substrate info available in the DB
    if len(query_res) == 0:
        subs_only = session.query(
            Substrate.subs_gene, Substrate.subs_accession)\
            .filter(Substrate.subs_gene == s_gene).all()
        query_res = subs_only

    # if the substrate is not in the DB 'not in DB' is added to the
    # three link entries/columns
    if len(query_res) == 0:
        new_line = (not_in_db, not_in_db, not_in_db)
        # print(new_line)

    else:
        unique_subs, unique_phos, unique_kin = distinct_records(query_res)
        if unique_kin:
            # if kinases were found in the query, map kinase genes to the user
            # gene and residue
            for kin in unique_kin:
                kinase_gene = kin[0]
                new_set = kin_to_ud.setdefault(kinase_gene, set())
                new_set.add((s_gene, residue))


        # append the new values to the columns dict db_links
        # if there are no records in any of the sets containing unique
        # records replace the empty set with 'not in DB'
        sub_line = sorted(unique_subs) if unique_subs else not_in_db
        phos_line = sorted(unique_phos) if unique_phos else not_in_db
        kin_line = sorted(unique_kin) if unique_kin else not_in_db
        new_line = list((sub_line, phos_line, kin_line))

    # remove all objects found in loop from session to reduce memory usage
    session.expire_all()
    session.close()
    if len(new_line) != 3:
        print(new_line)
    return new_line