Exemplo n.º 1
0
def mk_toxcastfp(idrow):
    # query existing toxprints
    dsi = 1449

    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # pull compound ids in invitrodb
    CID1 = mysession.execute(
        'SELECT compounds.id, mc5.aeid, compounds.dsstox_compound_id, mc5.hitc FROM invitrodb.mc5'
        ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id'
        ' JOIN invitrodb.sample ON sample.spid = mc4.spid'
        ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid'
        ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id'
        ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id'
    )

    CID1 = pd.DataFrame(list(CID1))

    # filter out rows without a given id
    output_df = pd.merge(idrow, CID1, on=0, how='inner')

    # sort
    output_df = output_df.sort_values(
        [output_df.columns.values[0], output_df.columns.values[1]])
    output_df = output_df.reset_index(drop=True)

    # remove -1 values replace with 0 !!!
    output_df = output_df.replace(-1, 0)

    # pivot tables per dtxcid
    output_df = pd.pivot_table(output_df,
                               index=0,
                               columns=1,
                               values=3,
                               fill_value='',
                               aggfunc=np.sum)

    # sets correct hitcall
    # if any duplicate is a hit, fingerprint is considered a hit !!!

    output_df = output_df.apply(
        lambda x: [y if y == '' else (int(y) if y <= 1 else 1) for y in x])

    # not all columns are present depending on query
    # pull remaining columns and add empty columns to output_df dataframe
    full_columns = [
        x[0] for x in mysession.execute(
            'SELECT assay_component_endpoint.aeid FROM invitrodb.assay_component_endpoint'
        )
    ]

    for x in full_columns:
        if x not in output_df.columns.values:
            output_df[x] = ''

    # output_df = output_df.sort_values(list(output_df.columns), axis=1, ascending=True)
    output_df = output_df.reindex_axis(sorted(output_df.columns), axis=1)

    return output_df
Exemplo n.º 2
0
def check(f):
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # get all dtxcids
    # remove blacklist here with NOT LIKE '%|%'
    df1 = pd.DataFrame(
        list(
            mysession.execute(
                'SELECT compounds.id, compounds.dsstox_compound_id FROM ro_stg_dsstox.compounds WHERE compounds.smiles NOT LIKE "%|%"'
            )))
    l0 = df1[1].tolist()
    l1 = df1[0].tolist()

    # get all compound ids with toxprints
    l2 = [
        i[0] for i in list(
            mysession.execute(
                'SELECT efk_dsstox_compound_id FROM sbox_rlougee_qsar.compound_descriptor_sets WHERE fk_descriptor_set_id = 1445'
            ))
    ]

    # find difference between ids
    l3 = list(set(l1) - set(l2))

    # merge df1 l3
    df2 = pd.merge(pd.DataFrame(l3), df1, on=[0], how='left')
    l4 = list(df2[1])

    ### IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE TOXPRINTS FOR THIS COMPOUND ###
    if not l4 and f == False:
        print('SQL query is empty: All toxprints available or no DTXCIDs')
        sys.exit(0)
    elif not l4 and f == True:
        return [], l0
    else:
        return l4, l0
Exemplo n.º 3
0
def check(f):
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # pull compound ids in invitrodb
    CID1 = mysession.execute(
        'SELECT compounds.id FROM invitrodb.mc5'
        ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id'
        ' JOIN invitrodb.sample ON sample.spid = mc4.spid'
        ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid'
        ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id'
        ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id'
    )

    CID1 = set([x[0] for x in list(CID1)])

    # pull compound ids in compound_descriptor_sets
    CID2 = mysession.query(CompoundDescriptorSets.efk_dsstox_compound_id) \
        .filter(CompoundDescriptorSets.fk_descriptor_set_id == '1449')

    CID2 = [x[0] for x in list(CID2)]

    # # # CHECKS FOR ID AND ToxcastFPs IN QSAR.COMPOUND_DESCRIPTOR_SETS # # #
    # # # MAKE A LIST OF THE DSSTOX.COMPOUNDS.ID THAT DON'T HAVE SPECIAL ToxcastFPs # # #

    CID3 = list(CID1 - set(CID2))

    # check dataframes
    # print("\n set CID1:", len(set(CID1)))
    # print("\n set CID2:", len(set(CID2)))
    # print("\n CID3", len(CID3), CID3)
    # print("\n CID3df", pd.DataFrame(CID3).head())
    # print("\n CID1df", pd.DataFrame(list(CID1)).head())

    # # # # IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE ToxcastFPs FOR THIS COMPOUND # # #
    if CID3 == [] and f is False:
        print(Fore.RED +
              'SQL query is empty: All ToxcastFP available or no DTXCIDs')
        sys.exit(0)

    elif CID3 == [] and f is True:
        return pd.DataFrame([]), pd.DataFrame(CID1)

    else:
        return pd.DataFrame(CID3), pd.DataFrame(CID1)
Exemplo n.º 4
0
def main():
    args = docopt(__doc__)
    # print(args)

    if args['--version']:
        print('NCCT CLI: Version 0.0.0')
        sys.exit(0)

    # set input arguments and options to variables
    InputString = args['<InputString>']
    if not InputString:
        InputString = sys.stdin.read()

    mysession = SQLSession(Schemas.dsstox_schema).get_session()
    query = mysession.execute(
        'SELECT datasets.name FROM sbox_rlougee_qsar.datasets WHERE datasets.name LIKE "%{}%"'
        .format(InputString))
    for i in pd.DataFrame(list(query))[0]:
        print(i)
Exemplo n.º 5
0
def main():
    args = docopt(__doc__)
    # print(args)

    if args['--version']:
        print('NCCT CLI: Version 0.0.0')
        sys.exit(0)

    ### QUERY THE MYSQL DB 4 A COMPLETE LIST OF AEIDS, ENDPOINTS & DTXCIDS ###
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    ### new stuff ###
    query0 = mysession.execute(
        'SELECT DISTINCT(name) FROM sbox_rlougee_qsar.datasets WHERE id NOT IN (SELECT fk_dataset_id FROM sbox_rlougee_qsar.univariate_calculations)'
    )
    query0 = [x[0] for x in list(query0)]

    # if query1 is empty exit
    if len(query0) == 0:
        print(Fore.RED + 'Enrichments are up to date')
        sys.exit(0)

    p = Pool(20)
    p.map(fillnewenrich, query0)
Exemplo n.º 6
0
from PIL import Image
from AssaySpaceIdentifier import assayspaceidentifier
from ImageMaker import imagemaker
from fpenrich import getenrichfp
from docopt import docopt
from database.database_schemas import Schemas
from database.session import SQLSession
import sys
import os
import pandas as pd
from colorama import init, Fore

mysession = SQLSession(Schemas.qsar_schema).get_session()
enrichment_data_1 = mysession.execute(
    'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, GROUP_CONCAT(CASE WHEN abbreviation = \'CT-Total\' THEN value ELSE NULL END) as \'CT-Tot\',GROUP_CONCAT(CASE WHEN abbreviation = \'TP\' THEN value ELSE NULL END) as \'TP\',GROUP_CONCAT(CASE WHEN abbreviation = \'FP\' THEN value ELSE NULL END) as \'FP\', GROUP_CONCAT(CASE WHEN abbreviation = \'FN\' THEN value ELSE NULL END) as \'FN\', GROUP_CONCAT(CASE WHEN abbreviation = \'TN\' THEN value ELSE NULL END) as \'TN\', GROUP_CONCAT(CASE WHEN abbreviation = \'BA\' THEN value ELSE NULL END) as \'BA\', GROUP_CONCAT(CASE WHEN abbreviation = \'OR\' THEN value ELSE NULL END) as \'OR\', GROUP_CONCAT(CASE WHEN abbreviation = \'P-Val\' THEN value ELSE NULL END) as \'P-Val\', GROUP_CONCAT(CASE WHEN abbreviation = \'Inv OR\' THEN value ELSE NULL END) as \'Inv OR\',GROUP_CONCAT(CASE WHEN abbreviation = \'Inv P-Val\' THEN value ELSE NULL END) as \'Inv P-Val\' FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%aeid\_9\_invitrodbv2%\' AND fk_descriptor_set_id = 1445 GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label'
)

ed1 = pd.DataFrame(list(enrichment_data_1))

ed1.columns = [
    'Data Table', 'Chemotype ID', 'Chemotype Label', 'Total Chemotypes',
    'True Positives', 'False Positives', 'False Negatives', 'True Negatives',
    'Balanced Accuracy', 'Odds Ratio', 'P-Value', 'Inverse Odds Ratio',
    'Inverse P-Value'
]

# print(pd.DataFrame(list(enrichment_data_1)).head())

enrichment_data_2 = mysession.execute(
    'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%aeid\_9\_invitrodbv2%\' AND fk_descriptor_set_id = 1445 GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value'
)
Exemplo n.º 7
0
def main():
    args = docopt(__doc__)
    # print(args)

    if args['--version']:
        print('NCCT CLI: Version 0.0.0')
        sys.exit(0)

    # set input arguments and options to variables
    DataSetName = args['<DataSetName>'].lower()
    outputpath = args['--output']
    if outputpath == None:
        outputpath = ''
    image = args['--image']
    allassays = args['--allassays']
    fullenrich = args['--fullenrich']
    fpenrich = args['--fpenrich']
    a = args['--all']
    OR = float(args['--OR'])
    PV = float(args['--PV'])
    TP = float(args['--TP'])

    if a:
        image = True
        allassays = True
        fpenrich = True

    descriptor_set_id = 1445

    # # # QUERY ENRICHMENT DATA FOR YOUR DATASETNAME # # #

    mysession = SQLSession(Schemas.qsar_schema).get_session()
    enrichment_data = mysession.execute(
        'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%{}%\' AND fk_descriptor_set_id = {} GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value'
        .format(DataSetName, descriptor_set_id))

    enrichment_data = pd.DataFrame(list(enrichment_data))

    if enrichment_data.empty:
        print(Fore.RED +
              'ERROR: DataSetName string returned no results {}'.format(
                  DataSetName))
        sys.exit(1)

    # format table, pivot columns, reorder
    # null columns are dropped without dropna=False but this adds empty rows
    enrichment_data = pd.pivot_table(enrichment_data,
                                     values=4,
                                     index=[0, 1, 2],
                                     columns=[3],
                                     dropna=False)
    enrichment_data = enrichment_data.reset_index(level=[0, 1, 2])
    enrichment_data.columns = [
        'Data Table', 'Chemotype ID', 'Chemotype Label', 'Balanced Accuracy',
        'Total Chemotypes', 'False Negatives', 'False Positives',
        'Inverse Odds Ratio', 'Inverse P-Value', 'Odds Ratio', 'P-Value',
        'True Negatives', 'True Positives'
    ]

    # drop empty rows
    enrichment_data = enrichment_data.dropna(thresh=4)

    enrichment_data = enrichment_data[[
        'Data Table', 'Chemotype ID', 'Chemotype Label', 'Total Chemotypes',
        'True Positives', 'False Positives', 'False Negatives',
        'True Negatives', 'Balanced Accuracy', 'Odds Ratio', 'P-Value',
        'Inverse Odds Ratio', 'Inverse P-Value'
    ]]

    # FIND THE BASE FOR FILE NAMES
    file_name = enrichment_data['Data Table'][0]
    file_name = file_name.split(':')[-1]

    # CREATE FOLDERS FOR RESULTS AND EDIT PATH
    if not os.path.exists(outputpath + 'CTEW_' + file_name):
        os.makedirs(outputpath + 'CTEW_' + file_name)
        outputpath = outputpath + 'CTEW_' + file_name
        os.makedirs(outputpath + '/CTEW_Results')
        os.makedirs(outputpath + '/CTEW-INV_Results')
        regoutputpath = outputpath + '/CTEW_Results/'
        invoutputpath = outputpath + '/CTEW-INV_Results/'
    else:
        print(Fore.RED +
              'ERROR: output folder {} already exists'.format(outputpath +
                                                              'CTEW_' +
                                                              file_name))
        sys.exit(1)

    # # # PULL SIGNIFICANT TOXPRINTS # # #
    enrichment_data2 = enrichment_data.dropna()
    enrichment_data2.loc[:, 'Odds Ratio'] = pd.to_numeric(
        enrichment_data2.loc[:, 'Odds Ratio'], errors='coerce')
    enrichment_data2.loc[:, 'P-Value'] = pd.to_numeric(
        enrichment_data2.loc[:, 'P-Value'], errors='coerce')
    enrichment_data2.loc[:, 'True Positives'] = pd.to_numeric(
        enrichment_data2.loc[:, 'True Positives'], errors='coerce')

    sigtxp = []
    for i, x in enrichment_data2.iterrows():
        if x['Odds Ratio'] >= OR and x['P-Value'] <= PV and x[
                'True Positives'] >= TP:
            sigtxp.append(x['Chemotype ID'])

    # # # PULL SIGNIFICANT INVERSE TOXPRINTS # # #
    enrichment_data3 = enrichment_data.dropna()
    enrichment_data3.loc[:, 'Inverse Odds Ratio'] = pd.to_numeric(
        enrichment_data3.loc[:, 'Inverse Odds Ratio'], errors='coerce')
    enrichment_data3.loc[:, 'Inverse P-Value'] = pd.to_numeric(
        enrichment_data3.loc[:, 'Inverse P-Value'], errors='coerce')

    invsigtxp = []
    for i, x in enrichment_data3.iterrows():
        if x['Inverse Odds Ratio'] >= OR and x['Inverse P-Value'] <= PV:
            invsigtxp.append(x['Chemotype ID'])

    # CREATE SIGNIFICANT TOXPRINT EXPORT TABLE AND INVERSE SIGNIFICANT TOXPRINT TABLE
    sigtxp = pd.DataFrame(sigtxp, columns=['Chemotype ID'])
    # sigtxp.columns = ['Chemotype ID']
    OutputTable = pd.merge(sigtxp,
                           enrichment_data,
                           on='Chemotype ID',
                           how='inner')

    invsigtxp = pd.DataFrame(invsigtxp, columns=['Chemotype ID'])
    # invsigtxp.columns = ['Chemotype ID']
    InvOutputTable = pd.merge(invsigtxp,
                              enrichment_data,
                              on='Chemotype ID',
                              how='inner')

    # CREATE FULL ENRICHMENT STATISTICS TABLE
    if fullenrich:
        enrichment_data.to_csv('{}Full_Enrichment_Stats_{}.tsv'.format(
            outputpath, file_name),
                               sep='\t',
                               index=False)

    # # # ENRICHED CHEMOTYPES # # #
    # return a table containing enriched chemotypes X DTXCID
    if fpenrich:
        try:
            ct_model_row = getenrichfp(DataSetName, sigtxp, regoutputpath,
                                       'CT-Enriched_FP_' + file_name)
            OutputTable = OutputTable.append(ct_model_row)
        except:
            pass
        try:
            ct_inv_model_row = getenrichfp(DataSetName, invsigtxp,
                                           invoutputpath,
                                           'CT-INV-Enriched_FP_' + file_name)
            InvOutputTable = InvOutputTable.append(ct_inv_model_row)
        except:
            pass

    # # # CREATE CT IMAGES # # #
    if image:
        imagemaker(sigtxp, 'CT-Enriched_Images_' + file_name, regoutputpath)
        imagemaker(invsigtxp, 'CT-INV-Enriched_Images_' + file_name,
                   invoutputpath)

    # # # CREATE ADDITIONAL ASSAY INFO # # #
    if allassays:
        assayspaceidentifier(sigtxp, 'CT-Enriched_TCAssays_' + file_name,
                             regoutputpath)
        assayspaceidentifier(invsigtxp,
                             'CT-INV-Enriched_TCAssays_' + file_name,
                             invoutputpath)

    # # # EXPORT CT-ENRICHED_STATS & CT-INV-ENRICHED_STATS # # #
    try:
        OutputTable.to_csv('{}CT-Enriched_Stats_{}.tsv'.format(
            regoutputpath, file_name),
                           sep='\t',
                           index=False)
    except:
        pass
    try:
        InvOutputTable.to_csv('{}CT-INV-Enriched_Stats_{}.tsv'.format(
            invoutputpath, file_name),
                              sep='\t',
                              index=False)
    except:
        pass
Exemplo n.º 8
0
from database.database_schemas import Schemas
from database.session import SQLSession
from database.qsar.descriptors import Descriptors
import sys
import os
import pandas as pd
import datetime

# THIS CODE IS TO BE USED ONE TIME TO FILL sbox_rlougee_qsar.descriptors table for the fingerprint labels, ids, MySQL_setup

# add Toxcast fingerprint data to MySQL
# query AEID data
mysession = SQLSession(Schemas.qsar_schema).get_session()
aeid_data = mysession.execute(
    'SELECT aeid, assay_component_endpoint_name FROM invitrodb.assay_component_endpoint'
)

aeid_data = pd.DataFrame(list(aeid_data))
aeid_data[0] = [int(x) for x in aeid_data[0]]
aeid_data = aeid_data.sort_values(0, axis=0)

aeid_data = aeid_data.reset_index(drop=True)
# print(aeid_data)
# sys.exit(1)

# add ASSAY_COMPONENT_ENDPOINT_NAME as descriptors.label
# use aeid as descriptors_name

for i, row in aeid_data.iterrows():
    username = '******'
    # create a new datasets name entry
Exemplo n.º 9
0
nonburst = pd.read_csv("/home/rlougee/Desktop/primary_data/Non-Burst_MC_v2_2018_full_enrichment_results.tsv", sep='\t')
burst = pd.read_csv("/home/rlougee/Desktop/primary_data/Burst_MC_v2_2018_full_enrichment_results.tsv", sep='\t')

nonburst['name'] = nonburst['name'].str.replace('Imported_DataTable:aeid_','').str.replace('_invitrodbv2_20180918', '').apply(int)
burst['name'] = burst['name'].str.replace('Imported_DataTable:','').str.replace('_Burst_MC_v2_20181120', '')

print(nonburst.tail())
print(burst.tail())

# QUERY AEID AND CORRECT ASSAY NAMES

mysession = SQLSession(Schemas.information_schema).get_session()


q0 = mysession.execute('SELECT aeid, assay_component_endpoint_name FROM sbox_rlougee_invitrodb.assay_component_endpoint')
q0 = pd.DataFrame(list(q0))
# q0[1] = q0[1].apply(str)
print('q0 shape', q0.shape)
print('nonburst shape', nonburst.shape)
print('burst shape', burst.shape)

nonburst2 = pd.merge(nonburst, q0, how='inner', left_on=['name'], right_on=[0])
print(nonburst2.shape)

burst2 = pd.merge(burst, q0, how='inner', left_on=['name'], right_on=[1])
print(burst2.shape)

# ok merges look good now

nonburst2.to_csv('/home/rlougee/Desktop/nonburst.tsv', sep='\t')
Exemplo n.º 10
0
import glob
from multiprocessing import Pool

def com(i):
    print(i)
    command = subp.Popen(
        'pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i),
        shell=True)
    command.communicate()

# create a tsv with dataset name x TP/TOTAL_CHEM so that we can see the coverage of our Toxprint Models

# get all relevant datasets
mysession = SQLSession(Schemas.qsar_schema).get_session()

datasets = [x[0] for x in mysession.execute('SELECT datasets.name FROM sbox_rlougee_qsar.datasets'
                                 ' WHERE sbox_rlougee_qsar.datasets.name LIKE "Imported_DataTable:aeid\_%\_invitrodbv2\_20180918"')]

# print(len(datasets))
# print(datasets[180:])
# sys.exit(1)
# # get TP stats file for each dataset
# for i in datasets[38:]:
#     print(i)
#     sys.exit(0)
#     command = subp.Popen('pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i), shell=True)
#     command.communicate()


# add multiprocessing for script
p = Pool(15)
p.map(com, datasets)
Exemplo n.º 11
0
def getenrichfp(DataSetName, sigtxp, mypath, myfilename, dsi=1445):
    """ Get Enrichment data for a combined set of chemotypes """

    # aborts if no significant chemotypes
    if len(sigtxp) == 0:
        return None

    mysession = SQLSession(Schemas.qsar_schema).get_session()
    MyDataSet = mysession.execute(
        'SELECT dsstox_compound_id, measured_value_dn, descriptor_string_tsv FROM sbox_rlougee_qsar.datasets'
        ' JOIN sbox_rlougee_qsar.dataset_datapoints ON sbox_rlougee_qsar.dataset_datapoints.fk_dataset_id = sbox_rlougee_qsar.datasets.id'
        ' JOIN sbox_rlougee_qsar.datapoints ON sbox_rlougee_qsar.datapoints.id = sbox_rlougee_qsar.dataset_datapoints.fk_datapoint_id'
        ' JOIN ro_stg_dsstox.compounds ON sbox_rlougee_qsar.datapoints.efk_dsstox_compound_id = ro_stg_dsstox.compounds.id'
        ' JOIN sbox_rlougee_qsar.compound_descriptor_sets ON ro_stg_dsstox.compounds.id = sbox_rlougee_qsar.compound_descriptor_sets.efk_dsstox_compound_id'
        ' WHERE sbox_rlougee_qsar.datasets.name LIKE \'%{}%\' AND sbox_rlougee_qsar.compound_descriptor_sets.fk_descriptor_set_id = {}'.format(DataSetName, dsi))
    MyDataSet = pd.DataFrame(list(MyDataSet))

    MyDataSet.columns = ['Dsstox_Compound_ID', 'Hit_Call', 'Toxprint']

    #something to separate and name fingerprint columns
    MyDataSet = pd.concat([MyDataSet, MyDataSet['Toxprint'].str[:].str.split('\t', expand=True)], axis=1)
    MyDataSet = MyDataSet.drop('Toxprint', axis=1)

    #name the columns correctly
    query3 = mysession.query(Descriptors.descriptors_name, Descriptors.label).filter(Descriptors.fk_descriptor_set_id == dsi)
    descriptornames = pd.DataFrame(list(query3))

    for num,name in enumerate(descriptornames['label'], start=0):
        MyDataSet = MyDataSet.rename(columns={num:name})

    # drop columns that are not significant
    sigtxp = pd.DataFrame(sigtxp)
    sigtxp.columns = ['descriptors_name']
    siglabel = pd.merge(sigtxp, descriptornames, on='descriptors_name', how='inner')
    siglabel = list(siglabel['label'])

    for i in MyDataSet.columns[2:]:
        if i in siglabel:
            pass
        else:
            MyDataSet = MyDataSet.drop(i, axis=1)

    # MyDataSet.to_csv('{}{}.tsv'.format(mypath, myfilename), sep='\t', index=False)

    # return overall balanced accuracy calculations
    # can just make a unique confusion matrix for significant toxprints and add to CT-Enriched Stats file
    # print(MyDataSet.head())
    model_row = pd.DataFrame([['Chemotype Full Model Coverage', myfilename, " ".join(sigtxp['descriptors_name']), 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0]], columns = ['Chemotype ID','Data Table','Chemotype Label','Total Chemotypes','True Positives','False Positives','False Negatives','True Negatives','Balanced Accuracy','Odds Ratio','P-Value','Inverse Odds Ratio','Inverse P-Value'])

    # fill model_row confusion matrix
    for index, row in MyDataSet.iterrows():
        rowsum = sum([int(x) for x in row.iloc[2:]])
        if row['Hit_Call'] == 1 and rowsum > 0:
            model_row['True Positives'] += 1
        elif row['Hit_Call'] == 1 and rowsum == 0:
            model_row['False Negatives'] += 1
        elif row['Hit_Call'] == 0 and rowsum > 0:
            model_row['False Positives'] += 1
        elif row['Hit_Call'] == 0 and rowsum == 0:
            model_row['True Negatives'] += 1

    # fill model_row statistics
    oddsratio, pvalue = stats.fisher_exact([ [int(model_row['True Positives']), int(model_row['False Positives'])], [int(model_row['False Negatives']), int(model_row['True Negatives'])]], alternative='greater')
    model_row['P-Value'] = pvalue
    model_row['Odds Ratio'] = oddsratio
    model_row['Total Chemotypes'] = (model_row['True Positives'] + model_row['False Positives'])
    BA = (((model_row['True Positives'] / (model_row['True Positives'] + model_row['False Negatives'])) + (model_row['True Negatives'] / (model_row['True Negatives'] + model_row['False Positives']))) / 2)
    model_row['Balanced Accuracy'] = float(BA)
    inv_oddsratio, inv_pvalue = stats.fisher_exact([ [int(model_row['False Positives']), int(model_row['True Positives'])], [int(model_row['True Negatives']), int(model_row['False Negatives'])] ],alternative='greater')
    model_row['Inverse P-Value'] = inv_pvalue
    model_row['Inverse Odds Ratio'] = inv_oddsratio

    # print(model_row)
    return model_row
Exemplo n.º 12
0
import glob
import pandas as pd
from database.database_schemas import Schemas
from database.dsstox.compounds import Compounds
from database.qsar.compound_descriptor_sets import CompoundDescriptorSets
from database.session import SQLSession
import sys

for f in glob.glob(
        "/home/rlougee/Desktop/invitrodb_v2_burst_splits/clean_assays/*"):
    aeid = f.split('/')[-1]
    a = pd.read_csv(f, sep='\t')

    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query0 = mysession.execute(
        'SELECT assay_component_endpoint_name FROM sbox_rlougee_invitrodb.assay_component_endpoint WHERE sbox_rlougee_invitrodb.assay_component_endpoint.aeid = {}'
        .format(aeid))

    output_name = list(query0)[0][0]
    a = a.dropna(axis=1)
    a.to_csv(
        '/home/rlougee/Desktop/invitrodb_v2_burst_splits/clean_assays2/{}.tsv'.
        format(output_name),
        sep='\t',
        index=False)
from multiprocessing import Pool

# pull enrichments for the BURST assays

def com(i):
    print(i)
    command = subp.Popen( 'export OPM_NUM_THREADS=5 & export USE_SIMPLE_THREADED_LEVEL3=1 &  pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_burst_enrichments/"'.format(i),
        shell=True)
    command.communicate()

# create a tsv with dataset name x TP/TOTAL_CHEM so that we can see the coverage of our Toxprint Models

# get all relevant datasets
mysession = SQLSession(Schemas.qsar_schema).get_session()

datasets = [x[0] for x in mysession.execute('SELECT datasets.name FROM sbox_rlougee_qsar.datasets'
                                 ' WHERE sbox_rlougee_qsar.datasets.name LIKE "%\_Burst\_MC\_v2\_2018112%"')]


# print(len(datasets))
# print(datasets[180:])
# sys.exit(1)
# # get TP stats file for each dataset
# for i in datasets[38:]:
#     print(i)
#     sys.exit(0)
#     command = subp.Popen('pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i), shell=True)
#     command.communicate()

# add multiprocessing for script
p = Pool(5)
p.map(com, datasets[384:])