def mk_toxcastfp(idrow): # query existing toxprints dsi = 1449 mysession = SQLSession(Schemas.qsar_schema).get_session() # pull compound ids in invitrodb CID1 = mysession.execute( 'SELECT compounds.id, mc5.aeid, compounds.dsstox_compound_id, mc5.hitc FROM invitrodb.mc5' ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id' ' JOIN invitrodb.sample ON sample.spid = mc4.spid' ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid' ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id' ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id' ) CID1 = pd.DataFrame(list(CID1)) # filter out rows without a given id output_df = pd.merge(idrow, CID1, on=0, how='inner') # sort output_df = output_df.sort_values( [output_df.columns.values[0], output_df.columns.values[1]]) output_df = output_df.reset_index(drop=True) # remove -1 values replace with 0 !!! output_df = output_df.replace(-1, 0) # pivot tables per dtxcid output_df = pd.pivot_table(output_df, index=0, columns=1, values=3, fill_value='', aggfunc=np.sum) # sets correct hitcall # if any duplicate is a hit, fingerprint is considered a hit !!! output_df = output_df.apply( lambda x: [y if y == '' else (int(y) if y <= 1 else 1) for y in x]) # not all columns are present depending on query # pull remaining columns and add empty columns to output_df dataframe full_columns = [ x[0] for x in mysession.execute( 'SELECT assay_component_endpoint.aeid FROM invitrodb.assay_component_endpoint' ) ] for x in full_columns: if x not in output_df.columns.values: output_df[x] = '' # output_df = output_df.sort_values(list(output_df.columns), axis=1, ascending=True) output_df = output_df.reindex_axis(sorted(output_df.columns), axis=1) return output_df
def mk_special_fingerprints(idrow): # query existing toxprints dsi = 1445 mysession = SQLSession(Schemas.qsar_schema).get_session() query2 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \ .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id).filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi) df2 = pd.DataFrame(list(query2)) # FILTERING BY A LIST OF >1000 WON'T WORK IN MANY DATABASES (THIS IS BREAKING THE SCRIPT HERE ON FULL UPDATE) # doing a full query then a merge after df2 = pd.merge(pd.DataFrame(idrow, columns=['dsstox_compound_id']), df2, on='dsstox_compound_id') # something to separate and name fingerprint columns df2 = pd.concat([ df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True) ], axis=1) df2 = df2.drop('descriptor_string_tsv', axis=1) # # # GENERATE SPECIFIC NEW FINGERPRINTS # # # # create an empty column for the new fingerprint df2['Special_Toxprints'] = "" # iterate through datatable and create new Special Toxprints # make sure you are looking at the right index when combining Toxprints for index, row in df2.iterrows(): # have to code each special toxprint this way if row[480] == '1' and row[489] == '1': row['Special_Toxprints'] = '1' else: row['Special_Toxprints'] = '0' # make sure to add tabs before the rest of the toxprints if row[489] == '1' and row[480] == '0': row['Special_Toxprints'] += '\t1' else: row['Special_Toxprints'] += '\t0' if row[480] == '1' and row[489] == '0': row['Special_Toxprints'] += '\t1' else: row['Special_Toxprints'] += '\t0' # remove everything but fingerprints and DTXCIDs output_df = df2[['dsstox_compound_id', 'Special_Toxprints']] return output_df
def updatedatabase(fingerprintdf): username = '******' descriptors_index = 1448 # Query for compound_descriptor_sets.id mysession = SQLSession(Schemas.qsar_schema).get_session() query3 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.id) \ .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id).filter(CompoundDescriptorSets.fk_descriptor_set_id == descriptors_index) df3 = pd.DataFrame(list(query3)) #join query with fingerprintdf mytable = pd.merge(df3, fingerprintdf, on='dsstox_compound_id') # # # CODE FOR UPDATING # # # for index, row in mytable.iterrows(): id = row[1] descriptor_string_tsv = row[2] updated_by = username mysession.query(CompoundDescriptorSets.id, CompoundDescriptorSets.descriptor_string_tsv, CompoundDescriptorSets.updated_by).filter( CompoundDescriptorSets.id == id).update({ "descriptor_string_tsv": descriptor_string_tsv, "updated_by": updated_by }) mysession.commit()
def check(f): mysession = SQLSession(Schemas.qsar_schema).get_session() # pull compound ids in invitrodb CID1 = mysession.execute( 'SELECT compounds.id FROM invitrodb.mc5' ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id' ' JOIN invitrodb.sample ON sample.spid = mc4.spid' ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid' ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id' ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id' ) CID1 = set([x[0] for x in list(CID1)]) # pull compound ids in compound_descriptor_sets CID2 = mysession.query(CompoundDescriptorSets.efk_dsstox_compound_id) \ .filter(CompoundDescriptorSets.fk_descriptor_set_id == '1449') CID2 = [x[0] for x in list(CID2)] # # # CHECKS FOR ID AND ToxcastFPs IN QSAR.COMPOUND_DESCRIPTOR_SETS # # # # # # MAKE A LIST OF THE DSSTOX.COMPOUNDS.ID THAT DON'T HAVE SPECIAL ToxcastFPs # # # CID3 = list(CID1 - set(CID2)) # check dataframes # print("\n set CID1:", len(set(CID1))) # print("\n set CID2:", len(set(CID2))) # print("\n CID3", len(CID3), CID3) # print("\n CID3df", pd.DataFrame(CID3).head()) # print("\n CID1df", pd.DataFrame(list(CID1)).head()) # # # # IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE ToxcastFPs FOR THIS COMPOUND # # # if CID3 == [] and f is False: print(Fore.RED + 'SQL query is empty: All ToxcastFP available or no DTXCIDs') sys.exit(0) elif CID3 == [] and f is True: return pd.DataFrame([]), pd.DataFrame(CID1) else: return pd.DataFrame(CID3), pd.DataFrame(CID1)
def main(): args = docopt(__doc__) # print(args) if args['--version']: print('NCCT CLI: Version 0.0.0') sys.exit(0) # set input arguments and options to variables InputString = args['<InputString>'] if not InputString: InputString = sys.stdin.read() mysession = SQLSession(Schemas.dsstox_schema).get_session() query = mysession.execute( 'SELECT datasets.name FROM sbox_rlougee_qsar.datasets WHERE datasets.name LIKE "%{}%"' .format(InputString)) for i in pd.DataFrame(list(query))[0]: print(i)
def fillnewenrich(x_aeid): print(x_aeid) # retrive the latest dataset for the aeid mysession = SQLSession(Schemas.qsar_schema).get_session() new_df = mysession.query(Compounds.dsstox_compound_id, Datapoints.measured_value_dn, Datasets.name, Datasets.id) \ .join(Datapoints, Datapoints.efk_dsstox_compound_id == Compounds.id) \ .join(DatasetDatapoints, Datapoints.id == DatasetDatapoints.fk_datapoint_id) \ .join(Datasets, DatasetDatapoints.fk_dataset_id == Datasets.id)\ .filter(Datasets.name == x_aeid) new_df = pd.DataFrame(list(new_df)) # new_df = query1[query1['name'].isin([x_aeid])] # rename columns new_df.columns = ['dsstox_compound_id', 'hitc', 'name', 'dataset_id'] my_dataset_id = new_df['dataset_id'].iloc[0] # make the enrichment table filluc(new_df, my_dataset_id)
def filldatabase(yourlist, desc_set_id, prefix): username = '******' descriptors_index = desc_set_id mysession = SQLSession(Schemas.qsar_schema).get_session() descriptor_objects = [] for index, val in enumerate(yourlist): fk_descriptor_set_id = descriptors_index index_number = index + 1 # xx = [str(x) for x in list(row)] descriptors_name = str(prefix.format(index + 1)) created_by = username updated_by = username long_description = '' short_description = '' label = val descriptor = Descriptors(index_number=index_number, descriptors_name=descriptors_name, long_description=long_description, short_description=short_description, label=label, created_by=created_by, fk_descriptor_set_id=fk_descriptor_set_id, updated_by=updated_by) descriptor_objects.append(descriptor) mysession.bulk_save_objects(descriptor_objects) mysession.commit()
def filldatabase(fingerprintdf): mysession = SQLSession(Schemas.qsar_schema).get_session() username = '******' descriptors_index = 1448 compound_descriptor_set_objects = [] # print(fingerprintdf) for index, row in fingerprintdf.iterrows(): efk_dsstox_compound_id = row[0].split('0', 1)[1] fk_descriptor_set_id = descriptors_index xx = [str(x) for x in list(row[1])] descriptor_string_tsv = '\t'.join(xx) created_by = username updated_by = username compound_descriptor_sets = CompoundDescriptorSets( efk_dsstox_compound_id=efk_dsstox_compound_id, descriptor_string_tsv=descriptor_string_tsv, created_by=created_by, fk_descriptor_set_id=fk_descriptor_set_id, updated_by=updated_by) compound_descriptor_set_objects.append(compound_descriptor_sets) mysession.bulk_save_objects(compound_descriptor_set_objects) mysession.commit()
def check(f): mysession = SQLSession(Schemas.qsar_schema).get_session() # get all dtxcids # remove blacklist here with NOT LIKE '%|%' df1 = pd.DataFrame( list( mysession.execute( 'SELECT compounds.id, compounds.dsstox_compound_id FROM ro_stg_dsstox.compounds WHERE compounds.smiles NOT LIKE "%|%"' ))) l0 = df1[1].tolist() l1 = df1[0].tolist() # get all compound ids with toxprints l2 = [ i[0] for i in list( mysession.execute( 'SELECT efk_dsstox_compound_id FROM sbox_rlougee_qsar.compound_descriptor_sets WHERE fk_descriptor_set_id = 1445' )) ] # find difference between ids l3 = list(set(l1) - set(l2)) # merge df1 l3 df2 = pd.merge(pd.DataFrame(l3), df1, on=[0], how='left') l4 = list(df2[1]) ### IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE TOXPRINTS FOR THIS COMPOUND ### if not l4 and f == False: print('SQL query is empty: All toxprints available or no DTXCIDs') sys.exit(0) elif not l4 and f == True: return [], l0 else: return l4, l0
def main(): args = docopt(__doc__) # print(args) if args['--version']: print('NCCT CLI: Version 0.0.0') sys.exit(0) ### QUERY THE MYSQL DB 4 A COMPLETE LIST OF AEIDS, ENDPOINTS & DTXCIDS ### mysession = SQLSession(Schemas.qsar_schema).get_session() ### new stuff ### query0 = mysession.execute( 'SELECT DISTINCT(name) FROM sbox_rlougee_qsar.datasets WHERE id NOT IN (SELECT fk_dataset_id FROM sbox_rlougee_qsar.univariate_calculations)' ) query0 = [x[0] for x in list(query0)] # if query1 is empty exit if len(query0) == 0: print(Fore.RED + 'Enrichments are up to date') sys.exit(0) p = Pool(20) p.map(fillnewenrich, query0)
def update_toxprint_database(fingerprintdf): username = '******' descriptors_index = 1445 # Query for compound_descriptor_sets.id mysession = SQLSession(Schemas.qsar_schema).get_session() query3 = mysession.query(CompoundDescriptorSets.id, CompoundDescriptorSets.efk_dsstox_compound_id)\ .filter(CompoundDescriptorSets.fk_descriptor_set_id == descriptors_index) df3 = pd.DataFrame(list(query3)) #join query with fingerprintdf fingerprintdf = fingerprintdf.reset_index() fingerprintdf = fingerprintdf.rename( columns={'M_NAME': 'efk_dsstox_compound_id'}) # not sure this will work fingerprintdf['efk_dsstox_compound_id'] = [ int(x[8:]) for x in fingerprintdf['efk_dsstox_compound_id'] ] mytable = pd.merge(df3, fingerprintdf, on='efk_dsstox_compound_id') # # # CODE FOR UPDATING # # # for index, row in mytable.iterrows(): id = str(row[0]) xx = [str(x) for x in list(row[2:])] descriptor_string_tsv = '\t'.join(xx) updated_by = username mysession.query(CompoundDescriptorSets.id, CompoundDescriptorSets.descriptor_string_tsv, CompoundDescriptorSets.updated_by).filter( CompoundDescriptorSets.id == id).update({ "descriptor_string_tsv": descriptor_string_tsv, "updated_by": updated_by }) mysession.commit()
from database.database_schemas import Schemas from database.dsstox.compounds import Compounds from database.qsar.compound_descriptors import CompoundDescriptors from database.session import SQLSession from database.dsstox.generic_substance_compounds import GenericSubstanceCompounds from database.dsstox.generic_substances import GenericSubstances import pandas as pd mytable = pd.read_csv('~/Desktop/Katies_data.tsv', sep='\t') dtxsid = mytable.iloc[:,4] mysession = SQLSession(Schemas.qsar_schema).get_session() query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.id, Compounds.dsstox_compound_id).join( GenericSubstanceCompounds) \ .join(Compounds).filter(GenericSubstances.dsstox_substance_id.in_(dtxsid)) df = pd.DataFrame(list(query)) myids = [int(x) for x in df.iloc[:,1]] # query1 = mysession.query(Compounds.id, Compounds.dsstox_compound_id).filter(Compounds.dsstox_compound_id.in_(dtxsid)) # df1 = pd.DataFrame(list(query1)) query2 = mysession.query(CompoundDescriptors.efk_dsstox_compound_id, CompoundDescriptors.descriptor_string_tsv, CompoundDescriptors.fk_descriptor_set_id).filter( CompoundDescriptors.efk_dsstox_compound_id.in_(myids)) df2 = pd.DataFrame(list(query2))
from Enrichment_MySQL.enrich_mysql0.duplicate_handler_0 import handle_duplicates from Enrichment_MySQL.enrich_mysql0.fillfp_0 import fillfp from database.database_schemas import Schemas from database.dsstox.compounds import Compounds from database.dsstox.generic_substance_compounds import GenericSubstanceCompounds from database.dsstox.generic_substances import GenericSubstances from database.invitrodb.mc4 import Mc4 from database.invitrodb.mc5 import Mc5 from database.invitrodb.sample import Sample from database.session import SQLSession # GET ALL ASSAYS FROM MC5 # QUERY MC5 data for hitcalls and chemical IDs mysession = SQLSession(Schemas.information_schema).get_session() query0 = mysession.query( Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid,) \ .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \ .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \ .join(Sample, Sample.chid == GenericSubstances.id) \ .join(Mc4, Mc4.spid == Sample.spid) \ .join(Mc5, Mc5.m4id == Mc4.m4id) mc5_table = pd.DataFrame(list(query0)) # print(mc5_table.shape) # print( mc5_table[mc5_table['aeid']==1086] ) # sys.exit(1) # run xgboost for files in test_aeids
from database.qsar.compound_descriptor_sets import CompoundDescriptorSets from database.dsstox.compounds import Compounds from database.database_schemas import Schemas from database.session import SQLSession mysession = SQLSession(Schemas.qsar_schema).get_session() x_aeid = 926 query3 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \ .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id) \ .filter(CompoundDescriptorSets.fk_descriptor_set_id == int(1445)) # .filter(Compounds.dsstox_compound_id.in_(str(new_df.iloc[0]))) \ print(list(query3))
import pandas as pd import subprocess as subp import glob from multiprocessing import Pool def com(i): print(i) command = subp.Popen( 'pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i), shell=True) command.communicate() # create a tsv with dataset name x TP/TOTAL_CHEM so that we can see the coverage of our Toxprint Models # get all relevant datasets mysession = SQLSession(Schemas.qsar_schema).get_session() datasets = [x[0] for x in mysession.execute('SELECT datasets.name FROM sbox_rlougee_qsar.datasets' ' WHERE sbox_rlougee_qsar.datasets.name LIKE "Imported_DataTable:aeid\_%\_invitrodbv2\_20180918"')] # print(len(datasets)) # print(datasets[180:]) # sys.exit(1) # # get TP stats file for each dataset # for i in datasets[38:]: # print(i) # sys.exit(0) # command = subp.Popen('pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i), shell=True) # command.communicate()
fk_datapoint_id = int(datapoints.id) dataset_datapoints = DatasetDatapoints(fk_dataset_id=fk_dataset_id, fk_datapoint_id=fk_datapoint_id, updated_by=username, created_by=username) mysession.add(dataset_datapoints) mysession.commit() #################################################################################################################### if __name__ == "__main__": #QUERY MC5 data for hitcalls, chemical IDs, assay subcategories mysession = SQLSession(Schemas.information_schema).get_session() query0 = mysession.query(Compounds.id, Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid, AssayComponentEndpoint.assay_component_endpoint_name, AssayComponent.assay_component_desc, AssayComponent.assay_component_target_desc, AssayComponentEndpoint.assay_component_endpoint_desc, AssayComponentEndpoint.assay_function_type, AssayComponentEndpoint.normalized_data_type, AssayComponentEndpoint.analysis_direction, AssayComponentEndpoint.burst_assay, AssayComponentEndpoint.key_positive_control, AssayComponentEndpoint.signal_direction, AssayComponentEndpoint.intended_target_type, AssayComponentEndpoint.intended_target_type_sub, AssayComponentEndpoint.intended_target_family, AssayComponentEndpoint.intended_target_family_sub, AssayComponent.assay_design_type, AssayComponent.assay_design_type_sub, AssayComponent.biological_process_target, AssayComponent.detection_technology_type, AssayComponent.detection_technology_type_sub, AssayComponent.detection_technology, AssayComponent.signal_direction_type, AssayComponent.key_assay_reagent, AssayComponent.key_assay_reagent_type, AssayComponent.technological_target_type, AssayComponent.technological_target_type_sub) \ .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \ .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \ .join(Sample, Sample.chid == GenericSubstances.id) \ .join(Mc4, Mc4.spid == Sample.spid) \ .join(Mc5, Mc5.m4id == Mc4.m4id) \ .join(AssayComponentEndpoint, AssayComponentEndpoint.aeid == Mc5.aeid) \ .join(AssayComponent, AssayComponent.acid == AssayComponentEndpoint.acid) mytable = pd.DataFrame(list(query0)) #################################################################################################################### ### MAKE TABLES FOR SUBCATEGORIES ###
# # retrive the latest dataset for the aeid # new_df = query1[query1['name'].isin([x_aeid])] # # rename columns # new_df.columns = ['dsstox_compound_id', 'hitc', 'name', 'dataset_id'] # my_dataset_id = new_df['dataset_id'].iloc[0] # # make the enrichment table # filluc(new_df, my_dataset_id) #################################################################################################################### # create a connection pool for multiprocessing with mysqlalchemy if __name__ == '__main__': ### QUERY THE MYSQL DB 4 A COMPLETE LIST OF AEIDS, ENDPOINTS & DTXCIDS ### mysession = SQLSession(Schemas.qsar_schema).get_session() # query the Unique Enrichment Table IDs query0 = mysession.query(UnivariateCalculations.fk_dataset_id) query0 = pd.DataFrame(list(query0)) if query0.empty: pass else: query0 = query0.fk_dataset_id.unique() # query the full set of data # slow ~3.5 minutes already query1 = mysession.query(Compounds.dsstox_compound_id, Datapoints.measured_value_dn, Datasets.name, Datasets.id) \ .join(Datapoints, Datapoints.efk_dsstox_compound_id == Compounds.id) \ .join(DatasetDatapoints, Datapoints.id == DatasetDatapoints.fk_datapoint_id) \
import glob import pandas as pd from database.database_schemas import Schemas from database.dsstox.compounds import Compounds from database.qsar.compound_descriptor_sets import CompoundDescriptorSets from database.session import SQLSession import sys for f in glob.glob( "/home/rlougee/Desktop/invitrodb_v2_burst_splits/clean_assays/*"): aeid = f.split('/')[-1] a = pd.read_csv(f, sep='\t') mysession = SQLSession(Schemas.qsar_schema).get_session() query0 = mysession.execute( 'SELECT assay_component_endpoint_name FROM sbox_rlougee_invitrodb.assay_component_endpoint WHERE sbox_rlougee_invitrodb.assay_component_endpoint.aeid = {}' .format(aeid)) output_name = list(query0)[0][0] a = a.dropna(axis=1) a.to_csv( '/home/rlougee/Desktop/invitrodb_v2_burst_splits/clean_assays2/{}.tsv'. format(output_name), sep='\t', index=False)
def main(): args = docopt(__doc__) # print(args) if args['--version']: print('NCCT CLI: Version 0.0.0') sys.exit(0) # set input arguments and options to variables InID = args['<InID>'].lower() OutID = args['<OutID>'].lower() tsv_input = args['<tsv_file>'] o = args['--output'] noerror = args['--noerror'] # filter InID & OutID if InID in acceptedID: pass else: print(Fore.RED + 'Invalid InID: {}\n InID must be {}'.format(InID, acceptedID)) sys.exit(1) if OutID in acceptedID: pass else: print(Fore.RED + 'Invalid InID: {}\n InID must be {}'.format(InID, acceptedID)) sys.exit(1) # creates table of .tsv file # takes stdin if argument is not directly given if not tsv_input: tsv_input = sys.stdin.read() mytable = pd.read_csv(StringIO(tsv_input), sep="\t") elif tsv_input: mytable = pd.read_csv(tsv_input, sep="\t") # takes the chemical ID column idrow = mytable.iloc[:, 0] colname = mytable.columns.values[0] # make an SQL query table for relevant SIDs & CIDs mysession = SQLSession(Schemas.dsstox_schema).get_session() query = mysession.query(GenericSubstances.dsstox_substance_id, GenericSubstances.casrn, Compounds.dsstox_compound_id)\ .join(GenericSubstanceCompounds, GenericSubstanceCompounds.fk_generic_substance_id == GenericSubstances.id)\ .join(Compounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) df = pd.DataFrame(list(query)) df.columns = ['dtxsid', 'casrn', 'dtxcid'] idrow = pd.DataFrame(idrow) idrow.columns = [InID] # do a join to filter out unwanted IDs if InID == 'dtxcid': df = pd.merge(idrow, df, on='dtxcid', how='inner') elif InID == 'casrn': df = pd.merge(idrow, df, on='casrn', how='inner') elif InID == 'dtxsid': df = pd.merge(idrow, df, on='dtxsid', how='inner') df = df.drop_duplicates(InID) # if no DTXCIDs returned if df.empty and not noerror: print(Fore.RED + "Error: No valid {} or no associated {}\n{}".format( InID, OutID, list(idrow))) sys.exit(1) elif df.empty: sys.exit(1) #creates a list of unconverted IDs noid = list(set(idrow.iloc[:, 0]) - set(list(df.iloc[:, 0]))) # creates new CID table mytable = mytable.rename(columns={colname: InID}) mytable = pd.merge(df, mytable, on=InID, how='left') for i in acceptedID: if i != OutID: mytable = mytable.drop(i, 1) outputtable = mytable # generates a string with tab seperation and line breaks for row ends columnnames = mytable.columns.values output = '' for i in columnnames: output += i + '\t' output += '\n' mytable = mytable.values.tolist() for i in mytable: a = '\t'.join(str(x) for x in i) output += a + '\n' # output options if not o: print(output[:int(-1)]) else: outputtable.to_csv(o, sep='\t', index=False) # ERROR message # not actual STDERR this is for the user if not noerror: print(Fore.RED + "Error: Invalid {} or no associated {}\n{}".format( InID, OutID, noid))
from database.database_schemas import Schemas from database.session import SQLSession from database.qsar.descriptors import Descriptors import sys import os import pandas as pd import datetime # THIS CODE IS TO BE USED ONE TIME TO FILL sbox_rlougee_qsar.descriptors table for the fingerprint labels, ids, MySQL_setup # add Toxcast fingerprint data to MySQL # query AEID data mysession = SQLSession(Schemas.qsar_schema).get_session() aeid_data = mysession.execute( 'SELECT aeid, assay_component_endpoint_name FROM invitrodb.assay_component_endpoint' ) aeid_data = pd.DataFrame(list(aeid_data)) aeid_data[0] = [int(x) for x in aeid_data[0]] aeid_data = aeid_data.sort_values(0, axis=0) aeid_data = aeid_data.reset_index(drop=True) # print(aeid_data) # sys.exit(1) # add ASSAY_COMPONENT_ENDPOINT_NAME as descriptors.label # use aeid as descriptors_name for i, row in aeid_data.iterrows(): username = '******' # create a new datasets name entry
from database.database_schemas import Schemas from database.dsstox.compounds import Compounds from database.qsar.compound_descriptor_sets import CompoundDescriptorSets from database.session import SQLSession import sys import pandas as pd import subprocess as subp import glob # create a tsv with dataset name x TP/TOTAL_CHEM so that we can see the coverage of our Toxprint Models # get all relevant datasets mysesson = mysession = SQLSession(Schemas.qsar_schema).get_session() datasets = [ x[0] for x in mysession.execute( 'SELECT datasets.name FROM sbox_rlougee_qsar.datasets' ' WHERE sbox_rlougee_qsar.datasets.name LIKE "%Ori\_MC%"') ] print(len(datasets)) # get TP stats file for each dataset # for i in datasets[1053:]: # print(i) # command = subp.Popen('pullenrichment {} --fpenrich -o "/share/home/rlougee/Desktop/ori_mc_percent_coverage/"'.format(i), shell=True) # command.communicate()
# IMPORT AND CLEAN UP DATATABLES nonburst = pd.read_csv("/home/rlougee/Desktop/primary_data/Non-Burst_MC_v2_2018_full_enrichment_results.tsv", sep='\t') burst = pd.read_csv("/home/rlougee/Desktop/primary_data/Burst_MC_v2_2018_full_enrichment_results.tsv", sep='\t') nonburst['name'] = nonburst['name'].str.replace('Imported_DataTable:aeid_','').str.replace('_invitrodbv2_20180918', '').apply(int) burst['name'] = burst['name'].str.replace('Imported_DataTable:','').str.replace('_Burst_MC_v2_20181120', '') print(nonburst.tail()) print(burst.tail()) # QUERY AEID AND CORRECT ASSAY NAMES mysession = SQLSession(Schemas.information_schema).get_session() q0 = mysession.execute('SELECT aeid, assay_component_endpoint_name FROM sbox_rlougee_invitrodb.assay_component_endpoint') q0 = pd.DataFrame(list(q0)) # q0[1] = q0[1].apply(str) print('q0 shape', q0.shape) print('nonburst shape', nonburst.shape) print('burst shape', burst.shape) nonburst2 = pd.merge(nonburst, q0, how='inner', left_on=['name'], right_on=[0]) print(nonburst2.shape) burst2 = pd.merge(burst, q0, how='inner', left_on=['name'], right_on=[1]) print(burst2.shape)
def main(): args = docopt(__doc__) # print(args) if args['--version']: print('NCCT CLI: Version 0.0.0') sys.exit(0) # set input arguments and options to variables DataSetName = args['<DataSetName>'].lower() outputpath = args['--output'] if outputpath == None: outputpath = '' image = args['--image'] allassays = args['--allassays'] fullenrich = args['--fullenrich'] fpenrich = args['--fpenrich'] a = args['--all'] OR = float(args['--OR']) PV = float(args['--PV']) TP = float(args['--TP']) if a: image = True allassays = True fpenrich = True descriptor_set_id = 1445 # # # QUERY ENRICHMENT DATA FOR YOUR DATASETNAME # # # mysession = SQLSession(Schemas.qsar_schema).get_session() enrichment_data = mysession.execute( 'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%{}%\' AND fk_descriptor_set_id = {} GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value' .format(DataSetName, descriptor_set_id)) enrichment_data = pd.DataFrame(list(enrichment_data)) if enrichment_data.empty: print(Fore.RED + 'ERROR: DataSetName string returned no results {}'.format( DataSetName)) sys.exit(1) # format table, pivot columns, reorder # null columns are dropped without dropna=False but this adds empty rows enrichment_data = pd.pivot_table(enrichment_data, values=4, index=[0, 1, 2], columns=[3], dropna=False) enrichment_data = enrichment_data.reset_index(level=[0, 1, 2]) enrichment_data.columns = [ 'Data Table', 'Chemotype ID', 'Chemotype Label', 'Balanced Accuracy', 'Total Chemotypes', 'False Negatives', 'False Positives', 'Inverse Odds Ratio', 'Inverse P-Value', 'Odds Ratio', 'P-Value', 'True Negatives', 'True Positives' ] # drop empty rows enrichment_data = enrichment_data.dropna(thresh=4) enrichment_data = enrichment_data[[ 'Data Table', 'Chemotype ID', 'Chemotype Label', 'Total Chemotypes', 'True Positives', 'False Positives', 'False Negatives', 'True Negatives', 'Balanced Accuracy', 'Odds Ratio', 'P-Value', 'Inverse Odds Ratio', 'Inverse P-Value' ]] # FIND THE BASE FOR FILE NAMES file_name = enrichment_data['Data Table'][0] file_name = file_name.split(':')[-1] # CREATE FOLDERS FOR RESULTS AND EDIT PATH if not os.path.exists(outputpath + 'CTEW_' + file_name): os.makedirs(outputpath + 'CTEW_' + file_name) outputpath = outputpath + 'CTEW_' + file_name os.makedirs(outputpath + '/CTEW_Results') os.makedirs(outputpath + '/CTEW-INV_Results') regoutputpath = outputpath + '/CTEW_Results/' invoutputpath = outputpath + '/CTEW-INV_Results/' else: print(Fore.RED + 'ERROR: output folder {} already exists'.format(outputpath + 'CTEW_' + file_name)) sys.exit(1) # # # PULL SIGNIFICANT TOXPRINTS # # # enrichment_data2 = enrichment_data.dropna() enrichment_data2.loc[:, 'Odds Ratio'] = pd.to_numeric( enrichment_data2.loc[:, 'Odds Ratio'], errors='coerce') enrichment_data2.loc[:, 'P-Value'] = pd.to_numeric( enrichment_data2.loc[:, 'P-Value'], errors='coerce') enrichment_data2.loc[:, 'True Positives'] = pd.to_numeric( enrichment_data2.loc[:, 'True Positives'], errors='coerce') sigtxp = [] for i, x in enrichment_data2.iterrows(): if x['Odds Ratio'] >= OR and x['P-Value'] <= PV and x[ 'True Positives'] >= TP: sigtxp.append(x['Chemotype ID']) # # # PULL SIGNIFICANT INVERSE TOXPRINTS # # # enrichment_data3 = enrichment_data.dropna() enrichment_data3.loc[:, 'Inverse Odds Ratio'] = pd.to_numeric( enrichment_data3.loc[:, 'Inverse Odds Ratio'], errors='coerce') enrichment_data3.loc[:, 'Inverse P-Value'] = pd.to_numeric( enrichment_data3.loc[:, 'Inverse P-Value'], errors='coerce') invsigtxp = [] for i, x in enrichment_data3.iterrows(): if x['Inverse Odds Ratio'] >= OR and x['Inverse P-Value'] <= PV: invsigtxp.append(x['Chemotype ID']) # CREATE SIGNIFICANT TOXPRINT EXPORT TABLE AND INVERSE SIGNIFICANT TOXPRINT TABLE sigtxp = pd.DataFrame(sigtxp, columns=['Chemotype ID']) # sigtxp.columns = ['Chemotype ID'] OutputTable = pd.merge(sigtxp, enrichment_data, on='Chemotype ID', how='inner') invsigtxp = pd.DataFrame(invsigtxp, columns=['Chemotype ID']) # invsigtxp.columns = ['Chemotype ID'] InvOutputTable = pd.merge(invsigtxp, enrichment_data, on='Chemotype ID', how='inner') # CREATE FULL ENRICHMENT STATISTICS TABLE if fullenrich: enrichment_data.to_csv('{}Full_Enrichment_Stats_{}.tsv'.format( outputpath, file_name), sep='\t', index=False) # # # ENRICHED CHEMOTYPES # # # # return a table containing enriched chemotypes X DTXCID if fpenrich: try: ct_model_row = getenrichfp(DataSetName, sigtxp, regoutputpath, 'CT-Enriched_FP_' + file_name) OutputTable = OutputTable.append(ct_model_row) except: pass try: ct_inv_model_row = getenrichfp(DataSetName, invsigtxp, invoutputpath, 'CT-INV-Enriched_FP_' + file_name) InvOutputTable = InvOutputTable.append(ct_inv_model_row) except: pass # # # CREATE CT IMAGES # # # if image: imagemaker(sigtxp, 'CT-Enriched_Images_' + file_name, regoutputpath) imagemaker(invsigtxp, 'CT-INV-Enriched_Images_' + file_name, invoutputpath) # # # CREATE ADDITIONAL ASSAY INFO # # # if allassays: assayspaceidentifier(sigtxp, 'CT-Enriched_TCAssays_' + file_name, regoutputpath) assayspaceidentifier(invsigtxp, 'CT-INV-Enriched_TCAssays_' + file_name, invoutputpath) # # # EXPORT CT-ENRICHED_STATS & CT-INV-ENRICHED_STATS # # # try: OutputTable.to_csv('{}CT-Enriched_Stats_{}.tsv'.format( regoutputpath, file_name), sep='\t', index=False) except: pass try: InvOutputTable.to_csv('{}CT-INV-Enriched_Stats_{}.tsv'.format( invoutputpath, file_name), sep='\t', index=False) except: pass
def filluc(invitrodbdf, mydatasetid): #set starttime starttime = time.time() username = '******' descriptor_set_id = [1445, 1447, 1446, 1448] # descriptor_set_id = [1448] # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # 1445 = toxprints # 1446 = MACCS # 1447 = pubchem # 1448 = Special Toxprints for id in descriptor_set_id: # add enrichment table to the mysql database try: filled_table = handle_duplicates(invitrodbdf.loc[:, ['dsstox_compound_id', 'hitc']]) except: print("DUPLICATE HANDLER FAILED: {}".format(mydatasetid)) sys.exit(1) try: filled_table = fillfp(filled_table, id) except: print("FILLFP FAILED: {}".format(mydatasetid)) sys.exit(1) # filled_table = pd.DataFrame(filled_table) try: my_enrichment_table = enrich(filled_table) except: print("ENRICH FAILED: {}".format(mydatasetid)) print(filled_table.head()) sys.exit(1) # add fk_descriptor_id ### CHECK THAT THESE ARE MATCHING! ### mysession2 = SQLSession(Schemas.qsar_schema).get_session() query3 = mysession2.query(Descriptors.id).filter(Descriptors.fk_descriptor_set_id == id) query3 = list(query3) query3 = [int(i[0]) for i in query3] my_enrichment_table.insert(0, 'fk_descriptor_id', query3) for index, row in my_enrichment_table.iterrows(): fk_dataset_id = int(mydatasetid) fk_descriptor_id = int(row['fk_descriptor_id']) univariate_calc = UnivariateCalculations(fk_dataset_id=fk_dataset_id, fk_descriptor_id=fk_descriptor_id, updated_by=username, created_by=username) mysession2.add(univariate_calc) mysession2.flush() fk_univ_calc_id = int(univariate_calc.id) ### NEED TO CHANGE for loop & stat_list IF THE STATISTICS ARE CHANGED IN Enrichment_Table_Generator ### count = 0 for i in row[1:]: if math.isnan(i): value = None elif math.isinf(i): value = 99999999.9 else: value = float(i) stat_list = [9, 10, 11, 12, 13, 4, 8, 7, 14, 15] fk_statistic_id = int(stat_list[count]) uc_statistics = UcStatistics(value=value, fk_univ_calc_id=int(fk_univ_calc_id), fk_statistic_id=int(fk_statistic_id), created_by=username, updated_by=username) mysession2.add(uc_statistics) count += 1 mysession2.commit() # mysession2.close() endtime = time.time() print('run time:{}'.format(endtime-starttime))
def cli(tsv_input,o,noerror): ### HELP DOCUMENTATION ### """ SIDtoCID takes in a .tsv datatable with a dsstox_substance_id column (must be index or first 2 columns). The dsstox_substance_id column is converted to dsstox_compound_id. Can use a .tsv file as stdin. Default output is stdout as .tsv. \n\n Warning!: column names are needed in the input .tsv! Otherwise first row will be skipped. -- EXAMPLE I/O TABLES -- INPUT: .tsv file |DTXSID COLUMN | ENDPOINT COLUMN |\n ----------------------------------\n | DTXSID123456 | 0 |\n ----------------------------------\n | DTXSID234567 | 1 |\n ----------------------------------\n | DTXSID345678 | 0 |\n ----------------------------------\n EXPORT: .tsv file |DTXCID COLUMN | ENDPOINT COLUMN |\n ----------------------------------\n | DTXCID891011 | 0 |\n ----------------------------------\n | DTXCID910111 | 1 |\n ----------------------------------\n | DTXCID101112 | 0 |\n ----------------------------------\n """ # creates table of .tsv file # takes stdin if argument is not directly given if not tsv_input: tsv_input = sys.stdin.read() mytable = pd.read_csv(StringIO(tsv_input), sep="\t") elif tsv_input: mytable = pd.read_csv(tsv_input, sep="\t") #checks the index, and first two columns for DTXSIDs #input table should be in the correct format already try: if mytable.iloc[0,0][0:6] == 'DTXSID': idrow = mytable.iloc[:,0] colname = mytable.columns.values[0] except: pass try: if mytable.iloc[0,1][0:6] == 'DTXSID': idrow = mytable.iloc[:,1] colname = mytable.columns.values[0] except: pass try: if mytable.index.values[0][0:6] == 'DTXSID': idrow = mytable.index.values mytable.index.name = 'DTXSID' colname = mytable.index.name except: pass # drop empty columns mytable = mytable.dropna(axis='columns', how='all') # click.echo(mytable.columns.values) #make an SQL query table for relevant SIDs & CIDs mysession = SQLSession(Schemas.dsstox_schema).get_session() query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.dsstox_compound_id).join(GenericSubstanceCompounds) \ .join(Compounds) df = pd.DataFrame(list(query)) idrow = pd.DataFrame(idrow) idrow.columns = ['dsstox_substance_id'] df = pd.merge(idrow, df, on='dsstox_substance_id', how='inner') #if no DTXCIDs returned if df.empty and noerror: click.secho("Error: No valid DTXSIDs or no associated DTXCIDs\n{}".format(list(idrow)), fg='red', bold=True) sys.exit(1) elif df.empty: sys.exit(1) #creates new CID table mytable = mytable.rename(columns={colname : "dsstox_substance_id"}) mytable = pd.merge(df, mytable, on='dsstox_substance_id') mytable = mytable.drop('dsstox_substance_id', 1) outputtable = mytable # generates a string with tab seperation and line breaks for row ends columnnames = mytable.columns.values output = '' for i in columnnames: output += i + '\t' output += '\n' mytable = mytable.values.tolist() for i in mytable: a = '\t'.join(str(x) for x in i) output += a + '\n' #output options if o =='': click.echo(output) else: outputtable.to_csv(o, sep='\t',index=False) # get IDs that were not converted noid = list(list(set(idrow.iloc[:,0]) - set(df.iloc[:,0]))) #ERROR message #not actual STDERR this is for the user if noerror: click.secho("Error: Invalid DTXSID or no associated DTXCID\n{}".format(noid), fg='red', bold=True)
# print(wow.head()) for i in wowie['name'].unique(): bla = wowie[wowie['name'] == i] blabla = pd.DataFrame(bla.iloc[:,1:]) print(blabla.head(), blabla.shape) # for idx, row in blabla.iterrows(): # if row[1] >= 40: # # print(row[1]) # blabla.iloc[idx, 1] = 10 # # print(blabla.iloc[idx,1]) # get Txp index mysession = SQLSession(Schemas.qsar_schema).get_session() query3 = mysession.query(Descriptors.index_number, Descriptors.descriptors_name).filter(Descriptors.fk_descriptor_set_id == 1445) descriptornames = pd.DataFrame(list(query3)) # sort by TXP number sorted = pd.merge(descriptornames, blabla, on='descriptors_name') sorted = sorted.drop('index_number', axis=1) # print(sorted.head()) name = i.split('_')[0] from zMisc_Code.DATA_VISUALIZATION.barplot import barplot barplot(sorted.iloc[:,1], sorted.iloc[:,0], name) plt.tight_layout() # plt.show()
from PIL import Image from AssaySpaceIdentifier import assayspaceidentifier from ImageMaker import imagemaker from fpenrich import getenrichfp from docopt import docopt from database.database_schemas import Schemas from database.session import SQLSession import sys import os import pandas as pd from colorama import init, Fore mysession = SQLSession(Schemas.qsar_schema).get_session() enrichment_data_1 = mysession.execute( 'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, GROUP_CONCAT(CASE WHEN abbreviation = \'CT-Total\' THEN value ELSE NULL END) as \'CT-Tot\',GROUP_CONCAT(CASE WHEN abbreviation = \'TP\' THEN value ELSE NULL END) as \'TP\',GROUP_CONCAT(CASE WHEN abbreviation = \'FP\' THEN value ELSE NULL END) as \'FP\', GROUP_CONCAT(CASE WHEN abbreviation = \'FN\' THEN value ELSE NULL END) as \'FN\', GROUP_CONCAT(CASE WHEN abbreviation = \'TN\' THEN value ELSE NULL END) as \'TN\', GROUP_CONCAT(CASE WHEN abbreviation = \'BA\' THEN value ELSE NULL END) as \'BA\', GROUP_CONCAT(CASE WHEN abbreviation = \'OR\' THEN value ELSE NULL END) as \'OR\', GROUP_CONCAT(CASE WHEN abbreviation = \'P-Val\' THEN value ELSE NULL END) as \'P-Val\', GROUP_CONCAT(CASE WHEN abbreviation = \'Inv OR\' THEN value ELSE NULL END) as \'Inv OR\',GROUP_CONCAT(CASE WHEN abbreviation = \'Inv P-Val\' THEN value ELSE NULL END) as \'Inv P-Val\' FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%aeid\_9\_invitrodbv2%\' AND fk_descriptor_set_id = 1445 GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label' ) ed1 = pd.DataFrame(list(enrichment_data_1)) ed1.columns = [ 'Data Table', 'Chemotype ID', 'Chemotype Label', 'Total Chemotypes', 'True Positives', 'False Positives', 'False Negatives', 'True Negatives', 'Balanced Accuracy', 'Odds Ratio', 'P-Value', 'Inverse Odds Ratio', 'Inverse P-Value' ] # print(pd.DataFrame(list(enrichment_data_1)).head()) enrichment_data_2 = mysession.execute( 'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%aeid\_9\_invitrodbv2%\' AND fk_descriptor_set_id = 1445 GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value' )
def getenrichfp(DataSetName, sigtxp, mypath, myfilename, dsi=1445): """ Get Enrichment data for a combined set of chemotypes """ # aborts if no significant chemotypes if len(sigtxp) == 0: return None mysession = SQLSession(Schemas.qsar_schema).get_session() MyDataSet = mysession.execute( 'SELECT dsstox_compound_id, measured_value_dn, descriptor_string_tsv FROM sbox_rlougee_qsar.datasets' ' JOIN sbox_rlougee_qsar.dataset_datapoints ON sbox_rlougee_qsar.dataset_datapoints.fk_dataset_id = sbox_rlougee_qsar.datasets.id' ' JOIN sbox_rlougee_qsar.datapoints ON sbox_rlougee_qsar.datapoints.id = sbox_rlougee_qsar.dataset_datapoints.fk_datapoint_id' ' JOIN ro_stg_dsstox.compounds ON sbox_rlougee_qsar.datapoints.efk_dsstox_compound_id = ro_stg_dsstox.compounds.id' ' JOIN sbox_rlougee_qsar.compound_descriptor_sets ON ro_stg_dsstox.compounds.id = sbox_rlougee_qsar.compound_descriptor_sets.efk_dsstox_compound_id' ' WHERE sbox_rlougee_qsar.datasets.name LIKE \'%{}%\' AND sbox_rlougee_qsar.compound_descriptor_sets.fk_descriptor_set_id = {}'.format(DataSetName, dsi)) MyDataSet = pd.DataFrame(list(MyDataSet)) MyDataSet.columns = ['Dsstox_Compound_ID', 'Hit_Call', 'Toxprint'] #something to separate and name fingerprint columns MyDataSet = pd.concat([MyDataSet, MyDataSet['Toxprint'].str[:].str.split('\t', expand=True)], axis=1) MyDataSet = MyDataSet.drop('Toxprint', axis=1) #name the columns correctly query3 = mysession.query(Descriptors.descriptors_name, Descriptors.label).filter(Descriptors.fk_descriptor_set_id == dsi) descriptornames = pd.DataFrame(list(query3)) for num,name in enumerate(descriptornames['label'], start=0): MyDataSet = MyDataSet.rename(columns={num:name}) # drop columns that are not significant sigtxp = pd.DataFrame(sigtxp) sigtxp.columns = ['descriptors_name'] siglabel = pd.merge(sigtxp, descriptornames, on='descriptors_name', how='inner') siglabel = list(siglabel['label']) for i in MyDataSet.columns[2:]: if i in siglabel: pass else: MyDataSet = MyDataSet.drop(i, axis=1) # MyDataSet.to_csv('{}{}.tsv'.format(mypath, myfilename), sep='\t', index=False) # return overall balanced accuracy calculations # can just make a unique confusion matrix for significant toxprints and add to CT-Enriched Stats file # print(MyDataSet.head()) model_row = pd.DataFrame([['Chemotype Full Model Coverage', myfilename, " ".join(sigtxp['descriptors_name']), 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0]], columns = ['Chemotype ID','Data Table','Chemotype Label','Total Chemotypes','True Positives','False Positives','False Negatives','True Negatives','Balanced Accuracy','Odds Ratio','P-Value','Inverse Odds Ratio','Inverse P-Value']) # fill model_row confusion matrix for index, row in MyDataSet.iterrows(): rowsum = sum([int(x) for x in row.iloc[2:]]) if row['Hit_Call'] == 1 and rowsum > 0: model_row['True Positives'] += 1 elif row['Hit_Call'] == 1 and rowsum == 0: model_row['False Negatives'] += 1 elif row['Hit_Call'] == 0 and rowsum > 0: model_row['False Positives'] += 1 elif row['Hit_Call'] == 0 and rowsum == 0: model_row['True Negatives'] += 1 # fill model_row statistics oddsratio, pvalue = stats.fisher_exact([ [int(model_row['True Positives']), int(model_row['False Positives'])], [int(model_row['False Negatives']), int(model_row['True Negatives'])]], alternative='greater') model_row['P-Value'] = pvalue model_row['Odds Ratio'] = oddsratio model_row['Total Chemotypes'] = (model_row['True Positives'] + model_row['False Positives']) BA = (((model_row['True Positives'] / (model_row['True Positives'] + model_row['False Negatives'])) + (model_row['True Negatives'] / (model_row['True Negatives'] + model_row['False Positives']))) / 2) model_row['Balanced Accuracy'] = float(BA) inv_oddsratio, inv_pvalue = stats.fisher_exact([ [int(model_row['False Positives']), int(model_row['True Positives'])], [int(model_row['True Negatives']), int(model_row['False Negatives'])] ],alternative='greater') model_row['Inverse P-Value'] = inv_pvalue model_row['Inverse Odds Ratio'] = inv_oddsratio # print(model_row) return model_row
from database.qsar.statistics import Statistics from database.qsar.uc_statistics import UcStatistics from database.qsar.univariate_calculations import UnivariateCalculations from database.invitrodb.assay_component import AssayComponent from database.invitrodb.assay_component_endpoint import AssayComponentEndpoint from sqlalchemy import or_, and_ import sys import click import pandas as pd from io import StringIO import re # FIRST NEED TO GET A TABLE index = AEID and columns = Assay Cats, Toxprints Enrichments (discrete hitcalls for each assay) # going to do this in two parts # part 1 get assay cats X aeids mysession = SQLSession(Schemas.information_schema).get_session() query0 = mysession.query(AssayComponentEndpoint.aeid, AssayComponentEndpoint.assay_component_endpoint_name, AssayComponent.assay_component_desc, AssayComponent.assay_component_target_desc, AssayComponentEndpoint.assay_component_endpoint_desc, AssayComponentEndpoint.assay_function_type, AssayComponentEndpoint.normalized_data_type, AssayComponentEndpoint.analysis_direction, AssayComponentEndpoint.burst_assay, AssayComponentEndpoint.key_positive_control, AssayComponentEndpoint.signal_direction, AssayComponentEndpoint.intended_target_type, AssayComponentEndpoint.intended_target_type_sub, AssayComponentEndpoint.intended_target_family, AssayComponentEndpoint.intended_target_family_sub, AssayComponent.assay_design_type, AssayComponent.assay_design_type_sub, AssayComponent.biological_process_target, AssayComponent.detection_technology_type, AssayComponent.detection_technology_type_sub, AssayComponent.detection_technology, AssayComponent.signal_direction_type, AssayComponent.key_assay_reagent, AssayComponent.key_assay_reagent_type,
def cli(): ### HELP DOCUMENTATION ### """ checks for updates for AEIDS (new compounds / new endpoints / new AEIDS) if there are any updates qsar.datapoints, qsar.datasets, and qsar.datasetdatapoints are updated to include this information """ #################################################################################################################### #QUERY MC5 data for hitcalls and chemical IDs mysession = SQLSession(Schemas.information_schema).get_session() query0 = mysession.query(Compounds.id, Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid) \ .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \ .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \ .join(Sample, Sample.chid == GenericSubstances.id) \ .join(Mc4, Mc4.spid == Sample.spid) \ .join(Mc5, Mc5.m4id == Mc4.m4id) mc5_table = pd.DataFrame(list(query0)) #################################################################################################################### ### QUERY DATE WHEN THE TABLE WAS LAST UPDATED ### # Not very useful as datasets.name date will always be different/ not worth querying all of the dates # query1 = mysession.query(Tables.UPDATE_TIME)\ # .filter(or_(Tables.TABLE_SCHEMA == 'invitrodb'), (Tables.TABLE_NAME == 'mc5')) # # #format last_update query # last_update = str(list(query1)[0][0])[:10].replace('-','') #################################################################################################################### def filldatasets(invitrodbdf, fd_aeid): username = '******' # create a new datasets name entry datasets_name = str('aeid:{}_{}'.format( fd_aeid, datetime.datetime.today().strftime("%Y%m%d"))) description = "The set of hit calls from the toxcast AEID: {} taken on the date:{}"\ .format(fd_aeid, datetime.datetime.today().strftime("%Y%m%d")) datasets = Datasets(name=datasets_name, label=datasets_name, updated_by=username, created_by=username, long_description=description, short_description=description) mysession.add(datasets) mysession.flush() fk_dataset_id = int(datasets.id) # add datatable to the mysql database for index, row in invitrodbdf.iterrows(): efk_dsstox_compound_id = row.loc['id'] efk_chemprop_measured_property_id = None #leave null -CG #not nullable measured_value_dn = row.loc['hitc'] created_by = username updated_by = username datapoints = Datapoints( efk_dsstox_compound_id=efk_dsstox_compound_id, efk_chemprop_measured_property_id= efk_chemprop_measured_property_id, measured_value_dn=measured_value_dn, created_by=created_by, updated_by=updated_by) mysession.add(datapoints) mysession.flush() fk_datapoint_id = int(datapoints.id) dataset_datapoints = DatasetDatapoints( fk_dataset_id=fk_dataset_id, fk_datapoint_id=fk_datapoint_id, updated_by=username, created_by=username) mysession.add(dataset_datapoints) mysession.commit() #################################################################################################################### ### CHECK 1) IF TABLE EXISTS FOR AEID 2) IF THE TABLE HAS CHANGED # begin a for loop for each unique aeid for x_aeid in mc5_table.aeid.unique(): #query latest dataset for this aeid aeid_query = mysession.query(Datasets.name) \ .filter(Datasets.name.like("aeid:{}/_%".format(str(x_aeid)), escape='/')) aeid_query = list(aeid_query) aeid_query = [x[0] for x in aeid_query] #get the latest values for aeid new_df = mc5_table[mc5_table['aeid'].isin([x_aeid])] if aeid_query == [] or aeid_query == [''] or aeid_query == None: print( "New AEID, filling mysql database for aeid: {}".format(x_aeid)) filldatasets(new_df, x_aeid) else: # find and retrieve the newest dataset name aeid_query_dates = [x.split('_')[1] for x in aeid_query] newest_aeid_date = sorted(aeid_query_dates)[0] newest_aeid = [x for x in aeid_query if str(newest_aeid_date) in x] #pull table and compare old_df = mysession.query(Datapoints.efk_dsstox_compound_id, Datapoints.measured_value_dn)\ .join(DatasetDatapoints, Datapoints.id==DatasetDatapoints.fk_datapoint_id)\ .join(Datasets, DatasetDatapoints.fk_dataset_id==Datasets.id)\ .filter(Datasets.name==newest_aeid[0]) old_df = pd.DataFrame(list(old_df)) ##FORMAT DFs FOR COMPARING #rename columns old_df.columns = ['id', 'hitc'] my_new_df = new_df.loc[:, ['id', 'hitc']] old_df['hitc'] = old_df['hitc'].astype(int) #sort dataframes my_new_df = my_new_df.sort_values(['id', 'hitc']) old_df = old_df.sort_values(['id', 'hitc']) #reset index my_new_df = my_new_df.reset_index(drop=True) old_df = old_df.reset_index(drop=True) if my_new_df.equals(old_df) == True: print("no change for aeid: {}".format(x_aeid)) pass else: print("Update, filling mysql database for aeid: {}".format( x_aeid)) filldatasets(new_df, x_aeid)