def allAgesGeneralSUD(logger): ''' Finds percentage of the total sample that has any SUD and more than 2 SUD Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: countDict = {"any_sud": [], "morethan2_sud": []} # Find number of users in each race who have any SUD any_sud = [] for race in table2_config["inputs"]["races"]: query = SQL(''' WITH subQ AS ( SELECT * FROM tejas.sud_race_age WHERE sud_race_age.race = {} ) SELECT count(*) FROM subQ ''').format(Literal(race)) data = [d[0] for d in pgIO.getAllData(query)] countDict["any_sud"].append(data[0]) # Find number of users in each race who have >2 SUD count = {"AA": 0, "NHPI": 0, "MR": 0} for race in table2_config["inputs"]["races"]: query = SQL(''' SELECT alc, cannabis, amphe, halluc, nicotin, cocaine, opioids, sedate, others, polysub, inhalant FROM tejas.sud_race_age WHERE sud_race_age.race = {} ''').format(Literal(race)) data = pgIO.getAllData(query) for tuple in data: if sum(list(tuple)) >= 2: count[race] += 1 for race in count: countDict["morethan2_sud"].append(count[race]) except Exception as e: logger.error('Cannot find general SUD counts because of {}'.format(e)) return countDict
def countRaceSetting(logger): ''' This function queries the database and returns the counts of each main race: AA, NHPI, MR sorted by treatment setting. Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' try: rd = {"AA": [], "NHPI": [], "MR": []} for race in table1_config["inputs"]["races"]: counts = [0] * len(table1_config["params"]["settings"]["all"]) count = 0 for setting in table1_config["params"]["settings"]["all"]: query = SQL(''' SELECT count(*) FROM tejas.race_age_t1new t1 INNER JOIN tejas.restofusers t2 ON t1.siteid = t2.siteid AND t1.backgroundid = t2.backgroundid WHERE t1.visit_type = {} AND t1.race = {} ''').format(Literal(setting), Literal(race)) data = [d[0] for d in pgIO.getAllData(query)] counts[count] += data[0] count += 1 rd[race] = counts except Exception as e: logger.error('countRaceSetting failed because of {}'.format(e)) return rd
def genSUDUserKeys(logger): ''' This function generates a .csv file for each SUD user's (siteid, backgroundid) Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' try: query = ''' SELECT patientid FROM sarah.test3 WHERE sud = true ''' data = pgIO.getAllData(query) csvfile = "../data/raw_data/SUDUser_keys.csv" with open(csvfile,'w+') as output: csv_output = csv.writer(output) for row in data: csv_output.writerow(row) output.close() except Exception as e: logger.error('Failed to generate list of SUD users because of {}'.format(e)) return
def genAllKeys(logger): ''' This function generates a .csv file of (siteid, backgroundid) of users after the first filter of age, race, sex and setting are done. The .csv file will then be used for the second filter by dsmno. Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' try: query = ''' SELECT patientid FROM sarah.test2 ''' data = pgIO.getAllData(query) csvfile = "../data/raw_data/firstfilter_allkeys.csv" with open(csvfile, 'w+') as output: csv_output = csv.writer(output) for row in data: csv_output.writerow(row) output.close() except Exception as e: logger.error( 'Failed to generate list of patients because of {}'.format(e)) return
def getRace(logger): '''Generates raceCount.csv This function was used to generate the data for the raceCount.csv file, which gets the race and count(race) for ALL the races in raw_data.background. After manual selection and grouping, the races under each race in the paper (AA, NHPI, MR) were manually entered into the json config file Function was deleted from the main after use. Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' try: query = ''' SELECT race, COUNT(race) FROM raw_data.background GROUP BY race ''' data = pgIO.getAllData(query) # data = [d[0] for d in data] except Exception as e: logger.error('getRace failed because of {}'.format(e)) return data
def countMainRace(logger): ''' This function queries the database and returns the counts of each main race: AA, NHPI, MR Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' try: total = [] for race in table1_config["inputs"]["races"]: query = SQL(''' SELECT COUNT(*) FROM sarah.test2 t1 INNER JOIN sarah.test3 t2 ON t1.patientid = t2.patientid WHERE race = {} ''').format(Literal(race)) data = [d[0] for d in pgIO.getAllData(query)] total.append(data[0]) except Exception as e: logger.error('countMainRace failed because of {}'.format(e)) return total
def createDF_byRace_morethan2SUD(logger, race): '''Creates dataframe for a sample from a specified race, dependent variable = at least 2 sud This function creates a dataframe for a sample from a specified race, where the dependent variable is >=2 sud and the independent variables are: age, sex and setting. Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information race {str} -- 'AA', 'NHPI', or 'MR' ''' try: query = SQL(''' SELECT morethan2sud,age,sex,visit_type FROM tejas.sud_race_age WHERE age BETWEEN 12 AND 100 AND race = {} ''').format( Literal(race) ) data = pgIO.getAllData(query) sud_data = [d[0] for d in data] age_data = [d[1] for d in data] sex_data = [d[2] for d in data] setting_data = [d[3] for d in data] d = {'sud': sud_data, 'age': age_data, 'sex': sex_data, 'setting': setting_data} main = pd.DataFrame(data=d) df = main.copy() # Change sud column to binary, dummify the other columns df.replace({False:0, True:1}, inplace=True) main.replace(to_replace=list(range(12, 18)), value="12-17", inplace=True) main.replace(to_replace=list(range(18, 35)), value="18-34", inplace=True) main.replace(to_replace=list(range(35, 50)), value="35-49", inplace=True) main.replace(to_replace=list(range(50, 100)), value="50+", inplace=True) dummy_ages = pd.get_dummies(main['age']) df = df[['sud']].join(dummy_ages.ix[:, :'35-49']) dummy_sexes = pd.get_dummies(main['sex']) df = df[['sud', '12-17', '18-34', '35-49']].join(dummy_sexes.ix[:, 'M':]) dummy_setting = pd.get_dummies(main['setting']) df = df[['sud', '12-17', '18-34', '35-49', 'M']].join(dummy_setting.ix[:, :'Inpatient']) df['intercept'] = 1.0 except Exception as e: logger.error('createDF_byRace_morethan2SUD failed because of {}'.format(e)) return df
def allAgesCategorisedSUD(logger): ''' Finds percentage of the age-binned sample that have SUD of a particular substance Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: countDict = { "alc":[], "cannabis":[], "amphe":[], "halluc":[], "nicotin":[], "cocaine":[], "opioids":[], "sedate":[], "others":[], "polysub":[], "inhalant":[] } for race in table2_config["inputs"]["races"]: for sudcat in table2_config["params"]["sudcats"]: query = SQL(''' SELECT count(*) FROM sarah.test2 t1 INNER JOIN sarah.test4 t2 ON t1.patientid = t2.patientid WHERE t1.race = {} AND t2.{} = true ''').format( Literal(race), Identifier(sudcat) ) data = [d[0] for d in pgIO.getAllData(query)] countDict[sudcat].append(data[0]) # Change counts to percentage of the race sample resultsDict = {} for row in countDict: resultsDict[row] = divByAllAges(countDict[row]) except Exception as e: logger.error('Failed to find categorised SUD counts because of {}'.format(e)) return resultsDict
def make_dataset(logger, csvfile, query): data = pgIO.getAllData(query) #returns list of tuples (T/F,.......) with open(csvfile, 'w+') as f: csv_out = csv.writer(f) csv_out.writerow(['sud', 'race', 'age', 'sex', 'setting']) csv_out.writerows(data) f.close() dataset = pd.read_csv(csvfile) return dataset
def ageBinnedCategorisedSUD(logger): ''' Finds percentage of the age-binned sample that has SUD of a particular substance Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: countDict = {} for sudcat in table2_config["params"]["sudcats"].keys(): list1 = [] for race in table2_config["inputs"]["races"]: list2 = [] for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']): query = SQL(''' SELECT count(*) FROM sarah.test2 t1 INNER JOIN sarah.test4 t2 ON t1.patientid = t2.patientid WHERE t1.race = {} AND t1.age BETWEEN {} AND {} AND t2.{} = true ''').format( Literal(race), Literal(lower), Literal(upper), Identifier(sudcat) ) data = [d[0] for d in pgIO.getAllData(query)] list2.append(data[0]) list1.append(list2) countDict[sudcat] = list1 # Change counts to percentage of the race sample resultsDict = {} for row in countDict: resultsDict[row] = divByAgeBins(countDict[row]) except Exception as e: logger.error('Failed to find categorised SUD counts because of {}'.format(e)) return resultsDict
def addmorethan2sudcolumn(logger): '''Populates the 'morethan2sud' column in tejas.sud_race_age This function counts the number of 'True' for each mental disorder for each user in tejas.sud_race_age. If they have more than 1 'True' value, their 'morethan2sud' column will be set to 'True'. Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: query = ''' SELECT siteid, backgroundid, alc, cannabis, amphe, halluc, nicotin, cocaine, opioids, sedate, others, polysub, inhalant FROM tejas.sud_race_age ''' data = pgIO.getAllData(query) csvfile = '../data/raw_data/atleast2suduser_keys.csv' count = 0 output = open(csvfile, 'w+') csv_output = csv.writer(output) for row in data: if sum(list(row[2:])) >= 2: csv_output.writerow(row) readCSV = csv.reader(open(csvfile), delimiter=",") for user in tqdm(readCSV): updateQuery = SQL(''' UPDATE tejas.sud_race_age SET morethan2sud = true WHERE siteid = {} AND backgroundid = {} ''').format(Literal(user[0]), Literal(str(user[1]))) value = pgIO.commitData(updateQuery) # print(type(user[0])) #Update column's null values to false updateQuery2 = ''' UPDATE tejas.sud_race_age SET morethan2sud = false WHERE morethan2sud is null ''' print(pgIO.commitData(updateQuery2)) except Exception as e: logger.error('adding morethan2sud column to the database failed because of {}'.format(e)) return
def checkTableExistence(logger, schemaName, tableName): doesExistQueryString = ''' SELECT EXISTS ( SELECT 1 FROM information_schema.tables WHERE table_schema = '{}' AND table_name = '{}' ); '''.format(schemaName, tableName) doesExistFlag = pgIO.getAllData(doesExistQueryString, dbName=dbName)[0][0] return doesExistFlag
def createDF_allRaces_anySUD(logger): '''Creates dataframe for total sample, dependent variable = any sud This function creates a dataframe for the total sample, where the dependent variable is any sud and the independent variables are: race, age, sex and setting. Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: query = ''' SELECT * from tejas.restofusers_t3_p1 ''' data = pgIO.getAllData(query) sud_data = [d[0] for d in data] race_data = [d[1] for d in data] age_data = [d[2] for d in data] sex_data = [d[3] for d in data] setting_data = [d[4] for d in data] d = {'sud': sud_data, 'race': race_data, 'age': age_data, 'sex': sex_data, 'setting': setting_data} main = pd.DataFrame(data=d) df = main.copy() # Change sud column to binary, dummify the other columns df.replace({False:0, True:1}, inplace=True) dummy_races = pd.get_dummies(main['race']) df = df[['sud']].join(dummy_races.ix[:, 'MR':]) main.replace(to_replace=list(range(12, 18)), value="12-17", inplace=True) main.replace(to_replace=list(range(18, 35)), value="18-34", inplace=True) main.replace(to_replace=list(range(35, 50)), value="35-49", inplace=True) main.replace(to_replace=list(range(50, 100)), value="50+", inplace=True) dummy_ages = pd.get_dummies(main['age']) df = df[['sud', 'MR', 'NHPI']].join(dummy_ages.ix[:, :'50+']) dummy_sexes = pd.get_dummies(main['sex']) df = df[['sud', 'MR', 'NHPI', '12-17', '18-34', '35-49', '50+']].join(dummy_sexes.ix[:, 'M':]) dummy_setting = pd.get_dummies(main['setting']) df = df[['sud', 'MR', 'NHPI', '12-17', '18-34', '35-49', 'M']].join(dummy_setting.ix[:, :'Inpatient']) df['intercept'] = 1.0 except Exception as e: logger.error('createDF_allRaces_anySUD failed because of {}'.format(e)) return df
def countRaceSUDppl(logger): result = {"AA": 0, "NHPI": 0, "MR": 0} for race in table2_config["inputs"]["races"]: query = SQL(''' WITH subQ AS ( SELECT * FROM tejas.sud_race_age WHERE sud_race_age.race = {} ) SELECT count(*) FROM subQ ''').format(Literal(race)) data = [d[0] for d in pgIO.getAllData(query)] if data != None: result[race] = data[0] return result
def getData(logger, self, query, columns=None, saveData=True, savePath='../data/intermediate', saveName='temp'): data = pgIO.getAllData(query, dbName=self.dbName) df = pd.DataFrame(data) if columns != None: df.columns = columns if saveData: if not os.path.exists(savePath): os.makedirs(savePath) dataOut.to_pickle(os.path.join(savePath, saveName + '.pkl')) print(saveName + '.pkl saved.') return dfmake
def allAgesCategorisedSUD(logger): ''' Finds percentage of the age-binned sample that have SUD of a particular substance Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: countDict = { "alc": [], "cannabis": [], "amphe": [], "halluc": [], "nicotin": [], "cocaine": [], "opioids": [], "sedate": [], "others": [], "polysub": [], "inhalant": [] } for sudcat in table2_config["params"]["sudcats"]: for race in table2_config["inputs"]["races"]: query = SQL(''' WITH subQ AS ( SELECT * FROM tejas.sud_race_age WHERE sud_race_age.race = {} AND sud_race_age.{} = true ) SELECT count(*) from subQ ''').format(Literal(race), Identifier(sudcat)) data = [d[0] for d in pgIO.getAllData(query)] countDict[sudcat].append(data[0]) # Change counts to percentage of the race sample resultsDict = {} except Exception as e: logger.error( 'Failed to find categorised SUD counts because of {}'.format(e)) return countDict
def ageBinnedCategorisedSUD(logger, race): ''' Finds percentage of the age-binned sample that has SUD of a particular substance Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' if race == "AA": x = 0 elif race == "NHPI": x = 1 elif race == "MR": x = 2 allAgesCatSUD = allAgesCategorisedSUD() try: countDict = {} for sudcat in table2_config["params"]["sudcats"].keys(): l = [0, 0, 0, 0, 0] i = 0 for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']): query = SQL(''' WITH subQ AS ( SELECT * FROM tejas.sud_race_age WHERE sud_race_age.race = {} AND sud_race_age.age >= {} AND sud_race_age.age <= {} AND sud_race_age.{} = true ) SELECT count(*) from subQ ''').format(Literal(race), Literal(lower), Literal(upper), Identifier(sudcat)) data = [d[0] for d in pgIO.getAllData(query)] l[i] = data[0] i += 1 for j in range(0, len(l)): l[j] = genPC(l[j], allAgesCatSUD[sudcat][x]) countDict[sudcat] = l except Exception as e: logger.error( 'Failed to find categorised SUD counts because of {}'.format(e)) return countDict
def countRaceAge(logger): ''' This function queries the database and returns the counts of each main race: AA, NHPI, MR sorted into age bins. Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' try: rd = {"AA": [], "NHPI": [], "MR": []} for race in table1_config["inputs"]["races"]: counts = [0, 0, 0, 0, 0] count = 0 for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']): query = SQL(''' WITH subQ as ( SELECT * FROM tejas.race_age_t1new t1 INNER JOIN tejas.restofusers t2 ON t1.siteid = t2.siteid AND t1.backgroundid = t2.backgroundid WHERE (cast (t1.age as int) >= {}) AND (cast (t1.age as int) <= {}) and t1.race = {} ) SELECT count(*) FROM subQ ''').format(Literal(lower), Literal(upper), Literal(race)) # returns pairs so we're just interested in first element data = [d[0] for d in pgIO.getAllData(query)] #print(data) #counts.append(data[0]) counts[count] += data[0] count += 1 rd[race] = counts #print(total) except Exception as e: logger.error('countRaceAge failed because of {}'.format(e)) return rd
def LoadData(logger, argParam): '''download data This function makes a connection, downloads the data from the database. Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' print('We are in LoadData module.') try: print('hi') jsonConfig = jsonref.load(open('../config/modules/loadData.json')) print('hEre I am.') schema = jsonConfig['saveData']['schema'] table = jsonConfig['saveData']['table'] saveFolder = jsonConfig['saveData']['saveFolder'] query = sql.SQL(''' SELECT * FROM {schema}.{table} ''').format(schema=sql.Identifier(schema), table=sql.Identifier(table)) data = pgIO.getAllData(query) # Check that the data is properly loaded print("-" * 10) data = np.array(data) # Save the data to the /data/raw folder np.save(os.path.join(saveFolder, 'raw_data.npy'), data) return data except Exception as e: logger.error(f'Unable to run LoadData \n {e}')
def countRaceAge(logger): ''' This function queries the database and returns the counts of each main race: AA, NHPI, MR sorted into age bins. Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' try: total = [] for race in table1_config["inputs"]["races"]: counts = [] for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']): query = SQL(''' SELECT count(*) FROM sarah.test2 t1 INNER JOIN sarah.test3 t2 ON t1.patientid = t2.patientid WHERE t1.age >= {} AND t1.age <= {} and t1.race = {} ''').format(Literal(lower), Literal(upper), Literal(race)) data = [d[0] for d in pgIO.getAllData(query)] #print("age range: "+str(lower)+"-"+ str(upper)+" count: "+str(data)) counts.append(data[0]) total.append(counts) except Exception as e: logger.error('countRaceAge failed because of {}'.format(e)) return total
def genAllKeys(logger): ''' This function generates a .csv file of (siteid, backgroundid) of users after the first filter of age, race, sex and setting are done. The .csv file will then be used for the second filter by dsmno. Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' all_userkeys = "../data/raw_data/allUserKeys.csv" with open(all_userkeys, 'w') as f: filewriter = csv.writer(f, delimiter=',') for race in table1_config["params"]["races"]["all"]: print("currently getting data for the " + race + " race") x = 0 # and (cast (id as int) > 0) and (cast (id as int) < 1000) query = SQL(''' select t1.siteid, t2.backgroundid from ( select id, siteid from raw_data.background where race = {} ) as t1 inner join raw_data.pdiagnose t2 on t1.siteid = t2.siteid and t1.id = t2.backgroundid group by (t1.siteid, t2.backgroundid) ''').format(Literal(race)) data = pgIO.getAllData(query) print("data is " + str(len(data)) + " items long") if len(data) > 0: # print("data is not none") for d in data: filewriter.writerow([d[0], d[1]]) f.close() return
def countRaceSetting(logger): ''' This function queries the database and returns the counts of each main race: AA, NHPI, MR sorted by treatment setting. Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' try: total = [] for race in table1_config["inputs"]["races"]: counts = [] for setting in table1_config["inputs"]["settings"]: query = SQL(''' SELECT count(*) FROM sarah.test2 t1 INNER JOIN sarah.test3 t2 ON t1.patientid = t2.patientid WHERE t1.visit_type = {} AND t1.race = {} ''').format(Literal(setting), Literal(race)) data = [d[0] for d in pgIO.getAllData(query)] counts.append(data[0]) total.append(counts) except Exception as e: logger.error('countRaceSetting failed because of {}'.format(e)) return total
def addmorethan2sudcolumn(logger): '''Populates the 'morethan2sud' column in sarah.test4 This function counts the number of 'True' for each mental disorder for each user in sarah.test4. If they have more than 1 'True' value, their 'morethan2sud' column will be set to 'True'. Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: query = ''' SELECT t1.patientid, t2.alc, t2.cannabis, t2.amphe, t2.halluc, t2.nicotin, t2.cocaine, t2.opioids, t2.sedate, t2.others, t2.polysub, t2.inhalant FROM sarah.test2 t1 INNER JOIN sarah.test4 t2 ON t1.patientid = t2.patientid ''' data = pgIO.getAllData(query) csvfile = '../data/raw_data/morethan2suduser_keys.csv' with open(csvfile, 'w+') as output: csv_output = csv.writer(output) for row in data: if sum(list(row[1:12])) >= 2: csv_output.writerow(row) output.close() with open(csvfile) as f: readCSV = csv.reader(f, delimiter=",") for user in tqdm(readCSV): updateQuery = ''' UPDATE sarah.test4 SET morethan2sud = True WHERE patientid = {} '''.format(user[0]) print(pgIO.commitData(updateQuery)) # print(type(user[0])) #Update column's null values to false updateQuery2 = ''' UPDATE sarah.test4 SET morethan2sud = False WHERE morethan2sud is null ''' print(pgIO.commitData(updateQuery2)) except Exception as e: logger.error( 'adding morethan2sud column to the databse failed because of {}'. format(e)) return
def getData(logger): '''get data from mindlinc This function gets some data from the mindlinc database. Parameters ---------- logger : {logging.Logger} The logger used for logging error information ''' dbName = projConfig['inputs']['dbName'] dbVersion = projConfig['inputs']['dbVersion'] cohortWindow = [0, 1000] daysWindow = [0, 365] # get CGI data - target cgi_query = ''' SELECT distinct on (severity, {0}.cgi.patientid, days) severity, {0}.cgi.patientid, days from ( select * from {0}.typepatient where age is not null and patientid >= {1} and patientid <= {2} and days >= {3} and days <= {4} ) as temp1 inner join {0}.cgi on {0}.cgi.typepatientid = temp1.typepatientid '''.format(dbVersion, cohortWindow[0], cohortWindow[1], daysWindow[0], daysWindow[1]) cgi_data = pgIO.getAllData(cgi_query, dbName=dbName) cgi_df = pd.DataFrame(cgi_data, columns=['cgi', 'patientID', 'days']) if not os.path.exists('../data/raw_data/cgi.pkl'): cgi_df.to_pickle('../data/raw_data/cgi.pkl') # get meds data - Features meds_query = ''' SELECT distinct on (medication, {0}.meds.patientid, days) medication, {0}.meds.patientid, days from ( select * from {0}.typepatient where age is not null and patientid >= {1} and patientid <= {2} and days >= {3} and days <= {4} ) as temp1 inner join {0}.meds on {0}.meds.typepatientid = temp1.typepatientid '''.format(dbVersion, cohortWindow[0], cohortWindow[1], daysWindow[0], daysWindow[1]) meds_data = pgIO.getAllData(meds_query, dbName=dbName) meds_df = pd.DataFrame(meds_data, columns=['meds', 'patientID', 'days']) if not os.path.exists('../data/raw_data/meds.pkl'): meds_df.to_pickle('../data/raw_data/meds.pkl') cgiOut = cgi_df.drop('days', axis=1).groupby(['patientID'], sort=False, as_index=False)['cgi'].mean() medsOut = meds_df.drop( 'days', axis=1).groupby('patientID', sort=False, as_index=False).agg(lambda x: list(x.unique())) medsOut = medsOut['meds'].str.join('|').str.get_dummies().join( medsOut[['patientID']]) dataOut = pd.merge(medsOut, cgiOut, how='inner', on='patientID') dataOut.set_index('patientID', inplace=True) if not os.path.exists('../data/raw_data/combined.pkl'): dataOut.to_pickle('../data/raw_data/combined.pkl') # print(dataOut.describe()) return dataOut
def createDF_allRaces_morethan2SUD(logger): '''Creates dataframe for total sample, dependent variable = more than 2 sud This function creates a dataframe for the total sample, where the dependent variable is >=2 sud and the independent variables are: race, age, sex and setting. Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: query = ''' SELECT t2.morethan2sud, t1.race, t1.age, t1.sex, t1.visit_type FROM sarah.test2 t1 INNER JOIN sarah.test4 t2 ON t1.patientid = t2.patientid WHERE t1.age BETWEEN 12 AND 100 ''' data = pgIO.getAllData(query) sud_data = [d[0] for d in data] race_data = [d[1] for d in data] age_data = [d[2] for d in data] sex_data = [d[3] for d in data] setting_data = [d[4] for d in data] d = { 'sud': sud_data, 'race': race_data, 'age': age_data, 'sex': sex_data, 'setting': setting_data } main = pd.DataFrame(data=d) df = main.copy() # Change sud column to binary, dummify the other columns df.replace({False: 0, True: 1}, inplace=True) dummy_races = pd.get_dummies(main['race']) df = df[['sud']].join(dummy_races.ix[:, 'MR':]) main.replace(to_replace=list(range(12, 18)), value="12-17", inplace=True) main.replace(to_replace=list(range(18, 35)), value="18-34", inplace=True) main.replace(to_replace=list(range(35, 50)), value="35-49", inplace=True) main.replace(to_replace=list(range(50, 100)), value="50+", inplace=True) dummy_ages = pd.get_dummies(main['age']) df = df[['sud', 'MR', 'NHPI']].join(dummy_ages.ix[:, :'35-49']) dummy_sexes = pd.get_dummies(main['sex']) df = df[['sud', 'MR', 'NHPI', '12-17', '18-34', '35-49']].join(dummy_sexes.ix[:, 'M':]) dummy_setting = pd.get_dummies(main['setting']) df = df[['sud', 'MR', 'NHPI', '12-17', '18-34', '35-49', 'M']].join(dummy_setting.ix[:, :'Hospital']) df['intercept'] = 1.0 except Exception as e: logger.error( 'createDF_allRaces_morethan2SUD failed because of {}'.format(e)) return df
def popTest4(logger): '''Populates test4 This function populates the table sarah.test4, which contains boolean columns for each mental disorder. If a user's row has True for that column, it means that he/she has that disorder, and vice versa. Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: all_userkeys = "../data/raw_data/SUDUser_keys.csv" with open(all_userkeys) as f: readCSV = csv.reader(f, delimiter=",") for user in tqdm(readCSV): getQuery = SQL(''' SELECT patientid, array_agg(distinct cast(dsmno as text)) && array[{}] as alc, array_agg(distinct cast(dsmno as text)) && array[{}] as cannabis, array_agg(distinct cast(dsmno as text)) && array[{}] as amphe, array_agg(distinct cast(dsmno as text)) && array[{}] as halluc, array_agg(distinct cast(dsmno as text)) && array[{}] as nicotin, array_agg(distinct cast(dsmno as text)) && array[{}] as cocaine, array_agg(distinct cast(dsmno as text)) && array[{}] as opioids, array_agg(distinct cast(dsmno as text)) && array[{}] as sedate, array_agg(distinct cast(dsmno as text)) && array[{}] as others, array_agg(distinct cast(dsmno as text)) && array[{}] as polysub, array_agg(distinct cast(dsmno as text)) && array[{}] as inhalant FROM rwe_version1_1.pdiagnose WHERE patientid = {} GROUP BY patientid ''').format( Literal(table2_config["params"]["sudcats"]["alc"]), Literal(table2_config["params"]["sudcats"]["cannabis"]), Literal(table2_config["params"]["sudcats"]["amphe"]), Literal(table2_config["params"]["sudcats"]["halluc"]), Literal(table2_config["params"]["sudcats"]["nicotin"]), Literal(table2_config["params"]["sudcats"]["cocaine"]), Literal(table2_config["params"]["sudcats"]["opioids"]), Literal(table2_config["params"]["sudcats"]["sedate"]), Literal(table2_config["params"]["sudcats"]["others"]), Literal(table2_config["params"]["sudcats"]["polysub"]), Literal(table2_config["params"]["sudcats"]["inhalant"]), Literal(int(user[0])) ) data = pgIO.getAllData(getQuery) pushQuery = ''' INSERT INTO sarah.test4(patientid, alc, cannabis, amphe, halluc, nicotin, cocaine, opioids, sedate, others, polysub, inhalant) VALUES %s ''' deleteDupliQuery = ''' DELETE FROM sarah.test4 a USING ( SELECT MAX(ctid) as ctid, patientid FROM sarah.test4 GROUP BY patientid HAVING count(*) > 1 ) b WHERE a.patientid = b.patientid AND a.ctid <> b.ctid ''' value = pgIO.commitData(deleteDupliQuery) if value == True: print("Duplicate values succesfully deleted") print(pgIO.commitDataList(pushQuery, data)) except Exception as e: logger. error('Failed to populate test4 table because of {}'.format(e)) return
def ageBinnedGeneralSUD(logger): ''' Finds percentage of the age-binned sample that has any SUD and more than 2 SUD Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: countDict = { "any_sud": [], "morethan2_sud": [] } # Find number of users in each race who have any SUD, separated into age bins any_sud = [] for race in table2_config["inputs"]["races"]: counts = [] for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']): query = SQL(''' SELECT count(*) FROM sarah.test2 t1 INNER JOIN sarah.test3 t2 ON t1.patientid = t2.patientid WHERE t1.race = {} AND t1.age BETWEEN {} AND {} AND t2.sud = true ''').format( Literal(race), Literal(lower), Literal(upper) ) data = [d[0] for d in pgIO.getAllData(query)] counts.append(data[0]) countDict["any_sud"].append(counts) # Find number of users in each race who have >2 SUD, separated into age bins count = { "AA": { "1": 0, "12": 0, "18": 0, "35": 0, "50": 0 }, "NHPI": { "1": 0, "12": 0, "18": 0, "35": 0, "50": 0 }, "MR": { "1": 0, "12": 0, "18": 0, "35": 0, "50": 0 } } for race in table2_config["inputs"]["races"]: for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']): query = SQL(''' SELECT t2.alc, t2.cannabis, t2.amphe, t2.halluc, t2.nicotin, t2.cocaine, t2.opioids, t2.sedate, t2.others, t2.polysub, t2.inhalant FROM sarah.test2 t1 INNER JOIN sarah.test4 t2 ON t1.patientid = t2.patientid WHERE t1.race = {} AND t1.age BETWEEN {} AND {} ''').format( Literal(race), Literal(lower), Literal(upper) ) data = pgIO.getAllData(query) for tuple in data: if sum(list(tuple))>=2: count[race][lower]+=1 for race in count: countDict["morethan2_sud"].append(list(count[race].values())) # Change counts to percentage of the race sample resultsDict = {} for row in countDict: resultsDict[row] = divByAgeBins(countDict[row]) except Exception as e: logger.error('Failed to find general SUD counts because of {}'.format(e)) return resultsDict
def allAgesGeneralSUD(logger): ''' Finds percentage of the total sample that has any SUD and more than 2 SUD Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: countDict = { "any_sud": [], "morethan2_sud": [] } # Find number of users in each race who have any SUD any_sud = [] for race in table2_config["inputs"]["races"]: query = SQL(''' SELECT count(*) FROM sarah.test2 t1 INNER JOIN sarah.test4 t2 ON t1.patientid = t2.patientid WHERE t1.race = {} ''').format( Literal(race) ) data = [d[0] for d in pgIO.getAllData(query)] countDict["any_sud"].append(data[0]) # Find number of users in each race who have >2 SUD count = { "AA": 0, "NHPI": 0, "MR": 0 } for race in table2_config["inputs"]["races"]: query = SQL(''' SELECT t2.alc, t2.cannabis, t2.amphe, t2.halluc, t2.nicotin, t2.cocaine, t2.opioids, t2.sedate, t2.others, t2.polysub, t2.inhalant FROM sarah.test2 t1 INNER JOIN sarah.test4 t2 ON t1.patientid = t2.patientid WHERE t1.race = {} ''').format( Literal(race) ) data = pgIO.getAllData(query) for tuple in data: if sum(list(tuple))>=2: count[race]+=1 for race in count: countDict["morethan2_sud"].append(count[race]) # Change counts to percentage of the race sample resultsDict = {} for row in countDict: resultsDict[row] = divByAllAges(countDict[row]) except Exception as e: logger.error('Failed to find general SUD counts because of {}'.format(e)) return resultsDict
def genDiagCount(logger, filePath): ''' This function generates the percentage of users per race that has a certain diagnosis Decorators: lD.log Arguments: logger {logging.Logger} filePath {str} Returns: dict -- dictionary containing the results ''' try: resultsDict = { "mood": [], "anxiety": [], "adjustment": [], "adhd": [], "sud": [], "psyc": [], "pers": [], "childhood": [], "impulse": [], "cognitive": [], "eating": [], "smtf": [], "disso": [], "sleep": [], "fd": [] } with open(filePath) as json_file: table1results = json.load(json_file) for category in resultsDict: for race in fig1_config["inputs"]["races"]: query = SQL(''' SELECT count(*) FROM sarah.test3 t1 INNER JOIN sarah.test2 t2 ON t1.patientid = t2.patientid WHERE t1.{} is true AND t2.race = {} ''').format( Identifier(category), Literal(race) ) data = [d[0] for d in pgIO.getAllData(query)] data = round((data[0]/table1results[race][0])*100, 1) resultsDict[category].append(data) #percentages json_file.close() except Exception as e: logger.error('Failed to generate count {}'.format(e)) return resultsDict
def ageBinnedGeneralSUD(logger): ''' Finds percentage of the age-binned sample that has any SUD and more than 2 SUD Decorators: lD.log Arguments: logger {logging.Logger} -- logs error information ''' try: countDict = { "any_sud": { "AA": [], "NHPI": [], "MR": [] }, "morethan2_sud": {} } # Find number of users in each race who have any SUD, separated into age bins for race in table2_config["inputs"]["races"]: ageCount = 0 ageCounts = [0, 0, 0, 0, 0] for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']): query = SQL(''' WITH subQ AS ( SELECT * FROM tejas.sud_race_age WHERE sud_race_age.race = {} AND sud_race_age.age BETWEEN {} AND {} ) SELECT count(*) FROM subQ ''').format(Literal(race), Literal(lower), Literal(upper)) data = [d[0] for d in pgIO.getAllData(query)] ageCounts[ageCount] = data[0] ageCount += 1 countDict["any_sud"][race] = ageCounts # Find number of users in each race who have >2 SUD, separated into age bins count = {"AA": [], "NHPI": [], "MR": []} for race in table2_config["inputs"]["races"]: ageCount = 0 ageCounts = [0, 0, 0, 0, 0] for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']): query = SQL(''' SELECT alc, cannabis, amphe, halluc, nicotin, cocaine, opioids, sedate, others, polysub, inhalant FROM tejas.sud_race_age WHERE sud_race_age.age >= {} AND sud_race_age.age <= {} AND sud_race_age.race = {} ''').format(Literal(lower), Literal(upper), Literal(race)) data = pgIO.getAllData(query) for tuple in data: if sum(list(tuple)) >= 2: ageCounts[ageCount] += 1 ageCount += 1 count[race] = ageCounts countDict["morethan2_sud"] = count except Exception as e: logger.error( 'Failed to find general SUD counts because of {}'.format(e)) #print(resultsDict) return countDict