print("loading..") for f in files: assert_exists(f) # check for files # extract studyid cohort_id_file = cohort_file + "_studyid" a = os.system("csv_split " + cohort_file) if not os.path.exists(cohort_id_file) else None assert_exists(cohort_id_file) # make sure we got the result # load filtered student credit table dat_cohort, datf_cohort = None, None if not os.path.exists('dat_cohort.p'): print "dat_cohort.p not found. Creating.." dat_cohort, datf_cohort = load_fields([cohort_file, 'dob']) pickle.dump([dat_cohort, datf_cohort], open('dat_cohort.p', 'wb')) else: dat_cohort, datf_cohort = pickle.load(open('dat_cohort.p', 'rb')) studyid, dob = list(dat_cohort.keys()), {} fdat_cohort = {datf_cohort[i]: i for i in range(0, len(datf_cohort))} # express dob as a function of studyid for i in studyid: dob[i] = dat_cohort[i][fdat_cohort['dob']][0] def filter_table_for_cohort(cohort_id_file, table_file): select_file = table_file + "_select.csv" if not os.path.exists(select_file): a = os.system("csv_select " + cohort_id_file + " studyid " +
# guess data types of cols of csv. Ignore data with no info (one outcome only) import os import sys from misc import load_fields args = sys.argv info = len(args) > 3 dat = load_fields("../test/merge.csv" if len(args) < 2 else args[1]) types = {} for k in dat.keys(): d, is_float = dat[k], True # extract col for i in range(0, len(d)): try: f = float(d[i]) except Exception: is_float = False t = "float" if is_float else "str" lsd = len(set(d)) if lsd > 1: if info: print(t, k, lsd if lsd > 12 else set(d)) if not t in types: types[t] = [] types[t].append(k) if len(args) < 3: print(types) else: if args[2] in ['str', 'float']: print(','.join(types[args[2]]))