def get_chisquare(self, pat_var, pat_cls, name=None, state=None): # delete patient in pat_cls without pat_val pat_cls_new = dict() for pat, cls in pat_cls.items(): if pat in pat_var: pat_cls_new[pat] = cls # print (len(pat_cls_new)) # deal with negative value min_fval = min(pat_var.values()) # print (min_fval) if isint(min_fval) and min_fval < 0: for pat, fval in pat_var.items(): pat_var[pat] = int(fval + math.fabs(min_fval)) # construct table pats = pandas.DataFrame({'variable': pat_var, 'cluster':pat_cls_new}) pat_tab = pandas.crosstab(pats.variable, pats.cluster, margins = True) pat_tab.column = ['subtype I', 'subtype II', 'subtype III', 'row_totals'] pat_tab.index = list(set([v for v in pat_var.values()])) + ['col_totals'] # print (pat_tab) # testing observed = pat_tab.ix[0:len(pat_var),0:self.K] ch2, p, dof, expected = stats.chi2_contingency(observed= observed) print ("%s: The p-value of %s is: %f" %(state, name, p)) return p
def get_concatenation(self): for fname in self.feature_list: # get triple lists (patient id, time, feature value) if fname in self.feature_name['motor']: tpl_list = self.dataio.feature.motor.feature_info[fname] elif fname in self.feature_name['non-motor']: tpl_list = self.dataio.feature.nonmotor.feature_info[fname] elif fname in self.feature_name['biospecimen']: tpl_list = self.dataio.feature.biospecimen.feature_info[fname] elif fname in self.feature_name['image']: tpl_list = self.dataio.feature.image.feature_info[fname] elif fname in self.feature_name['medication']: tpl_list = self.dataio.feature.medication.feature_info[fname] # print (len(tpl_list)) # store the patient info pat_record = dict( ) # patient id : a list of (time stamp, feature val) for tpl in tpl_list: if isint(tpl[2]) == True: fval = int(tpl[2]) elif isfloat(tpl[2]) == True: fval = float(tpl[2]) else: continue pat_id = tpl[0] time = convert_int(tpl[1]) if pat_id not in self.patient_id: continue if pat_id not in pat_record: pat_record[pat_id] = list() pat_record[pat_id].append((time, fval)) else: pat_record[pat_id].append((time, fval)) # store the records into Patient fidx = self.feature_dict[fname] # index of feature dimension for pat_id, tf_list in pat_record.items(): patient = self.patient_info[pat_id] for time, fval in tf_list: if time not in patient.patient_rec: patient.patient_rec[time] = numpy.zeros( self.feature_len, dtype='float32') - 1 patient.patient_rec[time][fidx] = fval self.patient_info[pat_id] = patient return (self.patient_info, self.feature_len)
def get_hy_stage(self): hy_stage = dict() feat_info = self.feature_info fname = 'MDS UPDRS PartIII' featname = 'H&Y' for fn, tpl_list in feat_info.items(): if fn not in self.get_feature_set(fname, featname): continue pat_record = dict( ) # patient id : a list of (time stamp, feature val) for tpl in tpl_list: if isint(tpl[2]) == True: fv = int(tpl[2]) elif isfloat(tpl[2]) == True: fv = float(tpl[2]) else: continue pat = tpl[0] time = convert_int(tpl[1]) if pat not in pat_record: pat_record[pat] = list() pat_record[pat].append((time, fv)) else: pat_record[pat].append((time, fv)) # sort for each patient according to time stamp pat_new_record = dict() for pat, tf_list in pat_record.items(): pat_new_record[pat] = sorted(tf_list, key=operator.itemgetter(0)) hy_stage[pat] = pat_new_record[pat][-1][1] print(hy_stage) return hy_stage
def get_biospecimen(self, dataio, K, fname=None, featname=None): pat_cluster = dataio.patient_cluster feat_info = dataio.feature.biospecimen.feature_info # initialization for each cluster for i in range(1, K + 1): self.BIO[str(i)] = list() self.BIO_first[str(i)] = list() self.BIO_median[str(i)] = list() # intialization for patient and the corresponding feature value pat_fval_first = dict() # patient id : first feature value pat_fval_median = dict() # patient id : median feature value pat_fval_last = dict() # patient id : last feature value pat_fval_diff = dict() # patient id : first-order difference # between last and first feature value for fn, tpl_list in feat_info.items(): if fn not in self.get_feature_set(fname, featname): continue pat_record = dict( ) # patient id : a list of (time stamp, feature val) for tpl in tpl_list: if isint(tpl[2]) == True: fv = int(tpl[2]) elif isfloat(tpl[2]) == True: fv = float(tpl[2]) else: continue pat = tpl[0] time = convert_int(tpl[1]) if pat not in pat_cluster: continue if pat not in pat_record: pat_record[pat] = list() pat_record[pat].append((time, fv)) else: pat_record[pat].append((time, fv)) # sort for each patient according to time stamp pat_new_record = dict() for pat, tf_list in pat_record.items(): pat_new_record[pat] = sorted(tf_list, key=operator.itemgetter(0)) # store last, (first, median) values for pat, tf_list in pat_new_record.items(): pat_fval_first, pat_fval_median, pat_fval_last = \ self.get_feature_value(pat, tf_list, pat_fval_first, pat_fval_median, pat_fval_last) pat_fval_diff = self.get_feature_diff(pat, tf_list, pat_fval_diff) # store feature values according to subtypes for pat, cls in pat_cluster.items(): if pat in pat_fval_last: self.BIO[str(cls)].append(pat_fval_last[pat]) if pat in pat_fval_first: self.BIO_first[str(cls)].append(pat_fval_first[pat]) if pat in pat_fval_median: self.BIO_median[str(cls)].append(pat_fval_median[pat]) # compute statistics # mean , std stats = Statistics(K) mean_BIO, std_BIO = stats.get_mean_std(self.BIO, is_total=False) mean_total_BIO, std_total_BIO = stats.get_mean_std(self.BIO, is_total=True) mean_BIO_first, std_BIO_first = stats.get_mean_std(self.BIO_first, is_total=False) mean_total_BIO_first, std_total_BIO_first = stats.get_mean_std( self.BIO_first, is_total=True) mean_BIO_median, std_BIO_median = stats.get_mean_std(self.BIO_median, is_total=False) mean_total_BIO_median, std_total_BIO_median = stats.get_mean_std( self.BIO_median, is_total=True) if featname != None: fname_ = fname + '-' + featname else: fname_ = fname self.mean[fname_] = list() self.mean_first[fname_] = list() self.mean_median[fname_] = list() for i in range(1, K + 1): self.mean[fname_].append((mean_BIO[str(i)], std_BIO[str(i)])) self.mean_first[fname_].append( (mean_BIO_first[str(i)], std_BIO_first[str(i)])) self.mean_median[fname_].append( (mean_BIO_median[str(i)], std_BIO_median[str(i)])) # display for i in range(1, K + 1): print('### CLUSTER %d ####' % i) print( "The average %s value in the %d-th clusters at follow-up is: %f (%f)" % (fname, i, mean_BIO[str(i)], std_BIO[str(i)])) print( "The average value in the %d-th clusters at baseline is: %f (%f)" % (i, mean_BIO_first[str(i)], std_BIO_first[str(i)])) print( "The average value in the %d-th clusters at median is: %f (%f)" % (i, mean_BIO_median[str(i)], std_BIO_median[str(i)])) print("The total average %s at follow-up is: %f (%f)" % (fname, mean_total_BIO, std_total_BIO)) print("The total average %s at baseline is: %f (%f)" % (fname, mean_total_BIO_first, std_total_BIO_first)) print("The total average %s at median is: %f (%f)" % (fname, mean_total_BIO_median, std_total_BIO_median)) print("##########") # hypothesis testing print("hypothesis testing...") if fname == 'CSF': if featname != None: fname = fname + '-' + featname stats.get_distribution(pat_fval_last, is_num=True, is_discretization=True) if featname == 'Total tau' or featname == 'Abeta 42': p_last = stats.get_f_oneway(pat_fval_last, pat_cluster, fname, 'STATIC') if featname == 'p-Tau181P' or featname == 'CSF Alpha-synuclein': p_last = stats.get_kruskal(pat_fval_last, pat_cluster, fname, 'STATIC') self.p_value.append([fname, None, None, p_last, None]) else: if featname != None: fname = fname + '-' + featname stats.get_distribution(pat_fval_first, is_num=True) p_first = stats.get_chisquare(pat_fval_first, pat_cluster, fname, 'FIRST') stats.get_distribution(pat_fval_median, is_num=True) p_median = stats.get_chisquare(pat_fval_median, pat_cluster, fname, 'MEDIAN') stats.get_distribution(pat_fval_last, is_num=True) p_last = stats.get_chisquare(pat_fval_last, pat_cluster, fname, 'LAST') stats.get_distribution(pat_fval_diff, is_num=True) p_diff = stats.get_chisquare(pat_fval_diff, pat_cluster, fname, 'DIFFERENCE') # store into self.p_value self.p_value.append([fname, p_first, p_median, p_last, p_diff]) # post hoc test # if p_first <= 0.05: # stats.get_tukeyhsd(pat_fval_first, pat_cluster, fname, 'FIRST') # if p_median <= 0.05: # stats.get_tukeyhsd(pat_fval_median, pat_cluster, fname, 'MEDIAN') if p_last <= 0.05: stats.get_tukeyhsd(pat_fval_last, pat_cluster, fname, 'LAST') # if p_diff <= 0.05: # stats.get_tukeyhsd(pat_fval_diff, pat_cluster, fname, 'DIFFERENCE') self.mean_total[fname] = (mean_total_BIO, std_total_BIO) print('-----------------------')
def get_motor(self, dataio, K, fname=None, featname=None): pat_cluster = dataio.patient_cluster feat_info = dataio.feature.motor.feature_info # initialization for each cluster for i in range(1, K + 1): self.MOTOR[str(i)] = list() self.MOTOR_first[str(i)] = list() self.MOTOR_median[str(i)] = list() # intialization for patient and the corresponding feature value pat_fval_first = dict() # patient id : first feature value pat_fval_median = dict() # patient id : median feature value pat_fval_last = dict() # patient id : last feature value pat_fval_diff = dict() # patient id : first-order difference # between last and first feature value # read and store for fn, tpl_list in feat_info.items(): if fn not in self.get_feature_set(fname, featname): continue pat_record = dict( ) # patient id : a list of (time stamp, feature val) for tpl in tpl_list: if isint(tpl[2]) == True: fv = int(tpl[2]) elif isfloat(tpl[2]) == True: fv = float(tpl[2]) else: continue pat = tpl[0] time = convert_int(tpl[1]) if pat not in pat_cluster: continue if pat not in pat_record: pat_record[pat] = list() pat_record[pat].append((time, fv)) else: pat_record[pat].append((time, fv)) # sort for each patient according to time stamp pat_new_record = dict() for pat, tf_list in pat_record.items(): pat_new_record[pat] = sorted(tf_list, key=operator.itemgetter(0)) # store last, (first, median) values for pat, tf_list in pat_new_record.items(): pat_fval_first, pat_fval_median, pat_fval_last = \ self.get_feature_value(pat, tf_list, pat_fval_first, pat_fval_median, pat_fval_last) pat_fval_diff = self.get_feature_diff(pat, tf_list, pat_fval_diff) # store feature values according to subtypes for pat, cls in pat_cluster.items(): if pat in pat_fval_last: self.MOTOR[str(cls)].append(pat_fval_last[pat]) if pat in pat_fval_median: self.MOTOR_median[str(cls)].append(pat_fval_median[pat]) if pat in pat_fval_first: self.MOTOR_first[str(cls)].append(pat_fval_first[pat]) # compute statistics # mean , std stats = Statistics(K) mean_MOTOR, std_MOTOR = stats.get_mean_std(self.MOTOR, is_total=False) mean_MOTOR_median, std_MOTOR_median = stats.get_mean_std( self.MOTOR_median, is_total=False) mean_MOTOR_first, std_MOTOR_first = stats.get_mean_std( self.MOTOR_first, is_total=False) mean_total_MOTOR, std_total_MOTOR = stats.get_mean_std(self.MOTOR, is_total=True) mean_total_MOTOR_median, std_total_MOTOR_median = stats.get_mean_std( self.MOTOR_median, is_total=True) mean_total_MOTOR_first, std_total_MOTOR_first = stats.get_mean_std( self.MOTOR_first, is_total=True) if featname != None: fname_ = fname + '-' + featname else: fname_ = fname self.mean[fname_] = list() self.mean_first[fname_] = list() self.mean_median[fname_] = list() for i in range(1, K + 1): self.mean[fname_].append((mean_MOTOR[str(i)], std_MOTOR[str(i)])) self.mean_first[fname_].append( (mean_MOTOR_first[str(i)], std_MOTOR_first[str(i)])) self.mean_median[fname_].append( (mean_MOTOR_median[str(i)], std_MOTOR_median[str(i)])) # display if featname != None: print('feature name: %s' % featname) else: print('feature name: %s' % fname) for i in range(1, K + 1): print('### CLUSTER %d ####' % i) print( "The average value in the %d-th clusters at follow-up is: %f (%f)" % (i, mean_MOTOR[str(i)], std_MOTOR[str(i)])) print( "The average value in the %d-th clusters at baseline is: %f (%f)" % (i, mean_MOTOR_first[str(i)], std_MOTOR_first[str(i)])) print( "The average value in the %d-th clusters at median is: %f (%f)" % (i, mean_MOTOR_median[str(i)], std_MOTOR_median[str(i)])) print("The total average %s at follow-up is: %f (%f)" % (fname, mean_total_MOTOR, std_total_MOTOR)) print("The total average %s at median is: %f (%f)" % (fname, mean_total_MOTOR_median, std_total_MOTOR_median)) print("The total average %s at baseline is: %f (%f)" % (fname, mean_total_MOTOR_first, std_total_MOTOR_first)) print("##########") # hypothesis testing print("hypothesis testing...") if fname == 'MDS UPDRS PartIV': if featname != None: fname = fname + '-' + featname # fisher exact stats.get_distribution(pat_fval_first, is_num=True) p_first = stats.get_fisher_exact(pat_fval_first, pat_cluster, fname, 'FIRST') stats.get_distribution(pat_fval_median, is_num=True) p_median = stats.get_fisher_exact(pat_fval_median, pat_cluster, fname, 'MEDIAN') stats.get_distribution(pat_fval_last, is_num=True) p_last = stats.get_fisher_exact(pat_fval_last, pat_cluster, fname, 'LAST') stats.get_distribution(pat_fval_diff, is_num=True) p_diff = stats.get_fisher_exact(pat_fval_diff, pat_cluster, fname, 'DIFFERENCE') self.p_value.append([fname, p_first, p_median, p_last, p_diff]) else: if featname != None: fname = fname + '-' + featname # chi-square stats.get_distribution(pat_fval_first, is_num=True) p_first = stats.get_chisquare(pat_fval_first, pat_cluster, fname, 'FIRST') stats.get_distribution(pat_fval_median, is_num=True) p_median = stats.get_chisquare(pat_fval_median, pat_cluster, fname, 'MEDIAN') stats.get_distribution(pat_fval_last, is_num=True) p_last = stats.get_chisquare(pat_fval_last, pat_cluster, fname, 'LAST') stats.get_distribution(pat_fval_diff, is_num=True) p_diff = stats.get_chisquare(pat_fval_diff, pat_cluster, fname, 'DIFFERENCE') self.p_value.append([fname, p_first, p_median, p_last, p_diff]) # post hoc test if p_first <= 0.05: stats.get_tukeyhsd(pat_fval_first, pat_cluster, fname, 'FIRST') if p_median <= 0.05: stats.get_tukeyhsd(pat_fval_median, pat_cluster, fname, 'MEDIAN') if p_last <= 0.05: stats.get_tukeyhsd(pat_fval_last, pat_cluster, fname, 'LAST') if p_diff <= 0.05: stats.get_tukeyhsd(pat_fval_diff, pat_cluster, fname, 'DIFFERENCE') self.mean_total[fname] = (mean_total_MOTOR, std_total_MOTOR) print('-----------------------')
def load_subtype(self, patient_id, patient_cluster, timestamp='baseline'): epsilon = sys.float_info.epsilon feat_info = self.feature_info fname = 'Motor Subtype' pat_TD_mean = dict() # pid: mean feature value related with TD pat_PIGD_mean = dict() # pid: mean feature value related with PIGD PIGD_set = set( ['NP2WALK', 'NP2FREZ', 'NP3GAIT', 'NP3FRZGT', 'NP3PSTBL']) for fn, tpl_list in feat_info.items(): if fn not in self.get_feature_set(fname): continue pat_record = dict( ) # patient id : a list of (time stamp, feature val) for tpl in tpl_list: if isint(tpl[2]) == True: fv = int(tpl[2]) elif isfloat(tpl[2]) == True: fv = float(tpl[2]) else: continue pat = tpl[0] time = convert_int(tpl[1]) if pat not in patient_id: continue if pat not in pat_record: pat_record[pat] = list() pat_record[pat].append((time, fv)) else: pat_record[pat].append((time, fv)) # sort for each patient according to time stamp pat_new_record = dict() for pat, tf_list in pat_record.items(): pat_new_record[pat] = sorted(tf_list, key=operator.itemgetter(0)) # store first values for pat, tf_list in pat_new_record.items(): if timestamp == 'baseline': time_idx = 0 pat_fval = tf_list[time_idx][1] elif timestamp == 'follow-up': time_idx = -1 pat_fval = tf_list[time_idx][1] elif timestamp == 'median': # print ('--------') if len(tf_list) % 2 == 1: time_idx = math.floor(len(tf_list) / 2) pat_fval = tf_list[time_idx][1] # print (time_idx) else: time_idx1 = math.floor(len(tf_list) / 2) time_idx2 = time_idx1 - 1 # print (time_idx1) # print (time_idx2) pat_fval = (tf_list[time_idx1][1] + tf_list[time_idx2][1]) / 2 print(len(tf_list)) if fn in PIGD_set: if pat not in pat_PIGD_mean: pat_PIGD_mean[pat] = 0 pat_PIGD_mean[pat] += pat_fval else: if pat not in pat_TD_mean: pat_TD_mean[pat] = 0 pat_TD_mean[pat] += pat_fval # categorization according to motor subtypes for pat in patient_id: if pat in pat_PIGD_mean and pat in pat_TD_mean: pat_TD_mean[pat] /= 11 pat_PIGD_mean[pat] /= 5 motor_ratio = pat_TD_mean[pat] / (pat_PIGD_mean[pat] + epsilon) if motor_ratio >= 1.15: patient_cluster[pat] = '1' # TD Subtype elif motor_ratio <= 0.90: patient_cluster[pat] = '2' # PIGD Subtype else: patient_cluster[pat] = '3' # indetermine Subtype print(patient_cluster)