def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False, counter=0): '''inserts data from the specified csv and corresponding columns''' important_featx ures = ['CHOLBMT', 'RRDIKA', 'RRSYKA'] # make convenient reference to the dictionary dct = self.id2data rows = rows.where((pd.notnull(rows)), None) # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns # ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) + 1 date_idx = headers.index(date_column[0]) + 1 # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) if 'lab_results' in suffix: values_dict = dict() # val_idx = headers.index('valuen') + 1 # pair IDs with a dict corresponding to data and dates for row in rows.itertuples():#line in de data code = row[code_idx] # if we do not know the high and low values, determine by data distribution if code not in important_features: if not code in values_dict: try: values_dict[code] = [float(row.valuen)] except ValueError: continue except TypeError: continue else: try: values_dict[code].append(float(row.valuen)) except ValueError: continue except TypeError: continue minmax_dict = self.calculate_minmax(values_dict, pattern, limit) if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # keep track of number of times the row is attributed to a positive stroke patient (or patient where the target instance = 'positive') num_pos = 0 num_total = 0 attribute_count = dict() # iterate over all instances, making a new dict with the new attributes as keys attribute2ids = dict() max=1000000000000000000 current = 0 for row in tqdm(rows.itertuples()): current += 1 # row = row.split(';') if current > max: break else: num_total+=1 # if key is not in the data dictionary, we skip it key = row.Index if not key in dct: continue if dct[key]['stroke_dates'][0] != 'negative': num_pos+=1 # init other vars date = str2date(row[date_idx], give_default_begin=True, give_default_end=True) begin = dct[key]['stroke_dates'][1] end = dct[key]['stroke_dates'][2] if code_column == 'specialisme': end = end - four_weeks() original_code = row[code_idx] if original_code == None: continue truncated_code = self.generate_code(original_code, limit) if truncated_code == None or truncated_code in ['K90', 'K89', 'k90', 'k89']: continue if not self.marshall_predictor(truncated_code, code_column): continue # if in the required interval and code is valid if (begin <= date and date <= end) and pattern.match(truncated_code): # if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E # if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): if 'lab_results' in suffix: # if we prepare for lab result abstraction try: val = float(row.valuen) if not original_code in important_features: min_val = minmax_dict[truncated_code]['low_bound'] max_val = minmax_dict[truncated_code]['high_bound'] else: min_val, max_val = self.determine_minmax(original_code) except ValueError: continue except TypeError: continue if not 'ID2abstractions' in locals(): # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement) ID2abstractions = dict() util.init_key(ID2abstractions, key, dict()) util.init_key(ID2abstractions[key], original_code, []) ID2abstractions[key][original_code].append((date, val)) if '' not in [val, min_val, max_val]: attr = get_value(val, min_val, max_val, original_code) if not attr in attribute_count: attribute_count[attr] = 0 # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], key, 0) # add 1 to the occurrence of the attribute in the instance attribute2ids[attr][key] += 1 attribute_count[attr] += 1 else: # else no lab result collection, regular aggregation # generate attribute names if 'cardiometabolism' in suffix: # val_idx = headers.index('valuec') value = str(row.valuec) else: value = None attributes = self.generate_attributes(original_code, limit, suffix, value, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. StandardEnrichProcess for attr in attributes: if not attr in attribute_count: attribute_count[attr] = 0 # print truncated_code, attr # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], key, 0) # add 1 to the occurrence of the attribute in the instance, except if attribute is binary if 'smoking' in suffix: if attribute2ids[attr][key] == 1: continue if 'allergies' in suffix: # val_idx = headers.index('flag') value = row.flag # check if the person actually has the allergie for which was tested if value == 'POS': attribute2ids[attr][key] = 1 # if negative or not tested, it is assumed that person does not have particular allergie else: attribute2ids[attr][key] = 0 else: attribute2ids[attr][key] += 1 attribute_count[attr] += 1 for attr, count in attribute_count.items(): try: self.statistics[attr + '_count/min/max'] = [count, min_val, max_val] except UnboundLocalError: self.statistics[attr + '_count'] = count if 'lab_results' in suffix: # do funky stuff with trends and abstractions # convert to trends PER lab result for ID in ID2abstractions: # print ID2abstractions[ID] for k, points in ID2abstractions[ID].items(): # the values are sorted before abstraction points = sorted(list(set(points))) # abstract the values and count the occurrences per measurement-trend per patient # if only 1 measurement was done, we cannot do time series analysis if len(points) > 1 and ID in dct: abstractions = get_trends(k, points) for attr in abstractions: attr = attr[0] # get the state util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], ID, 0) attribute2ids[attr][ID] += 1 # print len(attribute2ids) # print attribute2ids.keys()[0:5] # add data to each instance to_save = {} for ID in dct: to_save[ID] = [] for ID in dct: data = dct[ID]['data'] # to_save[ID] = [] for id2occurrences in attribute2ids.values(): # if patient has occurrences for the attribute, add that number, else add 0 if ID in id2occurrences: data.append(id2occurrences[ID]) to_save[ID].append(id2occurrences[ID]) else: data.append(0) to_save[ID].append(0) save_obj(self.statistics, self.in_dir + suffix[0]+ '_statistics.pkl') if self.survival == True: save_obj(to_save, self.in_dir + suffix[0] + '_dict_marshall' + str(counter)+ '_survival' + '.pkl') save_obj(list(attribute2ids.keys()), self.in_dir + suffix[0] + '_headers'+ str(counter) + '.pkl') else: save_obj(to_save, self.in_dir + suffix[0] + '_dict_marshall' + str(counter) + '.pkl') save_obj(list(attribute2ids.keys()), self.in_dir + suffix[0] + '_headers'+ str(counter) + '.pkl') # return the keys to be used as headers when writing the processed data return list(attribute2ids.keys()), num_total, num_pos, suffix
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False): '''inserts data from the specified csv and corresponding columns''' # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) b_date_idx = headers.index(date_column[0]) e_date_idx = headers.index(date_column[1]) if suffix == 'lab_results': val_idx = headers.index('waarde') min_idx = headers.index('referentie_minimum') max_idx = headers.index('referentie_maximum') if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # get the right suffix to append for the attribute name if suffix == '': suffix = code_column # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) max = 5000 current = 0 # iterate over all instances for row in rows: if current > max: break row = row.split(';') original_code = row[code_idx] if original_code == None: continue truncated_code = self.generate_code(original_code, limit) if truncated_code == None: continue ### is in Marshall Predictors check ### ### if it is a marshall predictor, we skip this line. if self.marshall_predictor(truncated_code, code_column): continue # if key is not in the data dictionary, we skip it key = row[ID_idx] if not key in dct: continue # init other vars b_date = str2date(row[b_date_idx], give_default_begin=True) # begin of event e_date = str2date(row[e_date_idx], give_default_end=True) # end of event b_reg = dct[key]['stroke_dates'][1] # beginning of registration e_reg = dct[key]['stroke_dates'][2] # ending of registration if code_column == 'specialisme': e_reg = e_reg - four_weeks() if suffix == 'lab_results': val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx]) if val == '': continue # if in the required interval (either beginning or ending date) AND code is valid if ( (b_reg <= b_date and b_date <= e_reg) or (b_reg <= e_date and e_date <= e_reg) ) and pattern.match(truncated_code): # if we need to take the SOEP code of consults into account if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): # generate attribute names if suffix == 'lab_results': # if we prepare for lab result abstraction if not 'ID2abstractions' in locals(): # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement) ID2abstractions = defaultdict(dict) util.init_key(ID2abstractions, key, defaultdict(dict)) util.init_key(ID2abstractions[key], original_code, []) ID2abstractions[key][original_code].append((b_date, val)) if '' not in [val, min_val, max_val]: attributes = [abstracts.get_value(val, min_val, max_val, original_code)] # # add value abstraction as state interval # self.insert_state_interval(key, attr, b_date, e_date) else: attributes = [] else: attributes = self.generate_attributes(original_code, limit, suffix, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. SequenceEnrichProcess for attr in attributes: # insert a StateInterval object with the specified parameters self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column) current += 1 if suffix == 'lab_results': # do funky stuff with trends and abstractions # convert to trends PER lab result for ID in ID2abstractions: # print ID2abstractions[ID] for k, points in ID2abstractions[ID].items(): # the values are sorted before abstraction points = sorted(list(set(points))) # abstract the values and append to the current patient's sequence # if only 1 measurement was done, we cannot do time series analysis if len(points) > 1 and ID in dct: abstractions = abstracts.get_trends(k, points) for abstraction in abstractions: self.insert_state_interval(ID, *abstraction, original_code=original_code, src=code_column) # self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions # to satisfy return value requirement for the method 'process' in the superclass return [], -1, -1
def insert_start_baseline(self, rows, headers): dct = self.id2data rows = rows.where((pd.notnull(rows)), None) actions_dict = dict() code_idx = headers.index('icpc_cat') + 1 date_idx = headers.index('dicpc_startdate') + 1 # patterns = ['12000','12001', '12002', '12004'] max = 5000000000000000000 current = 0 amount_x = 0 amount_y = 0 f = 0 g = 0 i = 0 z = 0 key_list = [] for row in tqdm(rows.itertuples()): current += 1 if current > max: break key = row.Index if not key in dct: z += 1 continue if not key in key_list: key_list.append(key) amount_x += 1 date = str2date(row[date_idx], give_default_begin=True, give_default_end=True) if int(str(date).split('-')[0]) < 2007: continue original_code = row[code_idx] if original_code == None: i += 1 continue string_code = str(original_code) if not key in actions_dict: actions_dict[key] = {} if not string_code in actions_dict[key]: actions_dict[key][string_code] = [] actions_dict[key][string_code].append(date) else: actions_dict[key][string_code].append(date) else: if not string_code in actions_dict[key]: actions_dict[key][string_code] = [] actions_dict[key][string_code].append(date) else: actions_dict[key][string_code].append(date) to_remove = [] for patient, action_codes in actions_dict.items(): amount_y += 1 lowest_dict = dict() count = 0 for action_code, dates in action_codes.items(): if not dates: count += 1 continue else: lowest_dict[action_code] = min(dates) # try: # lowest_dict[action_code] = min(dates) # except ValueError: # continue # for action_code, date in lowest_dict: earliest_visit = min(lowest_dict, key=lowest_dict.get) visit_date = lowest_dict[earliest_visit] self.id2data[patient]['stroke_dates'].append(visit_date) print(self.id2data[patient]['stroke_dates']) # except ValueError: # to_remove.append(patient) print(amount_x, amount_y, z) print(f, g, i, g + f + i) print(len(to_remove)) print(len(key_list)) for key in to_remove: del self.id2data[key]
def get_stroke_occurrences(self, rows, headers): '''sets all stroke cases to initial diagnosis date values in id2data[patient][stroke_dates][0]''' print('...getting all target (stroke) occurrences') stroke_count = 0 # get the index of the relevant columns stroke_idx = headers.index('icpc') + 1 date_idx = headers.index('dicpc_startdate') + 1 # regex patterns to match general_stroke_pattern = re.compile('K90') ischemic_stroke_pattern = re.compile('K90.03') intracerebral_hem_pattern = re.compile('K90.02') subarchnoid_hem_pattern = re.compile('K90.01') tia_stroke_pattern = re.compile('K89') max = 500000000000000000 current = 0 rows = rows.where((pd.notnull(rows)), None) # pair IDs with a dict corresponding to data and dates print(len(rows)) for row in tqdm(rows.itertuples()): #line in de data if current > max: break if row[date_idx] == " ": continue else: # get key and if it's in the dict, the current corresponding stroke value key = row.Index if key in self.id2data: stroke = self.id2data[key]['stroke_dates'][0] # if self.survival == True and not isinstance(stroke, datetime.date): # stroke = stroke[0] # get ICPC code and its date code = row.icpc if code == None: continue elif type(code) == str: code = code.strip().upper()[0:3] code_date = str2date( date_str=row.dicpc_startdate, mdy=False, give_default_begin=True, give_default_end=True ) #, mdy=False, give_default_begin=True, give_default_end=True # add stroke case if code matches, AND corresponding date is earlier than the currently recorded if self.survival: if (general_stroke_pattern.match(code) or ischemic_stroke_pattern.match(code) or intracerebral_hem_pattern.match(code) or subarchnoid_hem_pattern.match(code) or tia_stroke_pattern.match(code)): if (isinstance(stroke, list) and stroke[0] == False) or stroke > code_date: self.id2data[key]['stroke_dates'][ 0] = code_date self.id2data[key]['data'][0] = [True] stroke_count += 1 if not self.survival: if (general_stroke_pattern.match(code) or ischemic_stroke_pattern.match(code) or intracerebral_hem_pattern.match(code) or subarchnoid_hem_pattern.match(code) or tia_stroke_pattern.match(code) ) and (stroke == 'negative' or stroke > code_date): self.id2data[key]['stroke_dates'][0] = code_date self.id2data[key]['data'][0] = 'positive' stroke_count += 1 else: continue current += 1 save_obj(self.id2data, self.in_dir + 'stroke_dict') self.statistics['stroke count'] = stroke_count
def get_IDs(self, rows, headers): '''sets all IDs as keys to a dict. Additionally adds gender/age data and date registration data''' print('...getting all record IDs') # get the index of the relevant columns print(self.ID_column) print(headers) # ID_idx = headers.index(self.ID_column) #ID column index # age_idx = headers.index('birthyear') + 1 #age column index # gender_idx = headers.index('dgender') + 1 #gender column index # begin_idx = headers.index('dentrdate') + 1 #begin column index # end_idx = headers.index('dexitdate') + 1#end column index ID_amount = [] too_young = [] registration_none = [] unregistration_none = [] before_07 = 0 avg_age = [] max = 5000000000000000000000 current = 0 rows = rows.where((pd.notnull(rows)), None) # pair IDs with a dict corresponding to data and dates for row in tqdm(rows.itertuples()): #line in de data if current > max: break else: # key is ID if len(row) < 1: print('row < 1') break #zelf toegevoegd key = row.Index #int() weggehaald want key is ook met letters if key not in ID_amount: ID_amount.append(key) # skip if instance is outside the specified age limits try: if int(row.birthyear) > 2000: too_young.append(key) continue ID_age = 2018 - int(row.birthyear) avg_age.append(ID_age) # val is a new dict with keys 'data' en 'dates' # containing the processed data and registration dates, respectively val = dict() if self.survival == False: val['data'] = [ 'negative', key, ID_age, row.dgender ] #key 'data'; values ['negative', ID, age, gender] else: val['data'] = [[False], key, ID_age, row.dgender] registration = str2date( row.dentrdate, give_default_begin=False ) #registration date #default begin was true, even veranderd nav de pippi documenten #str2date uit date_math.py; converts date to format dd/mm/yyyy unregistration = str2date( row.dexitdate, ymd=False, give_default_end=True ) #if not (row[end_idx] in ['', None]) else str2date('2050-12-31') if registration == None: registration_none.append(key) continue if unregistration == None: unregistration_none.append(key) continue if int(str(unregistration).split('-')[0]) < 2007: before_07 += 1 continue if self.survival == False: val['stroke_dates'] = [ 'negative', registration, unregistration ] #key 'P_dates' ; values ['negative', begindate, enddate] else: val['stroke_dates'] = [[False], registration, unregistration] # add key/value pair self.id2data[key] = val #id2data dict; key=id, val=dict except ValueError: continue except TypeError: continue current += 1 self.statistics['unique ids'] = len(ID_amount) self.statistics['too old ids'] = len(too_young) self.statistics['in database before study started'] = len( registration_none) self.statistics['in database before until'] = len(unregistration_none) self.statistics['in database before study started'] = before_07 self.statistics['len id2data '] = len(self.id2data) self.statistics['average age'] = np.mean(avg_age) save_obj(self.id2data, self.in_dir + 'patient_dict') print('it worked!') return ['ID', 'age', 'gender']
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False): '''inserts data from the specified csv and corresponding columns''' # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) date_idx = headers.index(date_column[0]) if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # get the right suffix to append for the attribute name if suffix == '': suffix = code_column # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) # iterate over all instances, making a new dict with the new attributes as keys attribute2counts = defaultdict(dict) for row in rows: # if key is not in the data dictionary, we skip it key = row[ID_idx] if not key in dct: continue # init other vars date = str2date(row[date_idx]) begin = dct[key]['stroke_dates'][3] end = dct[key]['stroke_dates'][4] original_code = row[code_idx] # if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): # generate attribute names attributes = self.generate_attributes(original_code, limit, suffix, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. StandardEnrichProcess for attr in attributes: # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2counts, attr, defaultdict(dict)) util.init_key(attribute2counts[attr], key, 0) # add 1 to the occurrence of the attribute in the instance attribute2counts[attr] += 1 # add data to each instance for ID in dct: data = dct[ID]['data'] for id2occurrences in attribute2ids.values(): # if patient has occurrences for the attribute, add that number, else add 0 if ID in id2occurrences: data.append(id2occurrences[ID]) else: data.append(0) # return the keys to be used as headers when writing the processed data return attribute2ids.keys()
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False, counter=0): '''inserts data from the specified csv and corresponding columns''' important_features = ['CHOLBMT', 'RRDIKA', 'RRSYKA'] # read rows into list to re-use rows = rows.where((pd.notnull(rows)), None) # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns # ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) + 1 date_idx = headers.index(date_column[0]) + 1 b_date_idx = headers.index(date_column[0]) + 1 e_date_idx = headers.index(date_column[1]) + 1 # if incorporate_SOEP: # SOEP_idx = headers.index(incorporate_SOEP) # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) if 'lab_results' in suffix: values_dict = dict() # val_idx = headers.index('valuen') + 1 # pair IDs with a dict corresponding to data and dates for row in rows.itertuples(): #line in de data code = row[code_idx] # if we do not know the high and low values, determine by data distribution if code not in important_features: if not code in values_dict: try: values_dict[code] = [float(row.valuen)] except ValueError: continue except TypeError: continue else: try: values_dict[code].append(float(row.valuen)) except ValueError: continue except TypeError: continue minmax_dict = self.calculate_minmax(values_dict, pattern, limit) # keep track of number of times the row is attributed to a positive stroke patient (or patient where the target instance = 'positive') num_pos = 0 num_total = 0 attribute_count = dict() # iterate over all instances, making a new dict with the new attributes as keys attribute2ids = dict() max = 100000000000000000 current = 0 # iterate over all instances for row in tqdm(rows.itertuples()): current += 1 # row = row.split(';') if current > max: break else: num_total += 1 # if key is not in the data dictionary, we skip it key = row.Index if not key in dct: continue # init other vars b_date = str2date(row[b_date_idx], give_default_begin=True) # begin of event e_date = str2date(row[e_date_idx], give_default_end=True) # end of event b_reg = dct[key]['stroke_dates'][1] # beginning of registration e_reg = dct[key]['stroke_dates'][2] # ending of registration # print('wddup') # print(b_reg, e_reg) # print('xxx') # print(dct[key]['stroke_dates'][3], dct[key]['stroke_dates'][4]) original_code = row[code_idx] if original_code == None: continue truncated_code = self.generate_code(original_code, limit) if truncated_code == None or truncated_code in [ 'K90', 'K89', 'k90', 'k89' ]: continue print(b_reg, b_date, e_date) # print(b_reg <= b_date) # print(b_date <= e_reg) # print(b_reg <= e_date) # print(e_date <= e_reg) # if in the required interval (either beginning or ending date) AND code is valid if ((b_reg <= b_date and b_date <= e_reg) or (b_reg <= e_date and e_date <= e_reg)) and pattern.match(truncated_code): # if we need to take the SOEP code of consults into account # if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): # generate attribute names if 'lab_results' in suffix: # if we prepare for lab result abstraction try: val = float(row.valuen) if not original_code in important_features: min_val = minmax_dict[truncated_code]['low_bound'] max_val = minmax_dict[truncated_code]['high_bound'] else: min_val, max_val = self.determine_minmax( original_code) except ValueError: continue except TypeError: continue val, min_val, max_val = self.make_lab_values( val, min_val, max_val) if not 'ID2abstractions' in locals(): # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement) ID2abstractions = dict() util.init_key(ID2abstractions, key, dict()) util.init_key(ID2abstractions[key], original_code, []) ID2abstractions[key][original_code].append((b_date, val)) if '' not in [val, min_val, max_val]: attributes = [ get_value(val, min_val, max_val, original_code) ] # # add value abstraction as state interval # self.insert_state_interval(key, attr, b_date, e_date) else: attributes = [] else: if 'cardiometabolism' in suffix: val_idx = headers.index('valuec') value = str(row[val_idx]) else: value = None attributes = self.generate_attributes(original_code, limit, suffix, value, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. SequenceEnrichProcess for attr in attributes: if 'allergies' in suffix: # val_idx = headers.index('flag') value = row.flag # check if the person actually has the allergie for which was tested if value == 'POS': self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column) # if negative or not tested, it is assumed that person does not have particular allergie else: continue # insert a StateInterval object with the specified parameters self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column) if suffix == 'lab_results': # do funky stuff with trends and abstractions # convert to trends PER lab result for ID in ID2abstractions: # print ID2abstractions[ID] for k, points in ID2abstractions[ID].items(): # the values are sorted before abstraction points = sorted(list(set(points))) # abstract the values and append to the current patient's sequence # if only 1 measurement was done, we cannot do time series analysis if len(points) > 1 and ID in dct: abstractions = get_trends(k, points) for abstraction in abstractions: self.insert_state_interval( ID, *abstraction, original_code=original_code, src=code_column) # self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions # add data to each instance to_save = {} for ID in dct: to_save[ID] = [] for ID in dct: data = dct[ID]['data'] # to_save[ID] = [] for id2occurrences in attribute2ids.values(): # if patient has occurrences for the attribute, add that number, else add 0 if ID in id2occurrences: data.append(id2occurrences[ID]) to_save[ID].append(id2occurrences[ID]) else: data.append(0) to_save[ID].append(0) if self.survival == True: save_obj( to_save, self.in_dir + suffix[0] + '_dict_temporal' + str(counter) + '_survival' + '.pkl') save_obj( list(attribute2ids.keys()), self.in_dir + suffix[0] + 'temporal_headers' + str(counter) + '.pkl') else: save_obj( to_save, self.in_dir + suffix[0] + '_dict_temporal' + str(counter) + '.pkl') save_obj( list(attribute2ids.keys()), self.in_dir + suffix[0] + 'temporal_headers' + str(counter) + '.pkl') # to satisfy return value requirement for the method 'process' in the superclass return [], -1, -1