def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False): '''inserts data from the specified csv and corresponding columns''' # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) b_date_idx = headers.index(date_column[0]) e_date_idx = headers.index(date_column[1]) if suffix == 'lab_results': val_idx = headers.index('waarde') min_idx = headers.index('referentie_minimum') max_idx = headers.index('referentie_maximum') if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # get the right suffix to append for the attribute name if suffix == '': suffix = code_column # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) max = 5000 current = 0 # iterate over all instances for row in rows: if current > max: break row = row.split(';') original_code = row[code_idx] if original_code == None: continue truncated_code = self.generate_code(original_code, limit) if truncated_code == None: continue ### is in Marshall Predictors check ### ### if it is a marshall predictor, we skip this line. if self.marshall_predictor(truncated_code, code_column): continue # if key is not in the data dictionary, we skip it key = row[ID_idx] if not key in dct: continue # init other vars b_date = str2date(row[b_date_idx], give_default_begin=True) # begin of event e_date = str2date(row[e_date_idx], give_default_end=True) # end of event b_reg = dct[key]['stroke_dates'][1] # beginning of registration e_reg = dct[key]['stroke_dates'][2] # ending of registration if code_column == 'specialisme': e_reg = e_reg - four_weeks() if suffix == 'lab_results': val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx]) if val == '': continue # if in the required interval (either beginning or ending date) AND code is valid if ( (b_reg <= b_date and b_date <= e_reg) or (b_reg <= e_date and e_date <= e_reg) ) and pattern.match(truncated_code): # if we need to take the SOEP code of consults into account if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): # generate attribute names if suffix == 'lab_results': # if we prepare for lab result abstraction if not 'ID2abstractions' in locals(): # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement) ID2abstractions = defaultdict(dict) util.init_key(ID2abstractions, key, defaultdict(dict)) util.init_key(ID2abstractions[key], original_code, []) ID2abstractions[key][original_code].append((b_date, val)) if '' not in [val, min_val, max_val]: attributes = [abstracts.get_value(val, min_val, max_val, original_code)] # # add value abstraction as state interval # self.insert_state_interval(key, attr, b_date, e_date) else: attributes = [] else: attributes = self.generate_attributes(original_code, limit, suffix, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. SequenceEnrichProcess for attr in attributes: # insert a StateInterval object with the specified parameters self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column) current += 1 if suffix == 'lab_results': # do funky stuff with trends and abstractions # convert to trends PER lab result for ID in ID2abstractions: # print ID2abstractions[ID] for k, points in ID2abstractions[ID].items(): # the values are sorted before abstraction points = sorted(list(set(points))) # abstract the values and append to the current patient's sequence # if only 1 measurement was done, we cannot do time series analysis if len(points) > 1 and ID in dct: abstractions = abstracts.get_trends(k, points) for abstraction in abstractions: self.insert_state_interval(ID, *abstraction, original_code=original_code, src=code_column) # self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions # to satisfy return value requirement for the method 'process' in the superclass return [], -1, -1
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False): '''inserts data from the specified csv and corresponding columns''' # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) date_idx = headers.index(date_column[0]) if suffix == 'lab_results': val_idx = headers.index('waarde') min_idx = headers.index('referentie_minimum') max_idx = headers.index('referentie_maximum') if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # get the right suffix to append for the attribute name if suffix == '': suffix = code_column # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) # keep track of number of times the row is attributed to a positive CRC patient (or patient where the target instance = 'positive') num_pos = 0 num_total = 0 # iterate over all instances, making a new dict with the new attributes as keys attribute2ids = dict() for row in rows: original_code = row[code_idx] if original_code == None: continue truncated_code = self.generate_code(original_code, limit) if truncated_code == None: continue ### is in Marshall Predictors check ### ### if it is a marshall predictor, we skip this line. if self.marshall_predictor(truncated_code, code_column): continue num_total+=1 # if key is not in the data dictionary, we skip it key = int(row[ID_idx]) if not key in dct: continue if dct[key]['CRC_dates'][0] != 'negative': num_pos+=1 # init other vars date = str2date(row[date_idx], give_default_begin=True) begin = dct[key]['CRC_dates'][3] end = dct[key]['CRC_dates'][4] if code_column == 'specialisme': end = end - four_weeks() if suffix == 'lab_results': val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx]) if val == '': continue # if in the required interval and code is valid if (begin <= date and date <= end) and pattern.match(truncated_code): # if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): if suffix == 'lab_results': # if we prepare for lab result abstraction if not 'ID2abstractions' in locals(): # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement) ID2abstractions = dict() util.init_key(ID2abstractions, key, dict()) util.init_key(ID2abstractions[key], original_code, []) ID2abstractions[key][original_code].append((date, val)) if '' not in [val, min_val, max_val]: attr = abstracts.get_value(val, min_val, max_val, original_code) # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], key, 0) # add 1 to the occurrence of the attribute in the instance attribute2ids[attr][key] += 1 else: # else no lab result collection, regular aggregation # generate attribute names attributes = self.generate_attributes(original_code, limit, suffix, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. StandardEnrichProcess for attr in attributes: # print truncated_code, attr # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], key, 0) # add 1 to the occurrence of the attribute in the instance attribute2ids[attr][key] += 1 if suffix == 'lab_results': # do funky stuff with trends and abstractions # convert to trends PER lab result for ID in ID2abstractions: # print ID2abstractions[ID] for k, points in ID2abstractions[ID].iteritems(): # the values are sorted before abstraction points = sorted(list(set(points))) # abstract the values and count the occurrences per measurement-trend per patient # if only 1 measurement was done, we cannot do time series analysis if len(points) > 1 and ID in dct: abstractions = abstracts.get_trends(k, points) for attr in abstractions: attr = attr[0] # get the state util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], ID, 0) attribute2ids[attr][ID] += 1 # print len(attribute2ids) # print attribute2ids.keys()[0:5] # add data to each instance for ID in dct: data = dct[ID]['data'] for id2occurrences in attribute2ids.values(): # if patient has occurrences for the attribute, add that number, else add 0 if ID in id2occurrences: data.append(id2occurrences[ID]) else: data.append(0) # return the keys to be used as headers when writing the processed data return attribute2ids.keys(), num_total, num_pos
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False, counter=0): '''inserts data from the specified csv and corresponding columns''' important_featx ures = ['CHOLBMT', 'RRDIKA', 'RRSYKA'] # make convenient reference to the dictionary dct = self.id2data rows = rows.where((pd.notnull(rows)), None) # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns # ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) + 1 date_idx = headers.index(date_column[0]) + 1 # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) if 'lab_results' in suffix: values_dict = dict() # val_idx = headers.index('valuen') + 1 # pair IDs with a dict corresponding to data and dates for row in rows.itertuples():#line in de data code = row[code_idx] # if we do not know the high and low values, determine by data distribution if code not in important_features: if not code in values_dict: try: values_dict[code] = [float(row.valuen)] except ValueError: continue except TypeError: continue else: try: values_dict[code].append(float(row.valuen)) except ValueError: continue except TypeError: continue minmax_dict = self.calculate_minmax(values_dict, pattern, limit) if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # keep track of number of times the row is attributed to a positive stroke patient (or patient where the target instance = 'positive') num_pos = 0 num_total = 0 attribute_count = dict() # iterate over all instances, making a new dict with the new attributes as keys attribute2ids = dict() max=1000000000000000000 current = 0 for row in tqdm(rows.itertuples()): current += 1 # row = row.split(';') if current > max: break else: num_total+=1 # if key is not in the data dictionary, we skip it key = row.Index if not key in dct: continue if dct[key]['stroke_dates'][0] != 'negative': num_pos+=1 # init other vars date = str2date(row[date_idx], give_default_begin=True, give_default_end=True) begin = dct[key]['stroke_dates'][1] end = dct[key]['stroke_dates'][2] if code_column == 'specialisme': end = end - four_weeks() original_code = row[code_idx] if original_code == None: continue truncated_code = self.generate_code(original_code, limit) if truncated_code == None or truncated_code in ['K90', 'K89', 'k90', 'k89']: continue if not self.marshall_predictor(truncated_code, code_column): continue # if in the required interval and code is valid if (begin <= date and date <= end) and pattern.match(truncated_code): # if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E # if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): if 'lab_results' in suffix: # if we prepare for lab result abstraction try: val = float(row.valuen) if not original_code in important_features: min_val = minmax_dict[truncated_code]['low_bound'] max_val = minmax_dict[truncated_code]['high_bound'] else: min_val, max_val = self.determine_minmax(original_code) except ValueError: continue except TypeError: continue if not 'ID2abstractions' in locals(): # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement) ID2abstractions = dict() util.init_key(ID2abstractions, key, dict()) util.init_key(ID2abstractions[key], original_code, []) ID2abstractions[key][original_code].append((date, val)) if '' not in [val, min_val, max_val]: attr = get_value(val, min_val, max_val, original_code) if not attr in attribute_count: attribute_count[attr] = 0 # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], key, 0) # add 1 to the occurrence of the attribute in the instance attribute2ids[attr][key] += 1 attribute_count[attr] += 1 else: # else no lab result collection, regular aggregation # generate attribute names if 'cardiometabolism' in suffix: # val_idx = headers.index('valuec') value = str(row.valuec) else: value = None attributes = self.generate_attributes(original_code, limit, suffix, value, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. StandardEnrichProcess for attr in attributes: if not attr in attribute_count: attribute_count[attr] = 0 # print truncated_code, attr # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], key, 0) # add 1 to the occurrence of the attribute in the instance, except if attribute is binary if 'smoking' in suffix: if attribute2ids[attr][key] == 1: continue if 'allergies' in suffix: # val_idx = headers.index('flag') value = row.flag # check if the person actually has the allergie for which was tested if value == 'POS': attribute2ids[attr][key] = 1 # if negative or not tested, it is assumed that person does not have particular allergie else: attribute2ids[attr][key] = 0 else: attribute2ids[attr][key] += 1 attribute_count[attr] += 1 for attr, count in attribute_count.items(): try: self.statistics[attr + '_count/min/max'] = [count, min_val, max_val] except UnboundLocalError: self.statistics[attr + '_count'] = count if 'lab_results' in suffix: # do funky stuff with trends and abstractions # convert to trends PER lab result for ID in ID2abstractions: # print ID2abstractions[ID] for k, points in ID2abstractions[ID].items(): # the values are sorted before abstraction points = sorted(list(set(points))) # abstract the values and count the occurrences per measurement-trend per patient # if only 1 measurement was done, we cannot do time series analysis if len(points) > 1 and ID in dct: abstractions = get_trends(k, points) for attr in abstractions: attr = attr[0] # get the state util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], ID, 0) attribute2ids[attr][ID] += 1 # print len(attribute2ids) # print attribute2ids.keys()[0:5] # add data to each instance to_save = {} for ID in dct: to_save[ID] = [] for ID in dct: data = dct[ID]['data'] # to_save[ID] = [] for id2occurrences in attribute2ids.values(): # if patient has occurrences for the attribute, add that number, else add 0 if ID in id2occurrences: data.append(id2occurrences[ID]) to_save[ID].append(id2occurrences[ID]) else: data.append(0) to_save[ID].append(0) save_obj(self.statistics, self.in_dir + suffix[0]+ '_statistics.pkl') if self.survival == True: save_obj(to_save, self.in_dir + suffix[0] + '_dict_marshall' + str(counter)+ '_survival' + '.pkl') save_obj(list(attribute2ids.keys()), self.in_dir + suffix[0] + '_headers'+ str(counter) + '.pkl') else: save_obj(to_save, self.in_dir + suffix[0] + '_dict_marshall' + str(counter) + '.pkl') save_obj(list(attribute2ids.keys()), self.in_dir + suffix[0] + '_headers'+ str(counter) + '.pkl') # return the keys to be used as headers when writing the processed data return list(attribute2ids.keys()), num_total, num_pos, suffix
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False): '''inserts data from the specified csv and corresponding columns''' # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) date_idx = headers.index(date_column[0]) if suffix == 'lab_results': val_idx = headers.index('waarde') min_idx = headers.index('referentie_minimum') max_idx = headers.index('referentie_maximum') if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # get the right suffix to append for the attribute name if suffix == '': suffix = code_column # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) # keep track of number of times the row is attributed to a positive CRC patient (or patient where the target instance = 'positive') num_pos = 0 num_total = 0 # iterate over all instances, making a new dict with the new attributes as keys attribute2ids = dict() for row in rows: num_total+=1 # if key is not in the data dictionary, we skip it key = int(row[ID_idx]) if not key in dct: continue if dct[key]['CRC_dates'][0] != 'negative': num_pos+=1 # init other vars date = str2date(row[date_idx], give_default_begin=True) begin = dct[key]['CRC_dates'][3] end = dct[key]['CRC_dates'][4] if code_column == 'specialisme': end = end - four_weeks() original_code = row[code_idx] if original_code == None: continue truncated_code = self.generate_code(original_code, limit) if truncated_code == None: continue if suffix == 'lab_results': val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx]) if val == '': continue # if in the required interval and code is valid if (begin <= date and date <= end) and pattern.match(truncated_code): # if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): if suffix == 'lab_results': # if we prepare for lab result abstraction if not 'ID2abstractions' in locals(): # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement) ID2abstractions = dict() util.init_key(ID2abstractions, key, dict()) util.init_key(ID2abstractions[key], original_code, []) ID2abstractions[key][original_code].append((date, val)) if '' not in [val, min_val, max_val]: attr = abstracts.get_value(val, min_val, max_val, original_code) # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], key, 0) # add 1 to the occurrence of the attribute in the instance attribute2ids[attr][key] += 1 else: # else no lab result collection, regular aggregation # generate attribute names attributes = self.generate_attributes(original_code, limit, suffix, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. StandardEnrichProcess for attr in attributes: # print truncated_code, attr # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], key, 0) # add 1 to the occurrence of the attribute in the instance attribute2ids[attr][key] += 1 if suffix == 'lab_results': # do funky stuff with trends and abstractions # convert to trends PER lab result for ID in ID2abstractions: # print ID2abstractions[ID] for k, points in ID2abstractions[ID].iteritems(): # the values are sorted before abstraction points = sorted(list(set(points))) # abstract the values and count the occurrences per measurement-trend per patient # if only 1 measurement was done, we cannot do time series analysis if len(points) > 1 and ID in dct: abstractions = abstracts.get_trends(k, points) for attr in abstractions: attr = attr[0] # get the state util.init_key(attribute2ids, attr, dict()) util.init_key(attribute2ids[attr], ID, 0) attribute2ids[attr][ID] += 1 # print len(attribute2ids) # print attribute2ids.keys()[0:5] # add data to each instance for ID in dct: data = dct[ID]['data'] for id2occurrences in attribute2ids.values(): # if patient has occurrences for the attribute, add that number, else add 0 if ID in id2occurrences: data.append(id2occurrences[ID]) else: data.append(0) # return the keys to be used as headers when writing the processed data return attribute2ids.keys(), num_total, num_pos
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False): '''inserts data from the specified csv and corresponding columns''' # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) date_idx = headers.index(date_column[0]) if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # get the right suffix to append for the attribute name if suffix == '': suffix = code_column # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) # iterate over all instances, making a new dict with the new attributes as keys attribute2counts = defaultdict(dict) for row in rows: # if key is not in the data dictionary, we skip it key = row[ID_idx] if not key in dct: continue # init other vars date = str2date(row[date_idx]) begin = dct[key]['stroke_dates'][3] end = dct[key]['stroke_dates'][4] original_code = row[code_idx] # if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): # generate attribute names attributes = self.generate_attributes(original_code, limit, suffix, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. StandardEnrichProcess for attr in attributes: # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2counts, attr, defaultdict(dict)) util.init_key(attribute2counts[attr], key, 0) # add 1 to the occurrence of the attribute in the instance attribute2counts[attr] += 1 # add data to each instance for ID in dct: data = dct[ID]['data'] for id2occurrences in attribute2ids.values(): # if patient has occurrences for the attribute, add that number, else add 0 if ID in id2occurrences: data.append(id2occurrences[ID]) else: data.append(0) # return the keys to be used as headers when writing the processed data return attribute2ids.keys()
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False): '''inserts data from the specified csv and corresponding columns''' # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) date_idx = headers.index(date_column[0]) if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # get the right suffix to append for the attribute name if suffix == '': suffix = code_column # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) # iterate over all instances, making a new dict with the new attributes as keys attribute2counts = dict() for row in rows: # if key is not in the data dictionary, we skip it key = int(row[ID_idx]) if not key in dct: continue # init other vars date = str2date(row[date_idx]) begin = dct[key]['CRC_dates'][3] end = dct[key]['CRC_dates'][4] original_code = row[code_idx] # if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): # generate attribute names attributes = self.generate_attributes(original_code, limit, suffix, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. StandardEnrichProcess for attr in attributes: # check if attribute name and ID instance already exist, if not, make them util.init_key(attribute2counts, attr, dict()) util.init_key(attribute2counts[attr], key, 0) # add 1 to the occurrence of the attribute in the instance attribute2counts[attr] += 1 # add data to each instance for ID in dct: data = dct[ID]['data'] for id2occurrences in attribute2ids.values(): # if patient has occurrences for the attribute, add that number, else add 0 if ID in id2occurrences: data.append(id2occurrences[ID]) else: data.append(0) # return the keys to be used as headers when writing the processed data return attribute2ids.keys()
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False): '''inserts data from the specified csv and corresponding columns''' # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) b_date_idx = headers.index(date_column[0]) e_date_idx = headers.index(date_column[1]) if suffix == 'lab_results': val_idx = headers.index('waarde') min_idx = headers.index('referentie_minimum') max_idx = headers.index('referentie_maximum') if incorporate_SOEP: SOEP_idx = headers.index(incorporate_SOEP) # get the right suffix to append for the attribute name if suffix == '': suffix = code_column # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) # iterate over all instances for row in rows: # if key is not in the data dictionary, we skip it key = int(row[ID_idx]) if not key in dct: continue # init other vars b_date = str2date(row[b_date_idx], give_default_begin=True) # begin of event e_date = str2date(row[e_date_idx], give_default_end=True) # end of event b_reg = dct[key]['CRC_dates'][3] # beginning of registration e_reg = dct[key]['CRC_dates'][4] # ending of registration if code_column == 'specialisme': e_reg = e_reg - four_weeks() original_code = row[code_idx] if original_code == None: continue truncated_code = self.generate_code(original_code, limit) if truncated_code == None: continue if suffix == 'lab_results': val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx]) if val == '': continue # if in the required interval (either beginning or ending date) AND code is valid if ( (b_reg <= b_date and b_date <= e_reg) or (b_reg <= e_date and e_date <= e_reg) ) and pattern.match(truncated_code): # if we need to take the SOEP code of consults into account if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): # generate attribute names if suffix == 'lab_results': # if we prepare for lab result abstraction if not 'ID2abstractions' in locals(): # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement) ID2abstractions = dict() util.init_key(ID2abstractions, key, dict()) util.init_key(ID2abstractions[key], original_code, []) ID2abstractions[key][original_code].append((b_date, val)) if '' not in [val, min_val, max_val]: attributes = [abstracts.get_value(val, min_val, max_val, original_code)] # # add value abstraction as state interval # self.insert_state_interval(key, attr, b_date, e_date) else: attributes = [] else: attributes = self.generate_attributes(original_code, limit, suffix, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. SequenceEnrichProcess for attr in attributes: # insert a StateInterval object with the specified parameters self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column) if suffix == 'lab_results': # do funky stuff with trends and abstractions # convert to trends PER lab result for ID in ID2abstractions: # print ID2abstractions[ID] for k, points in ID2abstractions[ID].iteritems(): # the values are sorted before abstraction points = sorted(list(set(points))) # abstract the values and append to the current patient's sequence # if only 1 measurement was done, we cannot do time series analysis if len(points) > 1 and ID in dct: abstractions = abstracts.get_trends(k, points) for abstraction in abstractions: self.insert_state_interval(ID, *abstraction, original_code=original_code, src=code_column) # self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions # to satisfy return value requirement for the method 'process' in the superclass return [], -1, -1
def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False, counter=0): '''inserts data from the specified csv and corresponding columns''' important_features = ['CHOLBMT', 'RRDIKA', 'RRSYKA'] # read rows into list to re-use rows = rows.where((pd.notnull(rows)), None) # make convenient reference to the dictionary dct = self.id2data # # get data and corresponding headers # rows, headers = util.import_data(f, delim=self.delim) # get the index of the relevant columns # ID_idx = headers.index(self.ID_column) code_idx = headers.index(code_column) + 1 date_idx = headers.index(date_column[0]) + 1 b_date_idx = headers.index(date_column[0]) + 1 e_date_idx = headers.index(date_column[1]) + 1 # if incorporate_SOEP: # SOEP_idx = headers.index(incorporate_SOEP) # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) # regex pattern to match (ATC/ICPC standards) pattern = re.compile(regex_string) if 'lab_results' in suffix: values_dict = dict() # val_idx = headers.index('valuen') + 1 # pair IDs with a dict corresponding to data and dates for row in rows.itertuples(): #line in de data code = row[code_idx] # if we do not know the high and low values, determine by data distribution if code not in important_features: if not code in values_dict: try: values_dict[code] = [float(row.valuen)] except ValueError: continue except TypeError: continue else: try: values_dict[code].append(float(row.valuen)) except ValueError: continue except TypeError: continue minmax_dict = self.calculate_minmax(values_dict, pattern, limit) # keep track of number of times the row is attributed to a positive stroke patient (or patient where the target instance = 'positive') num_pos = 0 num_total = 0 attribute_count = dict() # iterate over all instances, making a new dict with the new attributes as keys attribute2ids = dict() max = 100000000000000000 current = 0 # iterate over all instances for row in tqdm(rows.itertuples()): current += 1 # row = row.split(';') if current > max: break else: num_total += 1 # if key is not in the data dictionary, we skip it key = row.Index if not key in dct: continue # init other vars b_date = str2date(row[b_date_idx], give_default_begin=True) # begin of event e_date = str2date(row[e_date_idx], give_default_end=True) # end of event b_reg = dct[key]['stroke_dates'][1] # beginning of registration e_reg = dct[key]['stroke_dates'][2] # ending of registration # print('wddup') # print(b_reg, e_reg) # print('xxx') # print(dct[key]['stroke_dates'][3], dct[key]['stroke_dates'][4]) original_code = row[code_idx] if original_code == None: continue truncated_code = self.generate_code(original_code, limit) if truncated_code == None or truncated_code in [ 'K90', 'K89', 'k90', 'k89' ]: continue print(b_reg, b_date, e_date) # print(b_reg <= b_date) # print(b_date <= e_reg) # print(b_reg <= e_date) # print(e_date <= e_reg) # if in the required interval (either beginning or ending date) AND code is valid if ((b_reg <= b_date and b_date <= e_reg) or (b_reg <= e_date and e_date <= e_reg)) and pattern.match(truncated_code): # if we need to take the SOEP code of consults into account # if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'): # generate attribute names if 'lab_results' in suffix: # if we prepare for lab result abstraction try: val = float(row.valuen) if not original_code in important_features: min_val = minmax_dict[truncated_code]['low_bound'] max_val = minmax_dict[truncated_code]['high_bound'] else: min_val, max_val = self.determine_minmax( original_code) except ValueError: continue except TypeError: continue val, min_val, max_val = self.make_lab_values( val, min_val, max_val) if not 'ID2abstractions' in locals(): # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement) ID2abstractions = dict() util.init_key(ID2abstractions, key, dict()) util.init_key(ID2abstractions[key], original_code, []) ID2abstractions[key][original_code].append((b_date, val)) if '' not in [val, min_val, max_val]: attributes = [ get_value(val, min_val, max_val, original_code) ] # # add value abstraction as state interval # self.insert_state_interval(key, attr, b_date, e_date) else: attributes = [] else: if 'cardiometabolism' in suffix: val_idx = headers.index('valuec') value = str(row[val_idx]) else: value = None attributes = self.generate_attributes(original_code, limit, suffix, value, src=code_column) # this loop allows multiple attributes to be created in the previous code line # this allows for other classes to subclass this class, e.g. SequenceEnrichProcess for attr in attributes: if 'allergies' in suffix: # val_idx = headers.index('flag') value = row.flag # check if the person actually has the allergie for which was tested if value == 'POS': self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column) # if negative or not tested, it is assumed that person does not have particular allergie else: continue # insert a StateInterval object with the specified parameters self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column) if suffix == 'lab_results': # do funky stuff with trends and abstractions # convert to trends PER lab result for ID in ID2abstractions: # print ID2abstractions[ID] for k, points in ID2abstractions[ID].items(): # the values are sorted before abstraction points = sorted(list(set(points))) # abstract the values and append to the current patient's sequence # if only 1 measurement was done, we cannot do time series analysis if len(points) > 1 and ID in dct: abstractions = get_trends(k, points) for abstraction in abstractions: self.insert_state_interval( ID, *abstraction, original_code=original_code, src=code_column) # self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions # add data to each instance to_save = {} for ID in dct: to_save[ID] = [] for ID in dct: data = dct[ID]['data'] # to_save[ID] = [] for id2occurrences in attribute2ids.values(): # if patient has occurrences for the attribute, add that number, else add 0 if ID in id2occurrences: data.append(id2occurrences[ID]) to_save[ID].append(id2occurrences[ID]) else: data.append(0) to_save[ID].append(0) if self.survival == True: save_obj( to_save, self.in_dir + suffix[0] + '_dict_temporal' + str(counter) + '_survival' + '.pkl') save_obj( list(attribute2ids.keys()), self.in_dir + suffix[0] + 'temporal_headers' + str(counter) + '.pkl') else: save_obj( to_save, self.in_dir + suffix[0] + '_dict_temporal' + str(counter) + '.pkl') save_obj( list(attribute2ids.keys()), self.in_dir + suffix[0] + 'temporal_headers' + str(counter) + '.pkl') # to satisfy return value requirement for the method 'process' in the superclass return [], -1, -1