示例#1
0
	def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False):
		'''inserts data from the specified csv and corresponding columns'''

		# make convenient reference to the dictionary
		dct = self.id2data

		# # get data and corresponding headers
		# rows, headers = util.import_data(f, delim=self.delim)

		# get the index of the relevant columns
		ID_idx = headers.index(self.ID_column)
		code_idx = headers.index(code_column)
		b_date_idx = headers.index(date_column[0])
		e_date_idx = headers.index(date_column[1])
		if suffix == 'lab_results':
			val_idx = headers.index('waarde')
			min_idx = headers.index('referentie_minimum')
			max_idx = headers.index('referentie_maximum')
		if incorporate_SOEP:
			SOEP_idx = headers.index(incorporate_SOEP)

		# get the right suffix to append for the attribute name
		if suffix == '':
			suffix = code_column

		# regex pattern to match (ATC/ICPC standards)
		pattern = re.compile(regex_string)

		max = 5000
		current = 0

		# iterate over all instances
		for row in rows:
			if current > max:
				break


			row = row.split(';')
			
			original_code = row[code_idx]
			if original_code == None:
				continue
			truncated_code = self.generate_code(original_code, limit) 
			if truncated_code == None:
				continue

			### is in Marshall Predictors check ###
			### if it is a marshall predictor, we skip this line.
			if self.marshall_predictor(truncated_code, code_column):
				continue

			# if key is not in the data dictionary, we skip it
			key = row[ID_idx]
			if not key in dct:
				continue

			# init other vars
			b_date = str2date(row[b_date_idx], give_default_begin=True) # begin of event
			e_date = str2date(row[e_date_idx], give_default_end=True) # end of event
			b_reg = dct[key]['stroke_dates'][1] # beginning of registration
			e_reg = dct[key]['stroke_dates'][2] # ending of registration
			if code_column == 'specialisme':
				e_reg = e_reg - four_weeks()

			if suffix == 'lab_results':
				val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx])
				if val == '':
					continue

			# if in the required interval (either beginning or ending date) AND code is valid
			if ( (b_reg <= b_date and b_date <= e_reg) or (b_reg <= e_date and e_date <= e_reg) ) and pattern.match(truncated_code):
				
				# if we need to take the SOEP code of consults into account
				if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):

					# generate attribute names
					if suffix == 'lab_results': # if we prepare for lab result abstraction
						if not 'ID2abstractions' in locals():
							# dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement)
							ID2abstractions = defaultdict(dict)
						
						util.init_key(ID2abstractions, key, defaultdict(dict))
						util.init_key(ID2abstractions[key], original_code, [])

						ID2abstractions[key][original_code].append((b_date, val))
					
						if '' not in [val, min_val, max_val]:
							attributes = [abstracts.get_value(val, min_val, max_val, original_code)]

							# # add value abstraction as state interval
							# self.insert_state_interval(key, attr, b_date, e_date)
						else:
							attributes = []

					else:
						attributes = self.generate_attributes(original_code, limit, suffix, src=code_column)

					# this loop allows multiple attributes to be created in the previous code line
					# this allows for other classes to subclass this class, e.g. SequenceEnrichProcess
					for attr in attributes:

						# insert a StateInterval object with the specified parameters
						self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column)

			current += 1

		if suffix == 'lab_results': # do funky stuff with trends and abstractions
			# convert to trends PER lab result
			for ID in ID2abstractions:
				# print ID2abstractions[ID]
				for k, points in ID2abstractions[ID].items():
					
					# the values are sorted before abstraction
					points = sorted(list(set(points)))

					# abstract the values and append to the current patient's sequence
					# if only 1 measurement was done, we cannot do time series analysis
					if len(points) > 1 and ID in dct: 
						abstractions = abstracts.get_trends(k, points)
						for abstraction in abstractions:
							self.insert_state_interval(ID, *abstraction, original_code=original_code, src=code_column)
						# self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions
		
		# to satisfy return value requirement for the method 'process' in the superclass
		return [], -1, -1
	def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False):
		'''inserts data from the specified csv and corresponding columns'''

		# make convenient reference to the dictionary
		dct = self.id2data

		# # get data and corresponding headers
		# rows, headers = util.import_data(f, delim=self.delim)

		# get the index of the relevant columns
		ID_idx = headers.index(self.ID_column)
		code_idx = headers.index(code_column)
		date_idx = headers.index(date_column[0])
		
		if suffix == 'lab_results':
			val_idx = headers.index('waarde')
			min_idx = headers.index('referentie_minimum')
			max_idx = headers.index('referentie_maximum')
		if incorporate_SOEP:
			SOEP_idx = headers.index(incorporate_SOEP)
		
		# get the right suffix to append for the attribute name
		if suffix == '':
			suffix = code_column

		# regex pattern to match (ATC/ICPC standards)
		pattern = re.compile(regex_string)

		# keep track of number of times the row is attributed to a positive CRC patient (or patient where the target instance = 'positive')
		num_pos = 0
		num_total = 0

		# iterate over all instances, making a new dict with the new attributes as keys
		attribute2ids = dict()
		for row in rows:
			original_code = row[code_idx]
			if original_code == None:
				continue
			truncated_code = self.generate_code(original_code, limit)
			if truncated_code == None:
				continue

			### is in Marshall Predictors check ###
			### if it is a marshall predictor, we skip this line.
			if self.marshall_predictor(truncated_code, code_column):
				continue
			num_total+=1

			# if key is not in the data dictionary, we skip it
			key = int(row[ID_idx])
			if not key in dct:
				continue

			if dct[key]['CRC_dates'][0] != 'negative':
				num_pos+=1

			# init other vars
			date = str2date(row[date_idx], give_default_begin=True)
			begin = dct[key]['CRC_dates'][3]
			end = dct[key]['CRC_dates'][4]
			if code_column == 'specialisme':
				end = end - four_weeks()

			if suffix == 'lab_results':
				val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx])
				if val == '':
					continue

			# if in the required interval and code is valid
			if (begin <= date and date <= end) and pattern.match(truncated_code):
				
				# if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E
				if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):
				
					if suffix == 'lab_results': # if we prepare for lab result abstraction
						if not 'ID2abstractions' in locals():
							# dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement)
							ID2abstractions = dict()
						
						util.init_key(ID2abstractions, key, dict())
						util.init_key(ID2abstractions[key], original_code, [])

						ID2abstractions[key][original_code].append((date, val))

						if '' not in [val, min_val, max_val]:
							attr = abstracts.get_value(val, min_val, max_val, original_code)

							# check if attribute name and ID instance already exist, if not, make them
							util.init_key(attribute2ids, attr, dict())
							util.init_key(attribute2ids[attr], key, 0)

							# add 1 to the occurrence of the attribute in the instance
							attribute2ids[attr][key] += 1

					else: # else no lab result collection, regular aggregation
						# generate attribute names
						attributes = self.generate_attributes(original_code, limit, suffix, src=code_column)
						
						# this loop allows multiple attributes to be created in the previous code line
						# this allows for other classes to subclass this class, e.g. StandardEnrichProcess
						for attr in attributes:
							# print truncated_code, attr
							# check if attribute name and ID instance already exist, if not, make them
							util.init_key(attribute2ids, attr, dict())
							util.init_key(attribute2ids[attr], key, 0)

							# add 1 to the occurrence of the attribute in the instance
							attribute2ids[attr][key] += 1

		if suffix == 'lab_results': # do funky stuff with trends and abstractions
			# convert to trends PER lab result
			for ID in ID2abstractions:
				# print ID2abstractions[ID]
				for k, points in ID2abstractions[ID].iteritems():
					
					# the values are sorted before abstraction
					points = sorted(list(set(points)))

					# abstract the values and count the occurrences per measurement-trend per patient
					# if only 1 measurement was done, we cannot do time series analysis
					if len(points) > 1 and ID in dct: 
						abstractions = abstracts.get_trends(k, points)
						for attr in abstractions:
							attr = attr[0] # get the state
							util.init_key(attribute2ids, attr, dict())
							util.init_key(attribute2ids[attr], ID, 0)
							attribute2ids[attr][ID] += 1
		# print len(attribute2ids)
		# print attribute2ids.keys()[0:5]
		
		# add data to each instance
		for ID in dct:
			data = dct[ID]['data']

			for id2occurrences in attribute2ids.values():
				
				# if patient has occurrences for the attribute, add that number, else add 0
				if ID in id2occurrences: 
					data.append(id2occurrences[ID])
				else:
					data.append(0)

		# return the keys to be used as headers when writing the processed data
		return attribute2ids.keys(), num_total, num_pos
示例#3
0
	def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False, counter=0):
		'''inserts data from the specified csv and corresponding columns'''
	
		important_featx	ures = ['CHOLBMT', 'RRDIKA', 'RRSYKA']

		# make convenient reference to the dictionary
		dct = self.id2data
		rows = rows.where((pd.notnull(rows)), None)

		# # get data and corresponding headers
		# rows, headers = util.import_data(f, delim=self.delim)

		# get the index of the relevant columns
		# ID_idx = headers.index(self.ID_column)
		code_idx = headers.index(code_column) + 1
		date_idx = headers.index(date_column[0]) + 1
		
		
		# regex pattern to match (ATC/ICPC standards)
		pattern = re.compile(regex_string)

		if 'lab_results' in suffix:
			values_dict = dict()
			# val_idx = headers.index('valuen') + 1

		# pair IDs with a dict corresponding to data and dates
			for row in rows.itertuples():#line in de data
				code = row[code_idx]
				# if we do not know the high and low values, determine by data distribution
				if code not in important_features:
					if not code in values_dict:
						try:
							values_dict[code] = [float(row.valuen)]
						except ValueError:
							continue
						except TypeError:
							continue
					else:
						try:
							values_dict[code].append(float(row.valuen))
						except ValueError:
							continue
						except TypeError:
							continue
							
			minmax_dict = self.calculate_minmax(values_dict, pattern, limit)
			

		if incorporate_SOEP:
			SOEP_idx = headers.index(incorporate_SOEP)

		# keep track of number of times the row is attributed to a positive stroke patient (or patient where the target instance = 'positive')
		num_pos = 0
		num_total = 0
		attribute_count = dict()
		# iterate over all instances, making a new dict with the new attributes as keys
		attribute2ids = dict()

		max=1000000000000000000
		current = 0 

		for row in tqdm(rows.itertuples()):
			current += 1	
			# row = row.split(';')

			if current > max: 
				break
			else:
				num_total+=1

				# if key is not in the data dictionary, we skip it
				key = row.Index
				
				if not key in dct:
					continue

				if dct[key]['stroke_dates'][0] != 'negative':
					num_pos+=1

				# init other vars
				date = str2date(row[date_idx], give_default_begin=True, give_default_end=True)
				begin = dct[key]['stroke_dates'][1]
				end = dct[key]['stroke_dates'][2]

				if code_column == 'specialisme':
					end = end - four_weeks()

				original_code = row[code_idx]
				if original_code == None:
					continue

				truncated_code = self.generate_code(original_code, limit)
				if truncated_code == None or truncated_code in ['K90', 'K89', 'k90', 'k89']:
					continue
				
				if not self.marshall_predictor(truncated_code, code_column):
					continue
				
				# if in the required interval and code is valid
				if (begin <= date and date <= end) and pattern.match(truncated_code):
					# if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E
					# if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):
					
						if 'lab_results' in suffix: # if we prepare for lab result abstraction						
							try:
								val = float(row.valuen)
								if not original_code in important_features:
									min_val = minmax_dict[truncated_code]['low_bound']
									max_val = minmax_dict[truncated_code]['high_bound']
								else:
									min_val, max_val = self.determine_minmax(original_code)

							except ValueError:
								continue

							except TypeError:
									continue

							if not 'ID2abstractions' in locals():
								# dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement)
								ID2abstractions = dict()
							
							util.init_key(ID2abstractions, key, dict())
							util.init_key(ID2abstractions[key], original_code, [])

							ID2abstractions[key][original_code].append((date, val))

							if '' not in [val, min_val, max_val]:
								attr = get_value(val, min_val, max_val, original_code)

								if not attr in attribute_count:
									attribute_count[attr] = 0

								# check if attribute name and ID instance already exist, if not, make them
								util.init_key(attribute2ids, attr, dict())
								util.init_key(attribute2ids[attr], key, 0)
								
								# add 1 to the occurrence of the attribute in the instance
								attribute2ids[attr][key] += 1
								attribute_count[attr] += 1

						else: # else no lab result collection, regular aggregation
							# generate attribute names

							if 'cardiometabolism' in suffix:
								# val_idx = headers.index('valuec')
								value = str(row.valuec)
							
							else:
								value = None

							attributes = self.generate_attributes(original_code, limit, suffix, value, src=code_column)
							# this loop allows multiple attributes to be created in the previous code line
							# this allows for other classes to subclass this class, e.g. StandardEnrichProcess
							for attr in attributes:
								if not attr in attribute_count:
									attribute_count[attr] = 0

								# print truncated_code, attr
								# check if attribute name and ID instance already exist, if not, make them
								util.init_key(attribute2ids, attr, dict())
								util.init_key(attribute2ids[attr], key, 0)

								# add 1 to the occurrence of the attribute in the instance, except if attribute is binary
								if 'smoking' in suffix:
									if attribute2ids[attr][key] == 1:
										continue

								if 'allergies' in suffix:
									# val_idx = headers.index('flag')
									value = row.flag

									# check if the person actually has the allergie for which was tested
									if value == 'POS':
										attribute2ids[attr][key] = 1
									# if negative or not tested, it is assumed that person does not have particular allergie
									else:
										attribute2ids[attr][key] = 0

								else:
									attribute2ids[attr][key] += 1
									attribute_count[attr] += 1
		
		for attr, count in attribute_count.items():
			try:
				self.statistics[attr + '_count/min/max'] = [count, min_val, max_val]
			except UnboundLocalError:
				self.statistics[attr + '_count'] = count

		if 'lab_results' in suffix: # do funky stuff with trends and abstractions
			# convert to trends PER lab result
			for ID in ID2abstractions:
				# print ID2abstractions[ID]
				for k, points in ID2abstractions[ID].items():
					
					# the values are sorted before abstraction
					points = sorted(list(set(points)))

					# abstract the values and count the occurrences per measurement-trend per patient
					# if only 1 measurement was done, we cannot do time series analysis
					if len(points) > 1 and ID in dct: 
						abstractions = get_trends(k, points)
						for attr in abstractions:
							attr = attr[0] # get the state
							util.init_key(attribute2ids, attr, dict())
							util.init_key(attribute2ids[attr], ID, 0)
							attribute2ids[attr][ID] += 1
		# print len(attribute2ids)
		# print attribute2ids.keys()[0:5]
		
		

		# add data to each instance
		to_save = {}

		for ID in dct:
			to_save[ID] = []

		for ID in dct:
			data = dct[ID]['data']
			# to_save[ID] = []

			for id2occurrences in attribute2ids.values():
				
				# if patient has occurrences for the attribute, add that number, else add 0
				if ID in id2occurrences: 
					data.append(id2occurrences[ID])
					to_save[ID].append(id2occurrences[ID])

				else:
					data.append(0)
					to_save[ID].append(0)

		save_obj(self.statistics, self.in_dir + suffix[0]+ '_statistics.pkl')

		if self.survival == True:
			save_obj(to_save, self.in_dir + suffix[0] + '_dict_marshall' + str(counter)+ '_survival' + '.pkl')
			save_obj(list(attribute2ids.keys()), self.in_dir + suffix[0]  + '_headers'+ str(counter) + '.pkl')
		else:
			save_obj(to_save, self.in_dir + suffix[0] + '_dict_marshall' + str(counter) + '.pkl')
			save_obj(list(attribute2ids.keys()), self.in_dir + suffix[0] + '_headers'+  str(counter) + '.pkl')


		# return the keys to be used as headers when writing the processed data
		return list(attribute2ids.keys()), num_total, num_pos, suffix
	def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False):
		'''inserts data from the specified csv and corresponding columns'''

		# make convenient reference to the dictionary
		dct = self.id2data

		# # get data and corresponding headers
		# rows, headers = util.import_data(f, delim=self.delim)

		# get the index of the relevant columns
		ID_idx = headers.index(self.ID_column)
		code_idx = headers.index(code_column)
		date_idx = headers.index(date_column[0])
		
		if suffix == 'lab_results':
			val_idx = headers.index('waarde')
			min_idx = headers.index('referentie_minimum')
			max_idx = headers.index('referentie_maximum')
		if incorporate_SOEP:
			SOEP_idx = headers.index(incorporate_SOEP)
		
		# get the right suffix to append for the attribute name
		if suffix == '':
			suffix = code_column

		# regex pattern to match (ATC/ICPC standards)
		pattern = re.compile(regex_string)

		# keep track of number of times the row is attributed to a positive CRC patient (or patient where the target instance = 'positive')
		num_pos = 0
		num_total = 0

		# iterate over all instances, making a new dict with the new attributes as keys
		attribute2ids = dict()
		for row in rows:
			num_total+=1

			# if key is not in the data dictionary, we skip it
			key = int(row[ID_idx])
			if not key in dct:
				continue

			if dct[key]['CRC_dates'][0] != 'negative':
				num_pos+=1

			# init other vars
			date = str2date(row[date_idx], give_default_begin=True)
			begin = dct[key]['CRC_dates'][3]
			end = dct[key]['CRC_dates'][4]
			if code_column == 'specialisme':
				end = end - four_weeks()

			original_code = row[code_idx]
			if original_code == None:
				continue
			truncated_code = self.generate_code(original_code, limit)
			if truncated_code == None:
				continue
			if suffix == 'lab_results':
				val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx])
				if val == '':
					continue

			# if in the required interval and code is valid
			if (begin <= date and date <= end) and pattern.match(truncated_code):
				
				# if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E
				if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):
				
					if suffix == 'lab_results': # if we prepare for lab result abstraction
						if not 'ID2abstractions' in locals():
							# dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement)
							ID2abstractions = dict()
						
						util.init_key(ID2abstractions, key, dict())
						util.init_key(ID2abstractions[key], original_code, [])

						ID2abstractions[key][original_code].append((date, val))

						if '' not in [val, min_val, max_val]:
							attr = abstracts.get_value(val, min_val, max_val, original_code)

							# check if attribute name and ID instance already exist, if not, make them
							util.init_key(attribute2ids, attr, dict())
							util.init_key(attribute2ids[attr], key, 0)

							# add 1 to the occurrence of the attribute in the instance
							attribute2ids[attr][key] += 1

					else: # else no lab result collection, regular aggregation
						# generate attribute names
						attributes = self.generate_attributes(original_code, limit, suffix, src=code_column)
						
						# this loop allows multiple attributes to be created in the previous code line
						# this allows for other classes to subclass this class, e.g. StandardEnrichProcess
						for attr in attributes:
							# print truncated_code, attr
							# check if attribute name and ID instance already exist, if not, make them
							util.init_key(attribute2ids, attr, dict())
							util.init_key(attribute2ids[attr], key, 0)

							# add 1 to the occurrence of the attribute in the instance
							attribute2ids[attr][key] += 1

		if suffix == 'lab_results': # do funky stuff with trends and abstractions
			# convert to trends PER lab result
			for ID in ID2abstractions:
				# print ID2abstractions[ID]
				for k, points in ID2abstractions[ID].iteritems():
					
					# the values are sorted before abstraction
					points = sorted(list(set(points)))

					# abstract the values and count the occurrences per measurement-trend per patient
					# if only 1 measurement was done, we cannot do time series analysis
					if len(points) > 1 and ID in dct: 
						abstractions = abstracts.get_trends(k, points)
						for attr in abstractions:
							attr = attr[0] # get the state
							util.init_key(attribute2ids, attr, dict())
							util.init_key(attribute2ids[attr], ID, 0)
							attribute2ids[attr][ID] += 1
		# print len(attribute2ids)
		# print attribute2ids.keys()[0:5]
		
		# add data to each instance
		for ID in dct:
			data = dct[ID]['data']

			for id2occurrences in attribute2ids.values():
				
				# if patient has occurrences for the attribute, add that number, else add 0
				if ID in id2occurrences: 
					data.append(id2occurrences[ID])
				else:
					data.append(0)

		# return the keys to be used as headers when writing the processed data
		return attribute2ids.keys(), num_total, num_pos
示例#5
0
    def insert_data(self,
                    rows,
                    headers,
                    code_column,
                    date_column,
                    regex_string,
                    limit,
                    suffix='',
                    incorporate_SOEP=False):
        '''inserts data from the specified csv and corresponding columns'''

        # make convenient reference to the dictionary
        dct = self.id2data

        # # get data and corresponding headers
        # rows, headers = util.import_data(f, delim=self.delim)

        # get the index of the relevant columns
        ID_idx = headers.index(self.ID_column)
        code_idx = headers.index(code_column)
        date_idx = headers.index(date_column[0])

        if incorporate_SOEP:
            SOEP_idx = headers.index(incorporate_SOEP)

        # get the right suffix to append for the attribute name
        if suffix == '':
            suffix = code_column

        # regex pattern to match (ATC/ICPC standards)
        pattern = re.compile(regex_string)

        # iterate over all instances, making a new dict with the new attributes as keys
        attribute2counts = defaultdict(dict)
        for row in rows:

            # if key is not in the data dictionary, we skip it
            key = row[ID_idx]
            if not key in dct:
                continue

            # init other vars
            date = str2date(row[date_idx])
            begin = dct[key]['stroke_dates'][3]
            end = dct[key]['stroke_dates'][4]
            original_code = row[code_idx]

            # if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E
            if (not incorporate_SOEP) or (incorporate_SOEP
                                          and row[SOEP_idx] == 'E'):

                # generate attribute names
                attributes = self.generate_attributes(original_code,
                                                      limit,
                                                      suffix,
                                                      src=code_column)

                # this loop allows multiple attributes to be created in the previous code line
                # this allows for other classes to subclass this class, e.g. StandardEnrichProcess
                for attr in attributes:

                    # check if attribute name and ID instance already exist, if not, make them
                    util.init_key(attribute2counts, attr, defaultdict(dict))
                    util.init_key(attribute2counts[attr], key, 0)

                    # add 1 to the occurrence of the attribute in the instance
                    attribute2counts[attr] += 1

        # add data to each instance
        for ID in dct:
            data = dct[ID]['data']

            for id2occurrences in attribute2ids.values():

                # if patient has occurrences for the attribute, add that number, else add 0
                if ID in id2occurrences:
                    data.append(id2occurrences[ID])
                else:
                    data.append(0)

        # return the keys to be used as headers when writing the processed data
        return attribute2ids.keys()
	def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False):
		'''inserts data from the specified csv and corresponding columns'''

		# make convenient reference to the dictionary
		dct = self.id2data

		# # get data and corresponding headers
		# rows, headers = util.import_data(f, delim=self.delim)

		# get the index of the relevant columns
		ID_idx = headers.index(self.ID_column)
		code_idx = headers.index(code_column)
		date_idx = headers.index(date_column[0])
		
		if incorporate_SOEP:
			SOEP_idx = headers.index(incorporate_SOEP)
		
		# get the right suffix to append for the attribute name
		if suffix == '':
			suffix = code_column

		# regex pattern to match (ATC/ICPC standards)
		pattern = re.compile(regex_string)

		# iterate over all instances, making a new dict with the new attributes as keys
		attribute2counts = dict()
		for row in rows:

			# if key is not in the data dictionary, we skip it
			key = int(row[ID_idx])
			if not key in dct:
				continue

			# init other vars
			date = str2date(row[date_idx])
			begin = dct[key]['CRC_dates'][3]
			end = dct[key]['CRC_dates'][4]
			original_code = row[code_idx]
				
			# if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E
			if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):
			
				# generate attribute names
				attributes = self.generate_attributes(original_code, limit, suffix, src=code_column)
				
				# this loop allows multiple attributes to be created in the previous code line
				# this allows for other classes to subclass this class, e.g. StandardEnrichProcess
				for attr in attributes:

					# check if attribute name and ID instance already exist, if not, make them
					util.init_key(attribute2counts, attr, dict())
					util.init_key(attribute2counts[attr], key, 0)

					# add 1 to the occurrence of the attribute in the instance
					attribute2counts[attr] += 1
		
		# add data to each instance
		for ID in dct:
			data = dct[ID]['data']

			for id2occurrences in attribute2ids.values():
				
				# if patient has occurrences for the attribute, add that number, else add 0
				if ID in id2occurrences: 
					data.append(id2occurrences[ID])
				else:
					data.append(0)

		# return the keys to be used as headers when writing the processed data
		return attribute2ids.keys()
	def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False):
		'''inserts data from the specified csv and corresponding columns'''

		# make convenient reference to the dictionary
		dct = self.id2data

		# # get data and corresponding headers
		# rows, headers = util.import_data(f, delim=self.delim)

		# get the index of the relevant columns
		ID_idx = headers.index(self.ID_column)
		code_idx = headers.index(code_column)
		b_date_idx = headers.index(date_column[0])
		e_date_idx = headers.index(date_column[1])
		if suffix == 'lab_results':
			val_idx = headers.index('waarde')
			min_idx = headers.index('referentie_minimum')
			max_idx = headers.index('referentie_maximum')
		if incorporate_SOEP:
			SOEP_idx = headers.index(incorporate_SOEP)

		# get the right suffix to append for the attribute name
		if suffix == '':
			suffix = code_column

		# regex pattern to match (ATC/ICPC standards)
		pattern = re.compile(regex_string)

		# iterate over all instances
		for row in rows:

			# if key is not in the data dictionary, we skip it
			key = int(row[ID_idx])
			if not key in dct:
				continue

			# init other vars
			b_date = str2date(row[b_date_idx], give_default_begin=True) # begin of event
			e_date = str2date(row[e_date_idx], give_default_end=True) # end of event
			b_reg = dct[key]['CRC_dates'][3] # beginning of registration
			e_reg = dct[key]['CRC_dates'][4] # ending of registration
			if code_column == 'specialisme':
				e_reg = e_reg - four_weeks()

			original_code = row[code_idx]
			if original_code == None:
				continue
			truncated_code = self.generate_code(original_code, limit) 
			if truncated_code == None:
				continue
			if suffix == 'lab_results':
				val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx])
				if val == '':
					continue

			# if in the required interval (either beginning or ending date) AND code is valid
			if ( (b_reg <= b_date and b_date <= e_reg) or (b_reg <= e_date and e_date <= e_reg) ) and pattern.match(truncated_code):
				
				# if we need to take the SOEP code of consults into account
				if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):

					# generate attribute names
					if suffix == 'lab_results': # if we prepare for lab result abstraction
						if not 'ID2abstractions' in locals():
							# dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement)
							ID2abstractions = dict()
						
						util.init_key(ID2abstractions, key, dict())
						util.init_key(ID2abstractions[key], original_code, [])

						ID2abstractions[key][original_code].append((b_date, val))
					
						if '' not in [val, min_val, max_val]:
							attributes = [abstracts.get_value(val, min_val, max_val, original_code)]

							# # add value abstraction as state interval
							# self.insert_state_interval(key, attr, b_date, e_date)
						else:
							attributes = []

					else:
						attributes = self.generate_attributes(original_code, limit, suffix, src=code_column)

					# this loop allows multiple attributes to be created in the previous code line
					# this allows for other classes to subclass this class, e.g. SequenceEnrichProcess
					for attr in attributes:

						# insert a StateInterval object with the specified parameters
						self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column)

		if suffix == 'lab_results': # do funky stuff with trends and abstractions
			# convert to trends PER lab result
			for ID in ID2abstractions:
				# print ID2abstractions[ID]
				for k, points in ID2abstractions[ID].iteritems():
					
					# the values are sorted before abstraction
					points = sorted(list(set(points)))

					# abstract the values and append to the current patient's sequence
					# if only 1 measurement was done, we cannot do time series analysis
					if len(points) > 1 and ID in dct: 
						abstractions = abstracts.get_trends(k, points)
						for abstraction in abstractions:
							self.insert_state_interval(ID, *abstraction, original_code=original_code, src=code_column)
						# self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions
		
		# to satisfy return value requirement for the method 'process' in the superclass
		return [], -1, -1
示例#8
0
    def insert_data(self,
                    rows,
                    headers,
                    code_column,
                    date_column,
                    regex_string,
                    limit,
                    suffix='',
                    incorporate_SOEP=False,
                    counter=0):
        '''inserts data from the specified csv and corresponding columns'''

        important_features = ['CHOLBMT', 'RRDIKA', 'RRSYKA']

        # read rows into list to re-use
        rows = rows.where((pd.notnull(rows)), None)

        # make convenient reference to the dictionary
        dct = self.id2data

        # # get data and corresponding headers
        # rows, headers = util.import_data(f, delim=self.delim)

        # get the index of the relevant columns
        # ID_idx = headers.index(self.ID_column)
        code_idx = headers.index(code_column) + 1
        date_idx = headers.index(date_column[0]) + 1
        b_date_idx = headers.index(date_column[0]) + 1
        e_date_idx = headers.index(date_column[1]) + 1

        # if incorporate_SOEP:
        # 	SOEP_idx = headers.index(incorporate_SOEP)

        # regex pattern to match (ATC/ICPC standards)
        pattern = re.compile(regex_string)

        # regex pattern to match (ATC/ICPC standards)
        pattern = re.compile(regex_string)

        if 'lab_results' in suffix:
            values_dict = dict()
            # val_idx = headers.index('valuen') + 1

            # pair IDs with a dict corresponding to data and dates
            for row in rows.itertuples():  #line in de data
                code = row[code_idx]
                # if we do not know the high and low values, determine by data distribution
                if code not in important_features:
                    if not code in values_dict:
                        try:
                            values_dict[code] = [float(row.valuen)]
                        except ValueError:
                            continue
                        except TypeError:
                            continue
                    else:
                        try:
                            values_dict[code].append(float(row.valuen))
                        except ValueError:
                            continue
                        except TypeError:
                            continue

            minmax_dict = self.calculate_minmax(values_dict, pattern, limit)

        # keep track of number of times the row is attributed to a positive stroke patient (or patient where the target instance = 'positive')
        num_pos = 0
        num_total = 0
        attribute_count = dict()
        # iterate over all instances, making a new dict with the new attributes as keys
        attribute2ids = dict()

        max = 100000000000000000
        current = 0

        # iterate over all instances
        for row in tqdm(rows.itertuples()):
            current += 1
            # row = row.split(';')

            if current > max:
                break
            else:
                num_total += 1

                # if key is not in the data dictionary, we skip it
                key = row.Index

                if not key in dct:
                    continue

            # init other vars
            b_date = str2date(row[b_date_idx],
                              give_default_begin=True)  # begin of event
            e_date = str2date(row[e_date_idx],
                              give_default_end=True)  # end of event
            b_reg = dct[key]['stroke_dates'][1]  # beginning of registration
            e_reg = dct[key]['stroke_dates'][2]  # ending of registration
            # print('wddup')
            # print(b_reg, e_reg)
            # print('xxx')

            # print(dct[key]['stroke_dates'][3], dct[key]['stroke_dates'][4])
            original_code = row[code_idx]
            if original_code == None:
                continue

            truncated_code = self.generate_code(original_code, limit)
            if truncated_code == None or truncated_code in [
                    'K90', 'K89', 'k90', 'k89'
            ]:
                continue

            print(b_reg, b_date, e_date)
            # print(b_reg <= b_date)
            # print(b_date <= e_reg)
            # print(b_reg <= e_date)
            # print(e_date <= e_reg)
            # if in the required interval (either beginning or ending date) AND code is valid
            if ((b_reg <= b_date and b_date <= e_reg) or
                (b_reg <= e_date
                 and e_date <= e_reg)) and pattern.match(truncated_code):

                # if we need to take the SOEP code of consults into account
                # if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):

                # generate attribute names
                if 'lab_results' in suffix:  # if we prepare for lab result abstraction

                    try:
                        val = float(row.valuen)
                        if not original_code in important_features:
                            min_val = minmax_dict[truncated_code]['low_bound']
                            max_val = minmax_dict[truncated_code]['high_bound']

                        else:
                            min_val, max_val = self.determine_minmax(
                                original_code)

                    except ValueError:
                        continue

                    except TypeError:
                        continue

                    val, min_val, max_val = self.make_lab_values(
                        val, min_val, max_val)

                    if not 'ID2abstractions' in locals():
                        # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement)
                        ID2abstractions = dict()

                    util.init_key(ID2abstractions, key, dict())
                    util.init_key(ID2abstractions[key], original_code, [])

                    ID2abstractions[key][original_code].append((b_date, val))

                    if '' not in [val, min_val, max_val]:
                        attributes = [
                            get_value(val, min_val, max_val, original_code)
                        ]

                        # # add value abstraction as state interval
                        # self.insert_state_interval(key, attr, b_date, e_date)
                    else:
                        attributes = []

                else:
                    if 'cardiometabolism' in suffix:
                        val_idx = headers.index('valuec')
                        value = str(row[val_idx])

                    else:
                        value = None

                    attributes = self.generate_attributes(original_code,
                                                          limit,
                                                          suffix,
                                                          value,
                                                          src=code_column)

                # this loop allows multiple attributes to be created in the previous code line
                # this allows for other classes to subclass this class, e.g. SequenceEnrichProcess
                for attr in attributes:
                    if 'allergies' in suffix:
                        # val_idx = headers.index('flag')
                        value = row.flag

                        # check if the person actually has the allergie for which was tested
                        if value == 'POS':
                            self.insert_state_interval(key, attr, b_date,
                                                       e_date, original_code,
                                                       code_column)
                        # if negative or not tested, it is assumed that person does not have particular allergie
                        else:
                            continue
                    # insert a StateInterval object with the specified parameters
                    self.insert_state_interval(key, attr, b_date, e_date,
                                               original_code, code_column)

        if suffix == 'lab_results':  # do funky stuff with trends and abstractions
            # convert to trends PER lab result
            for ID in ID2abstractions:
                # print ID2abstractions[ID]
                for k, points in ID2abstractions[ID].items():

                    # the values are sorted before abstraction
                    points = sorted(list(set(points)))

                    # abstract the values and append to the current patient's sequence
                    # if only 1 measurement was done, we cannot do time series analysis
                    if len(points) > 1 and ID in dct:
                        abstractions = get_trends(k, points)
                        for abstraction in abstractions:
                            self.insert_state_interval(
                                ID,
                                *abstraction,
                                original_code=original_code,
                                src=code_column)
                        # self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions

        # add data to each instance
        to_save = {}

        for ID in dct:
            to_save[ID] = []

        for ID in dct:
            data = dct[ID]['data']
            # to_save[ID] = []

            for id2occurrences in attribute2ids.values():

                # if patient has occurrences for the attribute, add that number, else add 0
                if ID in id2occurrences:
                    data.append(id2occurrences[ID])
                    to_save[ID].append(id2occurrences[ID])

                else:
                    data.append(0)
                    to_save[ID].append(0)

        if self.survival == True:
            save_obj(
                to_save, self.in_dir + suffix[0] + '_dict_temporal' +
                str(counter) + '_survival' + '.pkl')
            save_obj(
                list(attribute2ids.keys()), self.in_dir + suffix[0] +
                'temporal_headers' + str(counter) + '.pkl')
        else:
            save_obj(
                to_save, self.in_dir + suffix[0] + '_dict_temporal' +
                str(counter) + '.pkl')
            save_obj(
                list(attribute2ids.keys()), self.in_dir + suffix[0] +
                'temporal_headers' + str(counter) + '.pkl')
        # to satisfy return value requirement for the method 'process' in the superclass
        return [], -1, -1