def temporal(self, dct, now, args):
		needs_processing = {k : bool(v.get()) for k, v in dct['temporal_specific'].iteritems()}

		out_dir = dct['out_dir'].get() + '/' + now + '/data/'
		util.make_dir(out_dir)
		min_sup = float(dct['temporal_specific']['support'].get())
		
		if not dct['temporal_specific']['sequences_available'].get():
			# if enrichment is enabled, we create a different object instance than usual
			if dct['enrich'].get():
				seq_p = SequenceEnrichProcess(*args, mapping_files_dir=dct['mapping_dir'].get())
				name = 'sequences_enriched'
			elif dct['temporal_specific']['anti-knowledge-driven'].get():
				seq_p = NonMarshallSequenceProcess(*args)
				name = 'sequences_excl_marshall'				
			else:
				seq_p = SequenceProcess(*args)
				name = 'sequences'

			seq_p.process(needs_processing)
			seq_p.sort_sequences()
			seq_p.save_output(sequence_file=True, sub_dir='data/tmprl', name=name)

			generate_pattern_occurrences_per_patient(out_dir, seq_p.id2data, min_sup, dct['mapping_dir'].get())
			sequence_f = out_dir + '/tmprl/{}.csv'.format(name)
		else:
			sequence_f = dct['temporal_specific']['sequence_file'].get()
			generate_pattern_occurrences_per_patient(out_dir, sequence_f, min_sup, dct['mapping_dir'].get())
	def export(self, out_dir):
		'''export results'''
		util.make_dir(out_dir)
		
		io.dict2csv(self.code2indications, out_dir + 'indication.csv')
		io.dict2csv(self.code2effects, out_dir + 'effect.csv')
		io.dict2csv(self.code2ingredients, out_dir + 'ingredient.csv')
示例#3
0
    def export(self, out_dir):
        '''export results'''
        util.make_dir(out_dir)

        io.dict2csv(self.code2manifestation_of,
                    out_dir + 'manifestationof.csv')
        io.dict2csv(self.code2association, out_dir + 'association.csv')
示例#4
0
    def export(self, out_dir):
        '''export results'''
        util.make_dir(out_dir)

        io.dict2csv(self.code2indications, out_dir + 'indication.csv')
        io.dict2csv(self.code2effects, out_dir + 'effect.csv')
        io.dict2csv(self.code2ingredients, out_dir + 'ingredient.csv')
示例#5
0
    def save_statistics(self, sub_dir='data', name='unnamed'):
        out_dir = self.out_dir + '/' + sub_dir + '/'
        util.make_dir(out_dir)
        f_out = out_dir + name + '.csv'

        with open(f_out, 'w') as f:
            for key, value in self.statistics.items():
                f.write('%s:%s\n' % (key, value))
	def go(self, button):
		'''initiates the associated algorithms '''
		dct = self.user_input

		button.config(text='Running', state=DISABLED)
		if dct['in_dir'].get() == 'input folder':
			dct['in_dir'].set('sql')
		if dct['delimiter'].get() == '':
			dct['delimiter'].set(',')
		if dct['out_dir'].get() == 'output folder':
			dct['out_dir'].set('./out')
		if dct['min_age'].get() == '':
			dct['min_age'].set(30)
		if dct['max_age'].get() == '':
			dct['max_age'].set(150)
		if dct['begin_interval'].get() == '':
			dct['begin_interval'].set(int(365./52*26+1))
		if dct['end_interval'].get() == '':
			dct['end_interval'].set(int(365./52*0+1))
		if dct['ID_column'].get() == '':
			dct['ID_column'].set('patientnummer')
		if dct['temporal_specific']['support'].get() == '':
			dct['temporal_specific']['support'].set(0.1)
		if dct['mapping_dir'].get() == 'semantic enrichment dir':
			dct['mapping_dir'].set('./out/semantics/')


		self.master.update_idletasks()

		now = util.get_current_datetime()
		util.make_dir(dct['out_dir'].get() + '/' + now + '/')

		HISes = [dct['PMO'].get(), dct['MDM'].get(), dct['LUMC'].get(), 
				 dct['VUMH'].get(), dct['VUMD'].get(), dct['VUSC'].get()]

		args = [dct['in_dir'].get(), 
				dct['delimiter'].get(),
				dct['out_dir'].get() + '/' + now, 
				dct['ID_column'].get(),
				int(dct['min_age'].get()),
				int(dct['max_age'].get()),
				[int(dct['end_interval'].get()), int(dct['begin_interval'].get())],
				True if dct['in_dir'].get().lower() == 'sql' else False,
				HISes]
		
		if dct['process_temporal'].get(): # process temporally
			self.temporal(dct, now, args)
		else: # process atemporally
			self.regular(dct, now, args)

		pretty_dct = util.tkinter2var(dct)
		try:
			io.pprint_to_file(dct['out_dir'].get() + '/' + now + '/settings.txt', pretty_dct)
		except IOError, e:
			print e
示例#7
0
    def save_output(self,
                    benchmark=False,
                    sequence_file=False,
                    sub_dir='',
                    name='unnamed',
                    target=False):
        '''saves processed data to the specified output directory'''
        print('...saving processed data'
              )  # to {}'.format('sql' if self.from_sql else 'file')

        headers = self.headers
        # print (self.id2data.values())
        # print('x')
        # if we didn't get the data from sql database, just save to .csv
        if True or not self.from_sql:
            # possibly make new directories
            out_dir = self.out_dir + '/' + sub_dir + '/'
            util.make_dir(out_dir)

            f_out = out_dir + name + '.csv'
            out = write_csv(f_out)

            # write headers where required
            if benchmark:
                out.writerow(headers[0:3])
            elif target:
                out.writerow([headers[0], headers[-1]])
            elif sequence_file:
                pass
            else:
                out.writerow([headers[0]] + headers[3:-1])

            # write data
            for value in self.id2data.values():
                data = value['data']
                if benchmark:
                    data = data[0:3]
                    data[2] = 1 if data[2] == 'V' else 0
                elif target:
                    if self.survival == False:
                        data = [data[0], 0 if data[-1] == 'negative' else 1]
                    else:
                        data = [data[0], data[-1]]
                elif sequence_file:
                    pass
                else:
                    data = [data[0]] + data[3:-1]
                out.writerow(data)
示例#8
0
    def temporal(self, dct, now, args):
        needs_processing = {
            k: bool(v.get())
            for k, v in dct['temporal_specific'].items()
        }

        out_dir = dct['out_dir'].get() + '/' + now + '/data/'
        util.make_dir(out_dir)
        # minimal support is set here
        min_sup = float(dct['temporal_specific']['support'].get())

        # if there are no sequences available
        if not dct['temporal_specific']['sequences_available'].get():
            # if enrichment is enabled, we create a different object instance than usual

            # if enriched
            # if dct['enrich'].get():
            # 	seq_p = SequenceEnrichProcess(*args, mapping_files_dir=dct['mapping_dir'].get())
            # 	name = 'sequences_enriched'
            # if not enriched and no marshall predictors
            if dct['temporal_specific']['anti-knowledge-driven'].get():
                seq_p = NonMarshallSequenceProcess(*args)
                name = 'sequences_excl_marshall'
            else:
                seq_p = SequenceProcess(*args)
                name = 'sequences'

            seq_p.process(needs_processing)
            seq_p.sort_sequences()
            seq_p.save_output(sequence_file=True,
                              sub_dir='data/tmprl',
                              name=name)

            generate_pattern_occurrences_per_patient(out_dir, seq_p.id2data,
                                                     min_sup,
                                                     dct['mapping_dir'].get())
            sequence_f = out_dir + '/tmprl/{}.csv'.format(name)
        else:
            sequence_f = dct['temporal_specific']['sequence_file'].get()
            generate_pattern_occurrences_per_patient(out_dir, sequence_f,
                                                     min_sup,
                                                     dct['mapping_dir'].get())
    def go(self, button):
        '''initiates the associated algorithms '''
        dct = self.user_input

        button.config(text='Running', state=DISABLED)
        if dct['in_dir'].get() == 'input folder':
            dct['in_dir'].set('/Users/Reiny/Documents/UI_CRC/playground')
        if dct['delimiter'].get() == '':
            dct['delimiter'].set(',')
        if dct['out_dir'].get() == 'output folder':
            dct['out_dir'].set('/Users/Reiny/Documents/UI_CRC/out')

        self.master.update_idletasks()

        util.make_dir(dct['out_dir'].get() + '/')

        HISes = [
            dct['PMO'].get(), dct['MDM'].get(), dct['LUMC'].get(),
            dct['VUMH'].get(), dct['VUMD'].get(), dct['VUSC'].get()
        ]

        args = [
            dct['in_dir'].get(), dct['delimiter'].get(),
            dct['out_dir'].get() + '/' + dct['output_id'].get() + '.csv',
            dct['age+gender'].get(), dct['counts_med'].get(),
            dct['counts_med_enrich'].get(), dct['counts_consult'].get(),
            dct['counts_consult_enrich'].get(), dct['counts_referral'].get(),
            dct['counts_lab'].get(), dct['tmprl'].get(),
            dct['enriched_tmprl'].get(), dct['knowledge_driven'].get(),
            dct['anti_knowledge_driven'].get(),
            dct['anti_knowledge_driven_tmprl'].get(), dct['separate'].get(),
            HISes
        ]

        # merge
        combine.execute(*args)

        button.config(text='Done')
        self.master.update_idletasks()
        time.sleep(0.5)
        button.config(text='Run!', state=NORMAL)
示例#10
0
    def go(self, button):
        '''initiates the associated algorithms '''
        dct = self.user_input

        button.config(text='Running', state=DISABLED)

        self.master.update_idletasks()

        util.make_dir(dct['f_out'].get())

        report = Report(dct['f_general'].get(), dct['f_data'].get(),
                        dct['f_predictors'].get(), dct['f_out'].get(),
                        float(dct['feature-threshold'].get()))
        report.compile()
        report.export()

        print '### Done processing ###'
        button.config(text='Done')
        self.master.update_idletasks()
        time.sleep(0.5)
        button.config(text='Run!', state=NORMAL)
	def go(self, button):
		'''initiates the associated algorithms '''
		dct = self.user_input

		button.config(text='Running', state=DISABLED)

		self.master.update_idletasks()

		util.make_dir(dct['f_out'].get())

		report = Report(dct['f_general'].get(),
						dct['f_data'].get(),
						dct['f_predictors'].get(),
						dct['f_out'].get(),
						float(dct['feature-threshold'].get())
			)
		report.compile()
		report.export()

		print '### Done processing ###'
		button.config(text='Done')
		self.master.update_idletasks()
		time.sleep(0.5)	
		button.config(text='Run!', state=NORMAL)
示例#12
0
    def go(self, button):
        '''initiates the associated algorithms '''
        dct = self.user_input

        button.config(text='Running', state=DISABLED)
        if dct['in_dir'].get() == 'input folder':
            dct['in_dir'].set('sql')
        if dct['delimiter'].get() == '':
            dct['delimiter'].set(',')
        if dct['out_dir'].get() == 'output folder':
            dct['out_dir'].set('./out')
        if dct['min_age'].get() == '':
            dct['min_age'].set(30)
        if dct['max_age'].get() == '':
            dct['max_age'].set(150)
        if dct['begin_interval'].get() == '':
            dct['begin_interval'].set(int(365. / 52 * 26 + 1))
        if dct['end_interval'].get() == '':
            dct['end_interval'].set(int(365. / 52 * 0 + 1))
        if dct['ID_column'].get() == '':
            dct['ID_column'].set('patientnummer')
        if dct['temporal_specific']['support'].get() == '':
            dct['temporal_specific']['support'].set(0.1)
        # if dct['mapping_dir'].get() == 'semantic enrichment dir':
        # 	dct['mapping_dir'].set('./out/semantics/')

        self.master.update_idletasks()

        now = util.get_current_datetime()
        util.make_dir(dct['out_dir'].get() + '/' + now + '/')

        # HISes = [dct['PMO'].get(), dct['MDM'].get(), dct['LUMC'].get(),
        # 		 dct['VUMH'].get(), dct['VUMD'].get(), dct['VUSC'].get()]

        args = [
            dct['in_dir'].get(), dct['delimiter'].get(),
            dct['out_dir'].get() + '/' + now, dct['ID_column'].get(),
            int(dct['min_age'].get()),
            int(dct['max_age'].get()),
            [int(dct['end_interval'].get()),
             int(dct['begin_interval'].get())],
            True if dct['in_dir'].get().lower() == 'sql' else False, False,
            dct['survival'].get(), dct['already_processed'].get()
        ]

        if dct['process_temporal'].get():  # process temporally
            self.temporal(dct, now, args)
        else:  # process atemporally
            self.regular(dct, now, args)

        pretty_dct = util.tkinter2var(dct)
        try:
            io.pprint_to_file(
                dct['out_dir'].get() + '/' + now + '/settings.txt', pretty_dct)
        except IOError as e:
            print(e)

        print('### Done processing ###')
        button.config(text='Done')
        self.master.update_idletasks()
        time.sleep(0.5)
        button.config(text='Run!', state=NORMAL)
示例#13
0
def execute(in_dir, out_dir, record_id, target_id, day_id, day, algorithms, feature_selection, separate_testset, in_dir_test):
	'''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and subdirectories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
	print '### executing learning algorithms on... ###'

	# get the files
	files = util.list_dir_csv(in_dir)

	# stop if no files found
	if not files:
		print 'No appropriate csv files found. Select an input directory with appropriate files'
		return

	if separate_testset:
		files_test = util.list_dir_csv(in_dir_test)
	else:
		files_test = files

	# create directory
	util.make_dir(out_dir)

	# execute each algorithm
	for alg in algorithms:
		print '...{}'.format(alg)

		util.make_dir(out_dir+'/'+alg+'/')
		results_list = []
		if separate_testset:
			results_list2 = []
			util.make_dir(out_dir+'/'+alg+'_test/')

		# list which will contain the results

		# run algorithm alg for each file f
		for f, f_test in zip(files,files_test):
			fname = in_out.get_file_name(f, extension=False)
			print ' ...{}'.format(fname)

			# get data, split in features/target. If invalid stuff happened --> exit
			X, y, headers = in_out.import_data(f, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
			if type(X) == bool: return

			# if separate_testset:
			# 	X, X_te = X
			# 	y, y_te = y
			# 	print '  ...train instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])
			# 	print '  ...test instances: {}, attributes: {}'.format(X_te.shape[0], X_te.shape[1])
			# else:

			# Now remove the ones without a relevant day:

			new_headers = [h for h in headers if not h == day_id]
			day_index = headers.index(day_id)
			new_X = np.zeros((0, len(headers)))
			new_y = []

			for i in range(0, X.shape[0]):
				if X[i,headers.index(day_id)] == day:
					row = np.array(X[i,:]).reshape(-1)
					new_X = np.append(new_X, np.column_stack(row), axis=0)
					new_y.append(int(y[i]))
			new_X = np.delete(new_X, day_index, 1)
			X = new_X
			y = np.squeeze(np.asarray(new_y))

			print '  ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])


			model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection)
			results_list.append(results)

			if separate_testset:
				X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
				if type(X) == bool: return

				print '  ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1])

				results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features)
				results_list2.append(results)

		try:
			in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve')
		except IndexError:
			pass

		try:
			in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve')
		except NameError:
			pass

	# notify user
	print '## Learning Finished ##'
示例#14
0
def execute(in_dir, out_dir, record_id, target_id, algorithms, feature_selection, separate_testset, in_dir_test, survival, oversampling, undersampling, aggregation):
	'''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and subdirectories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
	print ('### executing learning algorithms on... ###')
	
	# get the files
	files = util.list_dir_csv(in_dir)

	# stop if no files found
	if not files:
		print ('No appropriate csv files found. Select an input directory with appropriate files')
		return

	if separate_testset:
		files_test = util.list_dir_csv(in_dir_test)
	else:
		files_test = files

	# create directory
	util.make_dir(out_dir)

	# execute each algorithm
	for alg in algorithms:
		print ('...{}'.format(alg))
	
		util.make_dir(out_dir+'/'+alg+'/')
		results_list = []	
		if separate_testset:
			results_list2 = []
			util.make_dir(out_dir+'/'+alg+'_test/')

		# list which will contain the results
	
		# run algorithm alg for each file f
		for f, f_test in zip(files,files_test):
			fname = in_out.get_file_name(f, extension=False)
			print (' ...{}'.format(fname))
	
			# get data, split in features/target. If invalid stuff happened --> exit
			X, y, headers, target_list = in_out.import_data(f, record_id, target_id, survival) # assumption: first column is patientnumber and is pruned, last is target
			if type(X) == bool: return
		
			if aggregation == True:
				X, headers = aggregations(f, target_list, survival)

			print ('  ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]))

			model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection, oversampling, survival, undersampling, aggregation)
			results_list.append(results)

			if separate_testset:
				X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
				if type(X) == bool: return
				
				print ('  ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1]))			

				results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features)
				results_list2.append(results)

		try:
			in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve')
		except IndexError:
			pass
		
		try:
			in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve')
		except NameError:
			pass

	# notify user
	print ('## Learning Finished ##')
示例#15
0
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k):
    '''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and sub_directories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
    print '### executing learning algorithms on... ###'

    # get the files
    files = util.list_dir_csv(in_dir)

    # stop if no files found
    if not files:
        print 'No appropriate csv files found. Select an input directory with appropriate files'
        return

    # create directory
    util.make_dir(out_dir)

    # execute each algorithm

    # run algorithm alg for each file f
    for f in files:
        results_list = []
        fname = in_out.get_file_name(f, extension=False)
        print ' ...{}'.format(fname)

        # get data, split in features/target. If invalid stuff happened --> exit
        X, y, headers = in_out.import_data(
            f, record_id, target_id,
            True)  # assumption: first column is patientnumber
        if type(X) == bool: return

        day_index = headers.index(day_id)
        new_X = np.zeros((0, len(headers)))
        new_y = []

        IDs = []
        IDrows = {}

        # ordering of time points and complete data (filled with nan's if not available) assumed!

        #  		features_to_be_removed   =    [ "pvc_bin","pnc_bin","pac_bin","ect_freq_bin","full_code_bin","comfort_meas_bin","other_code_bin","no_cpr_bin",
        # 										"dnr_bin","dni_bin","fall_risk_bin","orientation_ord","orient_unable_ass_bin","riker_sas_ord","vent_bin",
        # 										"vent_mode_ord","pacemaker_bin","trach_bin","flush_skin_bin","jaundice_skin_bin","pale_skin_bin","impaired_skin_bin",
        # 										"iabp_ord","iabp_bin","svnsicu_bin","svcsicu_bin","svcsru_bin","svmicu_bin","svmsicu_bin","svother_bin","svccu_bin",
        # 										"gender"]

        exclude = [
            146, 140, 95, 123, 88, 133, 22, 65, 49, 114, 178, 55, 133, 138, 34,
            186, 20, 73
        ]
        new_index = 0
        for i in range(0, X.shape[0]):
            if X[i, headers.index(day_id)] == day or day == -1:
                row = np.array(X[i, :]).reshape(-1)

                if not row[0] in IDs and not row[0] in exclude:
                    IDs.append(row[0])
                    new_y.append(int(y[i]))
                    IDrows[row[0]] = [new_index]
                elif not row[0] in exclude:
                    IDrows[row[0]].append(new_index)
                new_X = np.append(new_X, np.column_stack(row), axis=0)
                new_index += 1

        ID_column = new_X[:, 0]

        # Remove the id, the day, and the time stamp from the data and headers.
        new_X = np.delete(new_X, 2, 1)
        new_X = np.delete(new_X, 1, 1)
        new_X = np.delete(new_X, 0, 1)
        new_headers = headers[3:len(headers)]

        dtw_attr = ['hr', 'resp', 'nbp', 'sbp', 'dbp', 'so2']

        X = new_X
        print len(X)

        non_singular_rows = [
            i for i in range(0, X.shape[1])
            if len(set(util.get_non_nans(X[:, i].tolist()))) > 1
        ]
        #print str(len(non_singular_rows)) + ' ' + str(X.shape[1])
        #print non_singular_rows

        X = X[:, non_singular_rows]
        new_headers = np.array(new_headers)[non_singular_rows].tolist()
        print str(
            len(new_headers)) + "length new headers after non singular rows"
        print new_headers
        print "Removed columns with only nan of 1 value"
        max_values = np.nanmax(X, axis=0)
        min_values = np.nanmin(X, axis=0)

        ranges = []
        for i in range(0, len(min_values)):
            diff = max_values[i] - min_values[i]
            if diff == 0:
                print 'difference of zero encountered in ' + str(i)
                print 'Max values: ' + str(max_values[i])
                print 'Min values: ' + str(min_values[i])
                ranges.append(1)
            else:
                ranges.append(diff)

        # Now do some scaling to get the values to the same order or magnitude
        scaled_X = (X - min_values) / (max_values - min_values)
        X = scaled_X
        y = np.squeeze(np.asarray(new_y))

        print "Scaling done!"

        new_IDrows = {}
        for ID in IDs:
            IDrows[ID] = {
                'first_row': min(IDrows[ID]),
                'last_row': max(IDrows[ID])
            }

        print '  ...instances: {}, attributes: {}'.format(
            X.shape[0], X.shape[1])

        # Now we are going to build the similarity matrix. We are also going to store how many attributes
        # we actually able to make a comparison for.

        similarity_matrix = np.ones((len(IDs), len(IDs)))
        matching_number_matrix = np.ones((len(IDs), len(IDs)))

        for attr in range(0, len(new_headers)):
            print str(attr) + "attribute in KNN loop"
            print str(attr) + "/" + str(len(new_headers))

            temp = np.ones((len(IDs), len(IDs)))
            temp[:] = 2
            for i in range(0, len(IDs)):
                for j in range(i + 1, len(IDs)):

                    i_data = X[IDrows[IDs[i]]['first_row']:
                               IDrows[IDs[i]]['last_row'] + 1, attr].tolist()
                    j_data = X[IDrows[IDs[j]]['first_row']:
                               IDrows[IDs[j]]['last_row'] + 1, attr].tolist()

                    if new_headers[attr] in dtw_attr:
                        dtw_distance = dtw.lb_keogh(i_data, j_data, window)

                        if not dtw_distance == -1:
                            temp[i, j] += dtw_distance
                            matching_number_matrix[i, j] += 1
                            matching_number_matrix[
                                j, i] = matching_number_matrix[i, j]
                            temp[j, i] = temp[i, j]
                    else:
                        i_data = util.get_non_nans(i_data)
                        j_data = util.get_non_nans(j_data)
                        if len(i_data) > 0 and len(j_data) > 0:
                            simple_distance = math.pow(
                                np.mean(i_data) - np.mean(j_data), 2)
                            temp[i, j] += simple_distance
                            matching_number_matrix[i, j] += 1
                            matching_number_matrix[
                                j, i] = matching_number_matrix[i, j]
                            temp[j, i] = temp[i, j]

            if np.max(temp) != 0:
                temp = temp / np.max(temp)
            similarity_matrix += temp

        # We calculate the average score per item matched
        # Best might be to apply a weighting scheme now.
        similarity_matrix = (similarity_matrix / matching_number_matrix) + (
            1 / matching_number_matrix)

        print len(IDs)
        results = perform_classification(similarity_matrix, y, out_dir, k)
        results_list.append(results)
        print results
        in_out.save_results(out_dir + str(k) + '.csv',
                            ["fpr", "tpr", "auc", "cm"],
                            results[1:len(results)], [sum(y), len(y)])
        in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve')

        # notify user
    print '## Learning Finished ##'
    print similarity_matrix
def execute(in_dir, delim, out_file, age_gender=False, counts_med=False, 
				counts_med_enrich=False, counts_consult=False, counts_consult_enrich=False,
				counts_referral=False, counts_lab=False, tmprl=False, 
				enriched_tmprl=False, knowledge_driven=False, counts_no_knowledge=False, tmprl_no_knowledge=False,
				separate=False, HISes=[]):
	'''merge the in files to produce the out file'''
	merged = defaultdict(list)
	headers = ['ID']

	# we may not need this.
	ID2HIS = {}
	merged_test = defaultdict(list)

	# if we wish to separate, get dictionary of patient HIS sources using SQL.
	if separate:
		c = util.sql_connect().cursor()
		HISes_str = "','".join(HISes)
		q = '''SELECT patientnummer 
				FROM patienten
				WHERE PRAKTIJKCODE IN ('{}')'''.format(HISes_str)
		c.execute(q)
		
		ID2HIS = {row[0] : row[0] for row in c}

	if age_gender:
		headers = merge_file(in_dir+'/AG.csv', merged, headers, delim, separate, ID2HIS, merged_test)
		
	if counts_med:
		headers = merge_file(in_dir+'/C_M.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if counts_med_enrich:
		headers = merge_file(in_dir+'/C_M_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if counts_consult:
		headers = merge_file(in_dir+'/C_C.csv', merged, headers, delim, separate, ID2HIS, merged_test)
	
	if counts_consult_enrich:
		headers = merge_file(in_dir+'/C_C_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if counts_referral:
		headers = merge_file(in_dir+'/C_R.csv', merged, headers, delim, separate, ID2HIS, merged_test)
	
	if counts_lab:
		headers = merge_file(in_dir+'/C_L.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if tmprl:
		headers = merge_file(in_dir+'/T.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if enriched_tmprl:
		headers = merge_file(in_dir+'/T_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if knowledge_driven:
		headers = merge_file(in_dir+'/K.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if counts_no_knowledge:
		headers = merge_file(in_dir+'/C_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if tmprl_no_knowledge:
		headers = merge_file(in_dir+'/T_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	headers = merge_file(in_dir+'/CRC.csv', merged, headers, delim, separate, ID2HIS, merged_test)
	
	# now write to new file (also check whether all results have same length)
	make_dir(out_file)
	out = io.write_csv(out_file)

	out.writerow(headers)
	skip=0
	for key in merged:
		if len(headers) != 1+len(merged[key]):
			print 'unequal to header amount ({} vs {})! watch out.'.format(len(headers),len(merged[key]))
			# skip+=1
			# continue
		out.writerow([key] + merged[key])

	if separate:
		out_file_test = out_file[:out_file.rfind('/')+1] + 'test' + out_file[out_file.rfind('/'):]
		make_dir(out_file_test)

		out = io.write_csv(out_file_test)
		
		out.writerow(headers)
		for key in merged_test:
			if len(headers) != 1+len(merged_test[key]):
				print 'unequal to header amount ({} vs {})! watch out.'.format(len(headers),len(merged_test[key]))
				# skip+=1
				# continue
			out.writerow([key] + merged_test[key])

	print '## Done Merging ##'
示例#17
0
def execute(in_dir,
            delim,
            out_file,
            age_gender=False,
            counts_med=False,
            counts_med_enrich=False,
            counts_consult=False,
            counts_consult_enrich=False,
            counts_referral=False,
            counts_lab=False,
            all_counts=False,
            tmprl=False,
            enriched_tmprl=False,
            knowledge_driven=False,
            counts_no_knowledge=False,
            tmprl_no_knowledge=False,
            separate=False,
            HISes=[]):
    '''merge the in files to produce the out file'''
    merged = defaultdict(list)
    headers = ['ID']

    # we may not need this.
    ID2HIS = {}
    merged_test = defaultdict(list)

    # if we wish to separate, get dictionary of patient HIS sources using SQL.
    if separate:
        c = util.sql_connect().cursor()
        HISes_str = "','".join(HISes)
        q = '''SELECT patientnummer 
				FROM patienten
				WHERE PRAKTIJKCODE IN ('{}')'''.format(HISes_str)
        c.execute(q)

        ID2HIS = {row[0]: row[0] for row in c}

    if age_gender:
        headers = merge_file(in_dir + '/AG.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_med:
        headers = merge_file(in_dir + '/C_M.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_med_enrich:
        headers = merge_file(in_dir + '/C_M_enrich.csv', merged, headers,
                             delim, separate, ID2HIS, merged_test)

    if counts_consult:
        headers = merge_file(in_dir + '/C_C.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_consult_enrich:
        headers = merge_file(in_dir + '/C_C_enrich.csv', merged, headers,
                             delim, separate, ID2HIS, merged_test)

    if counts_referral:
        headers = merge_file(in_dir + '/C_R.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_lab:
        headers = merge_file(in_dir + '/C_L.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if tmprl:
        headers = merge_file(in_dir + '/T.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if enriched_tmprl:
        headers = merge_file(in_dir + '/T_enrich.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if knowledge_driven:
        headers = merge_file(in_dir + '/K.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_no_knowledge:
        headers = merge_file(in_dir + '/C_NK.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if tmprl_no_knowledge:
        headers = merge_file(in_dir + '/T_NK.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if all_counts:
        print('ja')
        headers = merge_file(in_dir + '/counts.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    headers = merge_file(in_dir + '/stroke.csv', merged, headers, delim,
                         separate, ID2HIS, merged_test)

    # now write to new file (also check whether all results have same length)
    make_dir(out_file)
    out = io.write_csv(out_file)

    out.writerow(headers)
    skip = 0

    for key in merged:
        if len(headers) != 1 + len(merged[key]):
            print('unequal to header amount ({} vs {})! watch out.'.format(
                len(headers), len(merged[key])))
            # skip+=1
            # continue
        out.writerow([key] + merged[key])

    if separate:
        out_file_test = out_file[:out_file.rfind('/') +
                                 1] + 'test' + out_file[out_file.rfind('/'):]
        make_dir(out_file_test)

        out = io.write_csv(out_file_test)

        out.writerow(headers)
        for key in merged_test:
            if len(headers) != 1 + len(merged_test[key]):
                print('unequal to header amount ({} vs {})! watch out.'.format(
                    len(headers), len(merged_test[key])))
                # skip+=1
                # continue
            out.writerow([key] + merged_test[key])

    print('## Done Merging ##')
示例#18
0
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k):
    '''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and subdirectories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
    print '### executing learning algorithms on... ###'

    # get the files
    files = util.list_dir_csv(in_dir)

    # stop if no files found
    if not files:
        print 'No appropriate csv files found. Select an input directory with appropriate files'
        return

    # create directory
    util.make_dir(out_dir)

    # execute each algorithm

    # run algorithm alg for each file f
    for f in files:
        results_list = []
        fname = in_out.get_file_name(f, extension=False)
        print ' ...{}'.format(fname)

        # get data, split in features/target. If invalid stuff happened --> exit
        X, y, headers = in_out.import_data(
            f, record_id, target_id,
            True)  # assumption: first column is patientnumber
        if type(X) == bool: return

        day_index = headers.index(day_id)
        new_X = np.zeros((0, len(headers)))
        new_y = []

        IDs = []
        IDrows = {}

        # ordering of time points and complete data (filled with nan's if not available) assumed!

        # Select the right day and normalize the columns
        new_index = 0
        for i in range(0, X.shape[0]):
            if X[i, headers.index(day_id)] == day or day == -1:
                row = np.array(X[i, :]).reshape(-1)

                if not row[0] in IDs:
                    IDs.append(row[0])
                    new_y.append(int(y[i]))
                    IDrows[row[0]] = [new_index]
                else:
                    IDrows[row[0]].append(new_index)
                new_X = np.append(new_X, np.column_stack(row), axis=0)
                new_index += 1

        # Remove the id, the day, and the time stamp from the data and headers.
        new_X = np.delete(new_X, 2, 1)
        new_X = np.delete(new_X, 1, 1)
        new_X = np.delete(new_X, 0, 1)
        new_headers = headers[3:len(headers)]
        X = new_X

        # Remove columns with only a single value or all nans

        non_singular_rows = [
            i for i in range(0, X.shape[1])
            if len(set(util.get_non_nans(X[:, i].tolist()))) > 1
        ]
        #print str(len(non_singular_rows)) + ' ' + str(X.shape[1])
        #print non_singular_rows

        X = X[:, non_singular_rows]
        new_headers = np.array(new_headers)[non_singular_rows].tolist()

        max_values = np.nanmax(X, axis=0)
        min_values = np.nanmin(X, axis=0)

        ranges = []
        for i in range(0, len(min_values)):
            diff = max_values[i] - min_values[i]
            if diff == 0:
                print 'difference of zero encountered in ' + str(i)
                print 'Max values: ' + str(max_values[i])
                print 'Min values: ' + str(min_values[i])
                ranges.append(1)
            else:
                ranges.append(diff)

        # Now do some scaling to get the values to the same order or magnitude
        scaled_X = (X - min_values) / (max_values - min_values)
        X = scaled_X
        y = np.squeeze(np.asarray(new_y))

        new_IDrows = {}
        for ID in IDs:
            IDrows[ID] = {
                'first_row': min(IDrows[ID]),
                'last_row': max(IDrows[ID])
            }

        print '  ...instances: {}, attributes: {}'.format(
            X.shape[0], X.shape[1])

        # Now we are going to build the similarity matrix. We are also going to store how many attributes
        # we actually able to make a comparison for.

        similarity_matrix = np.zeros((len(IDs), len(IDs)))
        matching_number_matrix = np.ones((len(IDs), len(IDs)))

        for i in range(0, len(IDs)):
            for j in range(i + 1, len(IDs)):
                for attr in range(0, len(new_headers)):
                    i_data = X[IDrows[IDs[i]]['first_row']:
                               IDrows[IDs[i]]['last_row'] + 1, attr].tolist()
                    j_data = X[IDrows[IDs[j]]['first_row']:
                               IDrows[IDs[j]]['last_row'] + 1, attr].tolist()
                    #print i_data
                    #print j_data
                    if new_headers[attr] in dtw_attr:
                        dtw_distance = dtw.lb_keogh(i_data, j_data, window)
                        # print dtw_distance
                        if not dtw_distance == -1:
                            similarity_matrix[i, j] += dtw_distance
                            matching_number_matrix[i, j] += 1
                    else:
                        i_data = util.get_non_nans(i_data)
                        j_data = util.get_non_nans(j_data)
                        if len(i_data) > 0 and len(j_data) > 0:
                            simple_distance = math.pow(
                                np.mean(i_data) - np.mean(j_data), 2)
                            similarity_matrix[i, j] += simple_distance
                            matching_number_matrix[i, j] += 1
                similarity_matrix[j, i] = similarity_matrix[i, j]
                matching_number_matrix[j, i] = matching_number_matrix[i, j]

        similarity_matrix = similarity_matrix / matching_number_matrix  # We calculate the average score per item matched
        # Best might be to apply a weighting scheme now.

        results = perform_classification(similarity_matrix, y, out_dir, k)
        results_list.append(results)

        in_out.save_results(out_dir + str(k) + '.csv',
                            ["fpr", "tpr", "auc", "cm"],
                            results[1:len(results)], [sum(y), len(y)])
        in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve')

    # notify user
    print '## Learning Finished ##'
	def export(self, out_dir):
		'''export results'''
		util.make_dir(out_dir)
		
		io.dict2csv(self.code2manifestation_of, out_dir + 'manifestationof.csv')
		io.dict2csv(self.code2association, out_dir + 'association.csv')
def execute(in_dir, out_dir, record_id, target_id, algorithms, feature_selection, separate_testset, in_dir_test):
	'''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and subdirectories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
	print '### executing learning algorithms on... ###'
	
	# get the files
	files = util.list_dir_csv(in_dir)

	# stop if no files found
	if not files:
		print 'No appropriate csv files found. Select an input directory with appropriate files'
		return

	if separate_testset:
		files_test = util.list_dir_csv(in_dir_test)
	else:
		files_test = files

	# create directory
	util.make_dir(out_dir)

	# execute each algorithm
	for alg in algorithms:
		print '...{}'.format(alg)
	
		util.make_dir(out_dir+'/'+alg+'/')
		results_list = []	
		if separate_testset:
			results_list2 = []
			util.make_dir(out_dir+'/'+alg+'_test/')

		# list which will contain the results
	
		# run algorithm alg for each file f
		for f, f_test in zip(files,files_test):
			fname = in_out.get_file_name(f, extension=False)
			print ' ...{}'.format(fname)
	
			# get data, split in features/target. If invalid stuff happened --> exit
			X, y, headers = in_out.import_data(f, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
			if type(X) == bool: return

			# if separate_testset:
			# 	X, X_te = X
			# 	y, y_te = y
			# 	print '  ...train instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])
			# 	print '  ...test instances: {}, attributes: {}'.format(X_te.shape[0], X_te.shape[1])
			# else:
			print '  ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])

			model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection)
			results_list.append(results)

			if separate_testset:
				X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
				if type(X) == bool: return
				
				print '  ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1])				

				results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features)
				results_list2.append(results)

		try:
			in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve')
		except IndexError:
			pass
		
		try:
			in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve')
		except NameError:
			pass

	# notify user
	print '## Learning Finished ##'