def analyze_classifiers(region_key, classifiers, x_train, y_train, x_test, y_test, out_file, preprocessor=text_preprocessor): printers.printsf('{0}Analysis for {1} ear region{0}'.format(40*'-', region_key), out_file) for key,value in classifiers.items(): clf = value[0] #the classifier usa = value[1] #use spare array ubf = value[2] #use binary features (this is to support NB) parameters = value[3] vectorizer = CountVectorizer(input='content', decode_error='ignore', preprocessor=preprocessor, binary=ubf) pipeline = (Pipeline(steps=[('vect', vectorizer),('clf',clf)]) if usa else Pipeline(steps=[('vect', vectorizer),('sa',sklx.SparseToArray()),('clf',clf)])) gs = sklx.grid_analysis(pipeline,parameters, x_train, y_train) printers.print_grid_search_results(gs,key,out_file,x_test,y_test)
#create empty patient array to hold predicted values num_patients = len(test_labels) patients = np.empty((num_patients,), dtype=[('pid','S7'),('inner','i4'),('middle','i4'),('outer','i4'),('mastoid','i4')]) #initialize patients array for k in region_keys: patients[k] = 0 #get patient values based on icd9 codes cnt = 0 for _ , row in test_labels.iterrows(): pid = row['pid'] patients['pid'][cnt] = pid report = test_reports[cnt] for region in region_keys: for keyword in keywords[region]: if keyword in report: patients[region][cnt] = 1 cnt += 1 #compare predicted and actual for k in region_keys: printers.printsf('{0}Analysis for {1} ear region{0}'.format(40*'-', k), standard_out_file) y_pred = patients[k] y_act = test_labels[k] pm = PerformanceMetrics(y_act, y_pred) printers.printsfPerformanceMetrics(pm, standard_out_file) cm = sklearn.metrics.confusion_matrix(y_act, y_pred) printers.printTwoClassConfusion(cm, standard_out_file)
if __name__ == '__main__': # training/test data and output files label_file = 'data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt' report_path = 'data/input/SDS_PV2_combined/reports' output_path = 'data/output/{0}' label_data = pd.read_csv(label_file) region_keys = label_data.columns[2:6] standard_out_file = output_path.format('Model_Persistence_Report.txt') if not os.path.exists(os.path.dirname(standard_out_file)): os.makedirs(os.path.dirname(standard_out_file)) now = time.localtime() (printers.printsf( '{6}{0}-{1}-{2} {3}:{4}:{5}{6}'.format(now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec, 40 * '-'), standard_out_file, 'a', False)) # set the numpy random seed so results are reproducible rs = RandomState(__seed__) # partition the data pos_cases, neg_cases = wrangle.partion(label_data['doc_norm'] == 1, label_data, ratios=[0.8, 0.2]) train_mask = np.concatenate((pos_cases[0], neg_cases[0])) test_mask = np.concatenate((pos_cases[1], neg_cases[1])) rs.shuffle(train_mask) rs.shuffle(test_mask) train_labels = label_data.iloc[train_mask]
(printers. printsf('{6}{0}-{1}-{2} {3}:{4}:{5}{6}'. format(now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec, 40*'-'), standard_out_file, 'a',False)) # partition the data pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2]) train_mask = np.concatenate((pos_cases[0], neg_cases[0])) test_mask = np.concatenate((pos_cases[1], neg_cases[1])) rs.shuffle(train_mask) rs.shuffle(test_mask) train_labels = label_data.iloc[train_mask] test_labels = label_data.iloc[test_mask] # print partition stats printers.printsf('{0}Data Partition Stats{0}'.format(40*'-'), standard_out_file) printers.print_data_stats(train_labels['doc_norm'], test_labels['doc_norm'], '{0}Document{0}'.format(40*'-'),standard_out_file) for key in region_keys: printers.print_data_stats(train_labels[key], test_labels[key], '{0}{1}{0}'.format(40*'-', key),standard_out_file) # read in the text reports train_reports = [load_report('{0}/{1}{2}'.format(report_path, pid, file_suffix)) for pid in train_labels['pid']] test_reports = [load_report('{0}/{1}{2}'.format(report_path, pid, file_suffix)) for pid in test_labels['pid']] #------------------------------ BASELINE ANALYSIS ----------------------------------------------------------------- if analyze_baseline: clf = linear_model.LogisticRegression(C=1000) #clf = BernoulliNB(alpha=1.0, binarize=None, fit_prior=True, class_prior=None)
standard_out_file, 'a',False)) # set the numpy random seed so results are reproducible rs = RandomState(__seed__) # partition the data pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2]) train_mask = np.concatenate((pos_cases[0], neg_cases[0])) test_mask = np.concatenate((pos_cases[1], neg_cases[1])) rs.shuffle(train_mask) rs.shuffle(test_mask) train_labels = label_data.iloc[train_mask] test_labels = label_data.iloc[test_mask] # print partition stats printers.printsf('{0}Data Partition Stats{0}'.format(40*'-'), standard_out_file) for key in region_keys: printers.print_data_stats(train_labels[key], test_labels[key], '{0}{1}{0}'.format(40*'-', key),standard_out_file) # read in the text reports print 'loading reports ..............' train_reports = [load_report('{0}/{1}.txt'.format(report_path,pid)) for pid in train_labels['pid']] test_reports = [load_report('{0}/{1}.txt'.format(report_path,pid)) for pid in test_labels['pid']] print 'loading configuration file .........' # load the configuration config_path = 'resources/config/models.ini' configuration = ConfigParser.ConfigParser() configuration.read(config_path)