Пример #1
0
def analyze_classifiers(region_key, classifiers, x_train, y_train, x_test, y_test, out_file, preprocessor=text_preprocessor):
    printers.printsf('{0}Analysis for {1} ear region{0}'.format(40*'-', region_key), out_file)
    for key,value in classifiers.items():
        clf = value[0] #the classifier
        usa = value[1] #use spare array
        ubf = value[2] #use binary features (this is to support NB)
        parameters = value[3]
        vectorizer = CountVectorizer(input='content', decode_error='ignore', preprocessor=preprocessor, binary=ubf)
        pipeline = (Pipeline(steps=[('vect', vectorizer),('clf',clf)]) if usa
                    else Pipeline(steps=[('vect', vectorizer),('sa',sklx.SparseToArray()),('clf',clf)]))
        gs = sklx.grid_analysis(pipeline,parameters, x_train, y_train)
        printers.print_grid_search_results(gs,key,out_file,x_test,y_test)
Пример #2
0
    #create empty patient array to hold predicted values
    num_patients = len(test_labels)
    patients = np.empty((num_patients,), dtype=[('pid','S7'),('inner','i4'),('middle','i4'),('outer','i4'),('mastoid','i4')])

    #initialize patients array
    for k in region_keys:
        patients[k] = 0

    #get patient values based on icd9 codes
    cnt = 0
    for _ , row in test_labels.iterrows():
        pid = row['pid']
        patients['pid'][cnt] = pid
        report = test_reports[cnt]
        for region in region_keys:
            for keyword in keywords[region]:
                if keyword in report: patients[region][cnt] = 1
        cnt += 1


    #compare predicted and actual
    for k in region_keys:
        printers.printsf('{0}Analysis for {1} ear region{0}'.format(40*'-', k), standard_out_file)
        y_pred = patients[k]
        y_act = test_labels[k]
        pm = PerformanceMetrics(y_act, y_pred)
        printers.printsfPerformanceMetrics(pm, standard_out_file)
        cm = sklearn.metrics.confusion_matrix(y_act, y_pred)
        printers.printTwoClassConfusion(cm, standard_out_file)

Пример #3
0

if __name__ == '__main__':
    # training/test data and output files
    label_file = 'data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt'
    report_path = 'data/input/SDS_PV2_combined/reports'
    output_path = 'data/output/{0}'
    label_data = pd.read_csv(label_file)
    region_keys = label_data.columns[2:6]
    standard_out_file = output_path.format('Model_Persistence_Report.txt')
    if not os.path.exists(os.path.dirname(standard_out_file)):
        os.makedirs(os.path.dirname(standard_out_file))
    now = time.localtime()
    (printers.printsf(
        '{6}{0}-{1}-{2} {3}:{4}:{5}{6}'.format(now.tm_year, now.tm_mon,
                                               now.tm_mday, now.tm_hour,
                                               now.tm_min, now.tm_sec,
                                               40 * '-'), standard_out_file,
        'a', False))

    # set the numpy random seed so results are reproducible
    rs = RandomState(__seed__)

    # partition the data
    pos_cases, neg_cases = wrangle.partion(label_data['doc_norm'] == 1,
                                           label_data,
                                           ratios=[0.8, 0.2])
    train_mask = np.concatenate((pos_cases[0], neg_cases[0]))
    test_mask = np.concatenate((pos_cases[1], neg_cases[1]))
    rs.shuffle(train_mask)
    rs.shuffle(test_mask)
    train_labels = label_data.iloc[train_mask]
Пример #4
0
    (printers.
     printsf('{6}{0}-{1}-{2} {3}:{4}:{5}{6}'.
             format(now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec, 40*'-'),
             standard_out_file, 'a',False))

    # partition the data
    pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2])
    train_mask = np.concatenate((pos_cases[0], neg_cases[0]))
    test_mask = np.concatenate((pos_cases[1], neg_cases[1]))
    rs.shuffle(train_mask)
    rs.shuffle(test_mask)
    train_labels = label_data.iloc[train_mask]
    test_labels = label_data.iloc[test_mask]

    # print partition stats
    printers.printsf('{0}Data Partition Stats{0}'.format(40*'-'), standard_out_file)
    printers.print_data_stats(train_labels['doc_norm'], test_labels['doc_norm'], '{0}Document{0}'.format(40*'-'),standard_out_file)
    for key in region_keys:
        printers.print_data_stats(train_labels[key], test_labels[key],
                                  '{0}{1}{0}'.format(40*'-', key),standard_out_file)

    # read in the text reports
    train_reports = [load_report('{0}/{1}{2}'.format(report_path, pid, file_suffix)) for pid in train_labels['pid']]
    test_reports = [load_report('{0}/{1}{2}'.format(report_path, pid, file_suffix)) for pid in test_labels['pid']]

    #------------------------------ BASELINE ANALYSIS -----------------------------------------------------------------

    if analyze_baseline:

        clf = linear_model.LogisticRegression(C=1000)
        #clf = BernoulliNB(alpha=1.0, binarize=None, fit_prior=True, class_prior=None)
Пример #5
0
             standard_out_file, 'a',False))

    # set the numpy random seed so results are reproducible
    rs = RandomState(__seed__)

    # partition the data
    pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2])
    train_mask = np.concatenate((pos_cases[0], neg_cases[0]))
    test_mask = np.concatenate((pos_cases[1], neg_cases[1]))
    rs.shuffle(train_mask)
    rs.shuffle(test_mask)
    train_labels = label_data.iloc[train_mask]
    test_labels = label_data.iloc[test_mask]

    # print partition stats
    printers.printsf('{0}Data Partition Stats{0}'.format(40*'-'), standard_out_file)
    for key in region_keys:
        printers.print_data_stats(train_labels[key], test_labels[key],
                                  '{0}{1}{0}'.format(40*'-', key),standard_out_file)

    # read in the text reports
    print 'loading reports ..............'
    train_reports = [load_report('{0}/{1}.txt'.format(report_path,pid)) for pid in train_labels['pid']]
    test_reports = [load_report('{0}/{1}.txt'.format(report_path,pid)) for pid in test_labels['pid']]

    print 'loading configuration file .........'
    # load the configuration
    config_path = 'resources/config/models.ini'
    configuration = ConfigParser.ConfigParser()
    configuration.read(config_path)