if len(args) < 1: print parser.usage print 'options:', options print 'args', args exit() inname = args[0] subset = None if len(options.subset_keys): subset = [x.strip() for x in options.subset_keys.split(',') if len(x.strip()) > 0] if subset: outname = os.path.splitext(inname)[0] + '.subset[%s]' % ','.join(subset) else: outname = os.path.splitext(inname)[0] + '.many' data_dict = cleanDictKeysAndVals(csv.readCsvAsDict(inname)) data_dict_many = filterDict(data_dict, lambda k,v: getNumElements(v) >= len(data_dict['Person.ID'])/2) data_dict_many = filterDict(data_dict_many, lambda k,v: getNumElementsWithFreq(v,3) >= 2) if subset: data_dict_many = filterDict(data_dict, lambda k,v: k in subset + ['Grant.Status']) data_dict_many['Grant.Status'] = data_dict['Grant.Status'] #showStats(data_dict) showStats(data_dict_many) date_strings = data_dict['Start.date'] dates = ['%.2f' % stringToDate(x) for x in date_strings] print 'dates', sorted(dates)[::100] # Convert dates to numbers that Weka can understand data_dict_many['Start.date'] = dates
test_knn() if False: test_knn_probability0() if len(sys.argv) < 3: print 'usage:', sys.argv[0], '<training file name> <test file name>' exit() training_data_csv = sys.argv[1] test_data_csv = sys.argv[2] k = 4 print 'training_data_csv:', training_data_csv print 'test_data_csv:', test_data_csv print 'k:', k training_data_dict_str, _ = csv.readCsvAsDict(training_data_csv) training_data_dict = {} for k in training_data_dict_str.keys(): training_data_dict[k] = [float(x) for x in training_data_dict_str[k]] print 'training keys:', training_data_dict.keys() training_data_class = training_data_dict['Grant.Status'] training_data_keys = [k for k in sorted(training_data_dict.keys()) if k != 'Grant.Status'] training_data = misc.transpose([training_data_dict[k] for k in training_data_keys]) print 'training data:', len(training_data), len(training_data[0]) test_data_dict_str, _ = csv.readCsvAsDict(test_data_csv) test_data_dict = {} for k in test_data_dict_str.keys(): test_data_dict[k] = [float(x) for x in test_data_dict_str[k]] # Use training data column headers to ensure data matches test_data = misc.transpose([test_data_dict[k] for k in training_data_keys])
print 'options:', options print 'args:', args print 'has_class:', options.has_class print 'num_rules:', num_rules print 'weka_results_filename:', weka_results_filename print 'data_file_csv:', data_file_csv print 'output_dir:', options.output_dir print 'knn_file_csv:', knn_file_csv all_rules, compound_rules = get_rules_from_weka_results(weka_results_filename) sorted_keys = get_sorted_rules_keys(all_rules) attrs = sorted(list(set([attr for attr,_,_ in sorted_keys]))) print ' attrs:', len(attrs), attrs data_dict, num_instances = csv.readCsvAsDict(data_file_csv) header = [k for k in sorted(data_dict.keys()) if k != 'Grant.Status'] print 'header:', len(header), header for a in attrs: assert(a in header) evals_dict = {} if DO_COMPOUND_RULES: evals_header = [compound_rule_to_string(compound) for compound in compound_rules[:num_rules]] if False: for i,e in enumerate(evals_header): print i,e else: evals_header = [rule_to_string(rule) for rule in sorted_keys[:num_rules]]