if len(args) < 1:
        print parser.usage
        print 'options:', options
        print 'args', args
        exit()

    inname = args[0]
    subset = None
    if len(options.subset_keys):
        subset = [x.strip() for x in options.subset_keys.split(',') if len(x.strip()) > 0]
    if subset:
        outname = os.path.splitext(inname)[0] + '.subset[%s]' % ','.join(subset)
    else:
        outname = os.path.splitext(inname)[0] + '.many'

    data_dict = cleanDictKeysAndVals(csv.readCsvAsDict(inname))
    data_dict_many = filterDict(data_dict, lambda k,v: getNumElements(v) >= len(data_dict['Person.ID'])/2)
    data_dict_many = filterDict(data_dict_many, lambda k,v: getNumElementsWithFreq(v,3) >= 2)
    if subset:
        data_dict_many = filterDict(data_dict, lambda k,v: k in subset + ['Grant.Status'])
    data_dict_many['Grant.Status'] = data_dict['Grant.Status']

    #showStats(data_dict)
    showStats(data_dict_many)

    date_strings = data_dict['Start.date']
    dates = ['%.2f' % stringToDate(x) for x in date_strings]
    print 'dates', sorted(dates)[::100]
    # Convert dates to numbers that Weka can understand
    data_dict_many['Start.date'] = dates
Пример #2
0
        test_knn()
    if False:
        test_knn_probability0()
        
    if len(sys.argv) < 3:
        print 'usage:', sys.argv[0], '<training file name> <test file name>'
        exit()

    training_data_csv = sys.argv[1]
    test_data_csv = sys.argv[2]
    k = 4 
    print 'training_data_csv:', training_data_csv
    print 'test_data_csv:', test_data_csv
    print 'k:', k

    training_data_dict_str, _ = csv.readCsvAsDict(training_data_csv)
    training_data_dict = {}
    for k in training_data_dict_str.keys():
        training_data_dict[k] = [float(x) for x in training_data_dict_str[k]]
    print 'training keys:', training_data_dict.keys()
    training_data_class = training_data_dict['Grant.Status']
    training_data_keys = [k for k in sorted(training_data_dict.keys()) if k != 'Grant.Status']
    training_data = misc.transpose([training_data_dict[k] for k in training_data_keys])
    print 'training data:', len(training_data), len(training_data[0])

    test_data_dict_str, _ = csv.readCsvAsDict(test_data_csv)
    test_data_dict = {}
    for k in test_data_dict_str.keys():
        test_data_dict[k] = [float(x) for x in test_data_dict_str[k]]
    # Use training data column headers to ensure data matches
    test_data = misc.transpose([test_data_dict[k] for k in training_data_keys])
Пример #3
0
    print 'options:', options
    print 'args:', args
    print 'has_class:', options.has_class
    print 'num_rules:', num_rules
    print 'weka_results_filename:', weka_results_filename
    print 'data_file_csv:', data_file_csv
    print 'output_dir:', options.output_dir
    print 'knn_file_csv:', knn_file_csv

    all_rules, compound_rules = get_rules_from_weka_results(weka_results_filename)
    sorted_keys = get_sorted_rules_keys(all_rules)
    
    attrs = sorted(list(set([attr for attr,_,_ in sorted_keys])))
    print ' attrs:', len(attrs), attrs

    data_dict, num_instances = csv.readCsvAsDict(data_file_csv)
    header = [k for k in sorted(data_dict.keys()) if k != 'Grant.Status']
    print 'header:', len(header), header

    for a in attrs:
        assert(a in header)

    evals_dict = {}
    if DO_COMPOUND_RULES:
        evals_header = [compound_rule_to_string(compound) for compound in compound_rules[:num_rules]]
        if False:
            for i,e in enumerate(evals_header):
                print i,e
    else:
        evals_header = [rule_to_string(rule) for rule in sorted_keys[:num_rules]]