Exemplo n.º 1
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        # Make sure they use --meta-iters if they want to do bagging/boosting
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    options['schema'] = schema
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    # import pdb;pdb.set_trace()
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print(options)
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y)
        train_time = (train_start - time.time())

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    print('      Accuracy: %.03f %.03f' %
          stats_manager.get_statistic('accuracy', pooled=False))

    print('     Precision: %.03f %.03f' %
          stats_manager.get_statistic('precision', pooled=False))

    print('        Recall: %.03f %.03f' %
          stats_manager.get_statistic('recall', pooled=False))

    print('Area under ROC: %.03f' %
          stats_manager.get_statistic('auc', pooled=True))
Exemplo n.º 2
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" % fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    #import pdb;pdb.set_trace()
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y, schema)
        train_time = (train_start - time.time())
        print train_time
        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X,schema)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:,1]    # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)
        
       
   
    print ('      Accuracy: %.03f %.03f'
        % stats_manager.get_statistic('accuracy', pooled=False))
		
    print ('     Precision: %.03f %.03f'
        % stats_manager.get_statistic('precision', pooled=False))
    
    print ('        Recall: %.03f %.03f'
        % stats_manager.get_statistic('recall', pooled=False))
    
    print ('Area under ROC: %.03f'
        % stats_manager.get_statistic('auc', pooled=True))
Exemplo n.º 3
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """
        Make sure they use --meta-iters if they want to do bagging/boosting.
        """
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    options['schema'] = schema
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()

    pool = mp.Pool(k)  # one process per fold
    # CPU gogogo
    results = pool.map(train_and_evaluate, [(fold, options) for fold in folds])

    for test_y, predictions, scores, train_time in results:
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    accuracy, std_dev = stats_manager.get_statistic(
        'accuracy',
        pooled=False,
    )
    print(('      Accuracy: %.03f %.03f' % (accuracy, std_dev)))
    precision, std_dev = stats_manager.get_statistic(
        'precision',
        pooled=False,
    )
    print(('     Precision: %.03f %.03f' % (precision, std_dev)))
    recall, std_dev = stats_manager.get_statistic(
        'recall',
        pooled=False,
    )
    print(('        Recall: %.03f %.03f' % (recall, std_dev)))
    area = stats_manager.get_statistic('auc', pooled=True)
    print(('Area under ROC: %.03f' % area))
Exemplo n.º 4
0
def main(**options):

    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')
    
    #MAX_DEPTH = options.pop('depth')
    
    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"])
    
    fs_alg = None
    
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" % fs_alg)
        fs_n = options.pop("fs_features")
    
    schema, X, y = get_dataset(dataset, dataset_directory)
    
    attr_set=[]
    for i in range(len(schema.feature_names)):
        attr_set.append(schema.is_nominal(i))

    folds = get_folds(X, y, k)
    
    stats_manager = StatisticsManager()
    
    #import pdb;pdb.set_trace()
    
    for train_X, train_y, test_X, test_y in folds:
        
        # Construct classifier instance
        print options
    
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y, attr_set)
        
        print 'ff'
        train_time = (train_start - time.time())
        
        if fs_alg:
            test_X = selector.transform(test_X)
            
        predictions=[]
        for t in test_X:
            predictions.append(classifier.predict(t))

        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:,1]    # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)
        
        print classifier.size()
        print classifier.depth()
    print ('      Accuracy: %.03f %.03f'
        % stats_manager.get_statistic('accuracy', pooled=False))
         
    
    '''
Exemplo n.º 5
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    #import pdb;pdb.set_trace()
    #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end
    maxSize = -1
    maxDepth = -1
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)
        classifier.schema = schema

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        #Note that I changed fit to take in the schema
        classifier.fit(train_X, train_y, schema)
        train_time = (train_start - time.time())

        #To see the values and confidences of the root node
        #for attrVal, child in classifier.treeHead.children.iteritems():
        #    print "%d with confidence %f" % (attrVal, child.classLabelConfidence)

        #Maintennce to keep track of the maxSize and maxDepth
        if classifier.size > maxSize:
            maxSize = classifier.size
        if classifier.depth > maxDepth:
            maxDepth = classifier.depth

        #For my testing purposes, I had printed out the train_time
        #print "train time: %f" % train_time

        #For spam and voting tests, I printed out the root attribute
        #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute])

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    #The printouts specified by the assignments
    print('\tAccuracy: %.03f %.03f' %
          stats_manager.get_statistic('accuracy', pooled=False))
    print "\tMaximum Size: %d" % maxSize
    print "\tMaximum Depth: %d" % maxDepth
Exemplo n.º 6
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" % fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    #import pdb;pdb.set_trace()
    #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end
    maxSize = -1
    maxDepth = -1
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)
        classifier.schema = schema

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        #Note that I changed fit to take in the schema
        classifier.fit(train_X, train_y, schema)
        train_time = (train_start - time.time())

        #To see the values and confidences of the root node
        #for attrVal, child in classifier.treeHead.children.iteritems():
        #    print "%d with confidence %f" % (attrVal, child.classLabelConfidence)

        #Maintennce to keep track of the maxSize and maxDepth
        if classifier.size > maxSize:
            maxSize = classifier.size
        if classifier.depth > maxDepth:
            maxDepth = classifier.depth

        #For my testing purposes, I had printed out the train_time
        #print "train time: %f" % train_time
        
        #For spam and voting tests, I printed out the root attribute
        #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute])

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:,1]    # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    #The printouts specified by the assignments
    print ('\tAccuracy: %.03f %.03f'
        % stats_manager.get_statistic('accuracy', pooled=False))
    print "\tMaximum Size: %d" % maxSize
    print "\tMaximum Depth: %d" % maxDepth
Exemplo n.º 7
0
def main(**options):

    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    #MAX_DEPTH = options.pop('depth')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None

    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)

    attr_set = []
    for i in range(len(schema.feature_names)):
        attr_set.append(schema.is_nominal(i))

    folds = get_folds(X, y, k)

    stats_manager = StatisticsManager()

    #import pdb;pdb.set_trace()

    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options

        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y, attr_set)

        print 'ff'
        train_time = (train_start - time.time())

        if fs_alg:
            test_X = selector.transform(test_X)

        predictions = []
        for t in test_X:
            predictions.append(classifier.predict(t))

        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

        print classifier.size()
        print classifier.depth()
    print('      Accuracy: %.03f %.03f' %
          stats_manager.get_statistic('accuracy', pooled=False))
    '''
Exemplo n.º 8
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        # Make sure they use --meta-iters if they want to do bagging/boosting
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    # put schema in the options so the classifier has it
    options['schema'] = schema
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    sizes = []
    depths = []
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)

        classifier.fit(train_X, train_y)
        first_test = classifier.root.feature
        print_str = schema.feature_names[first_test]
        if classifier.root.split:
            print_str += " <= %f" % classifier.root.split
        # Print the first test of each learned classifier
        print "First test: %s" % print_str
        train_time = (train_start - time.time())
        sizes.append(classifier.size())
        depths.append(classifier.depth())

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    print('      Accuracy: %.03f %.03f' %
          stats_manager.get_statistic('accuracy', pooled=False))
    print('      Average Size: %.03f' % np.mean(sizes))
    print('      Average Depth: %.03f' % np.mean(depths))
    '''