Exemplo n.º 1
0
def walk_classifier(name, data_fn, ckargs=None):

    evaluation_sets = {}  # {year: [training_list, test_list, expected_list]}

    final_query_lines = []

    # Build training data.
    year = date.today().year
    fn = data_fn
    print('Reading %s...' % fn)
    fn2 = spreadsheet_to_csv(fn)
    i = 0
    for line in read_raw_csv(fn2):
        i += 1
        print('line:', i, line)

        year = int(line['Election'].value)

        evaluation_sets.setdefault(year, [[], [], []])

        if line['Won'].value != MISSING:

            # Add line to test set.
            test_line = copy.deepcopy(line)
            evaluation_sets[year][2].append(test_line['Won'].value)
            test_line['Won'].value = MISSING
            evaluation_sets[year][1].append(test_line)

            # Add line to all future sets.
            for other_year in evaluation_sets:
                if year < other_year:
                    evaluation_sets[other_year][0].append(line)

        else:
            final_query_lines.append(line)

    accuracy = []

    final_training_data = None
    final_year = None

    # Evaluate each evaluation set.
    #pprint(evaluation_sets, indent=4)
    print('%i evaluation_sets.' % len(evaluation_sets))
    for year, data in sorted(evaluation_sets.items()):
        raw_training_data, raw_testing_data, prediction_values = data
        print('Evaluation set:', year, len(raw_training_data),
              len(raw_testing_data), len(prediction_values))

        if not raw_training_data:
            print('No training data. Skipping.')
            continue

        # Create training set.
        training_data = ArffFile(relation='presidential-candidates')
        for _line in raw_training_data:
            training_data.append(_line)
        training_data.attribute_data['Won'].update([DEMOCRAT, REPUBLICAN])
        training_data.write(open('training_data_%i.arff' % year, 'w'))

        if not raw_testing_data:
            final_training_data = training_data
            final_year = year
            print('No testing data. Skipping.')
            continue

        # Create query set.
        query_data = training_data.copy(schema_only=True)
        for _line in raw_testing_data:
            query_data.append(_line)
        query_data.write(open('query_data_%i.arff' % year, 'w'))

        # Train
        print('=' * 80)
        c = Classifier(name=name, ckargs=ckargs)
        print('Training...')
        c.train(training_data, verbose=True)

        # Test
        print('Predicting...')
        predictions = c.predict(query_data, verbose=True, distribution=True)
        print('predictions:')
        for predicted_value, actual_value in zip(predictions,
                                                 prediction_values):
            print('predicted_value =', predicted_value, 'actual_value =',
                  actual_value)
            accuracy.append(predicted_value.predicted == actual_value)

    print('-' * 80)
    accuracy_history = accuracy
    if accuracy:
        accuracy = sum(accuracy) / float(len(accuracy))
    else:
        accuracy = None
    print('accuracy_history:', accuracy_history)
    print('accuracy:', accuracy)

    # Make final prediction.
    predicted_cls = None
    certainty = None
    if final_training_data:

        # Create final query set.
        query_data = final_training_data.copy(schema_only=True)
        for _line in final_query_lines:
            query_data.append(_line)
        query_data.write(open('query_data_%i.arff' % year, 'w'))

        # Train
        print('!' * 80)
        c = Classifier(name=name, ckargs=ckargs)
        print('Final Training...')
        c.train(final_training_data, verbose=True)

        # Test
        print('~' * 80)
        print('Final Predicting...')
        predictions = c.predict(query_data, verbose=True, distribution=True)
        print('final predictions:')
        for predicted_value in predictions:
            print('predicted_value:', predicted_value)
            with open('prediction_%i_%s.txt' % (year, name), 'w') as fout:
                print('stdout:', file=fout)
                print(c.last_training_stdout, file=fout)
                print(file=fout)
                print('stderr.begin:', file=fout)
                print(c.last_training_stderr, file=fout)
                print('stderr.end:', file=fout)
                print(file=fout)
                print('predicted_value.probability:',
                      predicted_value.probability,
                      file=fout)
                predicted_cls = predicted_value.predicted
                certainty = predicted_value.certainty

    else:
        raise Exception(
            'No final training data! Are there no empty "won" columns?')

    return accuracy, predicted_cls, certainty
Exemplo n.º 2
0
class WekaRandomForestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_estimators=10,
                       max_depth=None,
                       max_features="auto",
                       random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state

    def fit(self, X, y):
        # Check params
        self.n_features_ = X.shape[1]
        random_state = check_random_state(self.random_state)

        if isinstance(self.max_features, str):
            if self.max_features == "auto":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        params = {}
        params["-I"] = self.n_estimators
        params["-K"] = max_features
        params["-depth"] = 0 if self.max_depth is None else self.max_depth
        params["-no-cv"] = None
        params["-s"] = random_state.randint(1000000)

        # Convert data
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        y = np.searchsorted(self.classes_, y)

        tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False)
        to_arff(X, y, self.n_classes_, tf)
        tf.close()

        # Run
        self.model_ = Classifier(name="weka.classifiers.trees.RandomForest", ckargs=params)
        self.model_.train(tf.name)
        os.remove(tf.name)

        return self

    def predict(self, X):
        tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False)
        to_arff(X, None, self.n_classes_, tf)
        tf.close()

        pred = np.zeros(len(X), dtype=np.int32)

        for i, r in enumerate(self.model_.predict(tf.name)):
            pred[i] = int(r.predicted[5])

        os.remove(tf.name)

        return self.classes_[pred]
Exemplo n.º 3
0
    def test_IBk(self):

        # Train a classifier.
        print('Training IBk classifier...')
        c = Classifier(name='weka.classifiers.lazy.IBk', ckargs={'-K': 1})
        training_fn = os.path.join(BP, 'fixtures/abalone-train.arff')
        c.train(training_fn, verbose=1)
        self.assertTrue(c._model_data)

        # Make a valid query.
        print('Using IBk classifier...')
        query_fn = os.path.join(BP, 'fixtures/abalone-query.arff')
        predictions = list(c.predict(query_fn, verbose=1, cleanup=0))
        pred0 = predictions[0]
        print('pred0:', pred0)
        pred1 = PredictionResult(actual=None, predicted=7, probability=None)
        print('pred1:', pred1)
        self.assertEqual(pred0, pred1)

        # Make a valid query.
        with self.assertRaises(PredictionError):
            query_fn = os.path.join(BP, 'fixtures/abalone-query-bad.arff')
            predictions = list(c.predict(query_fn, verbose=1, cleanup=0))

        # Make a valid query manually.
        query = arff.ArffFile(relation='test',
                              schema=[
                                  ('Sex', ('M', 'F', 'I')),
                                  ('Length', 'numeric'),
                                  ('Diameter', 'numeric'),
                                  ('Height', 'numeric'),
                                  ('Whole weight', 'numeric'),
                                  ('Shucked weight', 'numeric'),
                                  ('Viscera weight', 'numeric'),
                                  ('Shell weight', 'numeric'),
                                  ('Class_Rings', 'integer'),
                              ])
        query.append(
            ['M', 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07, '?'])
        data_str0 = """% 
@relation test
@attribute 'Sex' {F,I,M}
@attribute 'Length' numeric
@attribute 'Diameter' numeric
@attribute 'Height' numeric
@attribute 'Whole weight' numeric
@attribute 'Shucked weight' numeric
@attribute 'Viscera weight' numeric
@attribute 'Shell weight' numeric
@attribute 'Class_Rings' integer
@data
M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,?
"""
        data_str1 = query.write(fmt=DENSE)
        #        print(data_str0
        #        print(data_str1
        self.assertEqual(data_str0, data_str1)
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(
            predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))

        # Test pickling.
        fn = os.path.join(BP, 'fixtures/IBk.pkl')
        c.save(fn)
        c = Classifier.load(fn)
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(
            predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))
        #print('Pickle verified.')

        # Make a valid dict query manually.
        query = arff.ArffFile(relation='test',
                              schema=[
                                  ('Sex', ('M', 'F', 'I')),
                                  ('Length', 'numeric'),
                                  ('Diameter', 'numeric'),
                                  ('Height', 'numeric'),
                                  ('Whole weight', 'numeric'),
                                  ('Shucked weight', 'numeric'),
                                  ('Viscera weight', 'numeric'),
                                  ('Shell weight', 'numeric'),
                                  ('Class_Rings', 'integer'),
                              ])
        query.append({
            'Sex': 'M',
            'Length': 0.35,
            'Diameter': 0.265,
            'Height': 0.09,
            'Whole weight': 0.2255,
            'Shucked weight': 0.0995,
            'Viscera weight': 0.0485,
            'Shell weight': 0.07,
            'Class_Rings': arff.MISSING,
        })
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(
            predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))
Exemplo n.º 4
0
class WekaRandomForestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 n_estimators=10,
                 max_depth=None,
                 max_features="auto",
                 random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state

    def fit(self, X, y):
        # Check params
        self.n_features_ = X.shape[1]
        random_state = check_random_state(self.random_state)

        if isinstance(self.max_features, str):
            if self.max_features == "auto":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        params = {}
        params["-I"] = self.n_estimators
        params["-K"] = max_features
        params["-depth"] = 0 if self.max_depth is None else self.max_depth
        params["-no-cv"] = None
        params["-s"] = random_state.randint(1000000)

        # Convert data
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        y = np.searchsorted(self.classes_, y)

        tf = tempfile.NamedTemporaryFile(mode="w",
                                         suffix=".arff",
                                         dir="/dev/shm",
                                         delete=False)
        to_arff(X, y, self.n_classes_, tf)
        tf.close()

        # Run
        self.model_ = Classifier(name="weka.classifiers.trees.RandomForest",
                                 ckargs=params)
        self.model_.train(tf.name)
        os.remove(tf.name)

        return self

    def predict(self, X):
        tf = tempfile.NamedTemporaryFile(mode="w",
                                         suffix=".arff",
                                         dir="/dev/shm",
                                         delete=False)
        to_arff(X, None, self.n_classes_, tf)
        tf.close()

        pred = np.zeros(len(X), dtype=np.int32)

        for i, r in enumerate(self.model_.predict(tf.name)):
            pred[i] = int(r.predicted[5])

        os.remove(tf.name)

        return self.classes_[pred]
Exemplo n.º 5
0
class LRWrapper:

    #k is the number of features
    def __init__(self, C):
        self.k = 4
        self.C = C
        self.trainingData = None
        self.testingData = None
        #jvm.start()
        #jvm.start(system_cp = True, packages = True)

    def retrain(self, examples, labels):

        f = open("trainingweka.arff", "w")
        f.write("@relation randomset\n")
        for j in range(len(examples[0])):
            f.write("@attribute feature%d real\n" % j)
        f.write("@attribute class {TRUE, FALSE}\n")
        f.write("@data\n")

        for (example, label) in zip(examples, labels):
            for feature in example:
                f.write("%f," % feature)
            if label == 1:
                f.write("TRUE\n")
            else:
                f.write("FALSE\n")
        f.close()

        loader = Loader(classname="weka.core.converters.ArffLoader")
        # options=["-H", "-B", "10000"])
        self.trainingData = loader.load_file("trainingweka.arff")
        self.trainingData.set_class_index(self.trainingData.num_attributes() -
                                          1)
        self.classifier = Classifier(
            classname="weka.classifiers.functions.Logistic",
            options=["-R", "%f" % (1.0 / self.C)])
        self.classifier.build_classifier(self.trainingData)

        #self.classifier = LogisticRegression(penalty = 'l2', C = self.C)
        #self.classifier = LogisticRegression()
        #self.classifier.fit(examples, labels)

    def predict(self, testExamples):
        return self.classifier.predict(testExamples)

    def getParams(self):
        return (self.classifier.coef_, self.classifier.intercept_)

    def score(self, testExamples, labels):
        f = open("testingweka.arff", "w")
        f.write("@relation randomset\n")
        for j in range(len(testExamples[0])):
            f.write("@attribute feature%d real\n" % j)
        f.write("@attribute class {TRUE, FALSE}\n")
        f.write("@data\n")
        for (example, label) in zip(testExamples, labels):
            for feature in example:
                f.write("%f," % feature)
            if label == 1:
                f.write("TRUE\n")
            else:
                f.write("FALSE\n")
        f.close()

        loader = Loader(classname="weka.core.converters.ArffLoader")
        #                        options=["-H", "-B", "10000"])
        self.testingData = loader.load_file("testingweka.arff")
        self.testingData.set_class_index(self.testingData.num_attributes() - 1)

        evaluation = Evaluation(self.trainingData)
        evaluation.test_model(self.classifier, self.testingData)

        #print evaluation.percent_correct()
        #jvm.stop()
        return evaluation.percent_correct()

    def fscore(self, testExamples, labels):
        return 0
        predictions = self.predict(testExamples)
        precision = 0.0
        precisionD = 0.000000001
        recall = 0.0
        recallD = 0.000000001
        for (prediction, label) in zip(predictions, labels):
            if prediction == 1:
                if label == 1:
                    precision += 1
                precisionD += 1
            if label == 1:
                if prediction == 1:
                    recall += 1
                recallD += 1

        precision /= precisionD
        recall /= recallD

        return 2 * ((precision * recall) / (precision + recall + 0.000000001))

    #distance to the hyperplane
    def getUncertainty(self, example):
        probs = self.classifier.predict_proba([example])
        entropy = 0.0
        for p in probs[0]:
            entropy += p * log(p + 0.0000001)
        entropy *= -1

        return entropy

    def getAllUncertainties(self, examples):
        entropies = []
        probs = self.classifier.predict_proba(examples)
        for prob in probs:
            entropy = 0.0
            for p in prob:
                entropy += p * log(p + 0.0000001)
                #print "BOOP"
                #print p
                #print log(p)
            #print entropy
            entropy *= -1
            entropies.append(entropy)

        return entropies

    def getMostUncertainTask(self, tasks, taskIndices):
        highestUncertainty = -21930123123
        highestEntropyDistribution = None
        mostUncertainTaskIndices = []
        mustUncertainTasks = []

        entropies = self.getAllUncertainties(tasks)
        for (task, i, uncertainty) in zip(tasks, taskIndices, entropies):
            if uncertainty > highestUncertainty:
                mostUncertainTaskIndices = [i]
                mostUncertainTasks = [task]
                highestUncertainty = uncertainty
            elif uncertainty == highestUncertainty:
                mostUncertainTaskIndices.append(i)
                mostUncertainTasks.append(task)

        #(mostUncertainTaskIndex,
        # mostUncertainTask) = sample(zip(mostUncertainTaskIndices,
        #                               mostUncertainTasks), 1)[0]

        mostUncertainTaskIndex = mostUncertainTaskIndices[0]
        mostUncertainTask = mostUncertainTasks[0]

        return (self.classifier.predict_proba([mostUncertainTask])[0],
                mostUncertainTaskIndex)

    def getTotalUncertainty(self, examples):

        totalUncertainty = 0.0
        for example in examples:
            #print "YO"
            #print self.getUncertainty(example)
            totalUncertainty += self.getUncertainty(example)

        totalUncertainty /= len(examples)

        #return max(self.getAllUncertainties(examples))
        return totalUncertainty
Exemplo n.º 6
0
    def test_IBk(self):
        
        # Train a classifier.
        print('Training IBk classifier...')
        c = Classifier(name='weka.classifiers.lazy.IBk', ckargs={'-K':1})
        training_fn = os.path.join(BP, 'fixtures/abalone-train.arff')
        c.train(training_fn, verbose=1)
        self.assertTrue(c._model_data)
        
        # Make a valid query.
        print('Using IBk classifier...')
        query_fn = os.path.join(BP, 'fixtures/abalone-query.arff')
        predictions = list(c.predict(query_fn, verbose=1, cleanup=0))
        pred0 = predictions[0]
        print('pred0:', pred0)
        pred1 = PredictionResult(actual=None, predicted=7, probability=None)
        print('pred1:', pred1)
        self.assertEqual(pred0, pred1)
            
        # Make a valid query.
        with self.assertRaises(PredictionError):
            query_fn = os.path.join(BP, 'fixtures/abalone-query-bad.arff')
            predictions = list(c.predict(query_fn, verbose=1, cleanup=0))
            
        # Make a valid query manually.
        query = arff.ArffFile(relation='test', schema=[
            ('Sex', ('M', 'F', 'I')),
            ('Length', 'numeric'),
            ('Diameter', 'numeric'),
            ('Height', 'numeric'),
            ('Whole weight', 'numeric'),
            ('Shucked weight', 'numeric'),
            ('Viscera weight', 'numeric'),
            ('Shell weight', 'numeric'),
            ('Class_Rings', 'integer'),
        ])
        query.append(['M', 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07, '?'])
        data_str0 = """% 
@relation test
@attribute 'Sex' {F,I,M}
@attribute 'Length' numeric
@attribute 'Diameter' numeric
@attribute 'Height' numeric
@attribute 'Whole weight' numeric
@attribute 'Shucked weight' numeric
@attribute 'Viscera weight' numeric
@attribute 'Shell weight' numeric
@attribute 'Class_Rings' integer
@data
M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,?
"""
        data_str1 = query.write(fmt=DENSE)
#        print(data_str0
#        print(data_str1
        self.assertEqual(data_str0, data_str1)
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))
        
        # Test pickling.
        fn = os.path.join(BP, 'fixtures/IBk.pkl')
        c.save(fn)
        c = Classifier.load(fn)
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))
        #print('Pickle verified.')
        
        # Make a valid dict query manually.
        query = arff.ArffFile(relation='test', schema=[
            ('Sex', ('M', 'F', 'I')),
            ('Length', 'numeric'),
            ('Diameter', 'numeric'),
            ('Height', 'numeric'),
            ('Whole weight', 'numeric'),
            ('Shucked weight', 'numeric'),
            ('Viscera weight', 'numeric'),
            ('Shell weight', 'numeric'),
            ('Class_Rings', 'integer'),
        ])
        query.append({
            'Sex': 'M',
            'Length': 0.35,
            'Diameter': 0.265,
            'Height': 0.09,
            'Whole weight': 0.2255,
            'Shucked weight': 0.0995,
            'Viscera weight': 0.0485,
            'Shell weight': 0.07,
            'Class_Rings': arff.MISSING,
        })
        predictions = list(c.predict(query, verbose=1, cleanup=0))
        self.assertEqual(predictions[0],
            PredictionResult(actual=None, predicted=7, probability=None))