Пример #1
0
def testRegression():
    y = data.loadSVMLightTarget("../feature_set/synthetic.reg")

    # we are using pseudo index instead of the real feature set to use the
    # precomputed kernel.
    x = range(y.shape[0])
    dataset = zip(x, y)
    random.shuffle(dataset)

    # this precomputed kernel reads the index value and returns the covariance
    # matrix
    kernel = kernel_factory.precomputed("../feature_set/synthetic.kernel")

    for train, test in data.kFolds(dataset):
        x_train, y_train = zip(*train)
        x_train = np.atleast_2d(np.array(x_train)).T
        y_train = np.atleast_2d(np.array(y_train)).T

        reg = regress.GPRegressor(kernel)
        reg.fit(x_train, y_train)
        print "fitted"

        # test
        x_test, y_test = zip(*test)
        x_test = np.atleast_2d(np.array(x_test)).T
        y_test = np.atleast_2d(np.array(y_test)).T
        y_predict = reg.predict(x_test)

        print x_test.T
        print y_predict.T

        break
Пример #2
0
    def run(self):
        #dataset = self.getFollowingData()
        dataset = self.loadData()

        for train, test in data.kFolds(dataset):
            # training
            m = self.trainModelOn(train)

            predicates = self.getPredicates(train)
            e = self.getMostEffective(m, predicates)
            print e
Пример #3
0
    def getSeeds(self):
        seeds = []

        dataset = self.loadData()

        for train, test in data.kFolds(dataset):
            # training
            m = self.trainModelOn(train)

            predicates = self.getPredicates(train)
            e = self.getMostEffective(m, predicates)

            for tuple_list in e.values():
                user_ids = map(operator.itemgetter(0), tuple_list)
                seeds.extend(user_ids)

        return list(set(seeds))
Пример #4
0
def outputOnlyMatched():
    maxent.set_verbose(1)

    text_dataset = getTextData(False)
    following_dataset = getFollowingData(False)
    dataset = zip(text_dataset, following_dataset)
    random.shuffle(dataset)
    print 'finished loading dataset'

    tester = tests.tester(4)

    n_total = 0
    n_emit = 0

    for train, test in data.kFolds(dataset):
        text_train, following_train = zip(*train)

        # training
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)

        # prediction
        trials = []

        for datum in test:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_pre = t_model.predict(text_context)
            f_pre = f_model.predict(following_context)

            if t_pre == f_pre:
                trials.append((target, t_pre))
                n_emit += 1

            n_total += 1

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()
    print 'emitted portion:', float(n_emit) / float(n_total)
Пример #5
0
    def getFreqplot(self, dataset):

        n_buckets = 20

        cali_count = np.zeros((4, n_buckets))

        for train, test in data.kFolds(dataset):
            # training
            m = self.trainedModelOn(train)

            for datum in test:
                context, target, weight = datum
                pre = m.eval_all(context)

                for label, score in pre:
                    cali_count[int(label), int(floor(n_buckets * score))] += 1

        return cali_count
Пример #6
0
def scoreDistribution():
    maxent.set_verbose(1)
    text_dataset = getTextData()

    for train, test in data.kFolds(text_dataset):
        model = trainedModelOn(train)

        for datum in test:
            context, target, weight = datum
            pred = model.predict(context)
            model.eval_all(context)

            if pred != target:
                prob = map(itemgetter(1),
                        sorted(model.eval_all(context), key = itemgetter(0)))
                print prob, target

        break
Пример #7
0
    def doCrossValidation(self, dataset, size_limit):
        tester = tests.tester(4)

        for train, test in data.kFolds(dataset):
            # training
            train = random.sample(train, size_limit)
            m = self.trainedModelOn(train)

            # prediction
            trials = []

            for datum in test:
                context, target, weight = datum
                pre_target = m.predict(context)
                trials.append((target, pre_target))

            trials = zip(*trials)
            tester.record(trials[0], trials[1])

        print size_limit, tester.accuracy()
Пример #8
0
def seePredictionOnTrainingData():
    maxent.set_verbose(1)

    dataset = getTextData()
    print 'finished loading dataset'

    for train, test in data.kFolds(dataset):
        m = trainedModelOn(train)

        print "Accuracy on Training Set"
        for datum in train:
            context, target, weight = datum
            print m.eval_all(context)

        print "Accuracy on Test Set"
        for datum in test:
            context, target, weight = datum
            print m.eval_all(context)

        break
Пример #9
0
def regressAgeWithGP():
    y = data.loadSVMLightTarget("../feature_set/text.sreg")

    # we are using pseudo index instead of the real feature set to use the
    # precomputed kernel.
    x = range(y.shape[0])
    dataset = zip(x, y)
    random.shuffle(dataset)

    # this precomputed kernel reads the index value and returns the covariance
    # matrix
    for tau in [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]:
        kernel = kernel_factory.precomputed("../feature_set/text.kernel", tau)

        rms_errors = []
        diviations = []

        for train, test in data.kFolds(dataset):
            x_train, y_train = zip(*train)
            x_train = np.atleast_2d(np.array(x_train)).T
            y_train = np.atleast_2d(np.array(y_train)).T

            reg = regress.GPRegressor(kernel)
            reg.fit(x_train, y_train)

            # test
            x_test, y_test = zip(*test)
            x_test = np.atleast_2d(np.array(x_test)).T
            y_test = np.atleast_2d(np.array(y_test)).T
            y_predict = reg.predict(x_test)

            rms_e = sqrt(np.mean((y_predict - y_test) ** 2))
            div_e = np.mean(np.absolute(y_predict - y_test))

            rms_errors.append(rms_e)
            diviations.append(div_e)

        print "tau:", tau
        print "rms error:", np.mean(np.array(rms_e))
        print "diviation:", np.mean(np.array(div_e))
Пример #10
0
def doCrossValidation(dataset):
    tester = tests.tester(4)

    for train, test in data.kFolds(dataset):
        # training
        m = trainedModelOn(train)
        print 'train size', len(train)

        # prediction
        trials = []

        for datum in test:
            context, target, weight = datum
            pre_target = m.predict(context)
            trials.append((target, pre_target))

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()
Пример #11
0
def simpleEnsemble(pickup):
    maxent.set_verbose(1)

    text_dataset = getTextData(False)
    following_dataset = getFollowingData(False)
    dataset = zip(text_dataset, following_dataset)
    random.shuffle(dataset)
    print 'finished loading dataset'

    tester = tests.tester(4)

    for train, test in data.kFolds(dataset):
        text_train, following_train = zip(*train)

        # training
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)

        # prediction
        trials = []

        for datum in test:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_conf = t_model.eval_all(text_context)
            f_conf = f_model.eval_all(following_context)

            pre_target = str(pickup(t_conf, f_conf))

            trials.append((target, pre_target))

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()
Пример #12
0
    def run(self):

        dataset = self.loadData()

        f = open('confused.txt', 'w')

        for train, test in data.kFolds(dataset):
            # training
            m = self.trainedModelOn(train)

            dbcon = mlm.DBConnector()

            for datum in test:
                context, target, weight, meta = datum
                pre = m.eval_all(context)

                label = pre[0][0]
                confi = pre[0][1]

                if .2 <= confi <= .4 and label != target:
                    f.write('###')
                    f.write(' '.join(
                        map(str, [target, label,
                            meta['user_id'], meta['screen_name'], confi])))
                    f.write('\n')

                    #for predicate, value in context:
                    #    f.write(predicate + ' ' + str(value) + '\n')
                    user_id = meta['user_id']

                    text, length = dbcon.loadText(user_id)
                    f.write(text)

                    f.write('\n\n\n')

        f.close()
Пример #13
0
def classifierEnsemble():
    text_dataset = getTextData(False)
    following_dataset = getFollowingData(False)
    dataset = zip(text_dataset, following_dataset)
    random.shuffle(dataset)
    print 'finished loading dataset'

    tester = tests.tester(4)

    def _conf_to_feature(conf1, conf2):
        def _append_to_key(c):
            def _append(f):
                return (c + f[0], f[1])
            return _append

        conf1 = map(_append_to_key('0'), conf1)
        conf2 = map(_append_to_key('1'), conf2)

        confs = conf1
        confs.extend(conf2)

        return confs


    for train, test in data.kFolds(dataset):
        coffset = int(len(train) * .8)
        text_train, following_train = zip(*train[:coffset])

        # training
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)

        # train a chooser
        chooser = cmaxent.MaxentModel()
        chooser.begin_add_event()

        for datum in train[coffset:]:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_conf = t_model.eval_all(text_context)
            f_conf = f_model.eval_all(following_context)

            confs = _conf_to_feature(t_conf, f_conf)
            chooser.add_event(confs, target)

        chooser.end_add_event(0)
        chooser.train(50, 'lbfgs', 1e-1, 1e-4)

        # retrain the underlying classifiers
        text_train, following_train = zip(*train)
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)


        # prediction
        trials = []

        for datum in test:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_conf = t_model.eval_all(text_context)
            f_conf = f_model.eval_all(following_context)

            confs = _conf_to_feature(t_conf, f_conf)
            pre_target = chooser.predict(confs)

            trials.append((target, pre_target))

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()