예제 #1
0
def learn_generative(y_data):
    """
    Uses Snorkel to learn a generative model of the relative accuracies of LFs.
    It learns one generative model for each class, and combines them into a set of noisy labels
    """
    labels = [[], [], [], [], [], [], [], [], [], [], [], [], []]
    for ex in y_data:
        for i in range(0, 13):
            label_i = []
            for vote in ex:
                label_i.append(int(vote[i]))
            labels[i].append(np.array(label_i))
    labels = map(lambda x: np.array(x), labels)
    labels = np.array(labels)
    n_labels = []
    n_stats = []
    for i, class_lbl in enumerate(labels):
        print("learning generative model for label: {}".format(i))
        session = SnorkelSession()
        gen_model = GenerativeModel()
        gen_model.train(class_lbl,
                        epochs=100,
                        decay=0.95,
                        step_size=0.1 / class_lbl.shape[0],
                        reg_param=1e-6,
                        cardinality=2)
        train_marginals = gen_model.marginals(csr_matrix(class_lbl))
        n_labels.append(train_marginals)
        n_stats.append(gen_model.learned_lf_stats())
    for i, stats in enumerate(n_stats):
        stats.to_csv("./results/lf_stats/" + int_to_label[i],
                     sep=',',
                     encoding='utf-8')
    return np.array(n_labels).T
예제 #2
0
    def test_supervised(self):
        # A set of true priors
        tol = 0.1
        LF_acc_priors = [0.75, 0.75, 0.75, 0.75, 0.9]
        cardinality = 2
        LF_acc_prior_weights = [
            0.5 * np.log((cardinality - 1.0) * x / (1 - x))
            for x in LF_acc_priors
        ]
        label_prior = 1

        # Defines a label matrix
        n = 10000
        L = sparse.lil_matrix((n, 5), dtype=np.int64)

        # Store the supervised gold labels separately
        labels = np.zeros(n, np.int64)

        for i in range(n):
            y = 2 * random.randint(0, 1) - 1
            # First four LFs always vote, and have decent acc
            L[i, 0] = y * (2 * (random.random() < LF_acc_priors[0]) - 1)
            L[i, 1] = y * (2 * (random.random() < LF_acc_priors[1]) - 1)
            L[i, 2] = y * (2 * (random.random() < LF_acc_priors[2]) - 1)
            L[i, 3] = y * (2 * (random.random() < LF_acc_priors[3]) - 1)

            # The fifth LF is very accurate but has a much smaller coverage
            if random.random() < 0.2:
                L[i, 4] = y * (2 * (random.random() < LF_acc_priors[4]) - 1)

            # The sixth LF is a small supervised set
            if random.random() < 0.1:
                labels[i] = y

        # Test with priors -- first check init vals are correct
        print("Testing init:")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L,
                        LF_acc_prior_weights=LF_acc_prior_weights,
                        labels=labels,
                        reg_type=2,
                        reg_param=1,
                        epochs=0)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        print(accs)
        print(gen_model.weights.lf_propensity)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))

        # Now test that estimated LF accs are not too far off
        print("\nTesting estimated LF accs (TOL=%s)" % tol)
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=0,
            reg_param=0.0,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))

        # Test without supervised
        print("\nTesting without supervised")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, reg_type=0)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol)))

        # Test with supervised
        print("\nTesting with supervised, without priors")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, labels=labels, reg_type=0)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))

        # Test without supervised, and (intentionally) bad priors, but weak strength
        print("\nTesting without supervised, with bad priors (weak)")
        gen_model = GenerativeModel(lf_propensity=True)
        bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5]
        bad_prior_weights = [
            0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in bad_prior
        ]
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=0,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))

        # Test without supervised, and (intentionally) bad priors
        print("\nTesting without supervised, with bad priors (strong)")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=2,
            reg_param=100 * n,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
    def test_compile_no_deps(self):
        # Defines a label matrix
        L = sparse.lil_matrix((5, 3))

        # The first LF always says yes
        L[0, 0] = 1
        L[1, 0] = 1
        L[2, 0] = 1
        L[3, 0] = 1
        L[4, 0] = 1

        # The second LF votes differently
        L[0, 1] = 1
        L[2, 1] = -1
        L[4, 1] = 1

        # The third LF always abstains

        # Tests compilation
        gen_model = GenerativeModel(class_prior=True,
                                    lf_prior=False,
                                    lf_propensity=False,
                                    lf_class_propensity=False)
        gen_model._process_dependency_graph(L, ())
        m, n = L.shape
        LF_acc_prior_weights = [1.0 for _ in range(n)]
        is_fixed = [False for _ in range(n)]
        gen_model.cardinality = 2
        cardinalities = 2 * np.ones(5)
        weight, variable, factor, ftv, domain_mask, n_edges =\
            gen_model._compile(L, 0.5, 0.0, LF_acc_prior_weights, is_fixed,
                cardinalities)
        #
        # Weights
        #
        # Should now be 3 for LFs + 3 (fixed) for LF priors + 1 class prior
        self.assertEqual(len(weight), 7)

        self.assertFalse(weight[0]['isFixed'])
        self.assertEqual(weight[0]['initialValue'], 0.0)

        # The LF priors
        for i in range(1, 7, 2):
            self.assertTrue(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 1.0)

        # The LF weights
        for i in range(2, 7, 2):
            self.assertFalse(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 0.0)

        #
        # Variables
        #
        self.assertEqual(len(variable), 20)

        for i in range(5):
            self.assertEqual(variable[i]['isEvidence'], 0)
            self.assertTrue(variable[i]['initialValue'] == 0
                            or variable[i]['initialValue'] == 1)
            self.assertEqual(variable[i]["dataType"], 0)
            self.assertEqual(variable[i]["cardinality"], 2)

        for i in range(5):
            for j in range(3):
                self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1)
                # Remap label value; abstain is 0 in L, cardinality (= 2) in NS
                if L[i, j] == -1:
                    l = 0
                elif L[i, j] == 0:
                    l = 2
                elif L[i, j] == 1:
                    l = 1
                self.assertEqual(variable[5 + i * 3 + j]['initialValue'], l)
                self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0)
                self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3)

        #
        # Factors
        #
        # 5 * 3 LF acc factors + 5 * 3 LF prior factors + 5 class prior factors
        self.assertEqual(len(factor), 35)

        for i in range(5):
            self.assertEqual(factor[i]["factorFunction"],
                             FACTORS["DP_GEN_CLASS_PRIOR"])
            self.assertEqual(factor[i]["weightId"], 0)
            self.assertEqual(factor[i]["featureValue"], 1)
            self.assertEqual(factor[i]["arity"], 1)
            self.assertEqual(factor[i]["ftv_offset"], i)

        for i in range(5):
            for j in range(6):
                self.assertEqual(factor[5 + i * 6 + j]["factorFunction"],
                                 FACTORS["DP_GEN_LF_ACCURACY"])
                self.assertEqual(factor[5 + i * 6 + j]["weightId"], j + 1)
                self.assertEqual(factor[5 + i * 6 + j]["featureValue"], 1)
                self.assertEqual(factor[5 + i * 6 + j]["arity"], 2)
                self.assertEqual(factor[5 + i * 6 + j]["ftv_offset"],
                                 5 + 2 * (i * 6 + j))

        #
        # Factor to Var
        #
        self.assertEqual(len(ftv), 65)

        # Class prior factor - var edges
        for i in range(5):
            self.assertEqual(ftv[i]["vid"], i)
            self.assertEqual(ftv[i]["dense_equal_to"], 0)

        # LF *and LF prior* factor - var edges
        for i in range(5):
            for j in range(3):
                # Each LF has one weight factor and one prior factor here
                for k in range(2):
                    idx = 4 * (i * 3 + j) + 2 * k
                    self.assertEqual(ftv[5 + idx]["vid"], i)
                    self.assertEqual(ftv[6 + idx]["vid"], 5 + i * 3 + j)
                    self.assertEqual(ftv[5 + idx]["dense_equal_to"], 0)
                    self.assertEqual(ftv[6 + idx]["dense_equal_to"], 0)

        #
        # Domain mask
        #
        self.assertEqual(len(domain_mask), 20)
        for i in range(20):
            self.assertFalse(domain_mask[i])

        # n_edges
        self.assertEqual(n_edges, 65)
    def test_compile_with_deps(self):
        # Defines a label matrix
        L = sparse.lil_matrix((5, 3))

        # The first LF always says yes
        L[0, 0] = 1
        L[1, 0] = 1
        L[2, 0] = 1
        L[3, 0] = 1
        L[4, 0] = 1

        # The second LF votes differently
        L[0, 1] = 1
        L[2, 1] = -1
        L[4, 1] = 1

        # The third LF always abstains

        # Defined dependencies
        deps = []
        deps.append((0, 1, DEP_SIMILAR))
        deps.append((0, 2, DEP_SIMILAR))

        deps.append((0, 1, DEP_FIXING))
        deps.append((0, 2, DEP_REINFORCING))
        deps.append((1, 2, DEP_EXCLUSIVE))

        # Tests compilation
        gen_model = GenerativeModel(class_prior=False,
                                    lf_prior=False,
                                    lf_propensity=True,
                                    lf_class_propensity=False)
        gen_model._process_dependency_graph(L, deps)
        m, n = L.shape
        LF_acc_prior_weights = [1.0 for _ in range(n)]
        is_fixed = [False for _ in range(n)]
        gen_model.cardinality = 2
        cardinalities = 2 * np.ones(5)
        weight, variable, factor, ftv, domain_mask, n_edges =\
            gen_model._compile(L, 0.5, -1.0, LF_acc_prior_weights, is_fixed,
                cardinalities)

        #
        # Weights
        #
        # Should now be 3 for LFs + 3 fixed for LF priors + 3 for LF propensity
        # + 5 for deps
        self.assertEqual(len(weight), 14)

        # The LF priors
        for i in range(0, 6, 2):
            self.assertTrue(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 1.0)

        # The LF weights
        for i in range(1, 6, 2):
            self.assertFalse(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 0.0)

        # The dep weights
        for i in range(6, 14):
            self.assertFalse(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 0.5)

        #
        # Variables
        #
        self.assertEqual(len(variable), 20)

        for i in range(5):
            self.assertEqual(variable[i]['isEvidence'], 0)
            self.assertTrue(variable[i]['initialValue'] == 0
                            or variable[i]['initialValue'] == 1)
            self.assertEqual(variable[i]["dataType"], 0)
            self.assertEqual(variable[i]["cardinality"], 2)

        for i in range(5):
            for j in range(3):
                self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1)
                # Remap label value; abstain is 0 in L, cardinality (= 2) in NS
                if L[i, j] == -1:
                    l = 0
                elif L[i, j] == 0:
                    l = 2
                elif L[i, j] == 1:
                    l = 1
                self.assertEqual(variable[5 + i * 3 + j]['initialValue'], l)
                self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0)
                self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3)

        #
        # Factors
        #
        self.assertEqual(len(factor), 70)

        f_offset = 0
        ftv_offset = 0
        for i in range(5):
            for j in range(6):
                self.assertEqual(
                    factor[f_offset + i * 6 + j]["factorFunction"],
                    FACTORS["DP_GEN_LF_ACCURACY"])
                self.assertEqual(factor[f_offset + i * 6 + j]["weightId"], j)
                self.assertEqual(factor[f_offset + i * 6 + j]["featureValue"],
                                 1)
                self.assertEqual(factor[f_offset + i * 6 + j]["arity"], 2)
                self.assertEqual(factor[f_offset + i * 6 + j]["ftv_offset"],
                                 ftv_offset + 2 * (i * 6 + j))

        f_offset = 30
        ftv_offset = 60
        for i in range(5):
            for j in range(3):
                self.assertEqual(
                    factor[f_offset + i * 3 + j]["factorFunction"],
                    FACTORS["DP_GEN_LF_PROPENSITY"])
                self.assertEqual(factor[f_offset + i * 3 + j]["weightId"],
                                 6 + j)
                self.assertEqual(factor[f_offset + i * 3 + j]["featureValue"],
                                 1)
                self.assertEqual(factor[f_offset + i * 3 + j]["arity"], 1)
                self.assertEqual(factor[f_offset + i * 3 + j]["ftv_offset"],
                                 ftv_offset + (i * 3 + j))

        f_offset = 45
        ftv_offset = 75
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["DP_GEN_DEP_SIMILAR"])
            self.assertEqual(factor[f_offset + i]["weightId"], 9)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 2)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 2 * i)

        f_offset = 50
        ftv_offset = 85
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["DP_GEN_DEP_SIMILAR"])
            self.assertEqual(factor[f_offset + i]["weightId"], 10)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 2)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 2 * i)

        f_offset = 55
        ftv_offset = 95
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["DP_GEN_DEP_FIXING"])
            self.assertEqual(factor[f_offset + i]["weightId"], 11)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 3)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 3 * i)

        f_offset = 60
        ftv_offset = 110
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["DP_GEN_DEP_REINFORCING"])
            self.assertEqual(factor[f_offset + i]["weightId"], 12)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 3)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 3 * i)

        f_offset = 65
        ftv_offset = 125
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["DP_GEN_DEP_EXCLUSIVE"])
            self.assertEqual(factor[f_offset + i]["weightId"], 13)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 2)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 2 * i)

        #
        # Factor to Var
        #
        self.assertEqual(len(ftv), 135)

        ftv_offset = 0
        for i in range(5):
            for j in range(3):
                for k in range(2):
                    self.assertEqual(
                        ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k]["vid"], i)
                    self.assertEqual(
                        ftv[ftv_offset + 4 * (i * 3 + j) +
                            2 * k]["dense_equal_to"], 0)
                    self.assertEqual(
                        ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k + 1]["vid"],
                        5 + i * 3 + j)
                    self.assertEqual(
                        ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k +
                            1]["dense_equal_to"], 0)

        ftv_offset = 60
        for i in range(5):
            for j in range(3):
                self.assertEqual(ftv[ftv_offset + (i * 3 + j)]["vid"],
                                 5 + i * 3 + j)
                self.assertEqual(
                    ftv[ftv_offset + (i * 3 + j)]["dense_equal_to"], 0)

        ftv_offset = 75
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 1)
            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0)

        ftv_offset = 85
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2)
            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0)

        ftv_offset = 95
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i)
            self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 1)
            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0)

        ftv_offset = 110
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i)
            self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 2)
            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0)

        ftv_offset = 125
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3 + 1)
            self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2)
            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0)

        #
        # Domain mask
        #
        self.assertEqual(len(domain_mask), 20)
        for i in range(20):
            self.assertFalse(domain_mask[i])

        # n_edges
        self.assertEqual(n_edges, 135)
예제 #5
0
print("Commit to snorkel database done...")


#writing label generator
def worker_label_generator(t):
    for worker_id in cand_dict[t.tweet.stable_id]:
        yield worker_id, cand_dict[t.tweet.stable_id][worker_id]


np.random.seed(1701)
labeler = LabelAnnotator(label_generator=worker_label_generator)
L_train = labeler.apply(split=0)

print(L_train.lf_stats(session))

print("Creat training data done...")
print(" -train data shape", (L_train.shape))

print("Start to train a generative model")
gen_model = GenerativeModel(lf_propensity=True)
gen_model.train(L_train, reg_type=2, reg_param=0.1, epochs=30)

#doing statistics
print(gen_model.learned_lf_stats())

print("Train a genetive model done...!")
train_marginals = gen_model.marginals(L_train)
print("Number of examples:", len(train_marginals))
print(train_marginals)
예제 #6
0
    def _test_categorical(self, L, LF_acc_priors, labels, label_prior=1, 
        candidate_ranges=None, cardinality=4, tol=0.1, n=10000):
        """Run a suite of tests."""
        # Map to log scale weights
        LF_acc_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in LF_acc_priors]

        # Test with priors -- first check init vals are correct
        print("Testing init:")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=2,
            reg_param=1,
            epochs=0,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        print(accs)
        print(gen_model.weights.lf_propensity)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        print("Finished in {0} sec.".format(time()-t0))

        # Now test that estimated LF accs are not too far off
        print("\nTesting estimated LF accs (TOL=%s)" % tol)
        t0 = time()
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=0,
            reg_param=0.0,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))
        print("Finished in {0} sec.".format(time()-t0))

        # Test without supervised
        print("\nTesting without supervised")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, reg_type=0, candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol)))
        print("Finished in {0} sec.".format(time()-t0))

        # Test with supervised
        print("\nTesting with supervised, without priors")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            labels=labels,
            reg_type=0,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))
        print("Finished in {0} sec.".format(time()-t0))

        # Test without supervised, and (intentionally) bad priors, but weak strength
        print("\nTesting without supervised, with bad priors (weak)")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5]
        bad_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in bad_prior]
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=0,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        print("Finished in {0} sec.".format(time()-t0))

        # Test without supervised, and (intentionally) bad priors
        print("\nTesting without supervised, with bad priors (strong)")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=2,
            reg_param=100 * n,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
        print("Finished in {0} sec.".format(time()-t0))
예제 #7
0
    def test_supervised(self):
        # A set of true priors
        tol = 0.1
        LF_acc_priors = [0.75, 0.75, 0.75, 0.75, 0.9]
        cardinality = 2
        LF_acc_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in LF_acc_priors]
        label_prior = 1

        # Defines a label matrix
        n = 10000
        L = sparse.lil_matrix((n, 5), dtype=np.int64)

        # Store the supervised gold labels separately
        labels = np.zeros(n, np.int64)

        for i in range(n):
            y = 2 * random.randint(0, 1) - 1
            # First four LFs always vote, and have decent acc
            L[i, 0] = y * (2 * (random.random() < LF_acc_priors[0]) - 1)
            L[i, 1] = y * (2 * (random.random() < LF_acc_priors[1]) - 1)
            L[i, 2] = y * (2 * (random.random() < LF_acc_priors[2]) - 1)
            L[i, 3] = y * (2 * (random.random() < LF_acc_priors[3]) - 1)

            # The fifth LF is very accurate but has a much smaller coverage
            if random.random() < 0.2:
                L[i, 4] = y * (2 * (random.random() < LF_acc_priors[4]) - 1)

            # The sixth LF is a small supervised set
            if random.random() < 0.1:
                labels[i] = y

        # Test with priors -- first check init vals are correct
        print("Testing init:")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=2,
            reg_param=1,
            epochs=0
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        print(accs)
        print(gen_model.weights.lf_propensity)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))

        # Now test that estimated LF accs are not too far off
        print("\nTesting estimated LF accs (TOL=%s)" % tol)
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=0,
            reg_param=0.0,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))

        # Test without supervised
        print("\nTesting without supervised")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, reg_type=0)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol)))

        # Test with supervised
        print("\nTesting with supervised, without priors")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            labels=labels,
            reg_type=0
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))

        # Test without supervised, and (intentionally) bad priors, but weak strength
        print("\nTesting without supervised, with bad priors (weak)")
        gen_model = GenerativeModel(lf_propensity=True)
        bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5]
        bad_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in bad_prior]
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=0,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))

        # Test without supervised, and (intentionally) bad priors
        print("\nTesting without supervised, with bad priors (strong)")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=2,
            reg_param=100 * n,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
예제 #8
0
    def test_compile_no_deps(self):
        # Defines a label matrix
        L = sparse.lil_matrix((5, 3))

        # The first LF always says yes
        L[0, 0] = 1
        L[1, 0] = 1
        L[2, 0] = 1
        L[3, 0] = 1
        L[4, 0] = 1

        # The second LF votes differently
        L[0, 1] = 1
        L[2, 1] = -1
        L[4, 1] = 1

        # The third LF always abstains

        # Tests compilation
        gen_model = GenerativeModel(class_prior=True,
                                    lf_prior=False,
                                    lf_propensity=False,
                                    lf_class_propensity=False)
        gen_model._process_dependency_graph(L, ())
        weight, variable, factor, ftv, domain_mask, n_edges = gen_model._compile(
            L, None, 1.0)

        #
        # Weights
        #
        self.assertEqual(len(weight), 4)

        self.assertFalse(weight[0]['isFixed'])
        self.assertEqual(weight[0]['initialValue'], 0.0)

        for i in range(1, 4):
            self.assertFalse(weight[i]['isFixed'])
            self.assertTrue(0.9 <= weight[i]['initialValue'] <= 1.1)

        #
        # Variables
        #
        self.assertEqual(len(variable), 20)

        for i in range(5):
            self.assertEqual(variable[i]['isEvidence'], 0)
            self.assertTrue(variable[i]['initialValue'] == 0
                            or variable[i]['initialValue'] == 1)
            self.assertEqual(variable[i]["dataType"], 0)
            self.assertEqual(variable[i]["cardinality"], 2)

        for i in range(5):
            for j in range(3):
                self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1)
                self.assertEqual(variable[5 + i * 3 + j]['initialValue'],
                                 L[i, j] + 1)
                self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0)
                self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3)

        #
        # Factors
        #
        self.assertEqual(len(factor), 20)

        for i in range(5):
            self.assertEqual(factor[i]["factorFunction"],
                             FACTORS["DP_GEN_CLASS_PRIOR"])
            self.assertEqual(factor[i]["weightId"], 0)
            self.assertEqual(factor[i]["featureValue"], 1)
            self.assertEqual(factor[i]["arity"], 1)
            self.assertEqual(factor[i]["ftv_offset"], i)

        for i in range(5):
            for j in range(3):
                self.assertEqual(factor[5 + i * 3 + j]["factorFunction"],
                                 FACTORS["DP_GEN_LF_ACCURACY"])
                self.assertEqual(factor[5 + i * 3 + j]["weightId"], j + 1)
                self.assertEqual(factor[5 + i * 3 + j]["featureValue"], 1)
                self.assertEqual(factor[5 + i * 3 + j]["arity"], 2)
                self.assertEqual(factor[5 + i * 3 + j]["ftv_offset"],
                                 5 + 2 * (i * 3 + j))

        #
        # Factor to Var
        #
        self.assertEqual(len(ftv), 35)

        for i in range(5):
            self.assertEqual(ftv[i]["vid"], i)
            self.assertEqual(ftv[i]["dense_equal_to"], 0)

        for i in range(5):
            for j in range(3):
                self.assertEqual(ftv[5 + 2 * (i * 3 + j)]["vid"], i)
                self.assertEqual(ftv[6 + 2 * (i * 3 + j)]["vid"],
                                 5 + i * 3 + j)
                self.assertEqual(ftv[5 + 2 * (i * 3 + j)]["dense_equal_to"], 0)
                self.assertEqual(ftv[6 + 2 * (i * 3 + j)]["dense_equal_to"], 0)

        #
        # Domain mask
        #
        self.assertEqual(len(domain_mask), 20)
        for i in range(20):
            self.assertFalse(domain_mask[i])

        # n_edges
        self.assertEqual(n_edges, 35)
예제 #9
0
    def test_compile_with_deps(self):
        # Defines a label matrix
        L = sparse.lil_matrix((5, 3))

        # The first LF always says yes
        L[0, 0] = 1
        L[1, 0] = 1
        L[2, 0] = 1
        L[3, 0] = 1
        L[4, 0] = 1

        # The second LF votes differently
        L[0, 1] = 1
        L[2, 1] = -1
        L[4, 1] = 1

        # The third LF always abstains

        # Defined dependencies
        deps = []
        deps.append((0, 1, DEP_SIMILAR))
        deps.append((0, 2, DEP_SIMILAR))

        deps.append((0, 1, DEP_FIXING))
        deps.append((0, 2, DEP_REINFORCING))
        deps.append((1, 2, DEP_EXCLUSIVE))

        # Tests compilation
        gen_model = GenerativeModel(class_prior=False,
                                    lf_prior=False,
                                    lf_propensity=True,
                                    lf_class_propensity=False)
        gen_model._process_dependency_graph(L, deps)
        weight, variable, factor, ftv, domain_mask, n_edges = gen_model._compile(
            L, None, 1.0)

        #
        # Weights
        #
        self.assertEqual(len(weight), 11)

        for i in range(3):
            self.assertFalse(weight[i]['isFixed'])
            self.assertTrue(0.9 <= weight[i]['initialValue'] <= 1.1)
        for i in range(3, 11):
            self.assertFalse(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 0.0)

        #
        # Variables
        #
        self.assertEqual(len(variable), 20)

        for i in range(5):
            self.assertEqual(variable[i]['isEvidence'], 0)
            self.assertTrue(variable[i]['initialValue'] == 0
                            or variable[i]['initialValue'] == 1)
            self.assertEqual(variable[i]["dataType"], 0)
            self.assertEqual(variable[i]["cardinality"], 2)

        for i in range(5):
            for j in range(3):
                self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1)
                self.assertEqual(variable[5 + i * 3 + j]['initialValue'],
                                 L[i, j] + 1)
                self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0)
                self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3)

        #
        # Factors
        #
        self.assertEqual(len(factor), 55)

        f_offset = 0
        ftv_offset = 0
        for i in range(5):
            for j in range(3):
                self.assertEqual(
                    factor[f_offset + i * 3 + j]["factorFunction"],
                    FACTORS["DP_GEN_LF_ACCURACY"])
                self.assertEqual(factor[f_offset + i * 3 + j]["weightId"], j)
                self.assertEqual(factor[f_offset + i * 3 + j]["featureValue"],
                                 1)
                self.assertEqual(factor[f_offset + i * 3 + j]["arity"], 2)
                self.assertEqual(factor[f_offset + i * 3 + j]["ftv_offset"],
                                 ftv_offset + 2 * (i * 3 + j))

        f_offset = 15
        ftv_offset = 30
        for i in range(5):
            for j in range(3):
                self.assertEqual(
                    factor[f_offset + i * 3 + j]["factorFunction"],
                    FACTORS["DP_GEN_LF_PROPENSITY"])
                self.assertEqual(factor[f_offset + i * 3 + j]["weightId"],
                                 3 + j)
                self.assertEqual(factor[f_offset + i * 3 + j]["featureValue"],
                                 1)
                self.assertEqual(factor[f_offset + i * 3 + j]["arity"], 1)
                self.assertEqual(factor[f_offset + i * 3 + j]["ftv_offset"],
                                 ftv_offset + (i * 3 + j))

        f_offset = 30
        ftv_offset = 45
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["EQUAL"])
            self.assertEqual(factor[f_offset + i]["weightId"], 6)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 2)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 2 * i)

        f_offset = 35
        ftv_offset = 55
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["EQUAL"])
            self.assertEqual(factor[f_offset + i]["weightId"], 7)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 2)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 2 * i)

        f_offset = 40
        ftv_offset = 65
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["DP_GEN_DEP_FIXING"])
            self.assertEqual(factor[f_offset + i]["weightId"], 8)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 3)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 3 * i)

        f_offset = 45
        ftv_offset = 80
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["DP_GEN_DEP_REINFORCING"])
            self.assertEqual(factor[f_offset + i]["weightId"], 9)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 3)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 3 * i)

        f_offset = 50
        ftv_offset = 95
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"],
                             FACTORS["DP_GEN_DEP_EXCLUSIVE"])
            self.assertEqual(factor[f_offset + i]["weightId"], 10)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 2)
            self.assertEqual(factor[f_offset + i]["ftv_offset"],
                             ftv_offset + 2 * i)

        #
        # Factor to Var
        #
        self.assertEqual(len(ftv), 105)

        ftv_offset = 0
        for i in range(5):
            for j in range(3):
                self.assertEqual(ftv[ftv_offset + 2 * (i * 3 + j)]["vid"], i)
                self.assertEqual(
                    ftv[ftv_offset + 2 * (i * 3 + j)]["dense_equal_to"], 0)
                self.assertEqual(ftv[ftv_offset + 2 * (i * 3 + j) + 1]["vid"],
                                 5 + i * 3 + j)
                self.assertEqual(
                    ftv[ftv_offset + 2 * (i * 3 + j) + 1]["dense_equal_to"], 0)

        ftv_offset = 30
        for i in range(5):
            for j in range(3):
                self.assertEqual(ftv[ftv_offset + (i * 3 + j)]["vid"],
                                 5 + i * 3 + j)
                self.assertEqual(
                    ftv[ftv_offset + (i * 3 + j)]["dense_equal_to"], 0)

        ftv_offset = 45
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 1)
            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0)

        ftv_offset = 55
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2)
            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0)

        ftv_offset = 65
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i)
            self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 1)
            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0)

        ftv_offset = 80
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i)
            self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 2)
            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0)

        ftv_offset = 95
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3 + 1)
            self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2)
            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0)

        #
        # Domain mask
        #
        self.assertEqual(len(domain_mask), 20)
        for i in range(20):
            self.assertFalse(domain_mask[i])

        # n_edges
        self.assertEqual(n_edges, 105)
예제 #10
0
    def _test_categorical(self,
                          L,
                          LF_acc_priors,
                          labels,
                          label_prior=1,
                          candidate_ranges=None,
                          cardinality=4,
                          tol=0.1,
                          n=10000):
        """Run a suite of tests."""
        # Map to log scale weights
        LF_acc_prior_weights = map(
            lambda x: 0.5 * np.log((cardinality - 1.0) * x / (1 - x)),
            LF_acc_priors)

        # Test with priors -- first check init vals are correct
        print("Testing init:")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L,
                        LF_acc_prior_weights=LF_acc_prior_weights,
                        labels=labels,
                        reg_type=2,
                        reg_param=1,
                        epochs=0,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        print(accs)
        print(gen_model.weights.lf_propensity)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        print("Finished in {0} sec.".format(time() - t0))

        # Now test that estimated LF accs are not too far off
        print("\nTesting estimated LF accs (TOL=%s)" % tol)
        t0 = time()
        gen_model.train(L,
                        LF_acc_prior_weights=LF_acc_prior_weights,
                        labels=labels,
                        reg_type=0,
                        reg_param=0.0,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))
        print("Finished in {0} sec.".format(time() - t0))

        # Test without supervised
        print("\nTesting without supervised")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, reg_type=0, candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol)))
        print("Finished in {0} sec.".format(time() - t0))

        # Test with supervised
        print("\nTesting with supervised, without priors")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L,
                        labels=labels,
                        reg_type=0,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))
        print("Finished in {0} sec.".format(time() - t0))

        # Test without supervised, and (intentionally) bad priors, but weak strength
        print("\nTesting without supervised, with bad priors (weak)")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5]
        bad_prior_weights = map(
            lambda x: 0.5 * np.log((cardinality - 1.0) * x / (1 - x)),
            bad_prior)
        gen_model.train(L,
                        LF_acc_prior_weights=bad_prior_weights,
                        reg_type=0,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        print("Finished in {0} sec.".format(time() - t0))

        # Test without supervised, and (intentionally) bad priors
        print("\nTesting without supervised, with bad priors (strong)")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L,
                        LF_acc_prior_weights=bad_prior_weights,
                        reg_type=2,
                        reg_param=100 * n,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
        print("Finished in {0} sec.".format(time() - t0))
예제 #11
0
    def test_compile_no_deps(self):
        # Defines a label matrix
        L = sparse.lil_matrix((5, 3))

        # The first LF always says yes
        L[0, 0] = 1
        L[1, 0] = 1
        L[2, 0] = 1
        L[3, 0] = 1
        L[4, 0] = 1

        # The second LF votes differently
        L[0, 1] = 1
        L[2, 1] = -1
        L[4, 1] = 1

        # The third LF always abstains

        # Tests compilation
        gen_model = GenerativeModel(class_prior=True, lf_prior=False,
            lf_propensity=False, lf_class_propensity=False)
        gen_model._process_dependency_graph(L, ())
        m, n = L.shape
        LF_acc_prior_weights = [1.0 for _ in range(n)]
        is_fixed = [False for _ in range(n)]
        gen_model.cardinality = 2
        cardinalities = 2 * np.ones(5)
        weight, variable, factor, ftv, domain_mask, n_edges =\
            gen_model._compile(L, 0.5, 0.0, LF_acc_prior_weights, is_fixed, 
                cardinalities)
        #
        # Weights
        #
        # Should now be 3 for LFs + 3 (fixed) for LF priors + 1 class prior
        self.assertEqual(len(weight), 7)

        self.assertFalse(weight[0]['isFixed'])
        self.assertEqual(weight[0]['initialValue'], 0.0)

        # The LF priors
        for i in range(1,7,2):
            self.assertTrue(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 1.0)

        # The LF weights
        for i in range(2,7,2):
            self.assertFalse(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 0.0)

        #
        # Variables
        #
        self.assertEqual(len(variable), 20)

        for i in range(5):
            self.assertEqual(variable[i]['isEvidence'], 0)
            self.assertTrue(variable[i]['initialValue'] == 0 or variable[i]['initialValue'] == 1)
            self.assertEqual(variable[i]["dataType"], 0)
            self.assertEqual(variable[i]["cardinality"], 2)

        for i in range(5):
            for j in range(3):
                self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1)
                # Remap label value; abstain is 0 in L, cardinality (= 2) in NS
                if L[i, j] == -1:
                    l = 0
                elif L[i, j] == 0:
                    l = 2
                elif L[i,j] == 1:
                    l = 1
                self.assertEqual(variable[5 + i * 3 + j]['initialValue'], l)
                self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0)
                self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3)

        #
        # Factors
        #
        # 5 * 3 LF acc factors + 5 * 3 LF prior factors + 5 class prior factors
        self.assertEqual(len(factor), 35)

        for i in range(5):
            self.assertEqual(factor[i]["factorFunction"], FACTORS["DP_GEN_CLASS_PRIOR"])
            self.assertEqual(factor[i]["weightId"], 0)
            self.assertEqual(factor[i]["featureValue"], 1)
            self.assertEqual(factor[i]["arity"], 1)
            self.assertEqual(factor[i]["ftv_offset"], i)

        for i in range(5):
            for j in range(6):
                self.assertEqual(factor[5 + i * 6 + j]["factorFunction"], FACTORS["DP_GEN_LF_ACCURACY"])
                self.assertEqual(factor[5 + i * 6 + j]["weightId"], j + 1)
                self.assertEqual(factor[5 + i * 6 + j]["featureValue"], 1)
                self.assertEqual(factor[5 + i * 6 + j]["arity"], 2)
                self.assertEqual(factor[5 + i * 6 + j]["ftv_offset"], 5 + 2 * (i * 6 + j))

        #
        # Factor to Var
        #
        self.assertEqual(len(ftv), 65)

        # Class prior factor - var edges
        for i in range(5):
            self.assertEqual(ftv[i]["vid"], i)
            self.assertEqual(ftv[i]["dense_equal_to"], 0)

        # LF *and LF prior* factor - var edges
        for i in range(5):
            for j in range(3):
                # Each LF has one weight factor and one prior factor here
                for k in range(2):
                    idx = 4 * (i * 3 + j) + 2 * k
                    self.assertEqual(ftv[5 + idx]["vid"], i)
                    self.assertEqual(ftv[6 + idx]["vid"], 5 + i * 3 + j)
                    self.assertEqual(ftv[5 + idx]["dense_equal_to"], 0)
                    self.assertEqual(ftv[6 + idx]["dense_equal_to"], 0)

        #
        # Domain mask
        #
        self.assertEqual(len(domain_mask), 20)
        for i in range(20):
            self.assertFalse(domain_mask[i])

        # n_edges
        self.assertEqual(n_edges, 65)
예제 #12
0
    def test_compile_with_deps(self):
        # Defines a label matrix
        L = sparse.lil_matrix((5, 3))

        # The first LF always says yes
        L[0, 0] = 1
        L[1, 0] = 1
        L[2, 0] = 1
        L[3, 0] = 1
        L[4, 0] = 1

        # The second LF votes differently
        L[0, 1] = 1
        L[2, 1] = -1
        L[4, 1] = 1

        # The third LF always abstains

        # Defined dependencies
        deps = []
        deps.append((0, 1, DEP_SIMILAR))
        deps.append((0, 2, DEP_SIMILAR))

        deps.append((0, 1, DEP_FIXING))
        deps.append((0, 2, DEP_REINFORCING))
        deps.append((1, 2, DEP_EXCLUSIVE))

        # Tests compilation
        gen_model = GenerativeModel(class_prior=False, lf_prior=False,
            lf_propensity=True, lf_class_propensity=False)
        gen_model._process_dependency_graph(L, deps)
        m, n = L.shape
        LF_acc_prior_weights = [1.0 for _ in range(n)]
        is_fixed = [False for _ in range(n)]
        gen_model.cardinality = 2
        cardinalities = 2 * np.ones(5)
        weight, variable, factor, ftv, domain_mask, n_edges =\
            gen_model._compile(L, 0.5, -1.0, LF_acc_prior_weights, is_fixed,
                cardinalities)

        #
        # Weights
        #
        # Should now be 3 for LFs + 3 fixed for LF priors + 3 for LF propensity
        # + 5 for deps
        self.assertEqual(len(weight), 14)
        
        # The LF priors
        for i in range(0,6,2):
            self.assertTrue(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 1.0)

        # The LF weights
        for i in range(1,6,2):
            self.assertFalse(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 0.0)

        # The dep weights
        for i in range(6, 14):
            self.assertFalse(weight[i]['isFixed'])
            self.assertEqual(weight[i]['initialValue'], 0.5)

        #
        # Variables
        #
        self.assertEqual(len(variable), 20)

        for i in range(5):
            self.assertEqual(variable[i]['isEvidence'], 0)
            self.assertTrue(variable[i]['initialValue'] == 0 or variable[i]['initialValue'] == 1)
            self.assertEqual(variable[i]["dataType"], 0)
            self.assertEqual(variable[i]["cardinality"], 2)

        for i in range(5):
            for j in range(3):
                self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1)
                # Remap label value; abstain is 0 in L, cardinality (= 2) in NS
                if L[i, j] == -1:
                    l = 0
                elif L[i, j] == 0:
                    l = 2
                elif L[i,j] == 1:
                    l = 1
                self.assertEqual(variable[5 + i * 3 + j]['initialValue'], l)
                self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0)
                self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3)

        #
        # Factors
        #
        self.assertEqual(len(factor), 70)

        f_offset = 0
        ftv_offset = 0
        for i in range(5):
            for j in range(6):
                self.assertEqual(factor[f_offset + i * 6+ j]["factorFunction"], FACTORS["DP_GEN_LF_ACCURACY"])
                self.assertEqual(factor[f_offset + i * 6 + j]["weightId"], j)
                self.assertEqual(factor[f_offset + i * 6 + j]["featureValue"], 1)
                self.assertEqual(factor[f_offset + i * 6 + j]["arity"], 2)
                self.assertEqual(factor[f_offset + i * 6 + j]["ftv_offset"], ftv_offset + 2 * (i * 6 + j))

        f_offset = 30
        ftv_offset = 60
        for i in range(5):
            for j in range(3):
                self.assertEqual(factor[f_offset + i * 3 + j]["factorFunction"], FACTORS["DP_GEN_LF_PROPENSITY"])
                self.assertEqual(factor[f_offset + i * 3 + j]["weightId"], 6 + j)
                self.assertEqual(factor[f_offset + i * 3 + j]["featureValue"], 1)
                self.assertEqual(factor[f_offset + i * 3 + j]["arity"], 1)
                self.assertEqual(factor[f_offset + i * 3 + j]["ftv_offset"], ftv_offset + (i * 3 + j))

        f_offset = 45
        ftv_offset = 75
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_SIMILAR"])
            self.assertEqual(factor[f_offset + i]["weightId"], 9)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 2)
            self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i)

        f_offset = 50
        ftv_offset = 85
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_SIMILAR"])
            self.assertEqual(factor[f_offset + i]["weightId"], 10)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 2)
            self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i)

        f_offset = 55
        ftv_offset = 95
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_FIXING"])
            self.assertEqual(factor[f_offset + i]["weightId"], 11)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 3)
            self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 3 * i)

        f_offset = 60
        ftv_offset = 110
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_REINFORCING"])
            self.assertEqual(factor[f_offset + i]["weightId"], 12)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 3)
            self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 3 * i)

        f_offset = 65
        ftv_offset = 125
        for i in range(5):
            self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_EXCLUSIVE"])
            self.assertEqual(factor[f_offset + i]["weightId"], 13)
            self.assertEqual(factor[f_offset + i]["featureValue"], 1)
            self.assertEqual(factor[f_offset + i]["arity"], 2)
            self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i)

        #
        # Factor to Var
        #
        self.assertEqual(len(ftv), 135)

        ftv_offset = 0
        for i in range(5):
            for j in range(3):
                for k in range(2):
                    self.assertEqual(ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k]["vid"], i)
                    self.assertEqual(ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k]["dense_equal_to"], 0)
                    self.assertEqual(ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k + 1]["vid"], 5 + i * 3 + j)
                    self.assertEqual(ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k + 1]["dense_equal_to"], 0)

        ftv_offset = 60
        for i in range(5):
            for j in range(3):
                self.assertEqual(ftv[ftv_offset + (i * 3 + j)]["vid"], 5 + i * 3 + j)
                self.assertEqual(ftv[ftv_offset + (i * 3 + j)]["dense_equal_to"], 0)

        ftv_offset = 75
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 1)
            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0)

        ftv_offset = 85
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2)
            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0)

        ftv_offset = 95
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i)
            self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 1)
            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0)

        ftv_offset = 110
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i)
            self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3)
            self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 2)
            self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0)

        ftv_offset = 125
        for i in range(5):
            self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3 + 1)
            self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0)

            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2)
            self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0)

        #
        # Domain mask
        #
        self.assertEqual(len(domain_mask), 20)
        for i in range(20):
            self.assertFalse(domain_mask[i])

        # n_edges
        self.assertEqual(n_edges, 135)