示例#1
0
def learn_generative(y_data):
    """
    Uses Snorkel to learn a generative model of the relative accuracies of LFs.
    It learns one generative model for each class, and combines them into a set of noisy labels
    """
    labels = [[], [], [], [], [], [], [], [], [], [], [], [], []]
    for ex in y_data:
        for i in range(0, 13):
            label_i = []
            for vote in ex:
                label_i.append(int(vote[i]))
            labels[i].append(np.array(label_i))
    labels = map(lambda x: np.array(x), labels)
    labels = np.array(labels)
    n_labels = []
    n_stats = []
    for i, class_lbl in enumerate(labels):
        print("learning generative model for label: {}".format(i))
        session = SnorkelSession()
        gen_model = GenerativeModel()
        gen_model.train(class_lbl,
                        epochs=100,
                        decay=0.95,
                        step_size=0.1 / class_lbl.shape[0],
                        reg_param=1e-6,
                        cardinality=2)
        train_marginals = gen_model.marginals(csr_matrix(class_lbl))
        n_labels.append(train_marginals)
        n_stats.append(gen_model.learned_lf_stats())
    for i, stats in enumerate(n_stats):
        stats.to_csv("./results/lf_stats/" + int_to_label[i],
                     sep=',',
                     encoding='utf-8')
    return np.array(n_labels).T
示例#2
0
    def test_supervised(self):
        # A set of true priors
        tol = 0.1
        LF_acc_priors = [0.75, 0.75, 0.75, 0.75, 0.9]
        cardinality = 2
        LF_acc_prior_weights = [
            0.5 * np.log((cardinality - 1.0) * x / (1 - x))
            for x in LF_acc_priors
        ]
        label_prior = 1

        # Defines a label matrix
        n = 10000
        L = sparse.lil_matrix((n, 5), dtype=np.int64)

        # Store the supervised gold labels separately
        labels = np.zeros(n, np.int64)

        for i in range(n):
            y = 2 * random.randint(0, 1) - 1
            # First four LFs always vote, and have decent acc
            L[i, 0] = y * (2 * (random.random() < LF_acc_priors[0]) - 1)
            L[i, 1] = y * (2 * (random.random() < LF_acc_priors[1]) - 1)
            L[i, 2] = y * (2 * (random.random() < LF_acc_priors[2]) - 1)
            L[i, 3] = y * (2 * (random.random() < LF_acc_priors[3]) - 1)

            # The fifth LF is very accurate but has a much smaller coverage
            if random.random() < 0.2:
                L[i, 4] = y * (2 * (random.random() < LF_acc_priors[4]) - 1)

            # The sixth LF is a small supervised set
            if random.random() < 0.1:
                labels[i] = y

        # Test with priors -- first check init vals are correct
        print("Testing init:")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L,
                        LF_acc_prior_weights=LF_acc_prior_weights,
                        labels=labels,
                        reg_type=2,
                        reg_param=1,
                        epochs=0)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        print(accs)
        print(gen_model.weights.lf_propensity)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))

        # Now test that estimated LF accs are not too far off
        print("\nTesting estimated LF accs (TOL=%s)" % tol)
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=0,
            reg_param=0.0,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))

        # Test without supervised
        print("\nTesting without supervised")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, reg_type=0)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol)))

        # Test with supervised
        print("\nTesting with supervised, without priors")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, labels=labels, reg_type=0)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))

        # Test without supervised, and (intentionally) bad priors, but weak strength
        print("\nTesting without supervised, with bad priors (weak)")
        gen_model = GenerativeModel(lf_propensity=True)
        bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5]
        bad_prior_weights = [
            0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in bad_prior
        ]
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=0,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))

        # Test without supervised, and (intentionally) bad priors
        print("\nTesting without supervised, with bad priors (strong)")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=2,
            reg_param=100 * n,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
示例#3
0
print("Commit to snorkel database done...")


#writing label generator
def worker_label_generator(t):
    for worker_id in cand_dict[t.tweet.stable_id]:
        yield worker_id, cand_dict[t.tweet.stable_id][worker_id]


np.random.seed(1701)
labeler = LabelAnnotator(label_generator=worker_label_generator)
L_train = labeler.apply(split=0)

print(L_train.lf_stats(session))

print("Creat training data done...")
print(" -train data shape", (L_train.shape))

print("Start to train a generative model")
gen_model = GenerativeModel(lf_propensity=True)
gen_model.train(L_train, reg_type=2, reg_param=0.1, epochs=30)

#doing statistics
print(gen_model.learned_lf_stats())

print("Train a genetive model done...!")
train_marginals = gen_model.marginals(L_train)
print("Number of examples:", len(train_marginals))
print(train_marginals)
示例#4
0
    def test_supervised(self):
        # A set of true priors
        tol = 0.1
        LF_acc_priors = [0.75, 0.75, 0.75, 0.75, 0.9]
        cardinality = 2
        LF_acc_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in LF_acc_priors]
        label_prior = 1

        # Defines a label matrix
        n = 10000
        L = sparse.lil_matrix((n, 5), dtype=np.int64)

        # Store the supervised gold labels separately
        labels = np.zeros(n, np.int64)

        for i in range(n):
            y = 2 * random.randint(0, 1) - 1
            # First four LFs always vote, and have decent acc
            L[i, 0] = y * (2 * (random.random() < LF_acc_priors[0]) - 1)
            L[i, 1] = y * (2 * (random.random() < LF_acc_priors[1]) - 1)
            L[i, 2] = y * (2 * (random.random() < LF_acc_priors[2]) - 1)
            L[i, 3] = y * (2 * (random.random() < LF_acc_priors[3]) - 1)

            # The fifth LF is very accurate but has a much smaller coverage
            if random.random() < 0.2:
                L[i, 4] = y * (2 * (random.random() < LF_acc_priors[4]) - 1)

            # The sixth LF is a small supervised set
            if random.random() < 0.1:
                labels[i] = y

        # Test with priors -- first check init vals are correct
        print("Testing init:")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=2,
            reg_param=1,
            epochs=0
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        print(accs)
        print(gen_model.weights.lf_propensity)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))

        # Now test that estimated LF accs are not too far off
        print("\nTesting estimated LF accs (TOL=%s)" % tol)
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=0,
            reg_param=0.0,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))

        # Test without supervised
        print("\nTesting without supervised")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, reg_type=0)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol)))

        # Test with supervised
        print("\nTesting with supervised, without priors")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            labels=labels,
            reg_type=0
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))

        # Test without supervised, and (intentionally) bad priors, but weak strength
        print("\nTesting without supervised, with bad priors (weak)")
        gen_model = GenerativeModel(lf_propensity=True)
        bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5]
        bad_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in bad_prior]
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=0,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))

        # Test without supervised, and (intentionally) bad priors
        print("\nTesting without supervised, with bad priors (strong)")
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=2,
            reg_param=100 * n,
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
示例#5
0
    def _test_categorical(self, L, LF_acc_priors, labels, label_prior=1, 
        candidate_ranges=None, cardinality=4, tol=0.1, n=10000):
        """Run a suite of tests."""
        # Map to log scale weights
        LF_acc_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in LF_acc_priors]

        # Test with priors -- first check init vals are correct
        print("Testing init:")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=2,
            reg_param=1,
            epochs=0,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        print(accs)
        print(gen_model.weights.lf_propensity)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        print("Finished in {0} sec.".format(time()-t0))

        # Now test that estimated LF accs are not too far off
        print("\nTesting estimated LF accs (TOL=%s)" % tol)
        t0 = time()
        gen_model.train(
            L,
            LF_acc_prior_weights=LF_acc_prior_weights,
            labels=labels,
            reg_type=0,
            reg_param=0.0,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))
        print("Finished in {0} sec.".format(time()-t0))

        # Test without supervised
        print("\nTesting without supervised")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, reg_type=0, candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol)))
        print("Finished in {0} sec.".format(time()-t0))

        # Test with supervised
        print("\nTesting with supervised, without priors")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            labels=labels,
            reg_type=0,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))
        print("Finished in {0} sec.".format(time()-t0))

        # Test without supervised, and (intentionally) bad priors, but weak strength
        print("\nTesting without supervised, with bad priors (weak)")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5]
        bad_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in bad_prior]
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=0,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        print("Finished in {0} sec.".format(time()-t0))

        # Test without supervised, and (intentionally) bad priors
        print("\nTesting without supervised, with bad priors (strong)")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(
            L,
            LF_acc_prior_weights=bad_prior_weights,
            reg_type=2,
            reg_param=100 * n,
            candidate_ranges=candidate_ranges
        )
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
        print("Finished in {0} sec.".format(time()-t0))
示例#6
0
    def _test_categorical(self,
                          L,
                          LF_acc_priors,
                          labels,
                          label_prior=1,
                          candidate_ranges=None,
                          cardinality=4,
                          tol=0.1,
                          n=10000):
        """Run a suite of tests."""
        # Map to log scale weights
        LF_acc_prior_weights = map(
            lambda x: 0.5 * np.log((cardinality - 1.0) * x / (1 - x)),
            LF_acc_priors)

        # Test with priors -- first check init vals are correct
        print("Testing init:")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L,
                        LF_acc_prior_weights=LF_acc_prior_weights,
                        labels=labels,
                        reg_type=2,
                        reg_param=1,
                        epochs=0,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        print(accs)
        print(gen_model.weights.lf_propensity)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        print("Finished in {0} sec.".format(time() - t0))

        # Now test that estimated LF accs are not too far off
        print("\nTesting estimated LF accs (TOL=%s)" % tol)
        t0 = time()
        gen_model.train(L,
                        LF_acc_prior_weights=LF_acc_prior_weights,
                        labels=labels,
                        reg_type=0,
                        reg_param=0.0,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))
        print("Finished in {0} sec.".format(time() - t0))

        # Test without supervised
        print("\nTesting without supervised")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L, reg_type=0, candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol)))
        print("Finished in {0} sec.".format(time() - t0))

        # Test with supervised
        print("\nTesting with supervised, without priors")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L,
                        labels=labels,
                        reg_type=0,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors + [label_prior])
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        self.assertTrue(
            np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol)))
        print("Finished in {0} sec.".format(time() - t0))

        # Test without supervised, and (intentionally) bad priors, but weak strength
        print("\nTesting without supervised, with bad priors (weak)")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5]
        bad_prior_weights = map(
            lambda x: 0.5 * np.log((cardinality - 1.0) * x / (1 - x)),
            bad_prior)
        gen_model.train(L,
                        LF_acc_prior_weights=bad_prior_weights,
                        reg_type=0,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        print(coverage)
        priors = np.array(LF_acc_priors)
        self.assertTrue(np.all(np.abs(accs - priors) < tol))
        print("Finished in {0} sec.".format(time() - t0))

        # Test without supervised, and (intentionally) bad priors
        print("\nTesting without supervised, with bad priors (strong)")
        t0 = time()
        gen_model = GenerativeModel(lf_propensity=True)
        gen_model.train(L,
                        LF_acc_prior_weights=bad_prior_weights,
                        reg_type=2,
                        reg_param=100 * n,
                        candidate_ranges=candidate_ranges)
        stats = gen_model.learned_lf_stats()
        accs = stats["Accuracy"]
        coverage = stats["Coverage"]
        print(accs)
        self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
        print("Finished in {0} sec.".format(time() - t0))