示例#1
0
    def cluster_tuples(self, matched_tuples):
        # this is a single-pass clustering
        # Initialize: if no patterns exist, first tuple goes to first cluster
        if len(self.patterns) == 0:
            c1 = Pattern(matched_tuples[0])
            self.patterns.append(c1)

        count = 0
        for t in matched_tuples:
            count += 1
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            max_similarity = 0
            max_similarity_cluster_index = 0

            # go through all patterns(clusters of tuples) and find the one
            # with the highest similarity score
            for i in range(0, len(self.patterns), 1):
                extraction_pattern = self.patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster having
            #  this tuple as the centroid
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                self.patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                self.patterns[max_similarity_cluster_index].add_tuple(t)
示例#2
0
    def test_update_selectivity(self):
        bef_words = ['dummy']
        bet_words = ['dummy']
        aft_words = ['dummy']

        # positive
        pattern = Pattern()
        t = Tuple(self.e1, self.e2, None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config, self.baseline)
        self.assertEqual(len(pattern.p_values), 1)
        self.assertEqual(pattern.p_values[0],
                         self.baseline.shortest_path(self.e1, self.e2))
示例#3
0
    def cluster_tuples_parallel(self, patterns, matched_tuples, child_conn):
        updated_patterns = list(patterns)
        count = 0
        for t in matched_tuples:
            count += 1
            if count % 500 == 0:
                print(multiprocessing.current_process(), count, \
                    "tuples processed")

            # go through all patterns(clusters of tuples) and find the one with
            # the highest similarity score
            max_similarity = 0
            max_similarity_cluster_index = 0
            for i in range(0, len(updated_patterns)):
                extraction_pattern = updated_patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                updated_patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                updated_patterns[max_similarity_cluster_index].add_tuple(t)

        # Eliminate clusters with two or less patterns
        new_patterns = [p for p in updated_patterns if len(p.tuples) > 5]
        pid = multiprocessing.current_process().pid
        print(multiprocessing.current_process(), "Patterns: ",
              len(new_patterns))
        child_conn.send((pid, new_patterns))
示例#4
0
    def test_update_selectivity(self):

        bef_words = ['dummy']
        bet_words = ['dummy']
        aft_words = ['dummy']

        # positive
        pattern = Pattern()
        t = Tuple('seed_1 ', 'seed_2 ', None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.positive, 1)
        self.assertEqual(pattern.negative, 0)
        self.assertEqual(pattern.unknown, 0)

        # negative
        pattern = Pattern()
        t = Tuple('seed_1', 'seed_5', None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.negative, 1)
        self.assertEqual(pattern.positive, 0)
        self.assertEqual(pattern.unknown, 0)

        # negative
        pattern = Pattern()
        t = Tuple('seed_1', 'seed_3', None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.unknown, 0)
        self.assertEqual(pattern.positive, 0)
        self.assertEqual(pattern.negative, 1)

        # unknown
        pattern = Pattern()
        t = Tuple('seed_4', 'seed_5', None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.negative, 0)
        self.assertEqual(pattern.positive, 0)
        self.assertEqual(pattern.unknown, 1)
示例#5
0
    def test_update_selectivity(self):

        bef_words = ['dummy']
        bet_words = ['dummy']
        aft_words = ['dummy']

        # positive
        pattern = Pattern()
        t = Tuple('seed_1 ', 'seed_2 ', None, bef_words, bet_words, aft_words, self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.positive, 1)
        self.assertEqual(pattern.negative, 0)
        self.assertEqual(pattern.unknown, 0)

        # negative
        pattern = Pattern()
        t = Tuple('seed_1', 'seed_5', None, bef_words, bet_words, aft_words, self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.negative, 1)
        self.assertEqual(pattern.positive, 0)
        self.assertEqual(pattern.unknown, 0)

        # negative
        pattern = Pattern()
        t = Tuple('seed_1', 'seed_3', None, bef_words, bet_words, aft_words, self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.unknown, 0)
        self.assertEqual(pattern.positive, 0)
        self.assertEqual(pattern.negative, 1)

        # unknown
        pattern = Pattern()
        t = Tuple('seed_4', 'seed_5', None, bef_words, bet_words, aft_words, self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.negative, 0)
        self.assertEqual(pattern.positive, 0)
        self.assertEqual(pattern.unknown, 1)
示例#6
0
    def test_update_confidence(self):
        bef_words = ['dummy']
        bet_words = ['dummy']
        aft_words = ['dummy']

        # positive
        pattern = Pattern()
        t = Tuple(self.e1, self.e2, None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config, self.baseline)
        pattern.update_confidence(self.config)
        print(pattern.p_values[0])
        self.assertGreater(pattern.confidence, .5)

        # negative
        pattern = Pattern()
        t = Tuple(self.e2, self.e1, None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config, self.baseline)
        pattern.update_confidence(self.config)
        self.assertLess(pattern.confidence, .5)