Exemplo n.º 1
0
class MiningDataRecord(object):
    """
    mining the data record from a region.

    basic assumption:
    the subtree of data records also similar. so if not any adjacent pair of them are
    similar (less than threshold), data region itself is a data record,
    otherwise children are individual data record.
    """

    def __init__(self, threshold):
        self.stm = SimpleTreeMatch()
        self.threshold = threshold

    def find_records(self, region):
        if region.k == 1:
            records = []
            # if all the individual node of children node of Generalized node are similar
            for i in range(region.start, region.start + region.covered):
                for child1, child2 in pairwise(region.parent[i], 1, 0):
                    sim = self.stm.normalized_match_score(child1, child2)
                    if sim < self.threshold:
                        return self.slice_region(region)
            else:
                # each child of generalized node is a data record
                for gn in region.iter(1):
                    records.extend([Record(c) for c in gn])
            return records
        else:
            # if almost all the individual node in Generalized Node are similar
            children = [region.parent[region.start + i] for i in range(region.covered)]
            sizes = Counter([tree_size(child) for child in children])
            most_common_size, _= sizes.most_common(1)[0]
            most_typical_child = [child for child in children if tree_size(child) == most_common_size][0]
            similarities = dict([child, self.stm.normalized_match_score([most_typical_child], [child])] for child in children)
            if self.almost_similar(list(similarities.values()), self.threshold):
                return [Record(child) for child in children if similarities[child] >= self.threshold]
            else:
                return self.slice_region(region)

    def slice_region(self, region):
        """
        slice every generalized node of region to a data record
        """
        records = []
        for gn in region.iter(region.k):
            elements = [element for element in gn]
            records.append(Record(*elements))
        return records

    def almost_similar(self, similarities, threshold):
        sims = [1 for sim in similarities if sim >= threshold]
        return len(sims) / len(similarities) > 0.8
Exemplo n.º 2
0
class MiningDataRecord(object):
    """
    mining the data record from a region.

    basic assumption:
    the subtree of data records also similar. so if not any adjacent pair of them are
    similar (less than threshold), data region itself is a data record,
    otherwise children are individual data record.
    """

    def __init__(self, threshold=0.3):
        self.stm = SimpleTreeMatch()
        self.threshold = threshold

    def find_records(self, region):
        records = []
        if region.k == 1:
            for i in xrange(region.start, region.start + region.covered):
                for child1, child2 in pairwise(region.parent, 1, region.start):
                    similarity = self.stm.normalized_match_score(child1, child2)
                    if similarity < self.threshold:
                        return self.slice_region(region)
                else:
                    # each child of generalized node is a data record
                    for gn in region.iter(1):
                        records.extend([Record(c) for c in gn])

        return self.slice_region(region)

    def slice_region(self, region):
        """
        slice every generalized node of region to a data record
        """
        records = []
        for gn in region.iter(region.k):
            elements = [element for element in gn]
            records.append(Record(*elements))
        return records
Exemplo n.º 3
0
 def __init__(self, root, max_generalized_nodes=3, threshold=0.3):
     self.root = root
     self.max_generalized_nodes = max_generalized_nodes
     self.threshold = threshold
     self.stm = SimpleTreeMatch()
Exemplo n.º 4
0
class MiningDataRegion(object):
    def __init__(self, root, max_generalized_nodes=3, threshold=0.3):
        self.root = root
        self.max_generalized_nodes = max_generalized_nodes
        self.threshold = threshold
        self.stm = SimpleTreeMatch()

    def find_regions(self, root):
        data_regions = []
        if tree_depth(root) >= 2:
            scores = self.compare_generalized_nodes(root, self.max_generalized_nodes)
            data_regions.extend(self.identify_regions(0, root, self.max_generalized_nodes, self.threshold, scores))
            covered = set()
            for data_region in data_regions:
                for i in xrange(data_region.start, data_region.covered):
                    covered.add(data_region.parent[i])

            for child in root:
                if child not in covered:
                    data_regions.extend(self.find_regions(child))
        return data_regions


    def identify_regions(self, start, root, max_generalized_nodes, threshold, scores):
        cur_region = Region(parent=root, start=0, k=0, covered=0)
        max_region = Region(parent=root, start=0, k=0, covered=0)
        data_regions = []

        for k in xrange(1, max_generalized_nodes + 1):
            for i in xrange(start, k + start):
                flag = True
                for j in xrange(i, len(root) - k, k):
                    pair = GeneralizedNode(root[j], k), GeneralizedNode(root[j + k], k)
                    score = scores.get(pair)
                    if score >= threshold:
                        if flag:
                            cur_region.k = k
                            cur_region.start = j
                            cur_region.covered = 2 * k
                            flag = False
                        else:
                            cur_region.covered += k
                    elif not flag:  # doesn't match but previous match
                        break
                if max_region.covered <= cur_region.covered and (
                        max_region.start == 0 or cur_region.start <= max_region.start):
                    max_region.k = cur_region.k
                    max_region.start = cur_region.start
                    max_region.covered = cur_region.covered

        if max_region.covered:
            data_regions.append(max_region)
            if max_region.start + max_region.covered < len(max_region.parent):
                data_regions.extend(self.identify_regions(max_region.start + max_region.covered, root,
                                                          max_generalized_nodes, threshold, scores))

        return data_regions


    def compare_generalized_nodes(self, parent, k):
        """
         compare the adjacent children generalized nodes similarity of a given element

         Arguments:
         `parent`: the lxml element to compare children of.
         `k`: the maximum length of generalized node.
        """
        scores = {}
        for a, b in pairwise(parent, k):
            score = self.stm.normalized_match_score(a, b)
            gn1 = GeneralizedNode(a[0], len(a))
            gn2 = GeneralizedNode(b[0], len(b))
            scores.setdefault((gn1, gn2), score)
        return scores
Exemplo n.º 5
0
 def __init__(self, threshold=0.3):
     self.stm = SimpleTreeMatch()
     self.threshold = threshold