class MiningDataRecord(object): """ mining the data record from a region. basic assumption: the subtree of data records also similar. so if not any adjacent pair of them are similar (less than threshold), data region itself is a data record, otherwise children are individual data record. """ def __init__(self, threshold): self.stm = SimpleTreeMatch() self.threshold = threshold def find_records(self, region): if region.k == 1: records = [] # if all the individual node of children node of Generalized node are similar for i in range(region.start, region.start + region.covered): for child1, child2 in pairwise(region.parent[i], 1, 0): sim = self.stm.normalized_match_score(child1, child2) if sim < self.threshold: return self.slice_region(region) else: # each child of generalized node is a data record for gn in region.iter(1): records.extend([Record(c) for c in gn]) return records else: # if almost all the individual node in Generalized Node are similar children = [region.parent[region.start + i] for i in range(region.covered)] sizes = Counter([tree_size(child) for child in children]) most_common_size, _= sizes.most_common(1)[0] most_typical_child = [child for child in children if tree_size(child) == most_common_size][0] similarities = dict([child, self.stm.normalized_match_score([most_typical_child], [child])] for child in children) if self.almost_similar(list(similarities.values()), self.threshold): return [Record(child) for child in children if similarities[child] >= self.threshold] else: return self.slice_region(region) def slice_region(self, region): """ slice every generalized node of region to a data record """ records = [] for gn in region.iter(region.k): elements = [element for element in gn] records.append(Record(*elements)) return records def almost_similar(self, similarities, threshold): sims = [1 for sim in similarities if sim >= threshold] return len(sims) / len(similarities) > 0.8
class MiningDataRecord(object): """ mining the data record from a region. basic assumption: the subtree of data records also similar. so if not any adjacent pair of them are similar (less than threshold), data region itself is a data record, otherwise children are individual data record. """ def __init__(self, threshold=0.3): self.stm = SimpleTreeMatch() self.threshold = threshold def find_records(self, region): records = [] if region.k == 1: for i in xrange(region.start, region.start + region.covered): for child1, child2 in pairwise(region.parent, 1, region.start): similarity = self.stm.normalized_match_score(child1, child2) if similarity < self.threshold: return self.slice_region(region) else: # each child of generalized node is a data record for gn in region.iter(1): records.extend([Record(c) for c in gn]) return self.slice_region(region) def slice_region(self, region): """ slice every generalized node of region to a data record """ records = [] for gn in region.iter(region.k): elements = [element for element in gn] records.append(Record(*elements)) return records
def __init__(self, root, max_generalized_nodes=3, threshold=0.3): self.root = root self.max_generalized_nodes = max_generalized_nodes self.threshold = threshold self.stm = SimpleTreeMatch()
class MiningDataRegion(object): def __init__(self, root, max_generalized_nodes=3, threshold=0.3): self.root = root self.max_generalized_nodes = max_generalized_nodes self.threshold = threshold self.stm = SimpleTreeMatch() def find_regions(self, root): data_regions = [] if tree_depth(root) >= 2: scores = self.compare_generalized_nodes(root, self.max_generalized_nodes) data_regions.extend(self.identify_regions(0, root, self.max_generalized_nodes, self.threshold, scores)) covered = set() for data_region in data_regions: for i in xrange(data_region.start, data_region.covered): covered.add(data_region.parent[i]) for child in root: if child not in covered: data_regions.extend(self.find_regions(child)) return data_regions def identify_regions(self, start, root, max_generalized_nodes, threshold, scores): cur_region = Region(parent=root, start=0, k=0, covered=0) max_region = Region(parent=root, start=0, k=0, covered=0) data_regions = [] for k in xrange(1, max_generalized_nodes + 1): for i in xrange(start, k + start): flag = True for j in xrange(i, len(root) - k, k): pair = GeneralizedNode(root[j], k), GeneralizedNode(root[j + k], k) score = scores.get(pair) if score >= threshold: if flag: cur_region.k = k cur_region.start = j cur_region.covered = 2 * k flag = False else: cur_region.covered += k elif not flag: # doesn't match but previous match break if max_region.covered <= cur_region.covered and ( max_region.start == 0 or cur_region.start <= max_region.start): max_region.k = cur_region.k max_region.start = cur_region.start max_region.covered = cur_region.covered if max_region.covered: data_regions.append(max_region) if max_region.start + max_region.covered < len(max_region.parent): data_regions.extend(self.identify_regions(max_region.start + max_region.covered, root, max_generalized_nodes, threshold, scores)) return data_regions def compare_generalized_nodes(self, parent, k): """ compare the adjacent children generalized nodes similarity of a given element Arguments: `parent`: the lxml element to compare children of. `k`: the maximum length of generalized node. """ scores = {} for a, b in pairwise(parent, k): score = self.stm.normalized_match_score(a, b) gn1 = GeneralizedNode(a[0], len(a)) gn2 = GeneralizedNode(b[0], len(b)) scores.setdefault((gn1, gn2), score) return scores
def __init__(self, threshold=0.3): self.stm = SimpleTreeMatch() self.threshold = threshold