def __init__(self): self.pta = PartialTreeAligner(SimpleTreeAligner())
def __init__(self, include_html=False): self.pta = PartialTreeAligner(SimpleTreeAligner()) self.sta = SimpleTreeAligner()
class MiningDataField(object): """ Mining the data item from data records with tree alignment. """ def __init__(self): self.pta = PartialTreeAligner(SimpleTreeAligner()) def align_records(self, records): """ partial align multiple records. for example (from paper Web Data Extraction Based on Partial Tree Alignment): >>> from lxml.html import fragment_fromstring >>> t1 = fragment_fromstring("<p> <x1></x1> <x2></x2> <x3></x3> <x></x> <b></b> <d></d> </p>") >>> t2 = fragment_fromstring("<p> <b></b> <n></n> <c></c> <k></k> <g></g> </p>") >>> t3 = fragment_fromstring("<p> <b></b> <c></c> <d></d> <h></h> <k></k> </p>") >>> mdf = MiningDataField() >>> _, seed = mdf.align_records([Record(t1), Record(t2), Record(t3)]) >>> [e.tag for e in seed[0]] ['x1', 'x2', 'x3', 'x', 'b', 'n', 'c', 'd', 'h', 'k', 'g'] >>> [e.tag for e in t1] ['x1', 'x2', 'x3', 'x', 'b', 'd'] """ # sort by the tree size sorted_records = sorted(records, key=Record.size) # seed is the largest tree seed = sorted_records.pop() # a dict like {'t2': {}, 't3': {}, etc} # the nested dictionary is like {'seed_element' : 'original_element'} mappings = defaultdict(dict) seed_copy = copy.deepcopy(seed) mappings.setdefault(seed, self._create_seed_mapping(seed_copy, seed)) R = [] items = [] while len(sorted_records): next = sorted_records.pop() modified, partial_match, aligned = self.pta.align(seed_copy, next) if modified: mappings.update({next: aligned}) sorted_records.extend(R) R = [] else: # add it back to try it later since seed might change if partial_match: R.append(next) else: mappings.update({next: aligned}) for record in records: aligned = mappings[record] items.append(self._extract_item(seed_copy, aligned)) return items, seed_copy def _create_seed_mapping(self, copy_record, original_record): """ >>> from lxml.html import fragment_fromstring >>> t1 = fragment_fromstring("<p> <a></a> <b></b> </p>") >>> d1 = Record(t1) >>> d1_copy = copy.deepcopy(d1) >>> mdr = MiningDataField() >>> d = mdr._create_seed_mapping(d1_copy, d1) >>> len(d) 3 """ d = {} for _copy, _original in zip(copy_record, original_record): d[_copy] = _original d.update(self._create_seed_mapping(_copy, _original)) return d def _extract_item(self, seed, d): """ extract data item from the tree. `seed`: the seed tree `d`: a seed element -> original element dictionary """ from depta import Item fields = self._extract_field(seed, d) return Item(fields) def _extract_field(self, iterable, d): """ extract from the iterable recursively """ r = [] from depta import Field for i in iterable: if i in d: e = d.get(i) text = e.text or '' text = self._normalize_text(text) if len(text): field = Field(text, pq(e).html()) r.append(field) r.extend(self._extract_field(i, d)) return r def _normalize_text(self, text): return text.replace('\n', '').strip()
class MiningDataField(object): """ Mining the data item from data records with partial tree alignment. """ def __init__(self, include_html=False): self.pta = PartialTreeAligner(SimpleTreeAligner()) self.sta = SimpleTreeAligner() def align_records(self, records): """partial align multiple records. for example (from paper Web Data Extraction Based on Partial Tree Alignment): >>> from lxml.html import fragment_fromstring >>> t1 = fragment_fromstring("<p> <x1></x1> <x2></x2> <x3></x3> <x></x> <b></b> <d></d> </p>") >>> t2 = fragment_fromstring("<p> <b></b> <n></n> <c></c> <k></k> <g></g> </p>") >>> t3 = fragment_fromstring("<p> <b></b> <c></c> <d></d> <h></h> <k></k> </p>") >>> mdf = MiningDataField() >>> _, seed = mdf.align_records([Record(t1), Record(t2), Record(t3)]) >>> [e.tag for e in seed[0]] ['x1', 'x2', 'x3', 'x', 'b', 'n', 'c', 'd', 'h', 'k', 'g'] >>> [e.tag for e in t1] ['x1', 'x2', 'x3', 'x', 'b', 'd'] """ # sort by the tree size sorted_records = sorted(records, key=Record.size) # seed is the largest tree seed = sorted_records.pop() # a dict like {'t2': {}, 't3': {}, etc} # the nested dictionary is like {'seed_element' : 'original_element'} mappings = defaultdict(dict) seed_copy = copy.deepcopy(seed) mappings.setdefault(seed, self._create_seed_mapping(seed_copy, seed)) R = [] items = [] while len(sorted_records): next = sorted_records.pop() modified, partial_match, aligned = self.pta.align(seed_copy, next) mappings.update({next: aligned}) if modified: sorted_records.extend(R) R = [] else: # add it back to try it later since seed might change if partial_match: R.append(next) for record in records: aligned = mappings[record] items.append(self._extract_item(seed_copy, aligned)) return items, seed_copy def align_record(self, seed, record): """simple align the given record with given seed """ alignment = self.sta.align(seed, record) aligned = dict({alignment.first: alignment.second}) for sub in alignment.subs: aligned.update({sub.first: sub.second}) return self._extract_item(seed, aligned) def _create_seed_mapping(self, seed, record): """create a mapping from seed record to data record. for example: >>> from lxml.html import fragment_fromstring >>> t1 = fragment_fromstring("<p id='1'> <a></a> <b></b> </p>") >>> d1 = Record(t1) >>> p1 = t1 >>> t2 = fragment_fromstring("<p id='2'> <a></a> <b></b> </p>") >>> d2 = Record(t2) >>> p2 = t2 >>> mdr = MiningDataField() >>> d = mdr._create_seed_mapping(d1, d2) >>> d[p1] == p2 True """ d = {} for s, e in zip(seed, record): d[s] = e d.update(self._create_seed_mapping(s, e)) return d def _extract_item(self, seed, mapping): """extract data item from the tree. seed: the seed tree mapping: a seed element to original element dict """ r = [] for element in seed: r.extend(self._extract_element(element, mapping)) return r def _extract_element(self, seed, mapping): r = [] e = mapping.get(seed, None) # handle text attributes = {} tag = None if e is not None: attributes = e.attrib tag = e.tag pass_data = {"tag": tag, "attr": attributes} # if attributes: # href=attributes.get('href') # # # if href: # # r.append(Field(href, b'')) # #print (href) if e is not None: if seed.text and seed.text.strip(): r.append(Field(self._get_text(e.text), pass_data)) else: r.append(Field(u'', pass_data)) else: if seed.text and seed.text.strip(): r.append(Field(u'', pass_data)) else: r.append(Field(u'', pass_data)) # handle children for child in seed: r.extend(self._extract_element(child, mapping)) # handle tail if e is not None: if seed.tail and seed.tail.strip(): r.append(Field(self._get_text(e.tail) or u'', b'')) else: if seed.tail and seed.tail.strip(): r.append(Field(u'', b'')) return r def _get_text(self, text): if text is not None: if isinstance(text, bytes): return text.decode('utf8', 'ignore') return text return u''