Exemplo n.º 1
0
 def __init__(self):
     self.pta = PartialTreeAligner(SimpleTreeAligner())
Exemplo n.º 2
0
 def __init__(self, include_html=False):
     self.pta = PartialTreeAligner(SimpleTreeAligner())
     self.sta = SimpleTreeAligner()
Exemplo n.º 3
0
class MiningDataField(object):
    """
    Mining the data item from data records with tree alignment.
    """
    def __init__(self):
        self.pta = PartialTreeAligner(SimpleTreeAligner())

    def align_records(self, records):
        """
        partial align multiple records.

        for example (from paper Web Data Extraction Based on Partial Tree Alignment):
        >>> from lxml.html import fragment_fromstring
        >>> t1 = fragment_fromstring("<p> <x1></x1> <x2></x2> <x3></x3> <x></x> <b></b> <d></d> </p>")
        >>> t2 = fragment_fromstring("<p> <b></b> <n></n> <c></c> <k></k> <g></g> </p>")
        >>> t3 = fragment_fromstring("<p> <b></b> <c></c> <d></d> <h></h> <k></k> </p>")
        >>> mdf = MiningDataField()
        >>> _, seed = mdf.align_records([Record(t1), Record(t2), Record(t3)])
        >>> [e.tag for e in seed[0]]
        ['x1', 'x2', 'x3', 'x', 'b', 'n', 'c', 'd', 'h', 'k', 'g']
        >>> [e.tag for e in t1]
        ['x1', 'x2', 'x3', 'x', 'b', 'd']
        """
        # sort by the tree size
        sorted_records = sorted(records, key=Record.size)

        # seed is the largest tree
        seed = sorted_records.pop()

        # a dict like {'t2': {}, 't3': {}, etc}
        # the nested dictionary is like {'seed_element' : 'original_element'}
        mappings = defaultdict(dict)
        seed_copy = copy.deepcopy(seed)
        mappings.setdefault(seed, self._create_seed_mapping(seed_copy, seed))

        R = []
        items = []
        while len(sorted_records):
            next = sorted_records.pop()
            modified, partial_match, aligned = self.pta.align(seed_copy, next)
            if modified:
                mappings.update({next: aligned})
                sorted_records.extend(R)
                R = []
            else:
                # add it back to try it later since seed might change
                if partial_match:
                    R.append(next)
                else:
                    mappings.update({next: aligned})

        for record in records:
            aligned = mappings[record]
            items.append(self._extract_item(seed_copy, aligned))

        return items, seed_copy

    def _create_seed_mapping(self, copy_record, original_record):
        """
        >>> from lxml.html import fragment_fromstring
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> </p>")
        >>> d1 = Record(t1)
        >>> d1_copy = copy.deepcopy(d1)
        >>> mdr = MiningDataField()
        >>> d = mdr._create_seed_mapping(d1_copy, d1)
        >>> len(d)
        3
        """
        d = {}
        for _copy, _original in zip(copy_record, original_record):
            d[_copy] = _original
            d.update(self._create_seed_mapping(_copy, _original))

        return d

    def _extract_item(self, seed, d):
        """
        extract data item from the tree.
        `seed`: the seed tree
        `d`: a seed element -> original element dictionary
        """
        from depta import Item
        fields = self._extract_field(seed, d)
        return Item(fields)

    def _extract_field(self, iterable, d):
        """
        extract from the iterable recursively
        """
        r = []
        from depta import Field
        for i in iterable:
            if i in d:
                e = d.get(i)
                text = e.text or ''
                text = self._normalize_text(text)
                if len(text):
                    field = Field(text, pq(e).html())
                    r.append(field)
            r.extend(self._extract_field(i, d))
        return r

    def _normalize_text(self, text):
        return text.replace('\n', '').strip()
Exemplo n.º 4
0
class MiningDataField(object):
    """
    Mining the data item from data records with partial tree alignment.
    """
    def __init__(self, include_html=False):
        self.pta = PartialTreeAligner(SimpleTreeAligner())
        self.sta = SimpleTreeAligner()

    def align_records(self, records):
        """partial align multiple records.

        for example (from paper Web Data Extraction Based on Partial Tree Alignment):
        >>> from lxml.html import fragment_fromstring
        >>> t1 = fragment_fromstring("<p> <x1></x1> <x2></x2> <x3></x3> <x></x> <b></b> <d></d> </p>")
        >>> t2 = fragment_fromstring("<p> <b></b> <n></n> <c></c> <k></k> <g></g> </p>")
        >>> t3 = fragment_fromstring("<p> <b></b> <c></c> <d></d> <h></h> <k></k> </p>")
        >>> mdf = MiningDataField()
        >>> _, seed = mdf.align_records([Record(t1), Record(t2), Record(t3)])
        >>> [e.tag for e in seed[0]]
        ['x1', 'x2', 'x3', 'x', 'b', 'n', 'c', 'd', 'h', 'k', 'g']
        >>> [e.tag for e in t1]
        ['x1', 'x2', 'x3', 'x', 'b', 'd']
        """

        # sort by the tree size
        sorted_records = sorted(records, key=Record.size)

        # seed is the largest tree
        seed = sorted_records.pop()

        # a dict like {'t2': {}, 't3': {}, etc}
        # the nested dictionary is like {'seed_element' : 'original_element'}
        mappings = defaultdict(dict)
        seed_copy = copy.deepcopy(seed)
        mappings.setdefault(seed, self._create_seed_mapping(seed_copy, seed))

        R = []
        items = []
        while len(sorted_records):
            next = sorted_records.pop()
            modified, partial_match, aligned = self.pta.align(seed_copy, next)
            mappings.update({next: aligned})

            if modified:
                sorted_records.extend(R)
                R = []
            else:
                # add it back to try it later since seed might change
                if partial_match:
                    R.append(next)

        for record in records:
            aligned = mappings[record]
            items.append(self._extract_item(seed_copy, aligned))

        return items, seed_copy

    def align_record(self, seed, record):
        """simple align the given record with given seed
        """
        alignment = self.sta.align(seed, record)
        aligned = dict({alignment.first: alignment.second})
        for sub in alignment.subs:
            aligned.update({sub.first: sub.second})
        return self._extract_item(seed, aligned)

    def _create_seed_mapping(self, seed, record):
        """create a mapping from seed record to data record.

        for example:
        >>> from lxml.html import fragment_fromstring
        >>> t1 = fragment_fromstring("<p id='1'> <a></a> <b></b> </p>")
        >>> d1 = Record(t1)
        >>> p1 = t1

        >>> t2 = fragment_fromstring("<p id='2'> <a></a> <b></b> </p>")
        >>> d2 = Record(t2)
        >>> p2 = t2

        >>> mdr = MiningDataField()
        >>> d = mdr._create_seed_mapping(d1, d2)
        >>> d[p1] == p2
        True

        """
        d = {}
        for s, e in zip(seed, record):
            d[s] = e
            d.update(self._create_seed_mapping(s, e))
        return d

    def _extract_item(self, seed, mapping):
        """extract data item from the tree.

        seed: the seed tree
        mapping: a seed element to original element dict
        """
        r = []
        for element in seed:
            r.extend(self._extract_element(element, mapping))
        return r

    def _extract_element(self, seed, mapping):
        r = []
        e = mapping.get(seed, None)

        # handle text
        attributes = {}
        tag = None

        if e is not None:
            attributes = e.attrib
            tag = e.tag

        pass_data = {"tag": tag, "attr": attributes}

        # if attributes:
        #     href=attributes.get('href')
        #
        #     # if href:
        #     #     r.append(Field(href, b''))
        #         #print (href)

        if e is not None:

            if seed.text and seed.text.strip():
                r.append(Field(self._get_text(e.text), pass_data))
            else:
                r.append(Field(u'', pass_data))
        else:
            if seed.text and seed.text.strip():
                r.append(Field(u'', pass_data))
            else:
                r.append(Field(u'', pass_data))

        # handle children
        for child in seed:
            r.extend(self._extract_element(child, mapping))

        # handle tail
        if e is not None:
            if seed.tail and seed.tail.strip():
                r.append(Field(self._get_text(e.tail) or u'', b''))
        else:
            if seed.tail and seed.tail.strip():
                r.append(Field(u'', b''))

        return r

    def _get_text(self, text):
        if text is not None:
            if isinstance(text, bytes):
                return text.decode('utf8', 'ignore')
            return text
        return u''