Пример #1
0
    def test_full_align(self):

        ra = RecordAligner()
        records = [Record(t1), Record(t2), Record(t3)]
        seed, mappings = ra.align(records)

        self.assertEqual(3, len(mappings))

        # all the elements from seed should matched to other 2 trees
        for tag in ['root', 'a', 'b', 'c']:
            e = seed[0].xpath('//%s' % tag)[0]
            expected = []
            for record, mapping in mappings.items():
                expected.append(mapping[e].tag)
            self.assertEqual([tag] * 3, expected)
Пример #2
0
    def test_extract_with_seed2(self):

        mdr = MDR()
        page1 = get_page('htmlpage1')
        candidates, doc = mdr.list_candidates(page1, 'utf8')
        seed_record = Record(candidates[0][1], candidates[0][2])

        fragment1 = fragment_fromstring(get_page('fragment1'))
        seed_record_copy, mappings = mdr.extract(fragment1, seed_record)

        self.assertEquals(2, len(seed_record_copy))
        self.assertEquals('hreview', seed_record_copy[1].attrib.get('class'))
        # 27 items (records)
        self.assertEquals(27, len(mappings))

        extracted_dates = []
        extracted_texts = []

        for record, mapping in mappings.items():
            for k, v in mapping.items():
                if k.attrib.get('class') == 'dtreviewed':
                    extracted_dates.append(v.text)
                elif k.attrib.get('class') == 'description':
                    extracted_texts.append(v.text)

        # extract items are sorted in original order
        self.assertEquals(extracted_dates[0], '27-05-2014')
        self.assertEquals(extracted_dates[-1], '07-07-2013')
        self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig')
        self.assertEquals(
            extracted_texts[-1],
            'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.'
        )
Пример #3
0
    def test_extract_with_seed(self):
        mdr = MDR()

        page = get_page('htmlpage0')
        candidates, doc = mdr.list_candidates(page, 'utf8')
        # we known first element can be used as seed
        seed_record = Record(candidates[0][0])

        fragment = fragment_fromstring(get_page('fragment0'))
        seed_record_copy, mappings = mdr.extract(fragment, seed_record)

        # record only have 1 <li> elememt
        self.assertEquals(1, len(seed_record_copy))
        # 40 items (records)
        self.assertEquals(40, len(mappings))

        extracted_dates = []

        for record, mapping in mappings.items():
            for k, v in mapping.items():
                if k.attrib.get('itemprop') == 'datePublished':
                    extracted_dates.append(v.attrib.get('content'))

        self.assertEquals(extracted_dates[0], '2014-07-02')
        self.assertEquals(extracted_dates[-1], '2014-05-18')
Пример #4
0
    def test_align_with_record(self):

        ra = RecordAligner()
        seed_record = Record(t4)

        records = [Record(t1), Record(t2), Record(t3)]
        seed, mappings = ra.align(records, seed_record)

        self.assertEqual(4, len(mappings))

        # all the elements from seed should matched to other 3 trees
        for tag in ['root', 'a', 'b', 'c']:
            root = seed[0].xpath('//%s' % tag)[0]
            expected = []
            for record, mapping in mappings.items():
                if seed_record == record:
                    continue
                expected.append(mapping[root].tag)
            self.assertEqual([tag] * 3, expected)