def test_annotations(self): tm = TemplateMaker(self.PAGE) tm.annotate("field1", best_match("text to annotate"), best_match=False) annotations = [x[0] for x in tm.annotations()] self.assertEqual( annotations, [{u"annotations": {u"content": u"field1"}}, {u"annotations": {u"content": u"field1"}}] )
def test_annotations(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match('text to annotate'), best_match=False) annotations = [x[0] for x in tm.annotations()] self.assertEqual(annotations, [{u'annotations': {u'content': u'field1'}}, {u'annotations': {u'content': u'field1'}}])
def test_annotate_multiple(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match('text to annotate'), best_match=False) tpl = tm.get_template() ex = InstanceBasedLearningExtractor([(tpl, None)]) self.assertEqual(ex.extract(self.PAGE)[0], [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
def test_annotate_ignore_unpaired(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match("and that's"), best_match=False) tpl = tm.get_template() ex = InstanceBasedLearningExtractor([(tpl, None)]) self.assertEqual(ex.extract(self.PAGE)[0], [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
def test_annotate_multiple(self): tm = TemplateMaker(self.PAGE) tm.annotate("field1", best_match("text to annotate"), best_match=False) tpl = tm.get_template() ex = InstanceBasedLearningExtractor([tpl]) self.assertEqual( ex.extract(self.PAGE)[0], [{u"field1": [u"Some text to annotate here", u"Another text to annotate there"]}] )
def train_from_htmlpage(self, htmlpage, data): assert data, "Cannot train with empty data" tm = TemplateMaker(htmlpage) for field, values in data.items(): if (isinstance(values, (bytes, str)) or not hasattr(values, '__iter__')): values = [values] for value in values: value = str_to_unicode(value, htmlpage.encoding) tm.annotate(field, best_match(value)) self.add_template(tm.get_template())
def train_from_htmlpage(self, htmlpage, data): assert data, "Cannot train with empty data" tm = TemplateMaker(htmlpage) for field, values in data.items(): if not hasattr(values, '__iter__'): values = [values] for value in values: if isinstance(value, str): value = value.decode(htmlpage.encoding or 'utf-8') tm.annotate(field, best_match(value)) self.add_template(tm.get_template())
def train(self, url=None, data=None, html=None, encoding='utf-8'): assert data, "Cannot train with empty data" page = self._get_page(url, encoding, html) tm = TemplateMaker(page) for field, values in data.items(): if not hasattr(values, '__iter__'): values = [values] for value in values: if isinstance(value, str): value = value.decode(encoding) tm.annotate(field, best_match(value)) self.templates.append(tm.get_template())
def test_annotate_fragment_already_annotated(self): tm = TemplateMaker(self.PAGE) tm.annotate("field1", best_match("text to annotate")) self.assertRaises(FragmentAlreadyAnnotated, tm.annotate, "field1", best_match("text to annotate"))
def test_annotate_fragment_already_annotated(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match('text to annotate')) self.assertRaises(FragmentAlreadyAnnotated, tm.annotate, 'field1', best_match("text to annotate"))