class ContextResolverTest(unittest.TestCase): def setUp(self): self.cr = ContextResolver() self.soup = ContentCleaner().clean_content(html) self.element01 = self.soup.find('td', text='Value 01').parent self.element02 = self.soup.find('td', text='Value 03').parent def tearDown(self): pass def test_get_context(self): context = self.cr.get_context(self.element01) self.failUnless(context[u'Field 01:'] == 1) def test_get_tree_context(self): context = self.cr.get_context(self.element02) self.failUnless(context[u'Field 03'] == 1) self.failUnless(context[u'33'] == 1) def test_merge_contexts(self): context01 = {u'Field 01:': 1} context02 = {u'Field 01:': 3, u'Field 02:': 1, u'Field 03:': 4} merged = self.cr.merge_context(context01, context02) self.failUnless(merged == { u'Field 02:': 1, u'Field 01:': 4, u'Field 03:': 4 }) def test_clean_context(self): context = { 'a': 2, 'b': 3, 'c': 1, 'this string is quite long. yes indeed': 4 } result = self.cr.clean_context(context) self.failUnless(result == {'a': 2, 'b': 3}) def test_get_top_words(self): context = {u'a': 3, 'b': 5, 'c': 1, u'd': 2, 'e': 4} expected = ['b', 'e', u'a'] result = self.cr.get_top_strings(context, 3) self.failUnless(result == expected) def test_check_context(self): context01 = {'a': 3, 'b': 5, 'c': 1, 'd': 2, 'e': 4} context02 = {'a': 1, 'x': 3} result = self.cr.check_context(context01, context02) self.failUnless(result) context02 = {'x': 3} result = self.cr.check_context(context01, context02) self.failIf(result) context01 = {} result = self.cr.check_context(context01, context02) self.failUnless(result)
class PathRuler(Ruler): """ Creates a rule described by the path to locate some piece of information in an HTML document Content of the examples must be a BeautifulSoup object that describes an HTML document. """ def __init__(self, value_guide='.*'): super(PathRuler, self).__init__() self.context_resolver = ContextResolver() self.value_guide = value_guide def rule(self, training): rules = super(PathRuler, self).rule(training) for rule in rules: rule.pattern.insert(0, self.value_guide) # Clean context rule.pattern[1] = self.context_resolver.clean_context( rule.pattern[1]) return rules def _rule_example(self, example): log.debug('Ruling example with PathRuler. Value %s' % #@UndefinedVariable str(example.value)) rules = [] element_rules = [] for element in self._get_content_elements(example.value, example.content): rule = self._rule_element(example, element) if rule: element_rules.append(rule) self._merge_rules(rules, element_rules) return rules def _rule_element(self, example, element): try: pattern = self._get_element_path(example.content, element.parent) context = self.context_resolver.get_context(element.parent) pattern.insert(0, context) return PathRule(pattern) except Exception, e: log.warn('Path ruler cannot rule element %s: %s' #@UndefinedVariable % (str(element), e)) return None
def _choose_element(self, elements): matches = [] context_resolver = ContextResolver() for element in elements: # Check field context context = context_resolver.get_context(element) if not context_resolver.check_context(self.context, context): continue # Use value guide texts = element.findAll(name=True, text=True) element_text = ''.join(texts) match = re.search(self.value_guide, element_text) if match: matches.append(element_text) return matches
class ContextResolverTest(unittest.TestCase): def setUp(self): self.cr = ContextResolver() self.soup = ContentCleaner().clean_content(html) self.element01 = self.soup.find('td', text='Value 01').parent self.element02 = self.soup.find('td', text='Value 03').parent def tearDown(self): pass def test_get_context(self): context = self.cr.get_context(self.element01) self.failUnless(context[u'Field 01:'] == 1) def test_get_tree_context(self): context = self.cr.get_context(self.element02) self.failUnless(context[u'Field 03'] == 1) self.failUnless(context[u'33'] == 1) def test_merge_contexts(self): context01 = {u'Field 01:':1} context02 = {u'Field 01:':3, u'Field 02:':1, u'Field 03:':4} merged = self.cr.merge_context(context01, context02) self.failUnless(merged == {u'Field 02:': 1, u'Field 01:': 4, u'Field 03:': 4}) def test_clean_context(self): context = {'a':2, 'b':3, 'c':1, 'this string is quite long. yes indeed':4} result = self.cr.clean_context(context) self.failUnless(result == {'a':2, 'b':3}) def test_get_top_words(self): context = {u'a':3, 'b':5, 'c':1, u'd':2, 'e':4} expected = ['b', 'e', u'a'] result = self.cr.get_top_strings(context, 3) self.failUnless(result == expected) def test_check_context(self): context01 = {'a':3, 'b':5, 'c':1, 'd':2, 'e':4} context02 = {'a':1, 'x':3} result = self.cr.check_context(context01, context02) self.failUnless(result) context02 = {'x':3} result = self.cr.check_context(context01, context02) self.failIf(result) context01 = {} result = self.cr.check_context(context01, context02) self.failUnless(result)
def __init__(self, value_guide='.*'): super(PathRuler, self).__init__() self.context_resolver = ContextResolver() self.value_guide = value_guide
def setUp(self): self.cr = ContextResolver() self.soup = ContentCleaner().clean_content(html) self.element01 = self.soup.find('td', text='Value 01').parent self.element02 = self.soup.find('td', text='Value 03').parent