class ContextResolverTest(unittest.TestCase): def setUp(self): self.cr = ContextResolver() self.soup = ContentCleaner().clean_content(html) self.element01 = self.soup.find('td', text='Value 01').parent self.element02 = self.soup.find('td', text='Value 03').parent def tearDown(self): pass def test_get_context(self): context = self.cr.get_context(self.element01) self.failUnless(context[u'Field 01:'] == 1) def test_get_tree_context(self): context = self.cr.get_context(self.element02) self.failUnless(context[u'Field 03'] == 1) self.failUnless(context[u'33'] == 1) def test_merge_contexts(self): context01 = {u'Field 01:': 1} context02 = {u'Field 01:': 3, u'Field 02:': 1, u'Field 03:': 4} merged = self.cr.merge_context(context01, context02) self.failUnless(merged == { u'Field 02:': 1, u'Field 01:': 4, u'Field 03:': 4 }) def test_clean_context(self): context = { 'a': 2, 'b': 3, 'c': 1, 'this string is quite long. yes indeed': 4 } result = self.cr.clean_context(context) self.failUnless(result == {'a': 2, 'b': 3}) def test_get_top_words(self): context = {u'a': 3, 'b': 5, 'c': 1, u'd': 2, 'e': 4} expected = ['b', 'e', u'a'] result = self.cr.get_top_strings(context, 3) self.failUnless(result == expected) def test_check_context(self): context01 = {'a': 3, 'b': 5, 'c': 1, 'd': 2, 'e': 4} context02 = {'a': 1, 'x': 3} result = self.cr.check_context(context01, context02) self.failUnless(result) context02 = {'x': 3} result = self.cr.check_context(context01, context02) self.failIf(result) context01 = {} result = self.cr.check_context(context01, context02) self.failUnless(result)
class ContextResolverTest(unittest.TestCase): def setUp(self): self.cr = ContextResolver() self.soup = ContentCleaner().clean_content(html) self.element01 = self.soup.find('td', text='Value 01').parent self.element02 = self.soup.find('td', text='Value 03').parent def tearDown(self): pass def test_get_context(self): context = self.cr.get_context(self.element01) self.failUnless(context[u'Field 01:'] == 1) def test_get_tree_context(self): context = self.cr.get_context(self.element02) self.failUnless(context[u'Field 03'] == 1) self.failUnless(context[u'33'] == 1) def test_merge_contexts(self): context01 = {u'Field 01:':1} context02 = {u'Field 01:':3, u'Field 02:':1, u'Field 03:':4} merged = self.cr.merge_context(context01, context02) self.failUnless(merged == {u'Field 02:': 1, u'Field 01:': 4, u'Field 03:': 4}) def test_clean_context(self): context = {'a':2, 'b':3, 'c':1, 'this string is quite long. yes indeed':4} result = self.cr.clean_context(context) self.failUnless(result == {'a':2, 'b':3}) def test_get_top_words(self): context = {u'a':3, 'b':5, 'c':1, u'd':2, 'e':4} expected = ['b', 'e', u'a'] result = self.cr.get_top_strings(context, 3) self.failUnless(result == expected) def test_check_context(self): context01 = {'a':3, 'b':5, 'c':1, 'd':2, 'e':4} context02 = {'a':1, 'x':3} result = self.cr.check_context(context01, context02) self.failUnless(result) context02 = {'x':3} result = self.cr.check_context(context01, context02) self.failIf(result) context01 = {} result = self.cr.check_context(context01, context02) self.failUnless(result)
def _choose_element(self, elements): matches = [] context_resolver = ContextResolver() for element in elements: # Check field context context = context_resolver.get_context(element) if not context_resolver.check_context(self.context, context): continue # Use value guide texts = element.findAll(name=True, text=True) element_text = ''.join(texts) match = re.search(self.value_guide, element_text) if match: matches.append(element_text) return matches