def _rule_example(self, example): log.debug('Ruling example with MultiValuePathRuler') #@UndefinedVariable rule_example = super(MultiValuePathRuler, self)._rule_example values = list(example.value) count = len(values) example_rules = [] if not count: return [] # If there's only one value first_rules = rule_example(Example(values[0], example.content)) if count == 1: for rule in first_rules: #example_rules.append(MultiValuePathRule(rule.pattern)) example_rules.append(PathRule(rule.pattern)) return example_rules more_rules = rule_example(Example(values[1], example.content)) for f_rule in first_rules: f_rule_pattern = list(f_rule.pattern) if f_rule in more_rules: #example_rules.append(MultiValuePathRule(f_rule_pattern)) example_rules.append(PathRule(f_rule_pattern)) continue for s_rule in more_rules: if self._should_merge(f_rule, s_rule): f_rule_pattern = self._merge_patterns(f_rule.pattern, s_rule.pattern) example_rules.append(PathRule(f_rule_pattern)) return example_rules
def setUp(self): self.wt = WrapperTrainer([MockRuler()], num_examples=3) self.nsets = 5 self.example_sets = MockExampleManager()._get_examples( self.nsets, 5, 10) self.example_set = [ Example('v01', 'c01'), Example('v02', 'c02'), Example('v03', 'c03') ]
def test_evaluate_wrapper(self): self.example_set = [ Example('c01_r22_r33', 'c01'), Example('c02_r22_r33', 'c02'), Example('incorrect_value', 'c03') ] wrapper = Wrapper(rules=[MockRule(22), MockRule(33)]) self.wt._evaluate_wrapper(wrapper, self.example_set) self.failUnless(wrapper.upvotes == 2) self.failUnless(wrapper.downvotes == 1)
def get_examples(self, nexamples, url=u'', min_validity=0.5, break_on_min=False): """ Creates examples from the available references in the database. The references to use can be filtered depending on the validity and the url. This method returns a dictionary of fields whose value is a list of examples with values for that field. """ nexamples = nexamples if nexamples <= self.max_examples else self.max_examples url = unicode(url) examples = {} m_results = (self.session.query(mappers.Extraction, mappers.Reference). filter(mappers.Extraction.id == mappers.Reference.extraction_id). filter(mappers.Reference.validity >= min_validity). filter(mappers.Extraction.result_url.like(url + '%')). #@UndefinedVariable order_by(desc(mappers.Reference.validity)).all() [:self.max_examples_from_db]) for m_extraction, m_result in m_results: content = self._get_content(m_extraction.result_url) # Check if the contents of the database are still valid if not self._check_still_valid(m_result, content, min_validity): continue fields = [field for field in m_result.fields if field.valid] for field in fields: examples.setdefault(field.name, []) example = Example(field.value, content, m_extraction.result_url, field.valid, m_result.id) examples[field.name].append(example) # Authors and editors are special cases authors = self._get_name_regex_values(m_result.authors) if authors: examples.setdefault('author', []) examples['author'].append(Example(authors, content)) editors = self._get_name_regex_values(m_result.editors) if editors: examples.setdefault('editor', []) examples['editor'].append(Example(editors, content)) # Break if we already have enough examples for all of the fields if min(map(len, examples.values())) >= nexamples and break_on_min: break elif min(map(len, examples.values())) >= self.max_examples: break return examples
def setUp(self): self.ruler = ElementsRegexRuler() super(TestElementsRegexRuler, self).setUp() self.example06 = Example([ '(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)' ], [ 'The author Alberto Del Angel', 'Pierre Geurts The author', 'Damien Ernst' ]) self.example07 = Example([ '(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)' ], ['Alberto Del Angel', 'Pierre Geurts', 'Damien Ernst'])
def setUp(self): self.ruler = MultiValuePathRuler() super(TestMultiValuePathRuler, self).setUp() self.example06 = Example([ '.*(Botella.*P\.|P\..*Botella).*', '.*(Solona.*B\.|B\..*Solsona).*' ], self.soup03) self.example07 = Example([ '.*(Alberto.*Angel|Angel.*Alberto).*', '.*(Geurts.*Pierre|Pierre.*Geurts).*' ], self.soup01) self.example08 = Example([ '.*(Michael.*Sweredoski|Sweredoski.*Michael).*', '.*(Pierre.*Baldi|Baldi.*Pierre).*' ], self.soup02)
def test_rule(self): example01 = Example( u'2007', u' Volume 22 , ' 'Issue 22-23 (May 2007)') example02 = Example( u'2009', u' Volume 11 , ' 'Issue 16-25 (May 2009)') example03 = Example(u'2008', u' Year of publication: 2008') results = self.ruler.rule([example01, example02]) self.failUnless(results[0].pattern == u'Volume\\ (?:.*)\\ \\,\\ ' '\\;\\ Issue\\ (?:.*)\\-2(?:.*)\\ \\ \\;\\(May\\ ' '(.*)\\)') results = self.ruler.rule([example01, example02, example03]) self.failUnless(len(results) == 2) self.failUnless(results[0].pattern == u'Year\\ of\\ publication\\:' '\\ \\;(.*)')
def setUp(self): self.soup01 = get_soup('acm01.html') self.soup02 = get_soup('acm02.html') self.soup03 = get_soup('springer01.html') self.element01 = self.soup01.find(True, text='Neurocomputing ').parent self.element02 = self.soup01.find('td', {'class': 'small-text'}).parent self.element03 = self.soup01.find('col', {'width': '91%'}) self.text01 = '2007' self.text02 = '2668-2678' self.text03 = '2008' self.text04 = '1459-1460' self.text05 = '149-154' self.element_text = self.soup01.find(True, text=re.compile(self.text01)) self.example01 = Example(self.text01, self.soup01, 'http://some_url') self.example02 = Example(self.text02, self.soup01) self.example03 = Example(self.text03, self.soup02, 'http://some_url') self.example04 = Example(self.text04, self.soup02, 'http://some_url') self.example05 = Example(self.text05, self.soup03, 'http://some_url')
def _get_examples(self, nsets, min, max): sets = {} fields = 'abcdefghijklmnopqrstuvwxyz' nsets = nsets if nsets < len(fields) else len(fields) - 1 for i in range(nsets): field = fields[i] sets[field] = [ Example('v_%d' % i, 'c_%d' % i) for i in range(random.randint(min, max)) ] return sets
def setUp(self): self.ruler = SeparatorsRegexRuler() super(TestSeparatorsRegexRuler, self).setUp() self.example06 = Example([ '(Botella.*P\.|P\..*Botella)', '(Solona.*B\.|B\..*Solsona)', '(A\..*Martinez-Arias|Martinez-Arias.*A\.)', '(J\.M\..*Nieto|Nieto.*J\.M\.)' ], [ u'P. Botella1, B. Solsona1, ' 'A. Martinez-Arias2 and J.M. ' 'Lopez Nieto1' ]) self.example07 = Example([ '(Cabre.*L\.|L\..*Cabre)', '(Mancebo.*J\.|J\..*Mancebo)', '(J\..*Solsona|Solsona.*J\.)' ], u'L. Cabre1, J. Mancebo2, J. F. Solsona3, ' ' and the Bioethics Working ' 'Group of the SEMICYUC')
def _get_new_example_set(self, rule, example_set): """ Return a list of examples with the same value attribute as example_set but where the content is the result of applying rule. """ new_example_set = [] for example in example_set: value = example.value content = rule.apply(example.content) if value and content: new_example_set.append(Example(value, content)) else: log.warn('Example content is None after applying rule') #@UndefinedVariable return new_example_set
def test_rule_example(self): example = Example('2007', 'Volume 31, Number 7 / July, 2007') rules = self.ruler._rule_example(example) expected = 'Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)' self.failUnless(rules[0].pattern == expected)
def test_get_invalid_content_element(self): example = Example(value='random text', content=BeautifulSoup('')) elements = self.ruler._get_content_elements(example.value, example.content) self.failIf(elements)