示例#1
0
    def _rule_example(self, example):
        log.debug('Ruling example with MultiValuePathRuler') #@UndefinedVariable
        rule_example = super(MultiValuePathRuler, self)._rule_example
        values = list(example.value)
        count = len(values) 
        example_rules = []
        if not count:
            return []        
        
        # If there's only one value
        first_rules = rule_example(Example(values[0], example.content))
        if count == 1:
            for rule in first_rules:
                #example_rules.append(MultiValuePathRule(rule.pattern))
                example_rules.append(PathRule(rule.pattern))
            return example_rules
        
        more_rules = rule_example(Example(values[1], example.content))
        for f_rule in first_rules:
            f_rule_pattern = list(f_rule.pattern)
            if f_rule in more_rules:
                #example_rules.append(MultiValuePathRule(f_rule_pattern))
                example_rules.append(PathRule(f_rule_pattern))
                continue
            
            for s_rule in more_rules:
                if self._should_merge(f_rule, s_rule):
                    f_rule_pattern = self._merge_patterns(f_rule.pattern,
                                                          s_rule.pattern)
                
            example_rules.append(PathRule(f_rule_pattern))

        return example_rules
 def setUp(self):
     self.wt = WrapperTrainer([MockRuler()], num_examples=3)
     self.nsets = 5
     self.example_sets = MockExampleManager()._get_examples(
         self.nsets, 5, 10)
     self.example_set = [
         Example('v01', 'c01'),
         Example('v02', 'c02'),
         Example('v03', 'c03')
     ]
 def test_evaluate_wrapper(self):
     self.example_set = [
         Example('c01_r22_r33', 'c01'),
         Example('c02_r22_r33', 'c02'),
         Example('incorrect_value', 'c03')
     ]
     wrapper = Wrapper(rules=[MockRule(22), MockRule(33)])
     self.wt._evaluate_wrapper(wrapper, self.example_set)
     self.failUnless(wrapper.upvotes == 2)
     self.failUnless(wrapper.downvotes == 1)
示例#4
0
    def get_examples(self, nexamples, url=u'', min_validity=0.5,
                     break_on_min=False):
        """
        Creates examples from the available references in the database. The
        references to use can be filtered depending on the validity and the 
        url.
        
        This method returns a dictionary of fields whose value is a list of
        examples with values for that field.
        """
        
        nexamples = nexamples if nexamples <= self.max_examples else self.max_examples
        
        url = unicode(url)
        examples = {}
        m_results = (self.session.query(mappers.Extraction, mappers.Reference).
                     filter(mappers.Extraction.id == mappers.Reference.extraction_id).
                     filter(mappers.Reference.validity >= min_validity).
                     filter(mappers.Extraction.result_url.like(url + '%')). #@UndefinedVariable
                     order_by(desc(mappers.Reference.validity)).all()
                     [:self.max_examples_from_db])

        for m_extraction, m_result in m_results:
            content = self._get_content(m_extraction.result_url)

            # Check if the contents of the database are still valid
            if not self._check_still_valid(m_result, content, min_validity):
                continue
            
            fields = [field for field in m_result.fields if field.valid]
            for field in fields:
                examples.setdefault(field.name, [])
                example = Example(field.value, content, m_extraction.result_url,
                                  field.valid, m_result.id)
                examples[field.name].append(example)
            
            # Authors and editors are special cases
            authors = self._get_name_regex_values(m_result.authors)
            if authors:
                examples.setdefault('author', [])
                examples['author'].append(Example(authors, content))
            
            editors = self._get_name_regex_values(m_result.editors)
            if editors:
                examples.setdefault('editor', [])
                examples['editor'].append(Example(editors, content))
        
            # Break if we already have enough examples for all of the fields
            if min(map(len, examples.values())) >= nexamples and break_on_min:
                break
            elif min(map(len, examples.values())) >= self.max_examples:
                break
            
        return examples
    def setUp(self):
        self.ruler = ElementsRegexRuler()
        super(TestElementsRegexRuler, self).setUp()

        self.example06 = Example([
            '(Alberto.*Angel|Angel.*Alberto)',
            '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)'
        ], [
            'The author Alberto Del Angel', 'Pierre Geurts The author',
            'Damien Ernst'
        ])

        self.example07 = Example([
            '(Alberto.*Angel|Angel.*Alberto)',
            '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)'
        ], ['Alberto Del Angel', 'Pierre Geurts', 'Damien Ernst'])
    def setUp(self):
        self.ruler = MultiValuePathRuler()
        super(TestMultiValuePathRuler, self).setUp()

        self.example06 = Example([
            '.*(Botella.*P\.|P\..*Botella).*', '.*(Solona.*B\.|B\..*Solsona).*'
        ], self.soup03)

        self.example07 = Example([
            '.*(Alberto.*Angel|Angel.*Alberto).*',
            '.*(Geurts.*Pierre|Pierre.*Geurts).*'
        ], self.soup01)
        self.example08 = Example([
            '.*(Michael.*Sweredoski|Sweredoski.*Michael).*',
            '.*(Pierre.*Baldi|Baldi.*Pierre).*'
        ], self.soup02)
    def test_rule(self):
        example01 = Example(
            u'2007', u' Volume 22 ,&nbsp; '
            'Issue 22-23 &nbsp;(May 2007)')
        example02 = Example(
            u'2009', u' Volume 11 ,&nbsp; '
            'Issue 16-25 &nbsp;(May 2009)')
        example03 = Example(u'2008', u' Year of publication:&nbsp;2008')

        results = self.ruler.rule([example01, example02])
        self.failUnless(results[0].pattern == u'Volume\\ (?:.*)\\ \\,\\&nbsp'
                        '\\;\\ Issue\\ (?:.*)\\-2(?:.*)\\ \\&nbsp\\;\\(May\\ '
                        '(.*)\\)')

        results = self.ruler.rule([example01, example02, example03])
        self.failUnless(len(results) == 2)
        self.failUnless(results[0].pattern == u'Year\\ of\\ publication\\:'
                        '\\&nbsp\\;(.*)')
 def setUp(self):
     self.soup01 = get_soup('acm01.html')
     self.soup02 = get_soup('acm02.html')
     self.soup03 = get_soup('springer01.html')
     self.element01 = self.soup01.find(True, text='Neurocomputing ').parent
     self.element02 = self.soup01.find('td', {'class': 'small-text'}).parent
     self.element03 = self.soup01.find('col', {'width': '91%'})
     self.text01 = '2007'
     self.text02 = '2668-2678'
     self.text03 = '2008'
     self.text04 = '1459-1460'
     self.text05 = '149-154'
     self.element_text = self.soup01.find(True,
                                          text=re.compile(self.text01))
     self.example01 = Example(self.text01, self.soup01, 'http://some_url')
     self.example02 = Example(self.text02, self.soup01)
     self.example03 = Example(self.text03, self.soup02, 'http://some_url')
     self.example04 = Example(self.text04, self.soup02, 'http://some_url')
     self.example05 = Example(self.text05, self.soup03, 'http://some_url')
 def _get_examples(self, nsets, min, max):
     sets = {}
     fields = 'abcdefghijklmnopqrstuvwxyz'
     nsets = nsets if nsets < len(fields) else len(fields) - 1
     for i in range(nsets):
         field = fields[i]
         sets[field] = [
             Example('v_%d' % i, 'c_%d' % i)
             for i in range(random.randint(min, max))
         ]
     return sets
    def setUp(self):
        self.ruler = SeparatorsRegexRuler()
        super(TestSeparatorsRegexRuler, self).setUp()

        self.example06 = Example([
            '(Botella.*P\.|P\..*Botella)', '(Solona.*B\.|B\..*Solsona)',
            '(A\..*Martinez-Arias|Martinez-Arias.*A\.)',
            '(J\.M\..*Nieto|Nieto.*J\.M\.)'
        ], [
            u'P. Botella1, B. Solsona1, '
            'A. Martinez-Arias2 and J.M. '
            'Lopez Nieto1'
        ])

        self.example07 = Example([
            '(Cabre.*L\.|L\..*Cabre)', '(Mancebo.*J\.|J\..*Mancebo)',
            '(J\..*Solsona|Solsona.*J\.)'
        ], u'L. Cabre1, J. Mancebo2, J. F. Solsona3, '
                                 ' and the Bioethics Working '
                                 'Group of the SEMICYUC')
示例#11
0
 def _get_new_example_set(self, rule, example_set):
     """
     Return a list of examples with the same value attribute as example_set
     but where the content is the result of applying rule.
     """
     new_example_set = []
     for example in example_set:
         value = example.value
         content = rule.apply(example.content)
         if value and content:
             new_example_set.append(Example(value, content))
         else:
             log.warn('Example content is None after applying rule')  #@UndefinedVariable
     return new_example_set
 def test_rule_example(self):
     example = Example('2007', 'Volume 31, Number 7 / July, 2007')
     rules = self.ruler._rule_example(example)
     expected = 'Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)'
     self.failUnless(rules[0].pattern == expected)
 def test_get_invalid_content_element(self):
     example = Example(value='random text', content=BeautifulSoup(''))
     elements = self.ruler._get_content_elements(example.value,
                                                 example.content)
     self.failIf(elements)