Пример #1
0
 def find_reference_by_id(self, id):
     if not id:
         raise ValueError
     
     log.debug('Querying the database. Reference with id %s' % str(id)) #@UndefinedVariable
     m_reference = (self.session.query(mappers.Reference).
                    filter(mappers.Reference.id == id).one())
     
     if not m_reference:
         return None
     
     log.debug('Creating new reference') #@UndefinedVariable
     reference = Reference()
     reference.id = m_reference.id
     reference.validity = m_reference.validity
     
     log.debug('Adding fields') #@UndefinedVariable
     for m_field in m_reference.fields:
         reference.set_field(m_field.name, m_field.value, m_field.valid)
     
     log.debug('Adding authors') #@UndefinedVariable
     authors = []
     for m_author in m_reference.authors:
         authors.append(m_author.to_name_dict())
     if authors:
         reference.set_field(u'author', authors, True)
     
     log.debug('Adding editors') #@UndefinedVariable
     editors = []
     for m_editor in m_reference.editors:
         editors.append(m_editor.to_name_dict())
     if editors:
         reference.set_field(u'editor', editors, True)
     
     return reference
Пример #2
0
 def test_validate_reference_fields(self):
     ref = Reference()
     ref.set_field('title', 'Some article title')
     ref.set_field('year', '32')
     raw_text = "Some article title and something else"
     self.iec._validate_reference_fields(ref, raw_text)
     self.failUnless(ref.get_field('title').valid == True)
     self.failUnless(ref.get_field('year').valid == False)
Пример #3
0
    def test_format_reference_different_format(self):
        ref = Reference()
        ref.set_field('reference_id', 'Lmadsen99')
        ref.set_field('title', 'Some article title')

        self.iec._format_reference(ref)

        self.failUnless(ref.get_entry().startswith('@article{Lmadsen99,'))
        self.failUnless(ref.get_format() == self.iec.format)
 def test_validate_incorrect_reference(self):
     incorrect_ref = Reference()
     incorrect_ref.set_field('title', ('some arbitrary text'), False)
     incorrect_ref.set_field('author', [{
         'first_name': 'Jose-Luis',
         'last_name': 'Sancho',
         'middle_name': ''
     }], True)
     self.rv.validate(incorrect_ref)
     self.failUnless(incorrect_ref.validity < 0.5)
    def test_validate_correct_reference(self):
        correct_ref = Reference()
        correct_ref.set_field('author', [{
            'first_name': 'Jose-Luis',
            'last_name': 'Sancho',
            'middle_name': ''
        }], True)
        correct_ref.set_field('title',
                              ('Class separability estimation and '
                               'incremental learning using boundary methods'),
                              True)

        self.rv.validate(correct_ref)
        self.failUnless(correct_ref.validity == 1.0)
    def setUp(self):
        self.ref = Reference()
        self.ref.set_field('reference_id', 'Lmadsen99')
        self.ref.set_field('author', [{
            'first_name': 'Lars',
            'last_name': 'Madsen',
            'middle_name': 'Lithen'
        }])
        self.ref.set_field('title', 'Some article title')
        self.ref.set_field('pages', '133--144')
        self.ref.set_field('journal', 'Some journal')
        self.ref.set_field('year', '1999')

        self.ref_formatter = ReferenceFormatter()
        self.format_generator = BibtexGenerator()
Пример #7
0
    def _use_reference_wrappers(self, source, page, raw_text):
        """
        Use a reference wrapper to get the reference from a given page.
        Returns a list of References with the full entry, format and a 
        structure with the different fields.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """
        log.info('Attempting to extract reference with a reference wrapper'
                 )  #@UndefinedVariable
        references = []
        entry, format = ReferenceWrapper().extract_info(source, page)
        if not entry:
            log.debug('Could not find any entry using a reference wrapper'
                      )  #@UndefinedVariable
            return references

        # Create a parser for the given reference format
        try:
            parser = self.util_factory.create_parser(format)
        except UtilCreationError as e:
            log.error('Could not create a parser for %s: %s' % (
                format,  #@UndefinedVariable
                e.args))
            return references

        if not parser.check_format(entry):
            log.error('Given entry is not in %s' % format)  #@UndefinedVariable
            return references

        # There may be more than one entry for the same file.
        log.debug('Parsing extracted entries')  #@UndefinedVariable
        try:
            entries = parser.split_source(entry)
            for entry in entries:
                fields = parser.parse_entry(entry)
                reference = Reference(fields, format, entry)
                self._validate_reference_fields(reference, raw_text)
                references.append(reference)
        except Exception, e:
            log.error('Error parsing extracted entry: %s ' %
                      e)  #@UndefinedVariable
 def setUp(self):
     self.ref = Reference()
Пример #9
0
    def _use_rule_wrappers(self, source, page, raw_text):
        """
        Look if there is any wrapper in the database for the given source.
        """
        log.info('Attempting to extract reference with ruled wrappers'
                 )  #@UndefinedVariable
        fields = {}
        reference = Reference()
        wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers)
        wrapper_field_collections = wrapper_manager.find_wrapper_collections(
            source)

        for collection in wrapper_field_collections:
            # Get the wrappers for the current collection
            url, field = collection.url, collection.field
            wrappers = wrapper_manager.get_wrappers(url, field)
            log.debug('Collection %s:%s has %d wrappers' % (
                url,
                field,  #@UndefinedVariable
                len(wrappers)))

            # Get field validator
            try:
                validator = self.field_validation[collection.field][1]
            except KeyError:
                validator = None

            # Extract information using the wrappers we have
            for wrapper in wrappers:
                info = wrapper.extract_info(page)
                # we expect 'info' to be a string
                if type(info) == list and not (collection.field == 'author' or
                                               collection.field == 'editor'):
                    continue
                log.debug('Info extracted by wrapper: %s' %
                          info)  #@UndefinedVariable

                valid = validator.validate(info,
                                           raw_text) if validator else True
                # Save the extracted info even if it's not correct. It will
                # be overwritten afterwards if necessary
                reference.set_field(field, info, valid)

                if not valid:
                    log.debug(
                        'The extracted information is not valid. '  #@UndefinedVariable
                        'Downvoting wrapper.')
                    wrapper.downvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                else:
                    log.debug(
                        'The extracted information is valid. '  #@UndefinedVariable
                        'Upvoting wrapper')
                    wrapper.upvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                    fields[field] = info
                    break

        if len(reference.fields) > 0:
            log.info('Extracted reference')  #@UndefinedVariable
            return [reference]
        else:
            log.info('Could not extract reference using ruled wrappers'
                     )  #@UndefinedVariable
            return []
Пример #10
0
        if not content:
            log.info('Empty entries file')  #@UndefinedVariable
            return references

        if not self.parser.check_format(content):
            log.error('Given entry is not in %s' % format)  #@UndefinedVariable
            return references

        # There may be more than one entry for the same file.
        log.debug('Parsing entries')  #@UndefinedVariable

        entries = self.parser.split_source(content)
        for entry in entries:
            fields = self.parser.parse_entry(entry)
            reference = Reference(fields, format, entry)
            reference.validity = 1.0
            references.append(reference)
        return references

    def persist_file_references(self, file_path):
        """
        Parses references from a file and stores them to the database
        """
        extraction_gw = ExtractionGateway()
        references = self._parse_entries_file(file_path)
        extractions = []

        for reference, index in zip(references, range(len(references))):

            extraction = Extraction()
Пример #11
0
 def test_format_reference_same_format(self):
     ref = Reference(format=ReferenceFormat.BIBTEX, entry='formatted entry')
     self.iec._format_reference(ref)
     self.failUnless(ref.get_entry() == 'formatted entry')