Python sanitize_html примеры, papers.utils.sanitize_html Python примеры использования

Пример #1

0

Показать файл

Файл: ajax.py Проект: jilljenn/dissemin

def changeAuthor(request):
    response = dict()
    try:
        author = Author.objects.get(pk=request.POST.get('pk'))
        first = request.POST.get('value[first]')
        if first:
            first = sanitize_html(first)
        last = request.POST.get('value[last]')
        if last:
            last = sanitize_html(last)
        if not first or not last:
            return {'message':'First and last names are required.'}, 403
        if author.name.first != first or author.name.last != last:
            new_name = Name.lookup_name((first,last))
            new_name.save()
            author.name_id = new_name.pk
            author.save()

        author.paper.invalidate_cache()
        response['status'] = 'OK'
        researcher_id = author.researcher_id
        if not researcher_id:
            researcher_id = False
        response['value'] = {'first':first,'last':last,'researcher_id':researcher_id}
        
        # The fingerprint might have changed and might collide with another paper
        merged = author.paper.recompute_fingerprint_and_merge_if_needed()
        response['merged'] = ''
        if merged:
            response['merged'] = merged.pk
            response['merged_title'] = merged.title

        return response
    except ObjectDoesNotExist:
        return response, 404

Пример #2

0

Показать файл

def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException(
            'RoMEO did not provide a publisher id.\n' + 'URL was: ' + request)

    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the publisher\'s name.\n' + 'URL was: ' +
            request)

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except KeyError, IndexError:
        pass

Пример #3

0

Показать файл

Файл: romeo.py Проект: jilljenn/dissemin

def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException('RoMEO did not provide a publisher id.\n'+
                'URL was: '+request)
    
    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException('RoMEO did not provide the publisher\'s name.\n'+
                'URL was: '+request)

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except KeyError, IndexError:
        pass

Пример #4

0

Показать файл

 def create(cls, first, last):
     """
     Creates an instance of the Name object without saving it.
     Useful for name lookups where we are not sure we want to
     keep the name in the model.
     """
     instance = cls()
     instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip())
     instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip())
     instance.full = iunaccent(instance.first+' '+instance.last)
     return instance

Пример #5

0

Показать файл

Файл: baremodels.py Проект: Phyks/dissemin

 def create(cls, first, last):
     """
     Creates an instance of the Name object without saving it.
     Useful for name lookups where we are not sure we want to
     keep the name in the model.
     """
     instance = cls()
     instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip())
     instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip())
     instance.full = iunaccent(instance.first+' '+instance.last)
     return instance

Пример #6

0

Показать файл

Файл: romeo.py Проект: tarsbase/dissemin

def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException('RoMEO did not provide a publisher id.')

    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException(
            'RoMEO did not provide the publisher\'s name.')

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except (KeyError, IndexError):
        pass

    # Check if we already have it
    matches = None
    if alias:
        matches = Publisher.objects.filter(romeo_id=romeo_id,
                                           name__iexact=name,
                                           alias__iexact=alias)
    else:
        matches = Publisher.objects.filter(romeo_id=romeo_id,
                                           name__iexact=name,
                                           alias__isnull=True)
    if matches:
        return matches[0]

    # Otherwise, create it
    url = None
    try:
        url = nstrip(xml.findall('./homeurl')[0].text)
    except KeyError, IndexError:
        pass

Пример #7

0

Показать файл

Файл: ajax.py Проект: jilljenn/dissemin

def process_ajax_change(request, model, allowedFields):
    response = dict()
    try:
        instance = model.objects.get(pk=request.POST.get('pk'))
        field = request.POST.get('name')
        if field in allowedFields:
            val = request.POST.get('value')
            # TODO check that 'value' is actually present
            if type(val) == type(''):
                val = sanitize_html(val)
            setattr(instance, field, val)
            instance.save(update_fields=[field])
            if hasattr(instance, "invalidate_cache"):
                instance.invalidate_cache()
            if model == Paper:
                merged = instance.recompute_fingerprint_and_merge_if_needed()
                response['merged'] = ''
                if merged:
                    response['merged'] = merged.pk
                    response['merged_title'] = merged.title
            response['status'] = 'OK'
            response['value'] = val
            return response
        else:
            raise ObjectDoesNotExist
    except ObjectDoesNotExist:
        return response, 404

Пример #8

0

Показать файл

Файл: citeproc.py Проект: robertdigital/dissemin

 def _get_abstract(data):
     """
     Tries to get the abstract an sanitize it
     """
     abstract = data.get('abstract', '')
     abstract = sanitize_html(abstract)
     return abstract

Пример #9

0

Показать файл

Файл: ajax.py Проект: Lysxia/dissemin

def process_ajax_change(request, model, allowedFields):
    """
    General function used to change a CharField in a model with ajax
    """
    response = dict()
    try:
        instance = model.objects.get(pk=request.POST.get('pk'))
        field = request.POST.get('name')
        if field in allowedFields:
            val = request.POST.get('value')
            # TODO check that 'value' is actually present
            if type(val) == type(''):
                val = sanitize_html(val)
            setattr(instance, field, val)
            instance.save(update_fields=[field])
            if hasattr(instance, "invalidate_cache"):
                instance.invalidate_cache()
            if model == Paper:
                merged = instance.recompute_fingerprint_and_merge_if_needed()
                response['merged'] = ''
                if merged:
                    response['merged'] = merged.pk
                    response['merged_title'] = merged.title
            response['status'] = 'OK'
            response['value'] = val
            return response
        else:
            raise ObjectDoesNotExist
    except ObjectDoesNotExist:
        return response, 404

Пример #10

0

Показать файл

Файл: maintenance.py Проект: jilljenn/dissemin

def cleanup_abstracts():
    """
    Run HTML sanitizing on the abstracts
    (this is normally done on creation of the papers, but
    not for old dumps of the database)
    """
    for p in Publication.objects.all():
        if p.abstract:
            new_abstract = sanitize_html(p.abstract)
            if new_abstract != p.abstract:
                p.abstract = new_abstract
                p.save()
    for p in OaiRecord.objects.all():
        if p.description:
            new_abstract = sanitize_html(p.description)
            if new_abstract != p.description:
                p.description = new_abstract
                p.save()

Пример #11

0

Показать файл

Файл: maintenance.py Проект: jilljenn/dissemin

def cleanup_abstracts():
    """
    Run HTML sanitizing on the abstracts
    (this is normally done on creation of the papers, but
    not for old dumps of the database)
    """
    for p in Publication.objects.all():
        if p.abstract:
            new_abstract = sanitize_html(p.abstract)
            if new_abstract != p.abstract:
                p.abstract = new_abstract
                p.save()
    for p in OaiRecord.objects.all():
        if p.description:
            new_abstract = sanitize_html(p.description)
            if new_abstract != p.description:
                p.description = new_abstract
                p.save()

Пример #12

0

Показать файл

Файл: maintenance.py Проект: Lysxia/dissemin

def cleanup_titles():
    """
    Run HTML sanitizing on all the titles of the papers
    (this is normally done on creation of the papers, but
    not for old dumps of the database)
    """
    papers = Paper.objects.all()
    for p in papers:
        p.title = sanitize_html(p.title)
        p.save(update_fields=['title'])

Пример #13

0

Показать файл

Файл: maintenance.py Проект: jilljenn/dissemin

def cleanup_titles():
    """
    Run HTML sanitizing on all the titles of the papers
    (this is normally done on creation of the papers, but
    not for old dumps of the database)
    """
    papers = Paper.objects.all()
    for p in papers:
        p.title = sanitize_html(p.title)
        p.save(update_fields=['title'])

Пример #14

0

Показать файл

Файл: baremodels.py Проект: Phyks/dissemin

    def create(cls, title, author_names, pubdate, visible=True,
               affiliations=None, orcids=None):
        """
        Creates a (bare) paper. To save it to the database, we
        need to run the clustering algorithm to resolve Researchers for the authors,
        using `from_bare` from the (non-bare) :class:`Paper` subclass..

        :param title: The title of the paper (as a string). If it is too long for the database,
                      ValueError is raised.
        :param author_names: The ordered list of author names, as Name objects.
        :param pubdate: The publication date, as a python date object
        :param visible: The visibility of the paper if it is created. If another paper
                    exists, the visibility will be set to the maximum of the two possible
                    visibilities.
        :param affiliations: A list of (possibly None) affiliations for the authors. It has to
                    have the same length as the list of author names.
        :param orcids: same as affiliations, but for ORCID ids.
        """
        if not title or not author_names or not pubdate:
            raise ValueError(
                "A title, pubdate and authors have to be provided to create a paper.")

        if affiliations is not None and len(author_names) != len(affiliations):
            raise ValueError(
                "The number of affiliations and authors have to be equal.")
        if orcids is not None and len(author_names) != len(orcids):
            raise ValueError(
                "The number of ORCIDs (or Nones) and authors have to be equal.")
        if not isinstance(visible, bool):
            raise ValueError("Invalid paper visibility: %s" % str(visible))

        title = sanitize_html(title)
        title = maybe_recapitalize_title(title)

        p = cls()
        p.title = title
        p.pubdate = pubdate  # pubdate will be checked in fingerprint computation
        p.visible = visible
        for idx, n in enumerate(author_names):
            a = BareAuthor()
            a.name = n
            if affiliations is not None:
                a.affiliation = affiliations[idx]
            if orcids is not None:
                orcid = validate_orcid(orcids[idx])
                if orcid:
                    a.orcid = orcid
            p.add_author(a, position=idx)

        p.fingerprint = p.new_fingerprint()

        return p

Пример #15

0

Показать файл

    def create(cls, title, author_names, pubdate, visible=True,
               affiliations=None, orcids=None):
        """
        Creates a (bare) paper. To save it to the database, we
        need to run the clustering algorithm to resolve Researchers for the authors,
        using `from_bare` from the (non-bare) :class:`Paper` subclass..

        :param title: The title of the paper (as a string). If it is too long for the database,
                      ValueError is raised.
        :param author_names: The ordered list of author names, as Name objects.
        :param pubdate: The publication date, as a python date object
        :param visible: The visibility of the paper if it is created. If another paper
                    exists, the visibility will be set to the maximum of the two possible
                    visibilities.
        :param affiliations: A list of (possibly None) affiliations for the authors. It has to
                    have the same length as the list of author names.
        :param orcids: same as affiliations, but for ORCID ids.
        """
        if not title or not author_names or not pubdate:
            raise ValueError(
                "A title, pubdate and authors have to be provided to create a paper.")

        if affiliations is not None and len(author_names) != len(affiliations):
            raise ValueError(
                "The number of affiliations and authors have to be equal.")
        if orcids is not None and len(author_names) != len(orcids):
            raise ValueError(
                "The number of ORCIDs (or Nones) and authors have to be equal.")
        if not isinstance(visible, bool):
            raise ValueError("Invalid paper visibility: %s" % str(visible))

        title = sanitize_html(title)
        title = maybe_recapitalize_title(title)

        p = cls()
        p.title = title
        p.pubdate = pubdate  # pubdate will be checked in fingerprint computation
        p.visible = visible
        for idx, n in enumerate(author_names):
            a = BareAuthor()
            a.name = n
            if affiliations is not None:
                a.affiliation = affiliations[idx]
            if orcids is not None:
                orcid = validate_orcid(orcids[idx])
                if orcid:
                    a.orcid = orcid
            p.add_author(a, position=idx)

        p.fingerprint = p.new_fingerprint()

        return p

Пример #16

0

Показать файл

Файл: ajax.py Проект: jilljenn/dissemin

def changeAuthor(request):
    response = dict()
    try:
        author = Author.objects.get(pk=request.POST.get('pk'))
        first = request.POST.get('value[first]')
        if first:
            first = sanitize_html(first)
        last = request.POST.get('value[last]')
        if last:
            last = sanitize_html(last)
        if not first or not last:
            return {'message': 'First and last names are required.'}, 403
        if author.name.first != first or author.name.last != last:
            new_name = Name.lookup_name((first, last))
            new_name.save()
            author.name_id = new_name.pk
            author.save()

        author.paper.invalidate_cache()
        response['status'] = 'OK'
        researcher_id = author.researcher_id
        if not researcher_id:
            researcher_id = False
        response['value'] = {
            'first': first,
            'last': last,
            'researcher_id': researcher_id
        }

        # The fingerprint might have changed and might collide with another paper
        merged = author.paper.recompute_fingerprint_and_merge_if_needed()
        response['merged'] = ''
        if merged:
            response['merged'] = merged.pk
            response['merged_title'] = merged.title

        return response
    except ObjectDoesNotExist:
        return response, 404

Пример #17

0

Показать файл

    def add_oai_record(self, header, metadata, paper):
        """
        Add a record (from OAI-PMH) to the given paper
        """
        identifier = header.identifier()

        # description in oai_dc means abstract
        curdesc = ""
        for desc in metadata['description']:
            if len(desc) > len(curdesc):
                curdesc = desc
        curdesc = sanitize_html(curdesc)

        # Run extractor to find the URLs
        splash_url, pdf_url = self.extract_urls(header, metadata,
                                                self.oaisource.identifier)

        keywords = ' | '.join(metadata['subject'])
        contributors = ' '.join(metadata['contributor'])[:4096]

        typenorms = ['typenorm:' + tn for tn in metadata.get('typenorm', [])]
        pubtype_list = metadata.get('type', []) + typenorms
        pubtype = None
        for raw_pubtype in pubtype_list:
            pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype)
            if pubtype is not None:
                break

        if pubtype is None:
            pubtype = self.oaisource.default_pubtype

        # Find the DOI, if any
        doi = None
        for url in metadata['identifier'] + metadata['relation'] + metadata[
                'source']:
            if not doi:
                doi = to_doi(url)

        record = BareOaiRecord(source=self.oaisource,
                               identifier=identifier,
                               description=curdesc,
                               keywords=keywords,
                               contributors=contributors,
                               pubtype=pubtype,
                               pdf_url=pdf_url,
                               splash_url=splash_url,
                               doi=doi)
        paper.add_oairecord(record)

Пример #18

0

Показать файл

Файл: translators.py Проект: Phyks/dissemin

    def add_oai_record(self, header, metadata, paper):
        """
        Add a record (from OAI-PMH) to the given paper
        """
        identifier = header.identifier()

        # description in oai_dc means abstract
        curdesc = ""
        for desc in metadata['description']:
            if len(desc) > len(curdesc):
                curdesc = desc
        curdesc = sanitize_html(curdesc)

        # Run extractor to find the URLs
        splash_url, pdf_url = self.extract_urls(
            header, metadata, self.oaisource.identifier)

        keywords = ' | '.join(metadata['subject'])
        contributors = ' '.join(metadata['contributor'])[:4096]

        typenorms = ['typenorm:'+tn for tn in metadata.get('typenorm', [])]
        pubtype_list = metadata.get('type', []) + typenorms
        pubtype = None
        for raw_pubtype in pubtype_list:
            pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype)
            if pubtype is not None:
                break

        if pubtype is None:
            pubtype = self.oaisource.default_pubtype

        # Find the DOI, if any
        doi = None
        for url in metadata['identifier']+metadata['relation']+metadata['source']:
            if not doi:
                doi = to_doi(url)

        record = BareOaiRecord(
                source=self.oaisource,
                identifier=identifier,
                description=curdesc,
                keywords=keywords,
                contributors=contributors,
                pubtype=pubtype,
                pdf_url=pdf_url,
                splash_url=splash_url,
                doi=doi)
        paper.add_oairecord(record)

Пример #19

0

Показать файл

Файл: crossref.py Проект: NikolaJankovic/dissemin

def consolidate_publication(publi):
    """
    Fetches the abstract from Zotero and adds it to the publication if it succeeds.
    """
    zotero = fetch_zotero_by_DOI(publi.doi)
    if zotero is None:
        return publi
    for item in zotero:
        if 'abstractNote' in item:
            publi.description = sanitize_html(item['abstractNote'])
            publi.save(update_fields=['description'])
        for attachment in item.get('attachments', []):
            if attachment.get('mimeType') == 'application/pdf':
                publi.pdf_url = attachment.get('url')
                publi.save(update_fields=['pdf_url'])
                publi.about.update_availability()
    return publi

Пример #20

0

Показать файл

Файл: crossref.py Проект: jilljenn/dissemin

def consolidate_publication(publi):
    """
    Fetches the abstract from Zotero and adds it to the publication if it succeeds.
    """
    zotero = fetch_zotero_by_DOI(publi.doi)
    if zotero is None:
        return publi
    for item in zotero:
        if 'abstractNote' in item:
            publi.abstract = sanitize_html(item['abstractNote'])
            publi.save(update_fields=['abstract'])
        for attachment in item.get('attachments', []):
            if attachment.get('mimeType') == 'application/pdf':
                publi.pdf_url = attachment.get('url')
                publi.save(update_fields=['pdf_url'])
                publi.paper.update_availability()
    return publi

Пример #21

0

Показать файл

Файл: oai.py Проект: Lysxia/dissemin

def add_oai_record(record, source, paper):
    """ Add a record (from OAI-PMH) to the given paper """
    header = record[0]
    identifier = header.identifier()

    # A description is useful
    curdesc = ""
    for desc in record[1]._map['description']:
        if len(desc) > len(curdesc):
                curdesc = desc
    curdesc = sanitize_html(curdesc)

    # Run extractor to find the URLs
    pdf_url = None
    splash_url = None
    if source.identifier:
        try:
            extractor = REGISTERED_EXTRACTORS[source.identifier]
            urls = extractor.extract(record)
            pdf_url = urls.get('pdf')
            splash_url = urls.get('splash')
        except KeyError:
            print "Warning, invalid extractor for source "+source.name

    keywords = ' '.join(record[1]._map['subject'])
    contributors = ' '.join(record[1]._map['contributor'])[:4096]

    pubtype_list = record[1]._map.get('type')
    pubtype = None
    if len(pubtype_list) > 0:
        pubtype = pubtype_list[0]
    #pubtype = source.default_pubtype
    pubtype = PUBTYPE_TRANSLATIONS.get(pubtype, source.default_pubtype)

    record = BareOaiRecord(
            source=source,
            identifier=identifier,
            description=curdesc,
            keywords=keywords,
            contributors=contributors,
            pubtype=pubtype,
            pdf_url=pdf_url,
            splash_url=splash_url)
    paper.add_oairecord(record)

Пример #22

0

Показать файл

Файл: oai.py Проект: jilljenn/dissemin

def add_oai_record(record, source, paper):
    """ Add a record (from OAI-PMH) to the given paper """
    header = record[0]
    identifier = header.identifier()

    # A description is useful
    curdesc = ""
    for desc in record[1]._map['description']:
        if len(desc) > len(curdesc):
            curdesc = desc
    curdesc = sanitize_html(curdesc)

    # Run extractor to find the URLs
    pdf_url = None
    splash_url = None
    if source.identifier:
        try:
            extractor = REGISTERED_EXTRACTORS[source.identifier]
            urls = extractor.extract(record)
            pdf_url = urls.get('pdf')
            splash_url = urls.get('splash')
        except KeyError:
            print "Warning, invalid extractor for source " + source.name

    keywords = ' '.join(record[1]._map['subject'])
    contributors = ' '.join(record[1]._map['contributor'])[:4096]

    pubtype_list = record[1]._map.get('type')
    pubtype = None
    if len(pubtype_list) > 0:
        pubtype = pubtype_list[0]
    #pubtype = source.default_pubtype
    pubtype = PUBTYPE_TRANSLATIONS.get(pubtype, source.default_pubtype)

    record = BareOaiRecord(source=source,
                           identifier=identifier,
                           description=curdesc,
                           keywords=keywords,
                           contributors=contributors,
                           pubtype=pubtype,
                           pdf_url=pdf_url,
                           splash_url=splash_url)
    paper.add_oairecord(record)

Пример #23

0

Показать файл

 def test_sanitize_html(self):
     self.assertEqual(sanitize_html('My title<sub>is</sub><a href="http://dissem.in"><sup>nice</sup></a>'), 'My title<sub>is</sub><sup>nice</sup>')
     self.assertEqual(sanitize_html('$\\alpha$-conversion'), '$\u03b1$-conversion')
     self.assertEqual(sanitize_html('$$\\eta + \\omega$$'), '$\u03b7 + \u03c9$')
     self.assertEqual(sanitize_html('abc & def'), 'abc &amp; def')
     self.assertEqual(sanitize_html('Universitat Aut\\uFFFDnoma de Barcelona'), 'Universitat Aut�noma de Barcelona')

Пример #24

0

Показать файл

    def get_or_create_publisher(self, romeo_xml_description):
        """
        Retrieves from the model, or creates into the model,
        the publisher corresponding to the <publisher> description
        from RoMEO.

        If the data from RoMEO is more fresh than what we have
        in cache, we update our model.
        """
        xml = romeo_xml_description
        romeo_id = None
        try:
            romeo_id = xml.attrib['id']
        except KeyError:
            raise MetadataSourceException('RoMEO did not provide a publisher id.')

        romeo_parent_id = None
        try:
            romeo_parent_id = xml.attrib['parentid']
        except KeyError:
            pass

        name = None
        try:
            raw_name = xml.findall('./name')[0].text.strip()
            name = fromstring(kill_html(sanitize_html(raw_name))).text
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the publisher\'s name.')

        alias = None
        try:
            alias = nstrip(xml.findall('./alias')[0].text)
            if alias:
                alias = fromstring(kill_html(sanitize_html(alias))).text
        except (KeyError, IndexError):
            pass

        last_update = self._get_romeo_date(xml, './dateupdated')

        # Check if we already have it.
        # Sadly the romeo_id is not unique (as publishers imported from doaj
        # all get the same id, so we have to use the name too).
        matches = None
        if re.match(r'\d+', romeo_id): # numeric ids are unambiguous
            matches = Publisher.objects.filter(romeo_id=romeo_id)
        elif alias:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__iexact=alias)
        else:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__isnull=True)
        if matches:
            first_match = matches[0]
            if first_match.last_updated is not None and first_match.last_updated >= last_update:
                return matches[0]

        # Otherwise, create it
        url = None
        try:
            url = nstrip(xml.findall('./homeurl')[0].text)
        except (KeyError, IndexError):
            pass

        preprint = None
        try:
            preprint = xml.findall('./preprints/prearchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the preprint policy.')

        postprint = None
        try:
            postprint = xml.findall('./postprints/postarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the postprint policy.')

        pdfversion = None
        try:
            pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the pdf archiving policy.')

        # Compute OA status of the publisher
        status = 'UNK'

        if not matches:
            publisher = Publisher()
        else:
            publisher = matches[0]

        publisher.name = name
        publisher.alias = alias
        publisher.url = url
        publisher.preprint = preprint
        publisher.postprint = postprint
        publisher.pdfversion = pdfversion
        publisher.romeo_id = romeo_id
        publisher.romeo_parent_id = romeo_parent_id
        publisher.oa_status = status
        publisher.last_updated = last_update
        publisher.save()

        if matches:
            publisher.publishercopyrightlink_set.all().delete()
            publisher.publisherrestrictiondetail_set.all().delete()
            publisher.publishercondition_set.all().delete()

        # Add the conditions, restrictions, and copyright
        for restriction in xml.findall('./preprints/prerestrictions/prerestriction'):
            self.add_restriction(restriction, 'preprint', publisher)

        for restriction in xml.findall('./postprints/postrestrictions/postrestriction'):
            self.add_restriction(restriction, 'postprint', publisher)

        for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'):
            self.add_restriction(restriction, 'pdfversion', publisher)

        for condition in xml.findall('./conditions/condition'):
            if condition.text:
                c = PublisherCondition(publisher=publisher,
                                       text=condition.text.strip())
                c.save()

        # Update the publisher status
        publisher.oa_status = publisher.classify_oa_status()
        publisher.save(update_fields=['oa_status'])

        # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the
        # adequate task

        for link in xml.findall('./copyrightlinks/copyrightlink'):
            text = None
            url = None
            texts = link.findall('./copyrightlinktext')
            if texts:
                text = nstrip(texts[0].text)
            urls = link.findall('./copyrightlinkurl')
            if urls:
                url = nstrip(urls[0].text)
            if url and text:
                cplink = PublisherCopyrightLink(
                    text=text, url=url[:1024], publisher=publisher)
                cplink.save()

        return publisher

Python sanitize_html примеры использования