Python remove_html_markup示例，invenio.htmlutils.remove_html_markup Python示例

示例#1

0

显示文件

文件： bibmerge_engine.py 项目： metandrey/invenio-metandrey

def perform_candidate_record_search(requestType, data):
    """Handle search requests.
    """
    max_results = 999
    too_many = False
    result = {'resultCode': 0, 'resultText': ''}
    if requestType == "searchCandidates":
        recids = perform_request_search(p=data['query'])
        if len(recids) > max_results:
            too_many = True
        else:
            captions = [search_result_info(x) for x in recids]
            alternative_titles = [
                remove_html_markup(print_record(x, "hs")) for x in recids
            ]
            search_results = [recids, captions, alternative_titles]
    elif requestType == "searchRevisions":
        revisions = get_record_revision_ids(data['recID1'])
        captions = [split_revid(x, 'datetext')[1] for x in revisions]
        search_results = [revisions, captions]

    if too_many == True:
        result['resultCode'] = 1
        result['resultText'] = 'Too many results'
    else:
        result['results'] = search_results
        result['resultText'] = '%s results' % len(search_results[0])

    return result

示例#2

0

显示文件

文件： bibmerge_engine.py 项目： flannery/invenio-flannery

def perform_candidate_record_search(requestType, data):
    """Handle search requests.
    """
    max_results = 999
    too_many = False
    result = {
        'resultCode': 0,
        'resultText': ''
        }
    if requestType == "searchCandidates":
        recids = perform_request_search( p=data['query'] )
        if len(recids) > max_results:
            too_many = True
        else:
            captions = [ search_result_info(x) for x in recids ]
            alternative_titles = [ remove_html_markup(print_record(x, "hs")) for x in recids ]
            search_results = [recids, captions, alternative_titles]
    elif requestType == "searchRevisions":
        revisions = get_record_revision_ids( data['recID1'] )
        captions = [ split_revid(x, 'datetext')[1] for x in revisions ]
        search_results = [revisions, captions]

    if too_many == True:
        result['resultCode'] = 1
        result['resultText'] = 'Too many results'
    else:
        result['results'] = search_results
        result['resultText'] = '%s results' % len(search_results[0])

    return result

示例#3

0

显示文件

文件： htmlparser.py 项目： aw-bib/tind-invenio

def get_as_text(record_id=0, xml_record=None, ln=CFG_SITE_LANG):
    """Return the record in a textual format"""
    _ = gettext_set_language(ln)
    out = ""
    if record_id != 0:
        rec_in_hb = format_record(record_id, of="hb")
    elif xml_record:
        rec_in_hb = format_record(0, of="hb", xml_record=xml_record)
    rec_in_hb = rec_in_hb.replace("\n", " ")
    htparser = RecordHTMLParser()
    try:
        htparser.feed(rec_in_hb)
        htparser.close()
        out = htparser.result
    except:
        out = remove_html_markup(rec_in_hb)

    # Remove trailing whitespace and linefeeds
    out = out.strip("\n").strip()
    # Merge consecutive whitespaces. Must be done here, once all HTML
    # tags have been removed
    out = whitespaces_pattern.sub(" ", out)
    # Now consider non-breakable spaces
    out = out.replace("&nbsp;", " ")
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Detailed record"), "", out)
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Similar records"), "", out)
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Cited by"), "", out)
    return out.strip()

示例#4

0

显示文件

文件： bibmerge_engine.py 项目： pombredanne/invenio-old

def perform_candidate_record_search(requestType, data):
    """Handle search requests.
    """
    max_results = 999
    too_many = False
    result = {"resultCode": 0, "resultText": ""}
    if requestType == "searchCandidates":
        recids = perform_request_search(p=data["query"])
        if len(recids) > max_results:
            too_many = True
        else:
            captions = [search_result_info(x) for x in recids]
            alternative_titles = [remove_html_markup(print_record(x, "hs")) for x in recids]
            search_results = [recids, captions, alternative_titles]
    elif requestType == "searchRevisions":
        revisions = get_record_revision_ids(data["recID1"])
        captions = [split_revid(x, "datetext")[1] for x in revisions]
        search_results = [revisions, captions]

    if too_many == True:
        result["resultCode"] = 1
        result["resultText"] = "Too many results"
    else:
        result["results"] = search_results
        result["resultText"] = "%s results" % len(search_results[0])

    return result

示例#5

0

显示文件

文件： htmlparser.py 项目： epfl-si/invenio-infoscience

def get_as_text(record_id=0, xml_record=None, ln=CFG_SITE_LANG):
    """Return the record in a textual format"""
    _ = gettext_set_language(ln)
    out = ""
    if record_id != 0:
        rec_in_hb = format_record(record_id, of="hb")
    elif xml_record:
        rec_in_hb = format_record(0, of="hb", xml_record=xml_record)
    rec_in_hb = rec_in_hb.replace('\n', ' ')
    htparser = RecordHTMLParser()
    try:
        htparser.feed(rec_in_hb)
        htparser.close()
        out = htparser.result
    except:
        out = remove_html_markup(rec_in_hb)

    # Remove trailing whitespace and linefeeds
    out = out.strip('\n').strip()
    # Merge consecutive whitespaces. Must be done here, once all HTML
    # tags have been removed
    out = whitespaces_pattern.sub(' ', out)
    # Now consider non-breakable spaces
    out = out.replace('&nbsp;', ' ')
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Detailed record"), "", out)
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Similar records"), "", out)
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Cited by"), "", out)
    return out.strip()

示例#6

0

显示文件

    def tokenize_for_words(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        formulas = []
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            formulas = latex_formula_re.findall(phrase)
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                stemmed_block = remove_stopwords(block, self.remove_stopwords)
                stemmed_block = length_check(stemmed_block)
                stemmed_block = apply_stemming(stemmed_block,
                                               self.stemming_language)
                if stemmed_block:
                    words[stemmed_block] = 1
                if re_arxiv.match(block):
                    # special case for blocks like `arXiv:1007.5048' where
                    # we would like to index the part after the colon
                    # regardless of dot or other punctuation characters:
                    words[block.split(':', 1)[1]] = 1
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    stemmed_subblock = remove_stopwords(
                        subblock, self.remove_stopwords)
                    stemmed_subblock = length_check(stemmed_subblock)
                    stemmed_subblock = apply_stemming(stemmed_subblock,
                                                      self.stemming_language)
                    if stemmed_subblock:
                        words[stemmed_subblock] = 1
                    # 4th break each subblock into alphanumeric groups and add groups:
                    for alphanumeric_group in re_separators.split(subblock):
                        stemmed_alphanumeric_group = remove_stopwords(
                            alphanumeric_group, self.remove_stopwords)
                        stemmed_alphanumeric_group = length_check(
                            stemmed_alphanumeric_group)
                        stemmed_alphanumeric_group = apply_stemming(
                            stemmed_alphanumeric_group, self.stemming_language)
                        if stemmed_alphanumeric_group:
                            words[stemmed_alphanumeric_group] = 1
        for block in formulas:
            words[block] = 1
        return words.keys()

示例#7

0

显示文件

文件： BibIndexDefaultAuthorityTokenizer.py 项目： aw-bib/tind-invenio

 def tokenize_for_words(self, phrase, recid):
     """Return list of words found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     if not self.isAuthority(recid):
         return []
     words = {}
     formulas = []
     if self.remove_html_markup and phrase.find("</") > -1:
         phrase = remove_html_markup(phrase)
     if self.remove_latex_markup:
         formulas = latex_formula_re.findall(phrase)
         phrase = remove_latex_markup(phrase)
         phrase = latex_formula_re.sub(" ", phrase)
     phrase = wash_for_utf8(phrase)
     phrase = lower_index_term(phrase)
     # 1st split phrase into blocks according to whitespace
     for block in strip_accents(phrase).split():
         # 2nd remove leading/trailing punctuation and add block:
         block = re_block_punctuation_begin.sub("", block)
         block = re_block_punctuation_end.sub("", block)
         if block:
             stemmed_block = remove_stopwords(block, self.remove_stopwords)
             stemmed_block = length_check(stemmed_block)
             stemmed_block = apply_stemming(stemmed_block, self.stemming_language)
             if stemmed_block:
                 words[stemmed_block] = 1
             if re_arxiv.match(block):
                 # special case for blocks like `arXiv:1007.5048' where
                 # we would like to index the part after the colon
                 # regardless of dot or other punctuation characters:
                 words[block.split(":", 1)[1]] = 1
             # 3rd break each block into subblocks according to punctuation and add subblocks:
             for subblock in re_punctuation.split(block):
                 stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords)
                 stemmed_subblock = length_check(stemmed_subblock)
                 stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language)
                 if stemmed_subblock:
                     words[stemmed_subblock] = 1
                 # 4th break each subblock into alphanumeric groups and add groups:
                 for alphanumeric_group in re_separators.split(subblock):
                     stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                     stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group)
                     stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language)
                     if stemmed_alphanumeric_group:
                         words[stemmed_alphanumeric_group] = 1
     for block in formulas:
         words[block] = 1
     return words.keys()

示例#8

0

显示文件

文件： generate_scoap3_repo_page.py 项目： jalavik/scoap3

def main():
    for journal in CFG_JOURNALS:
        name = get_coll_i18nname(journal)
        reclist = get_collection_reclist(journal)
        print "<h2>%s</h2>" % escape(name)
        if not reclist:
            print "<p>None yet.</p>"
            continue
        print "<p><ul>"
        for recid in reclist:
            record = get_record(recid)
            title = remove_html_markup(record_get_field_value(record, '245', code='a'), remove_escaped_chars_p=False).strip()
            doi = record_get_field_value(record, '024', '7', code='a')
            print '<li><a href="http://dx.doi.org/%s" target="_blank">%s</a>: %s</li>' % (escape(doi, True), escape(doi), title)
        print "</ul></p>"

示例#9

0

显示文件

    def tokenize_for_pairs(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        last_word = ''
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                block = remove_stopwords(block, self.remove_stopwords)
                block = length_check(block)
                block = apply_stemming(block, self.stemming_language)
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    subblock = remove_stopwords(subblock,
                                                self.remove_stopwords)
                    subblock = length_check(subblock)
                    subblock = apply_stemming(subblock, self.stemming_language)
                    if subblock:
                        # 4th break each subblock into alphanumeric groups and add groups:
                        for alphanumeric_group in re_separators.split(
                                subblock):
                            alphanumeric_group = remove_stopwords(
                                alphanumeric_group, self.remove_stopwords)
                            alphanumeric_group = length_check(
                                alphanumeric_group)
                            alphanumeric_group = apply_stemming(
                                alphanumeric_group, self.stemming_language)
                            if alphanumeric_group:
                                if last_word:
                                    words['%s %s' %
                                          (last_word, alphanumeric_group)] = 1
                                last_word = alphanumeric_group
        return words.keys()

示例#10

0

显示文件

文件： BibIndexDefaultAuthorityTokenizer.py 项目： aw-bib/tind-invenio

 def tokenize_for_pairs(self, phrase, recid):
     """Return list of words found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     if not self.isAuthority(recid):
         return []
     words = {}
     if self.remove_html_markup and phrase.find("</") > -1:
         phrase = remove_html_markup(phrase)
     if self.remove_latex_markup:
         phrase = remove_latex_markup(phrase)
         phrase = latex_formula_re.sub(" ", phrase)
     phrase = wash_for_utf8(phrase)
     phrase = lower_index_term(phrase)
     # 1st split phrase into blocks according to whitespace
     last_word = ""
     for block in strip_accents(phrase).split():
         # 2nd remove leading/trailing punctuation and add block:
         block = re_block_punctuation_begin.sub("", block)
         block = re_block_punctuation_end.sub("", block)
         if block:
             block = remove_stopwords(block, self.remove_stopwords)
             block = length_check(block)
             block = apply_stemming(block, self.stemming_language)
             # 3rd break each block into subblocks according to punctuation and add subblocks:
             for subblock in re_punctuation.split(block):
                 subblock = remove_stopwords(subblock, self.remove_stopwords)
                 subblock = length_check(subblock)
                 subblock = apply_stemming(subblock, self.stemming_language)
                 if subblock:
                     # 4th break each subblock into alphanumeric groups and add groups:
                     for alphanumeric_group in re_separators.split(subblock):
                         alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                         alphanumeric_group = length_check(alphanumeric_group)
                         alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language)
                         if alphanumeric_group:
                             if last_word:
                                 words["%s %s" % (last_word, alphanumeric_group)] = 1
                             last_word = alphanumeric_group
     return words.keys()

示例#11

0

显示文件

文件： htmlutils_unit_tests.py 项目： chezjohnny/invenio

 def test_remove_html_markup_replacement(self):
     """htmlutils - remove HTML markup, some replacement"""
     test_input = 'This is <a href="test">test</a>.'
     test_expected = 'This is XtestX.'
     self.assertEqual(remove_html_markup(test_input, 'X'),
                      test_expected)

示例#12

0

显示文件

文件： htmlutils_unit_tests.py 项目： chezjohnny/invenio

 def test_remove_html_markup_empty(self):
     """htmlutils - remove HTML markup, empty replacement"""
     test_input = 'This is <a href="test">test</a>.'
     test_expected = 'This is test.'
     self.assertEqual(remove_html_markup(test_input, ''),
                      test_expected)

示例#13

0

显示文件

文件： bfe_webjournal_articles_overview.py 项目： ppiotr/Invenio

def _get_feature_text(record, language):
    """
    Looks for a text (header) that can be featured on the article overview
    page.
    """
    washer = HTMLWasher()
    header_text = ""
    # Check if there is a header
    if language == "fr":
        header = record.field('590__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('520__a')
    else:
        header = record.field('520__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('590__a')
    header = washer.wash(html_buffer=header,
                         allowed_tag_whitelist=[],
                         allowed_attribute_whitelist=[])
    if header != "":
        header_text = header
    else:
        if language == "fr":
            article = record.fields('590__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('520__b')
        else:
            article = record.fields('520__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('590__b')
        try:
            article = article[0]
        except:
            return ''

        match_obj = re.search(header_pattern, article)
        if not match_obj:
            match_obj = re.search(header_pattern2, article)
        try:
            header_text = match_obj.group("header")
            header_text = washer.wash(html_buffer=header_text,
                                      allowed_tag_whitelist=['a'],
                                      allowed_attribute_whitelist=['href',
                                                                   'target',
                                                                   'class'])
            if header_text == "":
                raise Exception
        except:
            article = article.replace(header_text, '')
            article = article.replace('<p/>', '')
            article = article.replace('<p>&nbsp;</p>', '')
            match_obj = re.search(para_pattern, article)
            try:
                # get the first paragraph
                header_text = match_obj.group("paragraph")
                try:
                    header_text = washer.wash(html_buffer=header_text,
                                              allowed_tag_whitelist=[],
                                              allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    header_text = remove_html_markup(header_text)

                if header_text.strip() == "":
                    raise Exception
                else:
                    if len(header_text) > 250:
                        header_text = _get_first_sentence_or_part(header_text)
            except:
                # in a last instance get the first sentence
                try:
                    article = washer.wash(article,
                                          allowed_tag_whitelist=[],
                                          allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    article = remove_html_markup(article)

                header_text = _get_first_sentence_or_part(article)

    return header_text

示例#14

0

显示文件

 def test_remove_html_markup_replacement(self):
     """htmlutils - remove HTML markup, some replacement"""
     test_input = 'This is <a href="test">test</a>.'
     test_expected = 'This is XtestX.'
     self.assertEqual(remove_html_markup(test_input, 'X'), test_expected)

示例#15

0

显示文件

 def test_remove_html_markup_empty(self):
     """htmlutils - remove HTML markup, empty replacement"""
     test_input = 'This is <a href="test">test</a>.'
     test_expected = 'This is test.'
     self.assertEqual(remove_html_markup(test_input, ''), test_expected)

示例#16

0

显示文件

文件： simplestore_marc_handler.py 项目： peskk3am/b2share

def add_basic_fields(rec, form, email):
    """
    Adds the basic fields from the form. Note that these fields are mapped
    to specific MARC fields. For information on the fields see the www.loc.gov
    website. For example http://www.loc.gov/marc/bibliographic/bd260.html
    contains information on field 260 for publication data.
    """
    # why aren't subfields a dictionary?!
    try:
        if form['title']:
            record_add_field(rec, '245', subfields=[('a', remove_html_markup(form['title']))])

        if form['creator']:
            record_add_field(rec, '100', subfields=[('a', remove_html_markup(form['creator']))])

        if form['domain']:
            record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['domain']))])
        pubfields = []
        if form['publisher']:
            pubfields.append(('b', remove_html_markup(form['publisher'])))
        if form.get('publication_date'):
            pubfields.append(('c', remove_html_markup(form['publication_date'])))
        if pubfields:
            record_add_field(rec, '260', subfields=pubfields)
        record_add_field(rec, '856', ind1='0', subfields=[('f', email)])

        if 'open_access' in form:
            record_add_field(rec, '542', subfields=[('l', 'open')])
        else:
            record_add_field(rec, '542', subfields=[('l', 'restricted')])

        if form['licence']:
            record_add_field(rec, '540', subfields=[('a', remove_html_markup(form['licence']))])
        record_add_field(rec, '520', subfields=[('a', remove_html_markup(form['description']))])

        if form['tags']:
            for kw in form['tags'].split(','):
                record_add_field(rec, '653',
                                 ind1='1',
                                 subfields=[('a', remove_html_markup(kw.strip()))])

        if form['contributors']:
            for kw in form['contributors'].split(';'):
                record_add_field(rec, '700', subfields=[('a', remove_html_markup(kw.strip()))])

        if form['language']:
            record_add_field(rec, '546', subfields=[('a', remove_html_markup(form['language']))])

        # copying zenodo here, but I don't think 980 is the right MARC field
        if form['resource_type']:
            record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['resource_type']))])

        if form['alternate_identifier']:
            record_add_field(rec, '024',
                             subfields=[('a', remove_html_markup(form['alternate_identifier']))])

        if form['version']:
            record_add_field(rec, '250', subfields=[('a', remove_html_markup(form['version']))])
        record_add_field(rec, '264',
                         subfields=[('b', CFG_SITE_NAME),
                                    ('c', str(datetime.utcnow()) + " UTC")])
    except Exception as e:
        current_app.logger.error(e)
        raise