def linkify_soup(soup, new_tag): assert hasattr(soup, 'contents') tags = set() old_elements = [e for e in soup.contents] for element in old_elements: if not isinstance(element, NavigableString): tags = tags.union(linkify_soup(element, new_tag)) continue segments = tag_re.split(element) if len(segments) <= 1: continue insertion_target = element for segment in segments: if len(segment) > 0: if tag_re.match(segment) is None: new_e = NavigableString(segment) else: new_e = new_tag("a", href='tag/{}.html'.format(segment)) new_e.string = segment tags.add(segment[1:]) insertion_target.insert_after(new_e) insertion_target = new_e element.extract() return tags
def scrapeError(url: str, elem: str, attr: Tuple[str, str], field_err: List[str], auth: bool = False) -> str: """ Scrapes input error from argument: url """ if auth: token = b64decode(session["token"]) token = str(token, encoding="utf-8") web_page = requests.request( 'GET', url, headers={'User-Agent': f"{request.user_agent}"}, params={"token": token}, allow_redirects=False) else: web_page = requests.request( 'GET', url, headers={'User-Agent': f"{request.user_agent}"}, allow_redirects=False) soup = BeautifulSoup(web_page.content) elem_tag = soup.find_all(f'{elem}', {f"{attr[0]}": f"{attr[1]}"}) for i in elem_tag: for err in field_err: i.insert(0, NavigableString(f"- {err}\n")) error = i return error
def test_ins5(self): self.soup.div.insert(0, NavigableString("Not a BeautifulSoup object")) self.assertEqual( str(self.soup.div), '<div class="second">Not a BeautifulSoup object' 'Second element</div>')
def add_link_markup(tags): for tag in tags: added_icon = False if not tag.attrs.get('class', None): tag.attrs.update({'class': []}) if tag['href'].startswith('/external-site/?'): components = urlparse(tag['href']) arguments = parse_qs(components.query) if 'ext_url' in arguments: external_url = arguments['ext_url'][0] tag['href'] = signed_redirect(external_url) elif NONCFPB_LINK_PATTERN.match(tag['href']): # Sets the icon to indicate you're leaving consumerfinance.gov tag.attrs['class'].append(EXTERNAL_A_CSS) if EXTERNAL_LINK_PATTERN.match(tag['href']): tag['href'] = signed_redirect(tag['href']) added_icon = True elif DOWNLOAD_LINK_PATTERN.search(tag['href']): # Sets the icon to indicate you're downloading a file tag.attrs['class'].append(DOWNLOAD_A_CSS) added_icon = True if added_icon: # Wraps the link text in a span that provides the underline contents = tag.contents span = BeautifulSoup('', 'html.parser').new_tag('span') span['class'] = EXTERNAL_SPAN_CSS span.contents = contents tag.contents = [span, NavigableString(' ')] elif not FILES_LINK_PATTERN.match(tag['href']): fix_link(tag)
def generate_stats_with_values(stats_with_values: List[cards.StatWithValue]) -> Tag: for stat_with_value in stats_with_values: swv_section = Tag(name="section") swv_section['class'] = "stat-with-value" swv_stat = Tag(name="label") swv_section.append(swv_stat) swv_stat.append(NavigableString(stat_with_value.stat)) swv_value = Tag(name="data") swv_section.append(swv_value) swv_value['value'] = stat_with_value.value swv_value.append(NavigableString(stat_with_value.value)) if stat_with_value.unit is not None: small = Tag(name="small") swv_value.append(small) small.append(NavigableString(stat_with_value.unit)) yield swv_section
def _recursive_replace(self, tag): if hasattr(tag, "contents"): # noqa: WPS421 (builtin function call, special cases only) for index, child in enumerate(tag.contents): if child.name == "code": tag.contents[index] = NavigableString(self.store(str(child))) else: self._recursive_replace(child)
def insert_paragraph_breaks(soup): """Identify <br> and <hr> and split their parent element into multiple elements where appropriate.""" # Indicator which is used as a placeholder to mark paragraph breaks BREAK_INDICATOR = "|BREAK_HERE|" # Find consecutive <br> elements and replace with a break marker for element in soup.find_all('br'): # When the next element is not another <br> count how long the chain is if (element.next_sibling is None) or (element.next_sibling.name != 'br'): br_element_chain = [element] while (br_element_chain[-1].previous_sibling is not None) and (br_element_chain[-1].previous_sibling.name == 'br'): br_element_chain.append(br_element_chain[-1].previous_sibling) # If there's only one <br> then we replace it with a space if len(br_element_chain) == 1: br_element_chain[0].replace_with(' ') # If there are multiple <br>s then replace them with BREAK_INDICATOR else: br_element_chain[0].replace_with(BREAK_INDICATOR) for inner_element in br_element_chain[1:]: inner_element.decompose() # Find consecutive <hr> elements and replace with a break marker # Use a list rather than the generator, since we are altering the tree as we traverse it for element in list(soup.find_all('hr')): element.replace_with(BREAK_INDICATOR) # Consolidate the text again now that we have added strings to the tree consolidate_text(soup) # Iterate through the tree, splitting string elements which contain BREAK_INDICATOR # Use a list rather than the generator, since we are altering the tree as we traverse it for element in list(soup.find_all(string=True)): if BREAK_INDICATOR in element: # Split the text into two or more fragments (there maybe be multiple BREAK_INDICATORs in the string) text_fragments = [ s.strip() for s in str(element).split(BREAK_INDICATOR) ] # Get the parent element parent_element = element.parent # If the parent is a paragraph then we want to close and reopen by creating a new tag if parent_element.name == "p": # Iterate in reverse order as we are repeatedly adding new elements directly after the original one for text_fragment in text_fragments[:0:-1]: new_p_element = soup.new_tag("p") new_p_element.string = text_fragment parent_element.insert_after(new_p_element) # Replace this element by a navigable string containing the first text fragment element.replace_with(NavigableString(text_fragments[0])) # Otherwise we want to simply include all the text fragments as independent NavigableStrings (that will be wrapped later) else: # Iterate in reverse order as we are repeatedly adding new elements directly after the original one for text_fragment in text_fragments[:0:-1]: element.insert_after(soup.new_string(text_fragment)) element.string.replace_with(text_fragments[0])
def construct_fields_for_each_file(files, soup): for file in files: fields = etree.SubElement(file, 'fields') table = soup.find(attrs={'data-text': file[0].text}).findNext('table') tags = [ format_tag(tag.text) for tag in table.find('thead').find('tr').find_all('th', recursive=False) ] for row in table.find('tbody').find_all('tr', recursive=False): field = etree.SubElement(fields, 'field') for tag, value in zip(tags, row.find_all('td', recursive=False)): # rename field_name if tag == 'field_name': attribute = etree.SubElement(field, 'identifier') else: attribute = etree.SubElement(field, tag) for unwanted_table in value.find_all('table'): unwanted_div = soup.new_tag('div') unwanted_div.append(' (table has been removed) ') unwanted_table.insert_after(unwanted_div) unwanted_table.decompose() for li in value.find_all('li'): li.insert(0, NavigableString(' • ')) attribute.text = format(value.get_text(separator=' ')) # add names to each field for field in fields: attribute = etree.Element('name') attribute.text = format_name(field[0].text) field.insert(1, attribute) return files
def extract_text_from_is_html(html: Union[str, TextIO]) -> str: """Extract article text from Ilta-Sanomat article HTML Args: html (Union[str,TextIO]): a string or a file-like object containing the article HTML Raises: ValueError: The layout of the article was not recognized, or the article parsed as empty Returns: str: article text """ soup = BeautifulSoup(html, 'lxml') elem = soup.select_one( 'article.single-article,article.article--m,article.article--l,article.article--xl-picture-top,article.article--xl-title-top' ) if elem is None: raise ValueError("Article layout not recognized") for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']: for block_elem in elem.find_all(tag): block_elem.insert_after(NavigableString('\n\n')) txt = elem.get_text().strip() if txt == "": raise ValueError("Parsing results in an empty article") return txt
def clean_tags(tag, post): if NavigableString == type(tag): tag.string.replace_with(escape(tag)) return tag # 递归,先处理子结点,不然 unwrap 后本结点就变性了 for child in tag.contents : clean_tags(child, post) if 'img' == tag.name : if 'src' in tag.attrs : post['images'].append(fix_url(tag.attrs['src'])) if 'title' in tag.attrs : title = tag.attrs['title'] tag.insert_before(NavigableString('[' + title + ']')) if tag.name not in ['b', 'i', 'a', 'code', 'pre']: tag.unwrap() if tag.name == 'a' : allowed_attrs = {} for key in tag.attrs : if key in KEEP_ATTRIBUTES : allowed_attrs[key] = tag.attrs[key] tag.attrs = allowed_attrs return tag
def urlize(data): """Urlize plain text links in the HTML contents. Do not urlize content of A and CODE tags. """ soup = BeautifulSoup(data, 'lxml') for found_string in soup.find_all(string=exclude_code_tag): new_content = [] strings_or_tags = found_string.parent.contents for string_or_tag in strings_or_tags: try: for string in PLAIN_LINK_RE.split(string_or_tag): if string.startswith('http'): # Apply an a-Tag tag = soup.new_tag('a') tag['href'] = string tag.string = string tag['nofollow'] = 'true' new_content.append(tag) else: # This is just a string, apply a bs4-string new_content.append(NavigableString(string)) except: # Regex failed, so apply what ever it is new_content.append(string_or_tag) # Apply the new content found_string.parent.contents = new_content return str(soup)
def checklistInSoupToENML(soup): ''' Transforms github style checklists `* [ ]` in the BeautifulSoup tree to ENML. ''' checktodo_re = re.compile(r'\[([ x])\]') # To be more github compatible, if all elements in a list begin with '[ ]', # convert them to en-todo evernote elements for ul in soup.find_all('ul'): tasks = [] istodo = True for li in ul.find_all('li'): task = soup.new_tag('div') todo_tag = soup.new_tag('en-todo') reg = checktodo_re.match(li.get_text()) istodo = istodo and reg character = reg.group(1) if reg else None if character == "x": todo_tag['checked'] = "true" task.append(todo_tag) if reg: task.append(NavigableString(li.get_text()[3:].strip())) tasks.append(task) if istodo: for task in tasks[::-1]: ul.insert_after(task) ul.extract()
def get_chapter_title(self, soup): heading = soup.new_tag('h2') heading['class'] = 'chapter-heading' title = '' chapter_number = None if self.chap_title_css: tag = soup.select_one(self.chap_title_css) chapter_details = re.match( r'(chapter\s+(\d+))[:\-\s]*([\w\s\'\-\d:.,]*)', tag.string, flags=re.IGNORECASE) try: chapter_number = chapter_details.group(2) title = chapter_details.group(3) except AttributeError: title = '' self.current_chapter = chapter_number if chapter_number else int( self.current_chapter) + 1 chap_title = 'Chapter ' + str(self.current_chapter) chap_title += (' - ' + title) if title else '' heading.string = NavigableString(chap_title) if self.debug: print(chap_title) return heading
def replace_any_case(element: NavigableString, target_word: str) -> None: # Replace all instances of the word, but maintaining the same case in # the replacement if len(element) == len(target_word): return if not re.match('.*[a-zA-Z0-9].*', target_word) or ( element.parent and element.parent.name == 'style'): return element.replace_with(BeautifulSoup( re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b', r'<b>\1</b>', html.escape(element), flags=re.I), 'html.parser') )
def link(self): soup = BeautifulSoup("", 'html5lib') link = soup.new_tag('a', href="#bibliography" + str(self.ordering)) link['class'] = "bibliography-reference" link['data-ordering'] = str(self.ordering) link.insert(0, NavigableString(self.content_title)) return link
def add_replacement_links(p, keys, soup, bib): """ Given a paragraph object and possible bibtex keys, add a replacement link to the paragraph object :param Tag p: BS Paragraph object :param list(str) keys: List of citation keys :param BeautifulSoup soup: Beautiful Soup object :param dict(str, str) bib: Dictionary created from BibTeX """ p.append(NavigableString('[')) for i, key in enumerate(keys): p.append(create_link_from_entry(soup, bib, key)) if i + 1 == len(keys): p.append(NavigableString('] ')) else: p.append(NavigableString(', '))
def extract_text_from_svyle_html(html: Union[str, TextIO]) -> str: """Extract article text from Svenska YLE article HTML Args: html (Union[str,TextIO]): a string or a file-like object containing the article HTML Raises: ValueError: The layout of the article was not recognized, or the article parsed as empty Returns: str: article text """ soup = BeautifulSoup(html, 'lxml') elem = soup.select_one('article#main-content') if elem is None: raise ValueError("Article layout not recognized") for elem_to_remove in soup.select('aside#id-article__tags'): elem_to_remove.extract() for elem_to_remove in soup.select('#comments'): elem_to_remove.extract() for elem_to_remove in soup.select('.ydd-share-buttons'): elem_to_remove.extract() for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']: for block_elem in elem.find_all(tag): block_elem.insert_after(NavigableString('\n\n')) txt = elem.get_text().strip() if txt == "": raise ValueError("Parsing results in an empty article") return txt
def format(self, article, subscriber, codes=None): formatted_article = deepcopy(article) pub_seq_num = superdesk.get_resource_service( 'subscribers').generate_sequence_number(subscriber) doc = {} try: # If there is a dateline inject it into the body if formatted_article.get( FORMAT) == FORMATS.HTML and formatted_article.get( 'dateline', {}).get('text'): soup = BeautifulSoup(formatted_article.get('body_html'), "html.parser") ptag = soup.find('p') if ptag is not None: ptag.insert( 0, NavigableString('{} '.format( formatted_article.get('dateline').get('text')))) formatted_article['body_html'] = str(soup) doc['message_html'] = render_template( 'email_article_body.html', article=formatted_article) else: doc['message_html'] = None doc['message_text'] = render_template('email_article_body.txt', article=formatted_article) doc['message_subject'] = render_template( 'email_article_subject.txt', article=formatted_article) except Exception as ex: raise FormatterError.EmailFormatterError(ex, FormatterError) return [(pub_seq_num, json.dumps(doc))]
def _parse(self, html): # print(html) soup = BeautifulSoup(html.decode('utf-8'), "html5lib") for comment in soup.find_all(text=lambda text: isinstance(text, Comment)): comment.extract() self.meta = soup.find_all('meta') try: soup.find('meta', attrs={'name':'hdl'}).get('content') soup.find('meta', attrs={'name':'dat'}).get('content') soup.find('meta', attrs={'name':'byl'}).get('content') except AttributeError: self.real_article = False # return try: p_tags = list(soup.find("article", {"id":"story"}).find_all('p')) except: print(html) return div = soup.find('div', attrs={'class': 'story-addendum story-content theme-correction'}) if div: p_tags += [div] footer = soup.find('footer', attrs={'class':'story-footer story-content'}) if footer: p_tags += list(footer.find_all(lambda x: x.get('class') != 'story-print-citation' and x.name == 'p')) p_contents = reduce(operator.concat, [p.contents + [NavigableString('\n')] for p in p_tags], []) body_strings = [] for node in p_contents: if type(node) is NavigableString: body_strings.append(node) else: if node.name == 'br': body_strings.append(' \n ') else: try: body_strings.append(node.get_text()) except: body_strings.append(node) main_body = ''.join(body_strings) # authorids = soup.find('div', attrs={'class':'authorIdentification'}) # authorid = authorids.getText() if authorids else '' top_correction = ' '.join(x.getText() for x in soup.find_all('nyt_correction_top')) or ' ' bottom_correction = ' '.join(x.getText() for x in soup.find_all('nyt_correction_bottom')) or ' ' self.body = '\n'.join([top_correction, main_body, # authorid, bottom_correction,]) # print(self.body)
def add_link_markup(tag): """Add necessary markup to the given link and return if modified. Add an external link icon if the input is not a CFPB (internal) link. Add an external link redirect if the input is not a gov link. If it contains a descendent that should not get an icon, return the link. If not, add a download icon if the input is a file. Otherwise (internal link that is not a file), return None. """ icon = False tag = BeautifulSoup(tag, 'html.parser').find('a', href=True) if tag is None: return None if not tag.attrs.get('class', None): tag.attrs.update({'class': []}) if tag['href'].startswith('/external-site/?'): # Sets the icon to indicate you're leaving consumerfinance.gov icon = 'external-link' components = urlparse(tag['href']) arguments = parse_qs(components.query) if 'ext_url' in arguments: external_url = arguments['ext_url'][0] # Add the redirect notice as well tag['href'] = signed_redirect(external_url) elif NON_CFPB_LINKS.match(tag['href']): # Sets the icon to indicate you're leaving consumerfinance.gov icon = 'external-link' if NON_GOV_LINKS.match(tag['href']): # Add the redirect notice as well tag['href'] = signed_redirect(tag['href']) elif DOWNLOAD_LINKS.search(tag['href']): # Sets the icon to indicate you're downloading a file icon = 'download' if tag.select(', '.join(ICONLESS_LINK_CHILD_ELEMENTS)): # If this tag has any children that are in our list of child elements # that should not get an icon, it doesn't get the icon. It might still # be an external link and modified accordingly above. return str(tag) if icon: tag.attrs['class'].append(LINK_ICON_CLASSES) # Wraps the link text in a span that provides the underline contents = tag.contents span = BeautifulSoup('', 'html.parser').new_tag('span') span['class'] = LINK_ICON_TEXT_CLASSES span.contents = contents tag.contents = [span, NavigableString(' ')] # Appends the SVG icon tag.contents.append(BeautifulSoup(svg_icon(icon), 'html.parser')) return str(tag) return None
def generate_sub_figure(typed_thing: Union[cards.Action, cards.Card]) -> Tag: figure = Tag(name="figure") figcaption = Tag(name="figcaption") h2 = Tag(name="h2", attrs={'class': 'type'}) h2.append(NavigableString(typed_thing.sub_type.title())) figcaption.append(h2) figure.append(figcaption) return figure
def replace_a(cls, element): if isinstance(element, NavigableString): return if element.name == "a": label = "{removed href}" element.replaceWith(NavigableString(f"{label} {element.text}"))
def _inline_image(image_tag: PageElement, image_file: Path) -> bool: """ replacement callable to replace img tags for inline_data """ image_content = "data:image/png;base64," + base64.b64encode( image_file.read_bytes()).decode("utf-8") image_content = NavigableString(image_content) image_tag["src"] = image_content
def create_link_from_entry(soup, bib, key): """ Creates link/replacement text for bibtex entry :param BeautifulSoup soup: Beautiful Soup object :param dict bib: Dictionary created from BibTeX :param str key: Single citation key :return: Link or text to replace citation :rtype: Tag|NavigableString """ # Define possible tags to use b_tag = soup.new_tag('b') try: entry = bib[key] except KeyError: click.echo('Entry {} not found in bibtex file!'.format(key)) b_tag.append('{}'.format(key)) return b_tag # Info for entry: <Author> et al. <Year>. try: author = NavigableString(entry['author'].split(',')[0] + ' et al. ' + entry['year'] + '. ') except KeyError: click.echo( 'Author not found for bibtex key: "{}", using key instead'.format( key)) author = NavigableString(key + ' ') # Use DOI if available if 'doi' in entry.keys(): link = soup.new_tag('a', href='https://doi.org/' + entry['doi']) link.append(author) return link # Otherwise use URL elif 'url' in entry.keys(): link = soup.new_tag('a', href=entry['url']) link.append(author) return link # Else return author else: b_tag.append(author) return b_tag
def __call__(self, soup, style='tmp-json-style'): order_map = {identifier: index + 1 for index, identifier in enumerate(self.config.reference_counts.keys())} for bio_tag in soup.find_all('notex-bibliography'): if len(self.config.citations): ol_tag = BeautifulSoup.new_tag(bio_tag, 'ol', **{'class': 'bibliography-list'}) for identifier, count in self.config.reference_counts.items(): if identifier in self.config.citations: li_tag = BeautifulSoup.new_tag(soup, 'li', **{ 'id': 'cite-{0:d}'.format(order_map[identifier]), 'class': 'citation-details', 'ref-count': count, }) li_tag.append(NavigableString(self.config.citations[identifier].render(style))) ol_tag.append(li_tag) else: #todo: logging print('reference to citation "{0:s}" which is not defined'.format(identifier)) li_tag = BeautifulSoup.new_tag(soup, 'li') li_tag.append(NavigableString('unknown citation "{0:s}"'.format(identifier))) ol_tag.append(li_tag) bio_tag.append(ol_tag) tag_names = set() if self.config.has_ci_tag: tag_names.add('reference-ci') if self.config.has_cite_tag: tag_names.add('reference-cite') if tag_names: for ref_tag in soup.find_all(tag_names): identifier = ref_tag.attrs['data-identifier'] if identifier in self.config.citations: citation = self.config.citations[identifier] ref_tag.attrs['class'].append('citstyle-{0:s}'.format(style)) index = order_map[identifier] ref_tag.parent.attrs['href'] = '#cite-{0:d}'.format(index) if ref_tag.name == 'reference-cite': marker = BeautifulSoup.new_tag(ref_tag, 'cite') marker.append(NavigableString(self.config.citations[identifier].ref_title())) else: marker = NavigableString(citation.ref_name(index, style)) ref_tag.append(marker) else: #todo: logging print('reference "{0:s}" not found'.format(identifier)) ref_tag.attrs['class'].extend(['citstyle-{0:s}'.format(style), 'citation-not-found', 'not-found']) ref_tag.append(NavigableString('[reference "{0:s}" ??]'.format(identifier)))
def clone_element(element: Union[Tag, NavigableString]) -> Union[Tag, NavigableString]: " Create a deep copy of an element from a BeautifulSoup tree. " if isinstance(element, Tag): new_element = create_empty_element_copy(element) for child in element.children: new_element.append(clone_element(child)) return new_element return NavigableString(str(element))
def recursive_replace(tag): if hasattr(tag, "contents"): for i in range(len(tag.contents)): child = tag.contents[i] if child.name == "code": tag.contents[i] = NavigableString( self.store(str(child))) else: recursive_replace(child)
def format_indents(soup): """ Needs clean up """ for indent in soup.find_all('indent'): if not len(indent.contents): indent.decompose() prev_left = None for indent in soup.find_all('indent'): match = None if indent.find('sml-image'): indent.find('sml-image').extract() indent.insert(0, NavigableString(u'• ')) if isinstance(indent.contents[0], NavigableString): match = listlike_reg.match(indent.contents[0]) left = get_left(indent) if not match or (prev_left and left > prev_left): if indent.previous_sibling and indent.previous_sibling and indent.previous_sibling.name == 'list': text = indent.previous_sibling.find_all('text')[-1] text.append(' ') for c in indent.contents[:]: text.append(c) indent.decompose() continue indent.name = 'entry' if match: text = soup.new_tag('text') text.string = match.group(2) indent.contents[0].replace_with('') for c in indent.contents[:]: text.append(c) insert = 0 if match.group(1) != u'•': label = soup.new_tag('label') label.string = match.group(1) indent.insert(insert, label) insert += 1 indent.insert(insert, text) else: text = soup.new_tag('text') for c in indent.contents[:]: text.append(c) indent.insert(0, text) if not (indent.previous_sibling and indent.previous_sibling.name == 'list'): new_list = soup.new_tag('list') entry = indent.replace_with(new_list) new_list.append(entry) else: indent.previous_sibling.append(indent) indent.attrs = {} prev_left = left
def set_html(self): if self.mime_data.hasHtml(): markup = self.mime_data.html() soup = BeautifulSoup(markup, "html.parser") for inner_text in list(soup.strings): inner_text.replace_with( NavigableString(self.modify_text(inner_text))) self.fin_mime_data.setHtml(str(soup)) self.format_list.remove("text/html")
def generate_latex_from_element(element: NavigableString, payload: dict): # if this is no tag, this is pure text if isinstance(element, str): if element.strip() == "": return "" return sanitize_string(element) data = process_symbols(element, payload, get_latex_for_element(element.name)) return data
def process_bibtex2html_output(bibtex2html_output, d): """ From the bibtex2html output, get clean version. """ # frag = bs(bibtex2html_output) frag = BeautifulSoup(bibtex2html_output, 'html.parser') with open(os.path.join(d, 'fixed_interpreted.html'), 'w') as f: f.write(str(frag)) res = Tag(name='div') ids = [] for dt in list(frag.select('dt')): assert dt.name == 'dt' name = dt.a.attrs['name'] name = 'bib:' + name ids.append(name) dd = dt.findNext('dd') assert dd.name == 'dd' entry = dd.__copy__() entry.name = 'cite' entry.attrs['id'] = name try_to_replace_stuff = True if try_to_replace_stuff: for x in list(entry.descendants): if isinstance(x, NavigableString): s = x.string.encode('utf-8') s = s.replace('\n', ' ') s = s.replace('[', '') s = s.replace('|', '') s = s.replace(']', '') y = NavigableString(unicode(s, 'utf-8')) x.replace_with(y) #print('string %r' % x.string) if isinstance(x, Tag) and x.name == 'a' and x.string == 'bib': x.extract() res.append(NavigableString('\n')) res.append(entry) res.append(NavigableString('\n')) res.attrs['id'] = 'bibliography_entries' logger.info('Found %d bib entries.' % len(ids)) return str(res)