def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def html2content(html, allowed_tags=["a", "abbr", "article", "aside", "b", "base", "blockquote", "body", "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg", "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr"]): cleaner = Cleaner() cleaner.allow_tags = allowed_tags cleaner.remove_unknown_tags = False cleaner.page_structure = False cleaner.meta = False cleaner.style = True cleaner.embeded = False return cleaner.clean_html(html)
def clearTag_old(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.links = True cleaner.meta = True cleaner.forms = True cleaner.embedded = True cleaner.frames = True cleaner.remove_unknown_tags = True cleaner.kill_tags = ["img"] cleaner.remove_tags = [ "strong", "div", "body", "br", "a", "p", "blockquote", "h3", "ol", "li", "font", ] return cleaner.clean_html( lxml.html.document_fromstring(text)).decode("utf-8")
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def html_strict_cleaning(html, allow_tags=['p', 'br', 'a', 'img', 'div']): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.allow_tags = allow_tags cleaner.remove_unknown_tags = False return lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(html)), encoding='unicode')
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'ul', 'ol', 'li', 'em', 'i', 'code', 'pre', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def get_cleaner(): cleaner = Cleaner() cleaner.embedded = True cleaner.frames = True cleaner.style = True cleaner.remove_unknown_tags = True cleaner.processing_instructions = True cleaner.annoying_tags = True cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p'] cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul', 'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub'] return cleaner
def get_locations(self) -> list: locations = [] page = requests.get(self.link) tree = html.fromstring(page.text) tag = tree.xpath('//ul[@class="dropdown-menu"]')[7] c = Cleaner() c.allow_tags = 'a' c.remove_unknown_tags = False doc = etree.tostring(tag) for s in c.clean_html(doc).decode("utf-8").replace( "<div>", "").replace("</div>", "").strip().split('\n'): locations.append( Location( regexTagContent.findall(s)[0].replace("<", "").replace(">", ""), regexHref.findall(s)[0])) return locations
def get_menu(self, location: str) -> Menu: food = [] c = Cleaner() c.allow_tags = ['img'] c.remove_unknown_tags = False page = requests.get(self.link + location) tree = html.fromstring(page.text) f = tree.xpath( '//div[@style="background-color:#ecf0f1;border-radius: 4px 4px 0px 0px; padding: 8px;"]' ) doc = etree.tostring(f[0], pretty_print=True) t = c.clean_html(doc).decode("utf-8").replace("<div>", "").replace( "</div>", "") dtime = self.__extract_date( c.clean_html(doc).decode("utf-8").replace("<div>", "").replace( "</div>", "").strip().split("Essen ")) food += self.__extract_food(regexFoodBlock.findall(t)) # foodList.append(f) return Menu(dtime, food)
def clean(self: T) -> str: cleaner = Cleaner() cleaner.style = self.__style cleaner.links = self.__links cleaner.page_structure = self.__page_structure cleaner.safe_attrs_only = self.__safe_attrs_only # allow_tags and remove_unknown_tags can't work together if self.__allow_tags is not None: cleaner.remove_unknown_tags = False cleaner.allow_tags = self.__allow_tags if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags if self.__remove_tags is not None: cleaner.remove_tags = self.__remove_tags if self.__safe_attrs is not None: cleaner.safe_attrs = self.__safe_attrs self.__input = cleaner.clean_html(self.__input) return self.__input
def main(): html = "<div>Bill Gates: Microsoft Founder and Philanthropist: Microsoft Founder and ... - Marylou Morano Kjelle - Google BooksA privacy reminder from Google<a class=\"gb_od gb_7c\" tabindex=\"0\">Review now</a><a class=\"gb_od gb_nd\" tabindex=\"0\">I'll read this later</a><a class=\"gb_b gb_5b\" href=\"https://www.google.co.uk/intl/en/options/\" title=\"Google apps\" tabindex=\"0\"></a><a class=\"gb_O\" href=\"https://myaccount.google.com/?utm_source=OGB&utm_medium=app\" id=\"gb192\">My Account</a><a class=\"gb_O\" href=\"https://www.google.co.uk/webhp?tab=pw&ei=MjliWbTaCcPq-AHcsbOYAQ&ved=0EKkuCAgoAQ\" id=\"gb1\">Search</a><a class=\"gb_O\" href=\"https://maps.google.co.uk/maps?hl=en&tab=pl\" id=\"gb8\">Maps</a><a class=\"gb_O\" href=\"https://www.youtube.com/?gl=GB\" id=\"gb36\">YouTube</a><a class=\"gb_O\" href=\"https://play.google.com/?hl=en&tab=p8\" id=\"gb78\">Play</a><a class=\"gb_O\" href=\"https://news.google.co.uk/nwshp?hl=en&tab=pn&ei=MjliWbTaCcPq-AHcsbOYAQ&ved=0EKkuCAwoBQ\" id=\"gb5\">News</a><a class=\"gb_O\" href=\"https://mail.google.com/mail/?tab=pm\" id=\"gb23\">Gmail</a><a class=\"gb_O\" href=\"https://drive.google.com/?tab=po\" id=\"gb49\">Drive</a><a class=\"gb_O\" href=\"https://www.google.com/calendar?tab=pc\" id=\"gb24\">Calendar</a><a class=\"gb_O\" href=\"https://plus.google.com/?gpsrc=ogpy0&tab=pX\" id=\"gb119\">Google+</a><a class=\"gb_O\" href=\"https://translate.google.co.uk/?hl=en&tab=pT\" id=\"gb51\">Translate</a><a class=\"gb_O\" href=\"https://photos.google.com/?tab=pq&pageId=none\" id=\"gb31\">Photos</a><a class=\"gb_ka gb_xf\" href=\"https://www.google.co.uk/intl/en/options/\">More</a><a class=\"gb_O\" href=\"http://www.google.co.uk/shopping?hl=en&tab=pf&ei=MjliWbTaCcPq-AHcsbOYAQ&ved=0EKkuCBMoDA\" id=\"gb6\">Shopping</a><a class=\"gb_O\" href=\"https://www.google.co.uk/finance?tab=pe\" id=\"gb27\">Finance</a><a class=\"gb_O\" href=\"https://docs.google.com/document/?usp=docs_alc\" id=\"gb25\">Docs</a><a class=\"gb_O\" href=\"https://books.google.co.uk/bkshp?hl=en&tab=pp&ei=MjliWbTaCcPq-AHcsbOYAQ&ved=0EKkuCBYoDw\" id=\"gb10\">Books</a><a class=\"gb_O\" href=\"https://www.blogger.com/?tab=pj\" id=\"gb30\">Blogger</a><a class=\"gb_O\" href=\"https://www.google.com/contacts/?hl=en&tab=pC\" id=\"gb53\">Contacts</a><a class=\"gb_O\" href=\"https://hangouts.google.com/\" id=\"gb300\">Hangouts</a><a class=\"gb_O\" href=\"https://keep.google.com/\" id=\"gb136\">Keep</a><a class=\"gb_la gb_sf\" href=\"https://www.google.co.uk/intl/en/options/\">Even more from Google</a><a class=\"gb_Bf gb_Ha gb_xb\" id=\"gb_70\" href=\"https://www.google.com/accounts/Login?service=print&continue=https://books.google.co.uk/books%3Fid%3Daf3PBQAAQBAJ%26pg%3DPA2%26lpg%3DPA2%26dq%3Dbillgatesmicrosoft%26source%3Dbl%26ots%3DkOoeyqnrmG%26sig%3DO8vNTHW0AmC039_nJsnKiEucONQ%26hl%3Den%26sa%3DX&hl=en\" target=\"_top\">Sign in</a><a class=\"gb_5d gb_3b\" href=\"https://books.google.co.uk/bkshp?hl=en&tab=pp\" title=\"Books\"></a>Hidden fields<a class=\"consentBumpSlowLink\" href=\"//consent.google.com/?hl=en&continue=https://books.google.co.uk/books%3Fid%3Daf3PBQAAQBAJ%26pg%3DPA2%26lpg%3DPA2%26dq%3Dbillgatesmicrosoft%26source%3Dbl%26ots%3DkOoeyqnrmG%26sig%3DO8vNTHW0AmC039_nJsnKiEucONQ%26hl%3Den%26sa%3DX%26ved%3D0ahUKEwi3oKipsPzUAhWcF8AKHSXwAc8Q6AEIkgEwFQ&pc=ogb&wp=71&l=1&if=1&fld=2&origin=https://books.google.co.uk\" target=\"_top\">Load basic HTML</a>\u00a0(for slow connections)<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&pg=PA2&lpg=PA2&dq=billgatesmicrosoft&source=bl&ots=kOoeyqnrmG&sig=O8vNTHW0AmC039_nJsnKiEucONQ&hl=en&sa=X&output=html_text\" title=\"Screen reader users: click this link for accessible mode. Accessible mode has the same essential features but works better with your reader.\"></a><a href=\"/books\">Books</a><a id=\"appbar-write-review-link\" href=\"https://www.google.com/accounts/Login?service=print&continue=https://books.google.co.uk/books%3Fop%3Dlookup%26id%3Daf3PBQAAQBAJ%26continue%3Dhttps://books.google.co.uk/books%253Fid%253Daf3PBQAAQBAJ%2526pg%253DPA2%2526lpg%253DPA2%2526dq%253Dbillgatesmicrosoft%2526source%253Dbl%2526ots%253DkOoeyqnrmG%2526sig%253DO8vNTHW0AmC039_nJsnKiEucONQ%2526hl%253Den%2526sa%253DX&hl=en\"></a><a id=\"appbar-view-print-sample-link\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&printsec=frontcover&source=gbs_vpt_read\"></a><a id=\"appbar-view-ebook-sample-link\" href=\"https://play.google.com/books/reader?id=af3PBQAAQBAJ&printsec=frontcover&source=gbs_vpt_read\"></a><a id=\"appbar-patents-prior-art-finder-link\" href=\"\"></a><a id=\"appbar-patents-discuss-this-link\" href=\"\"></a><a id=\"appbar-read-patent-link\" href=\"\"></a><a id=\"appbar-download-pdf-link\" href=\"\"></a>books.google.co.uk - Learn who Bill Gates is, how Microsoft got its start, where it\u00e2\u0080\u0099s heading, and much more. Primary sources with accompanying questions, multiple prompts, timeline, index, and glossary also included. Core Library is an imprint of Abdo Publishing Company....https://books.google.co.uk/books/about/Bill_Gates_Microsoft_Founder_and_Philant.html?id=af3PBQAAQBAJ&utm_source=gb-gplus-shareBill Gates: Microsoft Founder and Philanthropist<a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"https://www.google.com/accounts/Login?service=print&continue=https://books.google.co.uk/books%3Fop%3Dlibrary&hl=en\">My library</a><a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"http://books.google.co.uk/support/topic/4359341?hl=en-GB\">Help</a><a class=\"gbmt goog-menuitem-content\" id=\"\" href=\"https://books.google.co.uk/advanced_book_search?q=billgatesmicrosoft\">Advanced Book Search</a><a href=\"https://play.google.com/store/books/details?id=af3PBQAAQBAJ&rdid=book-af3PBQAAQBAJ&rdot=1&source=gbs_vpt_read&pcampaignid=books_booksearch_viewport\" id=\"gb-get-book-content\">Buy eBook - \u00c2\u00a321.79</a><p id=\"gb-buy-options-trigger\" class=\"gb-buy-options-link\">Get this book in print</p><a name=\"buy_anchor\"></a><a href=\"http://abdopublishing.com/shop/show/6322\" dir=\"ltr\">ABDO</a><a href=\"http://www.amazon.co.uk/gp/search?index=books&linkCode=qs&keywords=9781629694603\" dir=\"ltr\">Amazon.co.uk</a><a href=\"http://www.bookdepository.com/book/9781629694603\" dir=\"ltr\">BookDepository</a><a href=\"http://www.waterstones.com/waterstonesweb/advancedSearch.do?buttonClicked=2&isbn=1629694606\" dir=\"ltr\">Waterstone's</a><a href=\"http://www.whsmith.co.uk/CatalogAndSearch/SearchWithinCategory.aspx?as_ISBN=1629694606\" dir=\"ltr\">WHSmith</a><a href=\"http://bookshop.blackwell.co.uk/bobuk/scripts/home.jsp?action=search&type=isbn&term=1629694606\" dir=\"ltr\">Blackwell</a><a href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&pg=PA2&q=http://worldcat.org/isbn/1629694606&clientid=librarylink&usg=AFQjCNF30N2K8V8cLKFtxjpcHRSt5RPkwg&source=gbs_buy_r\">Find in a library</a><a class=\"secondary\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&sitesec=buy&source=gbs_buy_r\" id=\"get-all-sellers-link\">All sellers\u00a0\u00bb</a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&printsec=frontcover\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&dq=billgatesmicrosoft&sitesec=reviews\"></a> <a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&dq=billgatesmicrosoft&sitesec=reviews\" class=\"sbs-count secondary\">0 Reviews</a><a id=\"write-review-link\" href=\"https://www.google.com/accounts/Login?service=print&continue=https://books.google.co.uk/books%3Fop%3Dlookup%26id%3Daf3PBQAAQBAJ%26continue%3Dhttps://books.google.co.uk/books%253Fid%253Daf3PBQAAQBAJ%2526pg%253DPA2%2526lpg%253DPA2%2526dq%253Dbillgatesmicrosoft%2526source%253Dbl%2526ots%253DkOoeyqnrmG%2526sig%253DO8vNTHW0AmC039_nJsnKiEucONQ%2526hl%253Den%2526sa%253DX&hl=en\" class=\"secondary sbs-link\">Write review</a>https://books.google.com/books/about/Bill_Gates_Microsoft_Founder_and_Philant.html?id=af3PBQAAQBAJBill Gates: Microsoft Founder and Philanthropist: Microsoft Founder and ...By Marylou Morano Kjelle \u00a0<p><a id=\"sidebar-atb-link\" href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&dq=billgatesmicrosoft&source=gbs_navlinks_s\">About this book</a></p><a href=\"/intl/en/googlebooks/tos.html\" target=\"_blank\">Terms\u00a0of\u00a0Service</a><a name=\"pub_info_anchor\"></a><a href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&pg=PA2&q=http://www.abdopublishing.com/index.html&linkid=1&usg=AFQjCNEc5mprg92y9GPuvfY6s-WXez4caQ&source=gbs_pub_info_r\"></a>Pages displayed by permission of <a class=\"link_aux\" href=\"https://books.google.co.uk/url?id=af3PBQAAQBAJ&pg=PA2&q=http://www.abdopublishing.com/index.html&linkid=1&usg=AFQjCNEc5mprg92y9GPuvfY6s-WXez4caQ&source=gbs_pub_info_r\">ABDO</a>.\u00a0<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&printsec=copyright&source=gbs_pub_info_r\">Copyright</a>.\u00a0Page 2<a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&pg=PA1&lpg=PA2&ots=kOoeyqnrmG&focus=viewport&dq=billgatesmicrosoft\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&pg=PA3&lpg=PA2&ots=kOoeyqnrmG&focus=viewport&dq=billgatesmicrosoft\"></a>\u00a0\u00a0<a name=\"page\" accesskey=\"c\"></a><a href=\"https://books.google.co.uk/books?id=af3PBQAAQBAJ&pg=PA3&lpg=PA2&ots=kOoeyqnrmG&focus=viewport&dq=billgatesmicrosoft\"></a></div>" #print('Original HTML: ' + html[0:80]) c = Cleaner( scripts=True, javascript=True, comments=True, style=True, links=True, meta=True, page_structure=True, processing_instructions=True, embedded=True, frames=True, forms=True, annoying_tags=True, ) c.allow_tags = None c.remove_unknown_tags = True html = c.clean_html(html) print('Cleaned up HTML: ' + str(html))
def html2content( html, allowed_tags=[ "a", "abbr", "article", "aside", "b", "base", "blockquote", "body", "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg", "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr" ]): cleaner = Cleaner() cleaner.allow_tags = allowed_tags cleaner.remove_unknown_tags = False cleaner.page_structure = False cleaner.meta = False cleaner.style = True cleaner.embeded = False return cleaner.clean_html(html)
if nb_upper > nb_lower: return titlecase(title) else: return title # HTML sanitizing for the title overescaped_re = re.compile(r'&#(\d+);') unicode4_re = re.compile(r'(\\u[0-9A-Z]{4})(?![0-9A-Z])') whitespace_re = re.compile(r'\s+') ltgt_re = re.compile(r'.*[<>&]') html_cleaner = Cleaner() html_cleaner.allow_tags = ['sub', 'sup', 'b', 'span'] html_cleaner.remove_unknown_tags = False html_killer = Cleaner() html_killer.allow_tags = ['div'] html_killer.remove_unknown_tags = False latexmath_re = re.compile(r'\$(\S[^$]*?\S|\S)\$') def remove_latex_math_dollars(string): """ Removes LaTeX dollar tags. >>> remove_latex_math_dollars(u'This is $\\\\beta$-reduction explained') u'This is \\\\beta-reduction explained' >>> remove_latex_math_dollars(u'Compare $\\\\frac{2}{3}$ to $\\\\pi$')
elif title[i].islower(): nb_lower += 1 if nb_upper > nb_lower: title = titlecase(title) return title ## HTML sanitizing for the title overescaped_re = re.compile(r'&#(\d+);') unicode4_re = re.compile(r'(\\u[0-9A-Z]{4})(?![0-9A-Z])') whitespace_re = re.compile(r'\s+') html_cleaner = Cleaner() html_cleaner.allow_tags = ['sub','sup','b','span'] html_cleaner.remove_unknown_tags = False html_killer = Cleaner() html_killer.allow_tags = ['div'] html_killer.remove_unknown_tags = False latexmath_re = re.compile(r'\$(\S[^$]*?\S|\S)\$') def remove_latex_math_dollars(string): return latexmath_re.sub(r'\1', string) latex_command_re = re.compile(r'(\\([a-zA-Z]+|[.=\'"])({[^}]*})*)') def unescape_latex(s): def conditional_replace(fragment): rep = unicode_tex.tex_to_unicode_map.get(fragment.group(0)) return rep if rep is not None else fragment.group(0)
# HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED #HTML_CLEANER.kill_tags = MANUALLY_CLEANED def tree_cleaning(tree, include_tables, include_images=False): '''Prune the tree by discarding unwanted elements''' # determine cleaning strategy cleaning_list, stripping_list = \ MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy() if include_tables is False: cleaning_list.append('table') if include_images is True:
} logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR) logger.debug('dateparser configuration: %s', PARSERCONFIG) cleaner = Cleaner() cleaner.comments = True cleaner.embedded = True cleaner.forms = False cleaner.frames = True cleaner.javascript = False cleaner.links = False cleaner.meta = False cleaner.page_structure = True cleaner.processing_instructions = True cleaner.remove_unknown_tags = False cleaner.safe_attrs_only = False cleaner.scripts = False cleaner.style = False cleaner.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table', 'svg', 'video' ] # 'embed', 'figure', 'img', def date_validator(datestring, outputformat): """Validate a string with respect to the chosen outputformat and basic heuristics""" # try if date can be parsed using chosen outputformat try: dateobject = datetime.datetime.strptime(datestring, outputformat)
def google_news_cut(link): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter page = get_web_page(link) soup = BeautifulSoup(page, 'html.parser') # all_news = soup.find_all('a', 'nuEeue hzdq5d ME7ew') all_news = soup.find_all('a', 'ipQwMb Q7tWef') key_str = "" titles_link = [] word_t_list = [] documents = [] for news in all_news: # print(news.string) # print(news['href']) if re.match('\./', news['href']) is None: link = news['href'] else: link = 'https://news.google.com/' + re.sub('\./', "", news['href']) titles_link.append({'title': news.string, 'link': link}) key_str = key_str + news.string + "\n" remove_words = [ 'mlb', 'nba', '新聞網', '中央社', '報紙', '聯合', '時報', '全網', '自己', '中時', '年月日', '直播', '三立', '聞網', '使用者', '中國時報', '自由時報', '關鍵字', '網站', '發表', '留言', '發言', '網小時', '自由' ] jieba.load_userdict("my_dict.txt") jieba.load_userdict("news_dict.txt") jieba.analyse.set_stop_words("stop_words.txt") jieba.analyse.set_stop_words("stop_words_sport.txt") for t_link in titles_link: print('get_web_page: ', t_link['title'], " ", t_link['link']) try: page = get_web_page_html(t_link['link']) # page = get_web_page(t_link['link']) except requests.exceptions.SSLError: continue except lxml.etree.ParserError: continue if page is None: continue cleaner.kill_tags = ['a', 'img'] cleaner.remove_tags = ['div', 'p'] cleaner.remove_unknown_tags = False cleaner.allow_tags = ['p'] result = html.tostring(cleaner.clean_html(page), encoding="utf-8", pretty_print=True, method="html") article_content = re.sub(' ', "", result.decode('utf-8')) # article_content = re.sub(u'[^\u4E00-\u9FA5]', " ", article_content) article_content = re.sub(r'[\n\xa0\W你妳我他她它們]', "", article_content) article_content = re.sub('自己', "", article_content) # print(article_content) words_t = jieba.cut(article_content, cut_all=False) word_t_list = [word for word in words_t if word not in remove_words] print(word_t_list) documents.append(word_t_list) return documents