def clean(file_name, directory="."): content = open(file_name, "r").read() article = Extractor(content, loglevel=logging.INFO).extracted() #article = cgi.escape(article).encode('ascii', 'xmlcharrefreplace') #return article if article is None: print "Error processing html file" sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') source_url = head_doc.cssselect('link[rel="canonical"]')[0].get('href') reconstructed_body = u"<html><body>" + article.replace( "<h2", "<h1").replace("</h2>", "</h1>") + u"</body></html>" source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') try: post_content_doc = body_doc.xpath("//div[@class='post-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) except: print file_name basename = os.path.basename(file_name) cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html" #out = html.tostring(head_doc) + html.tostring(body_doc) result = html.tostring(body_doc) with codecs.open(directory + cleaned_file, 'w', 'utf-8') as cleaned_file_handle: cleaned_file_handle.write(result)
def clean(file_name, directory="."): content = codecs.open(file_name, "r", "utf-8").read() article = Extractor(content, loglevel=logging.INFO).extracted() if article is None: print "Error processing html file" sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') source_url = head_doc.cssselect('meta[property="og:url"]')[0].get( 'content') reconstructed_body = u"<html><body>" + article.replace( "<h2", "<h1").replace("</h2>", "</h1>") + u"</body></html>" source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') for bad in body_doc.xpath("//div[@class='comments-main']"): bad.getparent().remove(bad) for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"): ad_by_google.getparent().remove(ad_by_google) for bad_h3 in body_doc.xpath("//h3"): bad_h3.getparent().remove(bad_h3) post_content_doc = body_doc.xpath("//div[@class='post-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) basename = os.path.basename(file_name) cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html" result = html.tostring(body_doc) with codecs.open(directory + cleaned_file, 'w', 'utf-8') as cleaned_file_handle: cleaned_file_handle.write(result)
def clean(file_name, directory="."): cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html" # don't clean files that already have been cleaned if os.path.isfile(cleaned_file): return content = codecs.open(file_name, "r", 'utf-8').read() head_pos = content.find('<head>') # insert the encoding of the file content = content[:head_pos + 6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[ head_pos + 6:] article = Extractor(content, loglevel=logging.INFO).extracted() if article is None: print "Error processing html file" sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') source_url = head_doc.cssselect('meta[property="og:url"]')[0].get( 'content') title = html_doc.find('.//title').text_content() # if the title is unfortunately removed by boilerpipy, then add it back in if "h2" not in article: article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article reconstructed_body = "<html><body>" + article.replace( "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>" source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') for bad in body_doc.xpath("//div[@class='comments-main']"): bad.getparent().remove(bad) for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"): ad_by_google.getparent().remove(ad_by_google) for bad_h3 in body_doc.xpath("//h3"): bad_h3.getparent().remove(bad_h3) for pre_tag in body_doc.xpath("//pre"): if 'class' in pre_tag.attrib: pre_tag.attrib.pop('class') if 'title' in pre_tag.attrib: pre_tag.attrib.pop('title') post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) basename = os.path.basename(file_name) result = html.tostring(body_doc) # replace <code> with <code><pre> for styling later. result = result.replace('<pre>', '<pre> <code>').replace('</pre>', '</code> </pre>') with open(directory + cleaned_file, 'w') as cleaned_file_handle: cleaned_file_handle.write(result.encode('utf-8'))
def clean(file_name, directory="."): basename = os.path.basename(file_name) content = codecs.open(file_name, "r", 'utf-8').read() head_pos = content.find('<head>') # insert the encoding of the file content = content[:head_pos+6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[head_pos+6:] article = Extractor(content, loglevel = logging.INFO).extracted() if article is None: print "Error processing html file" sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') published_time = head_doc.cssselect('meta[property="article:published_time"]')[0].get('content')[:-6] print published_time cleaned_file = os.path.splitext(basename)[0] + "_" + published_time + "_cleaned.html" # don't clean files that already have been cleaned if os.path.isfile(cleaned_file): return source_url = head_doc.cssselect('meta[property="og:url"]')[0].get('content') title = html_doc.find('.//title').text_content() # if the title is unfortunately removed by boilerpipy, then add it back in if "h2" not in article: article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article reconstructed_body = "<html><body>" + article.replace("<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>" source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url +"' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') for bad in body_doc.xpath("//div[@class='comments-main']"): bad.getparent().remove(bad) for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"): ad_by_google.getparent().remove(ad_by_google) for bad_h3 in body_doc.xpath("//h3"): bad_h3.getparent().remove(bad_h3) for pre_tag in body_doc.xpath("//pre"): if 'class' in pre_tag.attrib: pre_tag.attrib.pop('class') if 'title' in pre_tag.attrib: pre_tag.attrib.pop('title') post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) result = html.tostring(body_doc) # replace <code> with <code><pre> for styling later. result = result.replace('<pre>', '<pre> <code>').replace('</pre>', '</code> </pre>') with open(directory + cleaned_file, 'w') as cleaned_file_handle: cleaned_file_handle.write(result.encode('utf-8'))
def clean(content): head_pos = content.find('<head>') # insert the encoding of the file content = content[:head_pos+6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[head_pos+6:] article = Extractor(content, loglevel=logging.INFO).extracted() if article is None: print("Error processing html file.") sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') source_url = head_doc.cssselect('meta[property="og:url"]')[0].get('content') title = html_doc.find('.//title').text_content() # Replace article = article.replace('<h1 class="tabtitle">C++</h1>', '<p><strong>C++</strong></p>') article = article.replace('<h1 class="tabtitle">C</h1>', '<p><strong>C</strong></p>') article = article.replace('<h1 class="tabtitle">C/C++</h1>', '<p><strong>C/C++</strong></p>') article = article.replace('<h1 class="tabtitle">Java</h1>', '<p><strong>Java</strong></p>') article = article.replace('<h1 class="tabtitle">Python</h1>', '<p><strong>Python</strong></p>') # if the title is unfortunately removed by boilerpipy, then add it back in if "h2" not in article: article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article reconstructed_body = "<html><body>" + article.replace("<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>" if "<body><h1>" not in reconstructed_body: reconstructed_body = reconstructed_body.replace("<body>", "<body><h1>" + title[:title.rfind('-')] + "</h1>") source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url +"' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') for bad in body_doc.xpath("//div[@class='comments-main']"): bad.getparent().remove(bad) for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"): ad_by_google.getparent().remove(ad_by_google) for bad_h3 in body_doc.xpath("//h3"): bad_h3.getparent().remove(bad_h3) for pre_tag in body_doc.xpath("//pre"): if 'class' in pre_tag.attrib: pre_tag.attrib.pop('class') if 'title' in pre_tag.attrib: pre_tag.attrib.pop('title') post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) result = html.tostring(body_doc) # replace <code> with <code><pre> for styling later. result = result.replace('<pre>', '<pre> <code>').replace('</pre>', '</code> </pre>') return result
def clean(file_name, directory="."): content = codecs.open(file_name, "r", 'utf-8').read() head_pos = content.find('<head>') # HERE is the key: insert the encoding of the file and everything works out ;) content = content[:head_pos+6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[head_pos+6:] article = Extractor(content, loglevel = logging.INFO).extracted() if article is None: print "Error processing html file" sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') source_url = head_doc.cssselect('meta[property="og:url"]')[0].get('content') title = html_doc.find('.//title').text_content() # if the title is unfortunately removed by boilerpipy, then add it back in if "h2" not in article: article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article reconstructed_body = "<html><body>" + article.replace("<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>" source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url +"' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') for bad in body_doc.xpath("//div[@class='comments-main']"): bad.getparent().remove(bad) for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"): ad_by_google.getparent().remove(ad_by_google) for bad_h3 in body_doc.xpath("//h3"): bad_h3.getparent().remove(bad_h3) post_content_doc = body_doc.xpath("//div[@class='post-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) basename = os.path.basename(file_name) cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html" result = html.tostring(body_doc) with open(directory + cleaned_file, 'w') as cleaned_file_handle: cleaned_file_handle.write(result.encode('utf-8'))
def clean(file_name, directory="."): content = codecs.open(file_name, "r", 'utf-8').read() article = Extractor(content, loglevel=logging.INFO).extracted() if article is None: print "Error processing html file" sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') source_url = head_doc.cssselect('link[rel="canonical"]')[0].get('href') title = html_doc.find('.//title').text_content() # if the title is unfortunately removed by boilerpipy, then add it back in if "h2" not in article: article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article reconstructed_body = "<html><body>" + article.replace( "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>" source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') try: post_content_doc = body_doc.xpath("//div[@class='post-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) except: print file_name basename = os.path.basename(file_name) cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html" result = html.tostring(body_doc) with open(directory + cleaned_file, 'w') as cleaned_file_handle: cleaned_file_handle.write(result.encode('utf-8'))
def clean(file_name, directory="."): content = codecs.open(file_name, "r", "utf-8").read() article = Extractor(content, loglevel=logging.INFO).extracted() if article is None: print "Error processing html file" sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find("head") source_url = head_doc.cssselect('link[rel="canonical"]')[0].get("href") title = html_doc.find(".//title").text_content() # if the title is unfortunately removed by boilerpipy, then add it back in if "h2" not in article: article = "<h1>" + title[: title.rfind("-")] + "</h1>" + article reconstructed_body = "<html><body>" + article.replace("<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>" source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find("body") try: post_content_doc = body_doc.xpath("//div[@class='post-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) except: print file_name basename = os.path.basename(file_name) cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html" result = html.tostring(body_doc) with open(directory + cleaned_file, "w") as cleaned_file_handle: cleaned_file_handle.write(result.encode("utf-8"))
def clean(content): head_pos = content.find('<head>') # insert the encoding of the file content = content[:head_pos + 6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[ head_pos + 6:] article = Extractor(content, loglevel=logging.INFO).extracted() if article is None: print("Error processing html file.") sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') source_url = head_doc.cssselect('meta[property="og:url"]')[0].get( 'content') title = html_doc.find('.//title').text_content() # Replace article = article.replace('<h1 class="tabtitle">C++</h1>', '<p><strong>C++</strong></p>') article = article.replace('<h1 class="tabtitle">C</h1>', '<p><strong>C</strong></p>') article = article.replace('<h1 class="tabtitle">C/C++</h1>', '<p><strong>C/C++</strong></p>') article = article.replace('<h1 class="tabtitle">Java</h1>', '<p><strong>Java</strong></p>') article = article.replace('<h1 class="tabtitle">Python</h1>', '<p><strong>Python</strong></p>') # if the title is unfortunately removed by boilerpipy, then add it back in if "h2" not in article: article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article reconstructed_body = "<html><body>" + article.replace( "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>" if "<body><h1>" not in reconstructed_body: reconstructed_body = reconstructed_body.replace( "<body>", "<body><h1>" + title[:title.rfind('-')] + "</h1>") source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') for bad in body_doc.xpath("//div[@class='comments-main']"): bad.getparent().remove(bad) for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"): ad_by_google.getparent().remove(ad_by_google) for bad_h3 in body_doc.xpath("//h3"): bad_h3.getparent().remove(bad_h3) for pre_tag in body_doc.xpath("//pre"): if 'class' in pre_tag.attrib: pre_tag.attrib.pop('class') if 'title' in pre_tag.attrib: pre_tag.attrib.pop('title') post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) result = html.tostring(body_doc) # replace <code> with <code><pre> for styling later. result = result.replace('<pre>', '<pre> <code>').replace('</pre>', '</code> </pre>') return result