def remove_code_block(s): from org.jsoup import Jsoup doc = Jsoup.parse(s) for element in doc.select("code"): element.remove() return doc.text()
def so_text(s): """ Removes code tag and its content from SO body as well as all html tags""" from org.jsoup import Jsoup s = unescape_html(s) doc = Jsoup.parse(s) for element in doc.select("code"): element.remove() return doc.text()
def so_tokenizer(s, remove_html=True, as_str=True): if remove_html: from org.jsoup import Jsoup s = unescape_html(s) doc = Jsoup.parse(s) s = doc.text() tokens = tokenize(s) tokens = set(tokens) res = [] for token in tokens: res.extend(camel_case_split(token)) res.append(token.lower()) res = [item for item in res if item not in java_stopwords] res = set(res) if as_str: return " ".join(res) else: return res
import sys import os import test1 from org.jsoup import Jsoup from com.pixshow.framework.utils import HttpUtility url = "http://en.wikipedia.org/"; print test1.workDir() html = HttpUtility.get(url); doc = Jsoup.parse(html) html = doc.select('#mp-itn b a').toString() appContext.get('testService').save(html)
def clean_question(html): """Removes code tag and its content. Subsequently, it removes html tags""" doc = Jsoup.parse(html) doc.select("code").empty() return doc.text()
def __init__(self, answer): self.answer = answer self.inline = [] self.block = [] self.doc = Jsoup.parse(answer)
def remove_html_tags(s): from org.jsoup import Jsoup return Jsoup.parse(s).text()
node["node"].replaceWith(new_div) break if len(argv) < 4: infile = "/Users/mac/Downloads/im" outfile = "/Users/mac/Downloads/dialogues.html" textfile = "/Users/mac/Downloads/dialogues.txt" else: infile = argv[1] outfile = argv[2] textfile = argv[3] with iopen(outfile, "w", encoding="utf-8", errors="ignore") as output: input = File(infile) soup = Jsoup.parse(input, "UTF-8", "") # First, create a new document new_doc = Jsoup.parse("<body></body>") new_doc.updateMetaCharsetElement(True) new_doc.charset(Charset.forName("UTF-8")) new_body = new_doc.select("body").first() for element in soup.select("*"): if (element.tag().toString() == "ul" and element.className() == "ui_clean_list im-mess-stack--mess _im_stack_messages") or ( element.tag().toString() == "div" and element.className() == "im-mess-stack--pname"): new_body.appendChild(element) # Then remove empty tags from it and transform the labels