Exemplo n.º 1
0
def remove_code_block(s):
    from org.jsoup import Jsoup
    doc = Jsoup.parse(s)
    for element in doc.select("code"):
        element.remove()

    return doc.text()
Exemplo n.º 2
0
def so_text(s):
    """ Removes code tag and its content from SO body as well as all html tags"""
    from org.jsoup import Jsoup
    s = unescape_html(s)
    doc = Jsoup.parse(s)
    for element in doc.select("code"):
        element.remove()

    return doc.text()
Exemplo n.º 3
0
def so_tokenizer(s, remove_html=True, as_str=True):

    if remove_html:
        from org.jsoup import Jsoup
        s = unescape_html(s)
        doc = Jsoup.parse(s)
        s = doc.text()
    tokens = tokenize(s)
    tokens = set(tokens)

    res = []
    for token in tokens:
        res.extend(camel_case_split(token))

        res.append(token.lower())

    res = [item for item in res if item not in java_stopwords]
    res = set(res)
    if as_str:
        return " ".join(res)
    else:
        return res
Exemplo n.º 4
0
import sys
import os

import test1

from org.jsoup import Jsoup
from com.pixshow.framework.utils import HttpUtility

url = "http://en.wikipedia.org/";

print test1.workDir()

html = HttpUtility.get(url);
doc = Jsoup.parse(html)

html = doc.select('#mp-itn b a').toString()

appContext.get('testService').save(html)
Exemplo n.º 5
0
def clean_question(html):
	"""Removes code tag and its content. Subsequently, it removes html tags"""
	doc = Jsoup.parse(html)
	doc.select("code").empty()
	return doc.text()
Exemplo n.º 6
0
	def __init__(self, answer):
		self.answer = answer
		self.inline = []
		self.block = []
		self.doc = Jsoup.parse(answer)
Exemplo n.º 7
0
def remove_html_tags(s):
    from org.jsoup import Jsoup
    return Jsoup.parse(s).text()
                    node["node"].replaceWith(new_div)
                    break


if len(argv) < 4:
    infile = "/Users/mac/Downloads/im"
    outfile = "/Users/mac/Downloads/dialogues.html"
    textfile = "/Users/mac/Downloads/dialogues.txt"
else:
    infile = argv[1]
    outfile = argv[2]
    textfile = argv[3]

with iopen(outfile, "w", encoding="utf-8", errors="ignore") as output:
    input = File(infile)
    soup = Jsoup.parse(input, "UTF-8", "")

    # First, create a new document
    new_doc = Jsoup.parse("<body></body>")
    new_doc.updateMetaCharsetElement(True)
    new_doc.charset(Charset.forName("UTF-8"))
    new_body = new_doc.select("body").first()

    for element in soup.select("*"):
        if (element.tag().toString() == "ul" and element.className()
                == "ui_clean_list im-mess-stack--mess _im_stack_messages") or (
                    element.tag().toString() == "div"
                    and element.className() == "im-mess-stack--pname"):
            new_body.appendChild(element)

    # Then remove empty tags from it and transform the labels