Exemplo n.º 1
0
def parse(filename, window_width=1000):
    try:
        with tempfile.NamedTemporaryFile(delete=False) as f:
            tmp_fname = f.name
            subprocess.check_call([
                settings.TIKA_PREFIX + 'tika', '--encoding=utf-8', '--html',
                filename
            ],
                                  stdout=f)
        return html.parse(tmp_fname, window_width)
    except subprocess.CalledProcessError as err:
        logger.warning(
            'Could not convert MSOffice file "%s" using tika because of %s, trying unoconv...'
            % (filename, err))
        try:
            subprocess.check_call(
                ['unoconv', '-fpdf', '-o', tmp_fname, filename])
            return pdf.parse(tmp_fname, window_width)
        except subprocess.CalledProcessError as err:
            logger.error(
                'Could not convert MSOffice file "%s" using unoconv because of %s'
                % (filename, err))
            raise PreprocError()
    finally:
        if tmp_fname and os.path.exists(tmp_fname):
            os.remove(tmp_fname)
Exemplo n.º 2
0
    def parse(self):
        self.timer.start("HTML")
        self.text = lex(self.body)
        self.timer.stop()
        self.nodes = parse(self.text)
        self.timer.start("Parse CSS")
        self.rules = parse_css(DEFAULT_STYLE)
        self.timer.stop()
        self.rules.sort(key=lambda x: x[0].score())
        self.timer.start("JS")
        self.js = dukpy.JSInterpreter()
        self.js_handles = dict()

        # Registration
        self.js.export_function("log", print)
        self.js.export_function("querySelectorAll", self.js_querySelectorAll)

        # Run runtime
        self.js.evaljs(DEFAULT_JS)

        for script in find_scripts(self.nodes, []):
            lhost, lport, lpath, lfragment = parse_url(
                relative_url(script, self.history[-1]))
            header, body = request('GET', lhost, lport, lpath)
            self.js.evaljs(body)
        self.timer.stop()
        self.relayout()
Exemplo n.º 3
0
def get_links(url, page, domain=False, noquery=False):
    '''Se domain e' True ritorna solo i links nello stesso
       dominio dell'URL di input. Se noquery e' True, ritorna
       solamente i links senza la componente query.'''
    try:
        parsed = html.parse(page, HTMLNode)
    except Exception as ex:
        return (None, str(ex))
    if parsed == None:
        return (None, 'HTML Parsing Error')
    linkset = set()
    for link in parsed.get_links():
        parsed = up.urlsplit(link)
        ext = os.path.splitext(parsed.path)[1].lower()
        if (parsed.scheme.lower() in ('http','https','ftp','')
            and (parsed.netloc or parsed.path)
            and ext in ('','.htm','.html')):
            if noquery and parsed.query: continue
            if not parsed.netloc:
                link = up.urljoin(url, link)
                parsed = up.urlsplit(link)
            else:
                if domain: continue
                if not parsed.scheme:
                    link = 'http://' + link
                    parsed = up.urlsplit(link)
            linkset.add(parsed.geturl())
    return (linkset, None)
Exemplo n.º 4
0
def render_to_image(html_source, css_source, height, width, renderer):
	tree = html.parse(html_source)
	rules = css.parse(css_source)

	styled_tree = style.style(tree, rules)

	layout_tree = [layout.build_layout_tree(node) for node in styled_tree if layout.get_display(node) is not layout.Display.NONE]

	root = layout.Dimensions.default()
	root.content.width = width

	for node in layout_tree:
		node.layout(root)

	renderer = painting.Renderer(width, height, renderer)
	image = renderer.render(layout_tree)

	return image
def get_metrics_for_project(project_name, data_dir, output):

    if not os.path.isdir(data_dir):
        raise RuntimeError(
            "Could not access understand results in directory: " + data_dir)

    # INTENTIONALLY USING FORWARD SLASH, THIS WORKS FOR WINDOWS
    # DO NOT CHANGE TO OS.PATH STUFF. UNDERSTAND WANTS FORWARD SLASHES
    # EVEN IN WINDOWS. -DJC 2018-06-08
    index_file = data_dir + '/index.html'
    if not os.path.isfile(index_file):
        raise RuntimeError("Understand results not found: ", index_file)

    with open(index_file) as htmlfile:
        logger.info("\tGathering metrics: " + index_file)

        output['Project Name'] = project_name
        # grab <head><script>
        block = html.tostring(html.parse(htmlfile).getroot()[0][0])
        myStr = block.decode()
        metrics = json.loads(myStr.split("metrics=")[1].split(';')[0])

        # Convert from name/value tags to dictionary
        metricsDict = {}
        for m in metrics:
            name = m['name']
            val = m['value']
            # If blank String, don't put it in so it's easier to catch later -djc 2018-03-19
            if name != "Project Name" and val:
                metricsDict[name] = val.replace('%', '')

        # Remove percentage information and only show core or central as appropriate
        core = True
        archType = metricsDict['Architecture Type']
        if archType == 'Hierarchical' or archType == 'Multi-Core':
            core = False
        output['Core'] = core

        for key, value in metricsDict.items():
            output[key] = value

    return output
Exemplo n.º 6
0
def tokenize_body(msg, config):
    if msg.is_multipart():
        rv = []
        for m in msg.get_payload():
            rv += tokenize(m, config)
        return rv
    else:
        type = msg.get("content-type", "text/plain")
        if type.startswith("text/"):
            payload = msg.get_payload(decode=True)
            if payload:
                tokens = []
                if type.startswith("text/html"):
                    try:
                        (payload, tags) = html.parse(payload)
                        tags = [(x,y) for (x,y) in tags
                                if x not in banned_attrs]
                        tags = [y and "%s=%s" % (x,y) or x for (x,y) in tags]
                        tokens += mangle("HTML", [x[:251] for x in tags])
                    except Exception, e:
#                        print >> sys.stderr, "crap:", e
                        tokens += ["BUGhtml"]
                        try:
                            payload = html_tag_re.sub("", payload)
                        except:
                            pass

                words = word_re.findall(payload)
                tokens += mangle("BODY",
                                 [x for x in words if 3 <= len(x) <= 20])
                if len(words) > 1 and config.double:
                    tokens += mangle("BODY",
                                     ["%s %s".lower() % (x, y)
                                      for (x,y) in zip(words[:-1], words[1:])
                                      if 3 <= len(x) <= 20 and 3 <= len(y) <= 20])
                
                for key, body in config.bodies.iteritems():
                    tokens += body.get_tokens(payload)
                return tokens
Exemplo n.º 7
0
import requests, html

print(html.parse("<body>"))
html = requests.get(
    "https://example.com/")  ## bad: overwrites imported package name
print(html)
Exemplo n.º 8
0
import os
import subprocess
import json
import eval_php
import html

project_name = sys.argv[1]
parsed_dir1 = sys.argv[2]
parsed_dir2 = sys.argv[3]

pwd = os.getcwd()
print pwd

with open(project_name+"-modified.txt") as f:
    for line in f:
    	parsed_path1 = pwd+"/"+parsed_dir1+line
    	parsed_path2 = pwd+"/"+parsed_dir2+line
    	print parsed_path1+" "+parsed_path2

    	if os.path.isfile(parsed_path1) and os.path.isfile(parsed_path2):
    		root1 = eval_php.parse_php(parsed_path1)
    		content1,parser1 = html.parse(root1)

    		root2 = eval_php.parse_php(parsed_path2)
    		content2,parser2 = html.parse(root2)

	    	if content1 == content2:
	    		print "===> "+line
	    	else:
	    		print "HTML : "+content1