forked from maliubiao/simple_http
/
etree_util.py
102 lines (83 loc) · 2.61 KB
/
etree_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#! /usr/bin/env python
#-*-encoding=utf-8-*-
import re
import sys
from lxml import etree
import pdb
def _match_one(tree, nodes, selector):
elements = []
prefix = selector[0]
sc = selector[1:]
#find by attr
if prefix == ".":
parts = sc.split("-")
if parts:
cat = parts[0]
value = "-".join(parts[1:])
else:
cat = "class",
value = sc
selector = re.compile(value)
for node in nodes:
v = node.attrib.get(cat, "")
if selector.findall(v):
elements.append(node)
#find by line number
elif prefix == ">":
if "-" in selector:
smin, smax = [int(x) for x in sc.split("-")]
else:
smin = smax = int(sc)
for _, node in etree.iterwalk(tree, tag="*", events=("start", )):
line = node.sourceline
if line >= smin and line <= smax:
elements.append(node)
#find by text
elif prefix == "-":
for _, node in etree.iterwalk(tree, tag="*", events=("start", )):
if node.text and re.findall(sc.decode("utf-8"), node.text, re.UNICODE):
elements.append(node)
#find by xpath
elif prefix == ",":
elements.extend(tree.xpath(sc))
#find by tag
else:
for node in nodes:
if selector == node.tag:
elements.append(node)
return elements
def get_xpath(node):
return node.getroottree().getpath(node)
def query_element(content, selector):
tree = etree.HTML(content)
elements = []
nodes = []
for _, node in etree.iterwalk(tree, tag="*", events=("start", )):
nodes.append(node)
if selector.startswith("["):
for x in selector.strip("[").strip("]").split(","):
elements.extend(_match_one(tree, nodes, x.strip()))
else:
elements.extend(_match_one(tree, nodes, selector))
return elements
def toutf8(s):
if isinstance(s, unicode):
return s.encode("utf-8")
return s
def dump_node(node):
print "tag: %s\n, line: %d\n, attrib: %s\n, text: %s\n, xpath: %s" % (
toutf8(node.tag), node.sourceline, str(node.attrib), toutf8(node.text), get_xpath(node))
def main():
if len(sys.argv) < 3:
print "usage etree_utils.py htmlfile selector"
exit(0)
f = open(sys.argv[1], 'r')
content = f.read()
f.close()
arg1 = sys.argv[1]
selector = sys.argv[2]
for i in query_element(content, selector):
print "==============="
dump_node(i)
if __name__ == "__main__":
main()