-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
59 lines (48 loc) · 1.57 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/python
import sys
from BeautifulSoup import BeautifulSoup
def unescape_xhtml(s):
htmlInput = '<html>' + s
# Livejournal stream has ' so we must use XHTML_ENTITIES
unescaped = BeautifulSoup(
htmlInput, convertEntities=BeautifulSoup.XHTML_ENTITIES
).contents[0].string
if not unescaped:
unescaped = u""
# Convert BeautifulSoup thing into real str
return unescaped.encode("utf-8")
def get_url(s):
url = s.replace("<wbr></wbr>", "").split(">", 1)[1]
assert not '<' in url, url
assert not '>' in url, url
if '&' in url:
url = unescape_xhtml(url)
if url.startswith("feed://"):
# Feed Directory has some feed:// results, but Reader treats feed://
# and http:// as the same feed.
url = url.replace("feed://", "http://", 1)
return url
def yield_urls(fh, take_everything):
UNKNOWN, TAKE, SKIP = range(3)
state = UNKNOWN
if take_everything:
# In case we have grep filtering out everything but feed URL lines
state = TAKE
for line in fh:
if '<div class="feed-result-stats"><span class="number">' in line:
if not take_everything and '<span class="number">Unknown</span>' in line:
state = SKIP
else:
state = TAKE
elif line.startswith('<div class="feed-info">'):
if state == TAKE:
yield get_url(line.rstrip())
elif state == UNKNOWN:
raise RuntimeError("Got feed-info %r before feed-result-stats?" % (line,))
# else pass if SKIP
def main():
# Take everything because Google actually does have data for some "Unknown" feeds
for url in yield_urls(sys.stdin, take_everything=True):
print url
if __name__ == '__main__':
main()