-
Notifications
You must be signed in to change notification settings - Fork 0
/
ch7.WebPageScrapper.py
57 lines (51 loc) · 1.71 KB
/
ch7.WebPageScrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
url = 'http://'+input('Enter a website url: ')
def urlcontent():
import urllib.request
with urllib.request.urlopen(url) as response:
html = response.read()
print(html)
content = response.readlines()
print(content)
# contents[0]
# try:
# if response.getcode() == 200:
# print('Bingo!')
# else:
# print('The response code was not 200, but: {}'.format(
# response.getcode()))
# except urllib.error.HTTPError as e:
# print('''An error occured: {}
# The response code was ()'''.format(e, e.getcode()))
print('\n\nHeader:\n\n')
headerinfo = response.info()
print(headerinfo)
def urlretrieve():
import urllib.request
with urllib.request.urlretrieve(url, filename=urlcontent) as request:
request.close()
def urlparser():
import urllib.request, formatter, sys, html
from html.parser import HTMLParser
with urllib.request.urlopen(url) as response:
data = str(response.read())
response.close()
format = formatter.AbstractFormatter(formatter.DumbWriter(
sys.stdout))
ptext = HTMLParser(format)
ptext.feed(data)
ptext.close()
def urlparser2():
import urllib.request, urllib.parse, formatter
from html.parser import HTMLParser
response = urllib.request.urlopen(url)
data = response.read()
response.close()
format = formatter.AbstractFormatter(formatter.NullFormatter())
ptext = HTMLParser(format)
ptext.feed(data)
for link in ptext.anchorlist:
print(link)
urlcontent()
urlretrieve()
urlparser() # bug in Python 3.5, works in P2.7
urlparser2() # bug in Python 3.5, works in P2.7