/
Using Python to Access Web Data.py
1083 lines (933 loc) · 43.7 KB
/
Using Python to Access Web Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 23 16:17:00 2019
@author: D17911
"""
from os import chdir, getcwd
getcwd() #'C:\\Users\\D17911'
chdir('\\\\WIL-HOMEDRIVE01\D17911$\Desktop\Courses\Python for Data Science\Week 1')
getcwd() ## '\\\\WIL-HOMEDRIVE01\\D17911$\\Desktop\\Courses\\Python for Data Science\\Week 1'
## Using Python to Access Web Data
## Chapter 11 - Regular Expressions
## In computing, a regular expression, also referred to as 'regex' or 'regexp', provides a concise and flexible means for matching strings of text, such as particular characters, words, or patterns of characters.
## A regular expression is written in a formal language that can be interpreted by a regular expression processor.
## Really clever wild card expressions for matching and parsing strings
## Understanding Regular Expressions
## 1. Very powerful and quite cryptic
## 2. Fun once you understand them
## 3. Regular expressions are a language unto themselves
## 4. A language of 'marker characters' - programming with characters
## 5. It is kind of an 'old school' langugae - compact
## Regular Expression Quick Guide
^ ## Matches the beginning of a line
$ ## Matches the end of a line
. ### Matches any character
\s ## Matches whitespace
\S ## Matches any non-whitespace character
* ## Repeats a character zero or more times
*? ## Repeats a character zero or more times (non-greedy)
+ ## Repeats a character one or more times
+? ## Repeats a character one or more times(non-greedy)
[aeiou] ## Matches a single character in the listed set
[^XYZ] ## Matches a single character not in the listed set
[a-z0-9] ## The set of characters can include a range
( ## Indicates where string extraction is to start
) ## Indicates where string extraction is to end
## The Regular Expression Module
## Before you can use regular expressions in your program, you must import the library using 'import re'.
## You can use re.search() -- give yes or no answer -- to see if a string matches a regular expression, similar to using the find() method for strings
## You can use re.findall() to extract portions of a string that match your regular expression, similar to a combination of find() and slicing: var[5:10]
## Using re.search() Like find()
str.find() ## If substring exists inside the string, it returns the idnex of first occurence of the substring; If does not exist, return -1
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
if line.find('From:') >= 0 :
print(line)
## Using regular expression, the same as above
import re
hand = open('mbox-short.txt')
for line in hand :
line = line.rstrip()
if re.search('From:', line): ## This returns True or False
print(line)
## Using re.search() Like startswith(), start with string
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
if line.startswith('From:') :
print(line)
## We fine-tune what is matched by adding special characters to the string
import re
hand = open('mbox-short.txt')
for line in hand :
line = line.rstrip
if re.search('^From:', line): ## Tweek matching strings, ^ beginning of the line
print(line)
## Wild-Card Characters
## The dot character matches any character
## If you add the asterisk character, the character is 'any number of times', zero or more time
^X.*: ## Looking for lines that has 'X' at the beginning, followed by any number of characters, followed by a ":"
^ and .* are special characters
^ ## Match the start of the line
. ## Match any character
* ## Many times
Outputs:
X-Sieve: CMU Sieve 2.3
X-DSPAM-Result: Innocent
X-DSPAM-Confidence: 0.8475
X-Content-Type-Message-Body: text/plain
X-Plane is behind schedule: two weeks
## Now we do not want to have 'X-Plane is behind schedule: two weeks' because of space in strings
## Fine-Tuning Your Match
## Depending on how 'clean' your data is and the purpose of your application, you may want to narrow your match down a bit
X-Sieve: CMU Sieve 2.3
X-DSPAM-Result: Innocent
## We use:
^X-\S+:
^ ## Match the start of the line of 'X-'
\S ## Match any non-whitespace character
+ ## One or more times >= 1
## Matching and Extracting Data(Not True or False)
## re.search() returns a True/False depending on whether the string matches the regular expression
## If we actually want the matching strings to be extracted, we use re.findall()
[0-9]+ ## One or more digits, [0-9] means any single digit between 0 to 9, + means one or more digits
[0-9.]+ ## One or more digits and period, [0-9] means any single digit between 0 to 9, + means one or more digits
import re
x = 'My 2 favourite numbers are 19 and 42'
y = re.findall('[0-9]+', x) ## It gives back a list, it is like a split in a for loop
print(y) ## ['2', '19', '42'] it is a list of strings, if nothing it will give us a blank list
y = re.findall('[AEIOU]+', x) ## Find one or more than one uppercase vowels, answer: []
## Warning: Greedy Matching
## The repeat characters (* and +) push outward in both directions(greedy) to match the largest possible string
import re
x = 'From: Using the : character'
y = re.findall('^F.+:', x)
print(y) ## ['From: Using the :']
^F.+: ## First character in the match is an F, Last character in the match is a :, .+ means one or more characters
## Why not output 'From:', greedy matching will give the larger thing, '.+' is pushing as large as possible and then still match the entire expression
y = re.findall('^F\S+:', x)
print(y) ## ['From:']
## Non-Greedy Matching: prefer the shortest
## Not all regular expression repeat codes are greedy!
## If you add a '?' character, the + and * chill out a bit
import re
x = 'From: Using the : character'
y = re.findall('^F.+?:', x)
print(y) ## ['From:']
^F.+?: ## ^F first character in the match is an F, '.+?' one or more characters but not greedy, ':' last character in the match in a :
## Fine-Tuning String Extraction
## You can refine the match for re.findall() and separately determine which portion of the match is to be extracted by using parentheses
x = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'
y = re.findall('\S+@\S+', x)
print(y) ## ['stephen.marquard@uct.ac.za']
\S+@\S+ ## At least one non-whitespace character: \S@\S will have d@u outcome
y = re.findall('\S@\S+?', x)
print(y) ## ['d@u'] Non-Greedy
y = re.findall('\S+?@\S+', x)
print(y) ## ['stephen.marquard@uct.ac.za']
## Parentheses are not part of the match - but they tell where to start and stop what string to extract
y = re.findall('^From (\S+@\S+)', x)
print(y) ## ['stephen.marquard@uct.ac.za']
^From (\S+@\S+) ## ( means start extracting after the space, require that find starts of From and extracting strings I want
y = re.findall('^From \S+@\S+', x)
print(y) ## ['From stephen.marquard@uct.ac.za']
## Old ways of Extracting a host name - using find and string slicing and position index
data = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'
atpos = data.find('@')
print(atpos) #21
sppos = data.find(' ', atpos)
print(sppos) #31
host = data[atpos+1 : sppos]
print(host) ## uct.ac.za
## The Double Split Pattern
## Sometimes we split a line one way, and then grab one of the pieces of the line and split that piece again
words = data.split()
email = words[1]
pieces = words.split('@')
print(pieces[1]) ## uct.ac.za
## The Regex Version
import re
y = re.findall('@([^ ]*)', data)
print(y) ## ['uct.ac.za']
y = re.findall('@(\S*)', data)
print(y) ## ['uct.ac.za']
'@([^ ]*)' ## @ starts with @, () extract beginning and end, [^ ] Match non-blank character, * Match many of them
[^ ] ## ^ means Not Everything but -- not everything but space, which means non-blank, similar \S
## Even Cooler Regex Version, pick lines and extract data
import re
y = re.findall('^From .*@([^ ]*)', data) ## Not only extract the string but also the line filtering
print(y) ## ['uct.ac.za']
'^From .*@([^ ]*)'
## Starting at the beginning of the line, looking for the string 'From '
## Pick Lines and Extracting Data
## Spam Confidence
import re
handle = open('mbox-short.txt')
numlist = list()
for line in handle:
line = line.rstrip()
stuff = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', line)
if len(stuff) != 1:
continue
num = float(stuff[0])
numlist.append(num)
print('Maximum:', max(numlist)) ## Maximum: 0.9907
## Match special(real) characters: A dollar sign, an asterisk, bracket, plus or a dot
## Escape Character
## If you want a special regular expression character to just behave normally (most of the time) you prefix it with '\'
import re
x = 'We just received $10.00 for cookies.'
y = re.findall('\$[0-9.]+', x)
print(y) ## ['$10.00']
\$[0-9.]+ ## \$ means A real dollar sign [0-9.] means A digit or period + means At least one or more
## $ has meaning of regular expressionand \$ means just a dollar sign
x = 'From a9 a 9 $ @'
y = re.findall('[a-z0-9]', x)
print(y)
## Chapter 12
## Networked Technology, Network Architecture
## Communications between two applications, computer program and server(PHP, java)
## TCP Connections/Sockets
## 'In computer networking, an Internet socket or network socket is an endpoint of a bidirectional inter-process comunication flow across an Internet Protocol-based computer network, such as the Internet'
## TCP Port Numbers
## A port is an application-specific or process-specific software communications endpoint
## It allows multiple networked applications to coexist on the same server
## There is a list of well-known TCP port numbers (like the extension phone numbers)
## We play with the Web Server which is port 80, port 443 is the secure HTTPS
## Common TCP Ports
## Telnet(23) - lOGIN IMAP(143/220/993) - Mail
## SSH(22) - Secure POP(109/110) - Mail Retrieval
## HTTP(80) DNS(53) - Domain Name
## SMTP(25)(Mail) FTP(21) - File Transfer
## Sockets in Python
## Python has built-in support for TCP Sockets
import socket
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ## ready to connect to the far end, but not connected yet
## mysock is an object
mysock.connect(('data.pr4e.org', 80))
('data.pr4e.org', 80) ## (Host -- Domain Name, Port)
## Above is dialing the phone, make connection
## Communicate far off application in Python
## Hypertext Transfer Protocol(HTTP)
## Application Protocal: The socket have made us from transport layer to application layer
## Since TCP( and Python) gives us a reliable socket, what do we want to do with the socket? What problem do we want to solve?
## Application Protocols (Mail, World Wide Web)
## There are different rules of talking to different applications (Mail, World Wide Web)
# HTTP - Hypertext Transfer Protocol
## The dominant Application Layer Protocol on the Internet
## Invented for the Web - to Retrieve HTML, Images, Documents, etc
## Extended to be data in addition to documents - RSS, Web Services, etc.. Basic Concept - Make a Connection - Request a Document - Retrieve the Document - Close the Connection
## HTTP: is the set of rules to allow browsers to retieve web documents from servers over the Internet
## One of the things about HTTP protocol is look at the Uniform Resource Locator (URL):
http://www.dr-chuck.com/page1.htm
http:// ## use the HTTP Protocol
www.dr-chuck.com ## host name
/page1.htm ## get the document
## Getting Data From the Server : Get the document, retrieve it, parse it and display for you on the browser
## Each the user clicks on an anchor tag with an href= value to switch to a new page, the browser makes a connection to the web server and issues a 'GET' request - to GET the content of the page at the specific URL
## The server returns the HTML(Hypertext Markup Language) document to the browser which formats and displays the document to the user
## Internect Standards (Build the protocols)
## The standards for all of the internet protocols(inner workings) are developed by an organization
## Internet Engineering Task Forse(IETF)
www.ietf.org
## Standards are called ;RCFs' - 'Request for Comments'
## Making an HTTP request
## Connect to the server like www.dr-chuck.com
## Request a document (or the default document)
- GET http://www.dr-chuck.com/page 1.htm HTTP/1.0 ## GET URL protocol
- GET http://www.mlive/com/ann-arbor/ HTTP/1.0
- GET http://www.facebook.com HTTP/1.0
## Install Telnet on Windows which can connect to any server
$ telnet data.pr4e.org 80
## Connecting here....
GET http://www.dr-chuck.com/page 1.htm HTTP/1.0
## Press Enter
## Then it will give the information and outputs of this page
## Outputs: headers (blank line) contents
HTTP/1.1 200 OK
Date: Thu, 08 Jan 2015 01:57:52 GMT
Last-Modified: Sun, 19 Jan 2014 14:25:43 GMT
Connection: close
Content-Type: text/html
<h1>The First Page</h1>
<p>If you like, you can switch to
the <a href="http://data.pr4e.org/page2.htm">Second Page</a>.</p>
Connection closed by foreign host ## not text, shows connection close
## An HTTP Request in Python
import socket
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ## ready to connect to the far end, but not connected yet
## mysock is an object
mysock.connect(('data.pr4e.org', 80)) ## two parentheses
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode() ## call method, prepare for sending
## two new line r\n\r\n: means enter enter, is what I did when we are in Telnet
mysock.send(cmd) ## send the staff to the server
while True: ## The output of while loop is the METADATA
data = mysock.recv(512) ## receive up to 512 characters
if (len(data) < 1):
break ## ends the loop until go to the file final
print(data.decode()) ## decode means take the data outside work and interpret what it means internally for us
mysock.close() ## close the socket
## In program, socket has four functions: socket, conect, send, recv
## socket: doorway get out of your computer
## connects: extends out of your computer and could fail if server does not exist
## send: send the data first to the server, might need several sends to print stuff out
## recv: receive is a method in the socket object
## Worked Example: Sockets
## if you have problem with '\r\n\r\n', you can change to '\n\n'
## Do the things in telnet
## Hack the html of this link: http://data.pr4e.org/romeo.txt
telnet data.pr4e.org 80 ## Enter
http://data.pr4e.org/romeo.txt ## hit Enter Twice
## go to this link ib browser and use developer: Javascript Console, we will see the header and document body
## Do the thing in Python
import socket
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80)) ## a tuple in the parentheses
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode()
## Strings inside Python which is UNICODE, so we have to send them out in UTF-8 format
## encode() converts from unicode internally to UTF-8
mysock.send(cmd)
while True: ## The output of while loop is the METADATA
data = mysock.recv(512) ## receive up to 512 characters
if (len(data) < 1):
break ## ends the loop until go to the file final
print(data.decode()) ## decode means take the data outside work and interpret what it means internally for us
## decode() is the opposite of encode, converts UTF-8 to unicode
mysock.close()
## Using the Developer Console to Explore HTTP
## Open this website : http://data.pr4e.org/page1.htm
## Use the developer tools adn look at the 'Network'- Headers, Preview, Response, Timing
## Below is the contents for 'Web Applications'
https://www.wa4e.com/code/arrays/guess.php
## add parameters
https://www.wa4e.com/code/arrays/guess.php?guess=12
## these parameters that go on in addition to the URL, called get parameters or query string parameters, we can add more at the end
https://www.wa4e.com/code/arrays/guess.php?guess=12&x=14&abc=22 ## in the Query String Parameters we will get 3 strings
## Status Code can be changed
## Try a bad URL:
https://www.wa4e.com/zap
### Status Code changed to 404
## Explore another Status Code
https://www.wa4e.com/code/route/redir1.php
## Status Code is 302 means send other thing back to server and redirect to this other page. Now is the wrong page and will direct to right page
## The response of Content-Type can be image/jpeg or text/html, text/plain
## Programs that Surf the Web
## Unicode Characters and Strings
## ASCII Maps: American Standard Code for Information Interchange
## Example: From 0 to 128, which means you can not put every character into a 0 through 127
New line: LF number 10 Bin: 0001010
Letter: H number 72 Bin: 1001000
Letter: e number 101 Bin: 1100101
## Representing Simple Strings
## Each character is represented by a number between 0 and 256 stored in 8 bits of memory
## We refer to '8 bits of memory as one 'byte' of memory - (i.e. my disk drive contains 3 Terabytes of memory)
## The ord() function tells us the numeric value(real number) of a simple ASCII character
ord() ## ordinal()
print(ord('H')) ## 72
print(ord('e')) ## 101
print(ord('\n')) ## 10
'Hi' < 'aaa' ## True
## Unicode Characters and Strings
## Unicode -- Universal Code used for thousands of characters for computers to exchange data from different places
## Multi-Byte Characters
## To represent the wide range of characters computers must handle we represent characters with more than one byte
### UTF-16 -- Fixed Length - Two bytes
### UTF-32 -- Fixed Length - Four Bytes
### UFT-8 -- 1~4 Bytes
#### Upwards compatible with ASCII
#### Automatic detection between ASCII and UTF-8
#### UTF-8 is recommended practice for encoding data to be exchanged between systems
## In Python edition 2, there are two kinds of strings: str and unicode
## In Python 3, all strings are Unicode
Python 2.7.10
x = '编程' ## regular string
type(x) ## <type 'str'>
x = u'编程' ## unicode string
type(x) ## <type 'unicode'>
Python 3.5.1
x = '编程'
type(x) ## <class 'str'>
x = u'编程'
type(x) ## <class 'str'>
Python 2.7.10 ## Byte String is the same with Regular String and they are different from Unicode String
x = b'abc'
type(x) ## <type 'str'>
x = '编程' ## regular string
type(x) ## <type 'str'>
x = u'编程' ## unicode string
type(x) ## <type 'unicode'>
Python 3.5.1 ## Byte String is different from Regular String and Regular String is the same with Unicode String
x = b'abc'
type(x) ## <class 'bytes'>
x = '编程'
type(x) ## <class 'str'>
x = u'编程'
type(x) ## <class 'str'>
## Python 3 and Unicode
## In Python 3, all strings internally are UNICODE
## Working with string variables in Python programs and reading from files usually 'just works'
## When we talk to a network resource using sockets to talk to a database we have to encode(send) and decode(receive) data(usually to UTF-8)
## When we talk to an external resource like a network socket we send bytes, so we need to encode Python 3 strings into a given character encoding
## WWhen We read data from an external resource, we must decode it based on the character set so it is properly represented in Python 3 as a string
while True: ## The output of while loop is the METADATA
data = mysock.recv(512) ## receive up to 512 characters, data is Bytes
if (len(data) < 1):
break ## ends the loop until go to the file final
mystring = data.decode() ## decode means take the data outside work and interpret what it means internally for us
print(mystring) ## mystring is Unicode, decode() converts bytes(UTF-8 or ASCII) to Unicode
## decode() is the opposite of encode, converts UTF-8 to unicode
bytes.decode()
str.encode()
## Retrieving Web Pages
## Using urllib in Python
## Since HTTP is so common, we have a library that does all the socket work for us and mkaes web pages look like a file
import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand:
print(line.decode().strip()) ## line is the byte array and decode it
## urllib treats things like a file
import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
count=dict()
for line in fhand:
words = line.decode().split()
for word in words:
count[word] = count.get(word, 0) + 1
print(count)
## Reading Web Pages, not only yexy/plain file but also html file
## If you want to review the header, you need to ask for it in urllib by a different way
import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen('http://www.dr-chuck.com/page1.htm')
for line in fhand:
print(line.decode().strip())
## HTML is ugly and does not work well with regular expressions
## Worked Example: Using Urllib -- Web Crawler @ Google
## Example 1
import urllib.request, urllib.parse, urllib.error
fhand=urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand:
print(line.decode().strip())
## Example 2
import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
counts = dict()
for line in fhand:
words = line.decode().rstrip().split()
for word in words:
counts[word] = counts.get(word, 0) + 1
print(counts)
## Parsing Web Pages (Web Scraping, HTML does not work well)
## What is Web Scraping?
## When a program or script pretends to be a browser and retrieves web pages(browser talk to servers),
## looks at those web pages, extracts information, and then looks at more web pages
## Search engines scrape web pages - we call this 'spidering the web' or 'web crawling'
## Why Scrape?
## Pull data - particularly social data - who links to who
## Get your own data back out of some system that has no 'export capability'
## Monitor a site for new information
## Spider the web to make a database for a search engine
## There is some controversy about web page scraping and some sites are a bit snippy about it
## Republishing copyrighted information is not allowed
## Violating terms of service is not allowed
## Parsing HTML -- The Easy Way - Beautiful Soup
## You could do string searches the hard way
## Or use the free software library called BeautifulSoup from www.crummy.com
## BeautifulSoup Installation
## Method 1: personal Computer
## To run this, you can install BeautifulSoup
## https://pypi.python.org/pypi/beautifulsoup4
## Method 2: Company Computer
## Or download the file
## http://www.py4e.com/code/bs4.zip
## and unzip it in the same directory as this file
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
html = urllib.request.urlopen(url) ## this is a for loop returns line by line
url = input('Enter - ') ## Enter - http://www.dr-chuck.com/page1.htm http://www.dr-chuck.com/page2.htm
html = urllib.request.urlopen(url).read() ## read() reads the whole blob with newlines at the end and comes in to one big string
soup = BeautifulSoup(html, 'html.parser') ## parse the whole string of HTML, and get back with an object
## Retrieve all of the anchor tags
tags = soup('a') ## ask Soup to give me a list of anchor tags
##[<a href="http://www.dr-chuck.com/page2.htm">
## Second Page</a>]
for tag in tags:
print(tag.get('href', None)) ## tags is a list and tag is a dictionary, get the href key
## Worked Example of BeautifulSoup
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
## Ignore SSL certificate errors, for the HTTPS application
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ') ## http://www.dr-chuck.com http://www.dr-chuck.com/page1.htm https://www.si.umich.edu/
html = urllib.request.urlopen(url, context=ctx).read() ## For loop of lines -- a whole string with newlines -- UTF-8 string
soup = BeautifulSoup(html, 'html.parser') ## clean soup object
## Retrieve all of the anchor tags, <a through /a>
tags = soup('a') ## list of anchor tags
for tag in tags:
print(tag.get('href', None))
## Exercise 4
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter the URL and I will count the paragraphs: ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
count = 0
tags = soup('p')
for tag in tags:
count += 1
print("The Number of Paragraphs on:", url, "is", count)
## Chapter 13 Web Services and XML
## Data on the Web
## Retrieve and parse XML(eXtensible Markup Language) data
## With the HTTP Request/Response well understood and well supported, there was a natural move towards exchanging data between programs using these protocols
## We needed to come up with an agreed way to represent data going between applications and across networks
## There are two commonly used formats: XML and JSON
## Sending Data across the 'Net'
## Two programs sending and receiving data(Python Dictionary and Java HashMap) communicate through Network
## a.k.a. 'Wire Protocol' - What we send on the 'wire'
## Step 1: Python producing the data(reading a database or a file), and then has a Python structured dictionary
## Step 2: Send that across the network,
## Step 3: Wire Protocol: How the data is put on the wire, how the data leaves one system, transits a network, and then enters another system
## Step 4: Destination System: Another program(Java HashMap)
## Agreeing on a 'Wire Format'
## XML is one of the wire formats
## Python Dictionary(internal data) 'Serialize' to a wire format and then 'De-Serialize'(taking data off the wire) to a new internal data structure
## Two types of serialization formats are XML and JSON
## Take Python Dictionary and we produce JSON and send JSON across the network as a string or document and turn into other programs
## XML: Marking up data to send across the network...
## XML "Elements' (or Nodes)
<people> ## Complex Element, start
<person> ## Complex Element, start
<name>Chuck</name> ## Simple Element
<phone>303 4456</phone> ## Simple Element, start and end tag
</person> ## Complex Element, end
<person> ## Complex Element, start
<name>Noah</name> ## Simple Elements
<phone>622 7421</phone> ## Simple Elements
</person> ## Complex Element, end
</people> ## Complex Element, end
## eXtensible Markup Language
## Primary purpose is to help information systems share structured data
## It started as a simplified subset of the Standard Generalized Markup Language (SGML), and is designed to be relatively human-legible
## XML Basics: Start Tag, End Tag, Text Content, Attribute, Self Closing Tag
<person> ## Start Tag
<name>Chuck</name> ## Chuck is Text Content, </name> is End tag
<phone type="intl"> ## type="intl" is the Attribute
+1 734 303 4456
</phone>
<email hide="yes" /> ## <email /> is Self Closing Tag
</person>
## White Space
## Line ends do not matter. White space is generally discarded on text elements. Except for the white space in the text content
## We indent only to be readable.
## Tags: indicate the beginning and ending of elements
## Attributes: - Keyword/value pairs on the opening tag of XML
## Serialize / De-Serialize - Convert data in one program into a common format that can be stored and/or transmitted between systems in a programming language-independent manner
## XML as a Tree (Text Node): Attributes child and Text child under a Node
## XML as Path: /a/b text: X /a/c/d text: Y /a/c/e text: Z
## XML Schema: Describing a 'contract' as to what is acceptable XML
## XML Schema(when data blow up in either of two sides, use XML Schema to decide)
## XML Schema
## Description of the legal format of an XML document
## Expressed in terms of constraints on the structure and content of documents
## Often used to specify a 'contract' between systems - 'My system will only accept XML that conforms to this particular Schema.'
## If a particular piece of XML meets the specification of the Schema - it is said to 'validate'
## XML Validation
## XML validation is the act of taking an document and a Schema Contract(also a XML document), send to validator
## Example
XML Document
<person>
<lastname>Severance</lastname>
<age>17</age>
<dateborn>2001-04-17</dateborn>
</person>
XML Schema Contract
<xs:complexType name="person">
<xs:sequence>
<xs:element name="lastname" type="xs:string"/>
<xs:element name="age" type="xs:integer"/>
<xs:element name="dateborn" type="xs:date"/>
</xs:sequence>
</xs:complexType>
## Many XML Schema Languages
## 1. Document Type Definition (DTD)
## 2. Standard Generalized Markup Language (ISO 8879:1986 SGML)
## 3. XML Schema from W3C(World Wide Web Consortium) - (XSD)
## XSD XML Schema (W3C spec)
## We will focus on the World Wide Web Consortium (W3C) version
## It is often called 'W3C Schema' because 'Schema' is considered generic
## More commonly it is called XSD because the file names end in .xsd
XSD Structure: xs:element xs:sequence xs:complexType
<person>
<lastname>Severance</lastname>
<age>17</age>
<dateborn>2001-04-17</dateborn>
</person>
XML Schema Contract
<xs:complexType name="person">
<xs:sequence>
<xs:element name="lastname" type="xs:string"/>
<xs:element name="age" type="xs:integer"/>
<xs:element name="dateborn" type="xs:date"/>
</xs:sequence>
</xs:complexType>
## XSD Constraints
<xs: element name="person">
<xs: complexType>
<xs:sequence>
<xs:element name="full_name" type="xs:string" minOccurs="1" maxOccurs="1" /> ## Exactly only one event/occur
<xs:element name="child_name" type="xs:string" minOccurs="0" maxOccurs="10" /> ## Can have 0 to 10 records
</xs: sequence>
</xs: complexType>
</xs:element>
## Example
<person>
<full_name>Tove Refsnes</full_name>
<child_name>Hege</child_name>
<child_name>Stale</child_name>
<child_name>Jim</child_name>
<child_name>Borge</child_name>
</person>
## XSD Data Types
<xs:element name="customer" type="xs:string"/>
<xs:element name="start" type="xs:date"/> ## the date formate is yyyy-mm-dd bc it can be sorted and computer likes this way
<xs:element name="startdate" type="xs:dateTime"/> ## yyyy-mm-ddThh:mm:ssZ
<xs:element name="prize" type="decimal"/> ## float
<xs:element name="weeks" type="xs:integer"/> ## integer
## It is common to represent time in UTC/GMT, given that servers are often scattered around the world
<customer>John Smith</customer>
<start>2002-09-24</date>
<startdate>2002-05-30T09:30:10Z</startdate> ## Time and Zone
<prize>999.50</prize>
<weeks>30</weeks>
## Time matters in web and computer
## ISO 8601 Date/Time format
2002-05-30T09:30:10Z
Year-month-day Time of day Z: Timezone - typically specified in UTC(Universal Time) / GMT(Greenwich Mean Time) rather than
## Parsing XML in Python
import xml.etree.ElementTree as ET # ET is alias
data = '''<person>
<name>Chuck</name>
<phone type="intl">
+1 734 303 4456
</phone>
<email hide="yes"/>
</person>''' ##### triple-quoted string ''' is a potentially multi-line string, has a newline at the end of each line
tree = ET.fromstring(data) ## fromstring() transform the string data to the tree pictures, this step can blow up if the string is bad
print('Name:', tree.find('name').text) ## in the XML, find the tag name and text is an attribute under the tag node
print('Attr:', tree.find('email').get('hide')) ## Email tag has an attribute of hide(value is yes) and do not have text
## Under a tag Node, there can be text child and multiple attributes child
## A self-closing tag do not have text child
## Child tags under tag, a tag can have multiple child tags
## Loops through a list of trees
import xml.etree.ElementTree as ET
input = '''<stuff>
<users>
<user x="2">
<id>001</id>
<name>Chuck</name>
</user>
<user x="7">
<id>009</id>
<name>Brent</name>
</user>
</users>
</stuff>'''
stuff = ET.fromstring(input)
lst = stuff.findall('users/user') ## find all of the user under the users, this is a list of tags with all child tag information
print(lst)
print('User count:', len(lst))
for item in lst:
print('Name', item.find('name').text)
print('Id', item.find('id').text)
print('Attribute', item.get('x')) ## Get the attribute in the tag
## JSON and the REST Architecture
## JavaScript Object Notation(JSON)
## Douglas Crockford - 'Discovered' JSON
## Object literal notation in JavaScript
## Dictionaries in JSON
import json
data = '''{
"name" : "Chuck",
"phone" : {
"type" : "intl",
"number" : "+1 734 303 4456"
},
"email" : {
"hide" : "yes"
}
}'''
## JSON represents data as nested "lists' and "dictionaries"
## JSON does not have start and end tags and there is no attributes
info = json.loads(data) ## This can have traceback, get back an object
## the thing get back 'info' is a python dictionary
print('Name:', info["name"])
print('Name:'), info.get('name')
print('Phone:', info.get('phone').get('number'))
print('Hide:', info["email"]["hide"])
print('Phone:', info["phone"]["number"])
## Another example, List in JSON
import json
input = '''[
{"id" : "001",
"x" : "2",
"name" : "Chuck"
},
{ "id" : "009",
"x" : "7",
"name" : "Chuck"
}
]'''
info = json.loads(input) ## info is a list
print('User count:', len(info)) ## two things in the list
for item in info: ## items in info is dictionary
print('Name', item["name"])
print('ID', item['id'])
print('Attribute', item['x'])
## Sevice Oriented Approach/Architectures
## Most non-trivial web applications use services
## They use services from other applications (Credit Card Charge, Hotel Reservation systems)
## Services publish the 'rules', applications must follow to make use of the service (API)
## API: Application Program Interfaces, which are ways to use web protocols to access data on systems, using well-defined and structured approaches
## Multiple Systems
## Initially - Two systems cooperate and split the problem
## As the data/service becomes useful - multiple applications want to use the information / application
## The Service Oriented Architectures (SOA)
## Using Application Programming Interfaces (API)
## We are the consumers of the data interchange
## The API is the specification for what the URL patterns are, what the syntax of the data we are supposed to send, what the syntax of data we can expect to get back
## Example, Google Geocoding API: addresses
## Example: Google APIs
## Get some JSON back from Google Geocoding APIs
http://maps.googleapis.com/maps/api/geocode/json?address=Ann+Arbor%2C+MI ## We type in Ann Arbor, MI in the search box
## '+' is a space, '%2C' is a common in the URL syntax
## JSON, data in the following code
{
"status": "OK",
"results": [
{
"geometry":
{
"location_type": "APPROXIMATE",
"location": {
"lat": 42.2808256,
"lng": -83.7430378
}
},
"address_components": [
{
"long_name": "Ann Arbor",
"types": [
"loyalty",
"political"
],
"short_name": "Ann Arbor"
}
],
"formatted_address": "Ann Arbor, MI, USA",
"types": [
"loyalty",
"political"
]
}
]
}
import urllib.request, urllib.parse, urllib.error
import json
serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?'
while True:
address = input('Enter location: ')
if len(address) < 1: break
url = serviceurl + urllib.parse.urlencode({'address': address}) ## This will turn the location in URL Syntax format
print('Retrieving', url)
uh = urllib.request.urlopen(url) ## This is a handle, not pull the data yet, the read() function read the data in a whole string
data = uh.read().decode() ## This gives back the previous JSON document in Unicode(UTF-8 transfer to Unicode),
## data is a string
print('Retrieved', len(data), 'characters')
try:
js = json.loads(data)
except:
js = None
if not js or 'status' not in js or js['status'] != 'OK': ## if js is false or status key not in js, js status is not 'OK'
print('==== Failure To Retrieve ====')
print(data)
continue
print(json.dump(js, indent=4))
lat = js["results"][0]["geometry"]["location"]["lat"] ## go over the tree
lng = js["results"][0]["geometry"]["location"]["lng"]
print('lat', lat, 'lng', lng)
location = js['results'][0]['formatted_address']
print(location)
## Results:
## Enter location: Ann Arbor, MI
## Retrieving http://maps.googleapis.com/maps/api/geocode/json?address=Ann+Arbor%2C+MI
## Retrieved 237 characters
## lat 42.2808256 lng -83.7430378
## Ann Arbor, MI, USA
## Securing API Requests
## API Security and Rate Limiting
## The compute resources to run these APIs are not 'free'
## The data provided by these APIs is usually valuable
## The data providers might limit the number of requests per dat, demand an API 'key', or even charge for usage
## They might change the rules as things progress...
## Twitter APIs: need authorized for each request unlike other APIs(google)
import urllib.request, urllib.parse, urllib.error
import twurl ## This is the code written by Chuck
import json
TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'
while True:
print('')
acct = input('Enter Twitter Account:')
if (len(acct) < 1): break
url = twurl.augment(TWITTER_URL, {'screen_name': acct, 'count' : '5'})
print('Retrieving', url)
connection = urllib.request.urlopen(url)
data = connection.read().decode()
headers = dict(connection.getheaders())
print('Remaining', headers['x-rate-limit-remaining'])
js = json.loads(data)
print(jspn.dumps(js, indent = 4))
for u in js['users']:
print(u['screen_name'])
s = u['status']['text']
print(' ', s[:50])
## Below is the Twitter JSON file
Enter Twitter Account: drchuck
Retrieving https://api.twitter.com/1.1/friends/...
Remaining 14
{
"users": [
{
"status": {
"text": "@jazzychad I just bought one ._.",
"created_at": "Fri Sep 20 08:36:34 +0000 2013", ...
},
"location": "San Francisco, California",
"screen_name": "leahculver",
"name": "Leah Culver", ...
},
{
"status": {
"text": "RT @WSJ: Big employers like Google ...",
"reated_at": "Sat Sep 28 19:36:37 +0000 2013", ...
},
"location": "Victoria Canada",
"screen_name": "_valeriei",
"name": "Valerie Irvine", ...
}, ... ## Can have a very long list
]
}
## This is the Twitter JSON outputs:
Leahculver
@jazzychad I just bought one .__.-
Valeriei
RT @WSJ: Big employers like Google, AT&T are h
Ericbollens
RT @lukew: sneak peek: my LONG take on the good &a
halherzog
Learning Objects is 10. We had a cake with the LO,
## twurl code which creates the Signed URL
## hidden.py is the four keys that we need to put in that we get from Twitter
import urllib
import oauth
import hidden
def augment(url, parameters):
secrets = hidden.oauth()
consumer = oauth.OAuthConsumer(secrets['consumer_key'], secrets['consumer_secret'])
token = oauth.OAuthToken(secrets['token_key'], secrets['token_secret'])
oauth_request = oauth.OAuthRequest.from_consumer_and_token(consumer, token = token, http_method='GET', http_url = url, parameters = parameters)
oauth_request.sign_request(oauth.OAuthSignatureMethod_HMAC_SHA1(), consumer, token)
return oauth_request.to_url()
## The above code will give us the true URL like:
https://api.twitter.com/1.1/statuses/user_timeline.json?count=2&oauth_version=1.0&oauth_token=101...SGI&screen_name=drchuck
&oauth_nonce=09239679&oauth_timestamp=1380395644&oauth_signature=rLK...BoD&oauth_consumer_key=h7Lu...GNg&oauth_signature_method=HMAC-SHA1
## Work Example: Twitter API
## Keep this file separate
## https://apps.twitter.com/
## Create new App and get the four strings
def oauth():
return {"consumer_key": "h7Lu...Ng".
"consumer_secret": "dNKenAC3New...mmn7Q",
"token_key": "10185562-eibxCp9n2...P4GEQQOSGI",
"token_secret": "H0ycCFemmC4wyf1...qoIpBo"}
## We need these four strings from the website
import urllib.request, urllib.parse, urllib.error
from twurl import augment ##OAuth website
import ssl
## https://apps.twitter.com/
## Create App and get the four strings, put them in hidden.py
print('* Calling Twitter...')
url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json', {'screen_name': 'drchuck', 'cbunt':'2'})
print(url)
## Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
connection = urllib.request.urlopen(url, context=ctx)
data = connection.read()
print(data.decode()) ## byte array
headers = dict(connection.getheaders()) ## url does not show headers but we can get the headers by using getheaders
print(headers) ## dictionary
## Parse the JSON very interested part