def xml2(self): input = ''' <stuff> <users> <user x="2"> <id>001</id> <name>Chuck</name> </user> <user x="7"> <id>009</id> <name>Brent</name> </user> </users> </stuff>''' exu = ExerciseUtils() stuff = exu.InternetTreeXML(input) stuff.create_tree() lst = stuff.findall_users() print('User count:', len(lst)) field_list = [("Name:", "name", "text", ""), ("Id:", "id", "text", ""), ("Attribute:", "", "attr", "x")] for tree in lst: stuff.replace_tree(tree) stuff.print_element_tree(field_list)
def urlwords(self): print("urlwords - compute the frequency of each word in the file") exu = ExerciseUtils() fh = exu.open_url(exu.url_text_doc, None) page = exu.get_url_page(fh) count = exu.getwords(page) assert len(count) == 26 print(count)
def curl1(self): print("curl1 - get and image and write it to a file") exu = ExerciseUtils() img = exu.open_url_small_img(self.url_jpg, None) imglen = len(img) assert imglen == 230210 print("Length of " + self.url_jpg + " is:", imglen) rc = exu.write_file(self.url_jpg, "wb", img) assert rc is None
def urllib1(self): print("urllib1 - use urllib to read a web page like a file") exu = ExerciseUtils() fh = exu.open_url(exu.url_text_doc, None) assert fh != "" page = exu.get_url_page(fh) assert len(page) > 0 print(page)
def curl2(self): print("curl2 - get and image and write it to a file using a buffer to\ read any size file") exu = ExerciseUtils() img = exu.open_url(self.url_jpg, None) assert img != "" count = exu.get_url_large_img_and_save(img, self.url_jpg) assert count == 230210 print(count, 'characters copied.')
def socket1(self): print("socket1 - World's simplest web browser") exu = ExerciseUtils() mysock, url = exu.init_socket_and_url(exu.url_prefix, exu.url_base, exu.url_text_doc) assert mysock is not None page = exu.get_page(mysock, url) assert len(page) == 2 mysock = exu.close_socket(mysock) # normal socket
def xml1(self): data = ''' <person> <name>James</name> <phone type="intl"> +1 503 851 8418 </phone> <email hide="yes" /> </person>''' exu = ExerciseUtils() inter_tree = exu.InternetTreeXML(data) inter_tree.create_tree() inter_tree.print_element_tree([("Name:", "name", "text", "")]) inter_tree.print_element_tree([("Attr:", "email", "attr", "hide")]) inter_tree.print_element_tree([("Name:", "name", "text", ""), ("Attr:", "email", "attr", "hide")])
def geojson(self): data = ''' [ { "id" : "001", "x" : "2", "name" : "Chuck" } , { "id" : "009", "x" : "7", "name" : "Brent" } ]''' exu = ExerciseUtils() intr_tree = exu.InternetTreeJSON(data) intr_tree.create_tree_list() print('User count:', intr_tree.tree_list_count()) field_list = [("Name:", "name", "text", ""), ("Id:", "id", "text", ""), ("Attribute:", "", "attr", "x")] for tree in intr_tree.tree_list: intr_tree.replace_tree(tree) intr_tree.print_element_tree(field_list)
def urljpeg(self): print("urljpeg - get a jpeg document") exu = ExerciseUtils() mysock, url = exu.init_socket_and_url(exu.url_prefix, exu.url_base, self.url_jpg) assert mysock is not None pic = exu.get_jpeg(mysock, url) assert len(pic) == 230608 mysock = exu.close_socket(mysock) # normal socket piclen = exu.save_picture(pic, self.local_jpg) assert piclen > 0
from utils import ExerciseUtils def_url = "http://data.pr4e.org/mbox-short.txt" print( "urllib1 - World's simplest web browser for any url and displays up to 3000 characters" ) print(" Format of url must be http(s)://urlbase/page") print(" Example: " + def_url) url = input("Enter url to open(" + def_url + "): ") if url == "": url = def_url exu = ExerciseUtils() url_prefix, url_base, url_page = exu.split_url(url) if url_base == "" or url_base == None: print("Bad URL") else: char_count = exu.print_page_urllib(url, 3000) assert char_count == 94626 print("Total characters found:", char_count)
import re from utils import ExerciseUtils def_url = "http://data.pr4e.org/romeo.txt" print("socket1 - World's simplest web browser for any url") print(" Format of url must be http(s)://urlbase/page") print(" Example: " + def_url) url = input("Enter url to open(" + def_url + "): ") if url == "": url = def_url exu = ExerciseUtils() url_prefix, url_base, url_page = exu.split_url(url) total_chars = exu.print_page_limit(url_prefix, url_base, url_page, 3000, skipheaders=True) # normal socket
import re from utils import ExerciseUtils def_url = "http://data.pr4e.org/mbox-short.txt" print("socket1 - World's simplest web browser for any url and displays up to 3000 characters") print(" Format of url must be http(s)://urlbase/page") print(" Example: " + def_url) url = input("Enter url to open(" + def_url + "): ") if url == "": url = def_url exu = ExerciseUtils() url_prefix, url_base, url_page = exu.split_url(url) if url_base == "" or url_base == None: print("Bad URL") else: print("opening socket to:", url) mysock, url = exu.init_socket_and_url(url_prefix, url_base, url_page) if mysock != None: total_chars = exu.print_page_socket(mysock, url, 3000) print("Total characters found:", total_chars) mysock = exu.close_socket(mysock) # normal socket
def urllinks2(self): print("urllinks2 - Look at the parts of a tag") exu = ExerciseUtils() html = exu.get_html(exu.url_default2) exu.bs4_tags(html, 'a')
def urllinks(self): print("urllinks - Search for link values within URL page using\ BeatifulSoup to parse html") exu = ExerciseUtils() html = exu.get_html(exu.url_default1) exu.bs4_tags(html, 'a', pflags=[False, True, False, False])
def urlregex(self): print("urlregex - Search for link values within URL input") exu = ExerciseUtils() html = exu.get_html(exu.url_default1) exu.regexlinks(html)
from utils import ExerciseUtils print( 'ex_11_02 - Write a program to look for lines of the form: New Revision: 39772 \n \ Extract the number from each of the lines using a regular expression and the findall() method. Compute the average of the numbers and print out the average as an integer.' ) files = ["mbox.txt", "mbox-short.txt"] exu = ExerciseUtils() for file in files: count, avg = exu.run_findall_avg(file, "^New Revision: ([0-9]+)", True) print("Found ", str(count), " Average is " + str(avg) + " for file " + file)
from bs4 import BeautifulSoup from utils import ExerciseUtils print( "urlpara - Search for paragrapsh (tags beginning with <p>) within URL page using BeatifulSoup to parse html" ) exu = ExerciseUtils() urls = [exu.url_default1, exu.url_default2, exu.url_default3] for url in urls: html = exu.get_html(url) assert len(html) > 0 tags = exu.bs4_tags(html, "p", [True, False, False, False]) print("Reading: ", url) print("Number of paragraphs: ", len(tags))
import re from utils import ExerciseUtils def_url = "http://data.pr4e.org/romeo.txt" print("socket1 - World's simplest web browser for any url") print(" Format of url must be http(s)://urlbase/page") print(" Example: " + def_url) url = input("Enter url to open(" + def_url + "): ") if url == "": url = def_url exu = ExerciseUtils() url_prefix, url_base, url_page = exu.split_url(url) if url_base == "" or url_base == None: print("Bad URL") else: print("opening socket to:", url) mysock, url = exu.init_socket_and_url(url_prefix, url_base, url_page) if mysock != None: page = exu.get_page(mysock, url) mysock = exu.close_socket(mysock) # normal socket assert mysock._closed
from utils import ExerciseUtils print( 'ex_11_01 - Write a simple program to simulate the operation of the grep command on Unix. Ask the user to enter a regular expression and count the number of lines that matched the regular expression' ) regex_list = list() regex = input("Enter a regular expression: ") regex_list.append(regex) if len(regex_list) > 0: regex_list = ["^Author", "^X-", "java$"] exu = ExerciseUtils() for regex in regex_list: count = exu.run_search1('mbox.txt', regex, False)
from utils import ExerciseUtils print("re01 - Search for lines that contain 'From'") exu = ExerciseUtils() count = exu.run_search1('mbox-short.txt', 'From:', False) assert count == 27 print("re02 - Search for lines that start with 'From'") exu = ExerciseUtils() count = exu.run_search1('mbox-short.txt', '^From:', False) assert count == 27 print( "re03 - Search for lines that start with 'F', followed by 2 characters, followed by 'm:'" ) exu = ExerciseUtils() count = exu.run_search1('mbox-short.txt', '^F..m:', False) assert count == 27 print("re04 - Search for lines that start with From and have an '@' sign") exu = ExerciseUtils() count = exu.run_search1('mbox-short.txt', '^From:.+@', False) assert count == 27 print("re05 - Search for an address") exu = ExerciseUtils() count = exu.run_findall('mbox-short5.txt', '\\S+@\\S+', False) assert count == 5 print("re06 - Search for lines that have an at sign between characters") exu = ExerciseUtils()