# each time you crawl, clear out your database and start over: helpers.create_or_replace_table() # add the first link to crawl: urls = ['https://www.mccormick.northwestern.edu/eecs/courses/'] visited = {} counter = 0 while len(urls) > 0: # get the next url url = urls.pop(0) soup = helpers.get_webpage(url) counter += 1 # extract urls from the web page (already done for you) webpage_urls = helpers.extract_links_from_webpage(soup, url) # extract key data from the web page (already done for you): # (print the row variable to understand it) row = helpers.extract_data_from_webpage(soup, url) print(row['body']) urls += webpage_urls # YOUR TASKS: # 1. Add the urls that you found to the urls list so that the # webpage keeps crawling (b/c of the while loop condition) # just like Tutorial 7. print('add webpage_urls to the urls list') # 2. Track how many times each url has been visited as you crawl; # and don't crawl the same page twice.
import time import helpers # add the first link to crawl: urls = ['https://www.northwestern.edu/'] pagerank = {} while len(urls) > 0: ######################### # Don't forget to sleep # ######################### time.sleep(2) # removes the top url from the list url = urls.pop(0) print('\nretrieving ' + url + '...') soup = helpers.get_webpage(url) if soup is None: print('Error retrieving {url}'.format(url=url)) else: website_summary = helpers.extract_website_summary_from_webpage(soup) links_on_page = helpers.extract_links_from_webpage(soup, url) helpers.write_links_to_file(links_on_page) print(website_summary) # Goal: modify this code to crawl through all the links of the northwestern # website, and track how many times each website is linked to, using a dictionary.