def find_tag_list(url, soup_object, request_obj, limiting_d): ''' Function finds the list of hyperlink tags from given URL and collects the list of URLs. Inputs: url (string): URL to find list of further URLs from soup_object (soup object): Soup object obtained from using Beautiful Soup on given URL request_obj (request object): Request object obtained from the functions given in PA to obtain request object from URL limiting_d (string): Limiting domain for the URLs Outputs: url_list (list): List containing all URLs that are able to be followed from the given URL. ''' https_url = util.get_request_url(request_obj) ahref_tag_list = soup_object.find_all('a', href = True) url_list = [] for tag in ahref_tag_list: this_url = tag['href'] newest_url = util.convert_if_relative_url(https_url, this_url) if util.is_url_ok_to_follow(newest_url, limiting_d): url_list.append(newest_url) return url_list
def queue_children_sites(starting_url, queue): '''Given a url and a queue, adds all children urls of the start point to the queue Inputs: starting_url -- string that corresponds to a url queue -- queue.Queue object Outputs: None, queue is modified in place to contain all child urls''' if starting_url[4] == 's': pass else: starting_url = starting_url[:4] + 's' + starting_url[4:] #turns http to https if not already request = util.get_request(starting_url) assert request != None text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") URLs = soup.find_all("a") URLs = [URL["href"] for URL in URLs if URL.has_attr("href")] children = [] for URL in URLs: if util.is_absolute_url(URL): children.append(URL) else: URL = util.convert_if_relative_url(starting_url, URL) children.append(URL) children = [ child for child in children if util.is_url_ok_to_follow(child, limiting_domain) ] for child in children: queue.put(child)
def get_restaurant_links_chicago(): # start from searching "Restaurant", "Chicago" from yelp main page page_suffix = [i for i in range(0, 231, 10)] url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=' url_list = [] for suffix in page_suffix: page_url = url + str(suffix) url_list.append(page_url) links = [] for url in url_list: request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") # extract href links to restaurants links = [] for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) # Hardcoded filter if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link) return links
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain): ''' Crawl the college catalog and adds to an index dictionary to map set of words with associated course identifier. Inputs: url: starting url to begin crawling with q: queue of urls in line to be crawled links_visited: list of visited links limiting_domain: domain name index_list: dictionary that maps words to course identifiers ''' if url in links_visited: return request = util.get_request(url) if request is None: return post_url = util.get_request_url(request) if post_url in links_visited: return html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain) tag_list = soup.find_all("ul", attrs = {"class": "pagination"}) current_page = tag_list[0].find_all("li", attrs = {"class": "current"}) next_page = current_page[0].next_sibling.next_sibling.findChild() next_page_href = next_page.get('href') next_page_href = util.convert_if_relative_url(post_url, next_page_href) page_parser_q.put(next_page_href)
def find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain): ''' Adds links to be visited to the queue 'q' and adds links visited to the list 'links_visited.' Inputs: soup: soup object from the text of the HTML document url: starting url to begin crawling with post_url: this is the processed absolute url q: queue of urls that is being added to for each url crawled links_visited: list of visited links limiting_domain: domain name ''' tag_list = soup.find_all( "div", attrs={"class": "Grid Grid--SpacingResponsiveLarge"}) link_list = tag_list[0].find_all('h3', {"class": "Card__Title"}) for link in link_list: possible_link = link.findChild().get("href") # possible_link = parsing_default_domain + possible_link actual_link = util.convert_if_relative_url(post_url, possible_link) if actual_link is not None and actual_link not in links_visited: if util.is_url_ok_to_follow(actual_link, limiting_domain): pull_info_q.put(actual_link) links_visited.append(url) if post_url != url: links_visited.append(post_url)
def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit): urls = Queue.Queue() visited = [] index = {} def search(word): rv = [] matches = [] words = re.findall("[a-zA-Z]\w*", word) if len(words) == 0: return [] for url in index.keys(): for title in index[url].keys(): for word in words: word = word.lower() if word in title or word in index[url][title]: matches.append((title, url)) for pair in matches: if matches.count(pair) == len(words): if pair not in rv: rv.append(pair) return rv if util.is_url_ok_to_follow(starting_url, limiting_domain): urls.put(starting_url) while not urls.empty() and len(visited) < max_num_pages_to_visit: top_queue = urls.get() if top_queue not in visited and util.is_url_ok_to_follow( top_queue, limiting_domain): request = util.get_request(top_queue) if request == None: visited.append(top_queue) continue new_page = util.get_request_url(request) if new_page != top_queue: if new_page not in visited: visited.append(new_page) top_queue = new_page data = bs4.BeautifulSoup(util.read_request(request)) visited.append(top_queue) index = indexer(index, top_queue, data) for link in data.find_all('a'): href = link.get('href') if href == None: continue href = util.remove_fragment(href) if not util.is_absolute_url(href): url = util.convert_if_relative_url(top_queue, href) urls.put(url) else: return None return search
def url_check(url, parent_url): ''' Takes a url, the limiting domain, and its parent url, and does various checks on the url, returning the url in the correct format if it is ok to use and returning None if not. ''' if not util.is_absolute_url(url): url = util.convert_if_relative_url(parent_url, url) url = util.remove_fragment(url) if url: return url else: return None
def get_restaurant_links_cook(): cities = get_cities() city_state = get_loc_cook() new_city_state = [] for ele in city_state: if ele[0] in cities: new_city_state.append(ele) page_suffix = [i for i in range(0, 231, 10)] #print(city_state) url_list = [] for city, state in city_state: html = "https://www.yelp.com/search?find_desc=Restaurants&find_loc=" + city.replace( " ", "") + "%2C%20" + state for suffix in page_suffix: html_page = html + "&start=" + str(suffix) url_list.append(html_page) r''' with open(r"c:\Users\35653\Desktop\CS122\project\urls.txt", "w") as write_file: write_file.writelines(url_list) write_file.close() ''' url_list = [ "https://www.yelp.com/search?find_desc=Restaurants&find_loc=Lyons%2C%20IL&start=190" ] for url in url_list: request = util.get_request(url) if request: text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") # extract href links to restaurants links = [] for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) # Hardcoded filter if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link + "\n") return links
def calendar_scraper(url, limiting_domain): ''' Extracts links from a given url. Inputs: url - (string) url from which to get limiting_domain: (string) that links must match visited_links: (list) of already visited sites Outputs: links - list of strings, non-repetead and not previously visited links soup - soup object corresponding to visited url (to be used for getting words) ''' #A. Extracting links req = util.get_request(url) url2 = util.get_request_url(req) soup = make_soup(req) if soup: cal = [] cal_list = soup.find_all("div", class_="panel panel_default") for d in div_list: d_tr = util.remove_fragment(link.get("href")) d_abs = util.convert_if_relative_url(url2, d_tr) if util.is_url_ok_to_follow(d_abs, limiting_domain): cal.append(d_abs) art = [] art_list = soup.find_all("div", class_="panel panel_default") for d in div_list: d_tr = util.remove_fragment(link.get("href")) d_abs = util.convert_if_relative_url(url2, d_tr) if util.is_url_ok_to_follow(d_abs, limiting_domain): cal.append(d_abs) return cal, art
def get_restaurant_links(): ''' Start from searching "Restaurant", "Chicago" on yelp main page, and collect all restaurant links from 24 pages Input: None Output: links (list): a list of links ''' page_suffix = [i for i in range(0, 231, 10)] url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=' url_list = [] for suffix in page_suffix: page_url = url + str(suffix) url_list.append(page_url) links = [] count = 0 for url in url_list: count += 1 print(count) request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link + "\n") print(link) i = 5 + random.random() * 5 time.sleep(i) return links
def scrape(): ''' performs entire scraping function outputs: index: dictionary of museum/exhibit information ''' index = {} for museum_id in scrape_dict: limiter = scrape_dict[museum_id]['limiter'] pages = scrape_dict[museum_id]['page'] exhibit_urls = [] for page in pages: r = requests.get(page) soup = bs4.BeautifulSoup(r.text, "html5lib") for link in soup.find_all('a', href=True): u = util.convert_if_relative_url(page, link['href']) u = util.remove_fragment(u) restr = scrape_dict[museum_id]['restr'] crawl(limiter, exhibit_urls, u, restr) index[museum_id] = {} exhibit_id = museum_id + '01' for link in exhibit_urls: r = requests.get(link) soup = bs4.BeautifulSoup(r.text,"html5lib") print(link) try: index[museum_id][exhibit_id] = {} scrape_dict[museum_id]['info'](soup, index[museum_id], exhibit_id) index[museum_id][exhibit_id]['url'] = link exhibit_id = '00' + str(int(exhibit_id) + 1) except: print('\t^^ Scraper Failed') with open('../csvs/musid_name.csv','w') as f: line = 'mus_id|name' + '\n' f.write(line) for mus_id in scrape_dict: line = '{}|{}\n'.format(str(mus_id), \ scrape_dict[mus_id]['name']) f.write(line) return index
def scrape(): ''' performs entire scraping function outputs: index: dictionary of museum/exhibit information ''' index = {} for museum_id in scrape_dict: limiter = scrape_dict[museum_id]['limiter'] pages = scrape_dict[museum_id]['page'] exhibit_urls = [] for page in pages: r = requests.get(page) soup = bs4.BeautifulSoup(r.text, "html5lib") for link in soup.find_all('a', href=True): u = util.convert_if_relative_url(page, link['href']) u = util.remove_fragment(u) restr = scrape_dict[museum_id]['restr'] crawl(limiter, exhibit_urls, u, restr) index[museum_id] = {} exhibit_id = museum_id + '01' for link in exhibit_urls: r = requests.get(link) soup = bs4.BeautifulSoup(r.text, "html5lib") print(link) try: index[museum_id][exhibit_id] = {} scrape_dict[museum_id]['info'](soup, index[museum_id], exhibit_id) index[museum_id][exhibit_id]['url'] = link exhibit_id = '00' + str(int(exhibit_id) + 1) except: print('\t^^ Scraper Failed') with open('../csvs/musid_name.csv', 'w') as f: line = 'mus_id|name' + '\n' f.write(line) for mus_id in scrape_dict: line = '{}|{}\n'.format(str(mus_id), \ scrape_dict[mus_id]['name']) f.write(line) return index
def get_next_page(soup_object, current_url): """ Takes a PLOS url page of articles and gets the next page Inputs: current_url (string): url for the Soup object soup_object (Soup Object) Returns: next page url (string) """ next_page_url = soup_object.find_all("a", id="nextPageLink") if next_page_url: next_page_url = next_page_url[0]["href"] next_page_url = util.convert_if_relative_url(current_url, next_page_url) return next_page_url else: return None
def clean_url(url, limiting_domain, parent_url): ''' Cleans the given url, if necessary. Inputs: url: (string) A url limiting_domain: (string) The limiting domain of the url. parent_url: (string) The partent url if the given url is incomplete. Outputs: The cleaned url if it is ok to follow, and None otherwise. ''' c_url = util.remove_fragment(url) c_url = util.convert_if_relative_url(parent_url, c_url) if util.is_url_ok_to_follow(c_url, limiting_domain): return c_url return None
def get_category_urls(current_category_url): ''' From URL with current categories, get url for each category Input: current_category_url: url for current categories Return: list of urls with each url containing awards information for one category ''' html = requests.get(url).text soup = BeautifulSoup(html, "html5lib") links = soup.find('div', class_="div-col columns column-width").find_all('a') urls = [] for link in links: category_relative_url = link['href'] category_url = util.convert_if_relative_url(url, category_relative_url) urls.append(category_url) return urls
def get_PLOS_subject_urls(starting_url): """ Parses the PLOS starting url and extracts the urls for each subject area. Inputs: starting_url (string) Returns: list of urls for each subject area in PLOS """ soup = get_soup_object(starting_url) soup_dropdown_menu_lst = soup.find_all("ul", typeof="v:Breadcrumb") soup_subject_urls_lst = soup_dropdown_menu_lst[0].find_all("a") subject_urls_lst = [] for a_tag in soup_subject_urls_lst[1:]: url = util.convert_if_relative_url(starting_url, a_tag["href"]) subject_urls_lst.append(url) return subject_urls_lst
def queue_urls(url, soup, queue, limiting_domain): ''' Forms a queue of all the urls Inputs: url: the url to put into the queue soup: BeautifulSoup object queue: the existing queue limiting_domain: a domain with which to stay in when queuing Outputs: None ''' for link in soup.find_all('a'): clean_url = util.convert_if_relative_url( url, util.remove_fragment(link.get('href'))) if util.is_absolute_url(clean_url) and str(clean_url)[0] != 'b': if (util.is_url_ok_to_follow( clean_url, limiting_domain)) and clean_url not in queue.all_items: queue.enqueue(clean_url)
def get_all_links(soup, url): ''' funcion take a soup object and will give a list of all relevent urls/links for this assignmnet in a list Input: soup (bs4 object): a bs4 soup object of the wabe page url (strl): the url from the bs4 soup was construted, (This is required because to convert into absolute url) Output: all_links (list): a list of ready to go urls ''' all_links = [] mega_set = soup.find_all('a') for link in mega_set: if link.has_attr('href'): link = util.remove_fragment(link['href']) abs_link = util.convert_if_relative_url(url, link) if abs_link not in all_links: all_links.append(abs_link) return all_links
def go(num_articles_to_crawl, start_page, database_name): ''' Crawl the PLOS One and generate an SQL database. Automatically samples even number of articles from each subject area based on num_articles_to_crawl. Inputs: num_articles_to_crawl (int): theapproximate number of articles to process during the crawl start_page (int): page of PLOS One subject browsing to start crawling at database_name (string): name of database to add to ''' urls_visited = set() starting_url = ("https://journals.plos.org/plosone/browse") subject_urls_lst = get_PLOS_subject_urls(starting_url) num_articles_per_field = num_articles_to_crawl / 11 num_pages_per_field = math.ceil(num_articles_per_field / 13) for subject_url in subject_urls_lst: field = get_field(subject_url) subject_url += '?page=' + str(start_page) subject_soup = get_soup_object(subject_url) current_url = subject_url urls_visited.add(current_url) for _ in range(num_pages_per_field): soup_article_lst = subject_soup.find_all("h2", class_="title") for soup_article in soup_article_lst: article_url = util.convert_if_relative_url( current_url, soup_article.find_all("a")[0]["href"]) if article_url not in urls_visited: urls_visited.add(article_url) process_article(article_url, field, database_name) current_url = get_next_page(subject_soup, subject_url) if not current_url: break subject_soup = get_soup_object(current_url) urls_visited.add(current_url) return
def get_clean_urls(url, limiting_domain): ''' Given a soup for a webpage, create and return a list of all 'a' tag urls in that soup that have been cleaned (absolute urls only) and 'ok' to follow Inputs: url - absolute url limiting_domain - domain name used_links - list of links already visited Outputs: list of absolute urls ''' soup = get_soup_from_url(url) all_a_tags = soup.find_all("a") # get urls (i.e. if tag has 'href' attribute) clean_urls = [] for tag in all_a_tags: if tag.has_attr('href'): absolute = util.convert_if_relative_url(url, tag['href']) if not util.is_url_ok_to_follow(absolute, limiting_domain): continue util.remove_fragment(absolute) # protocol field reversion temp_request = util.get_request(absolute) if temp_request == None: continue reverted_url = util.get_request_url(temp_request) # is url ok to follow based on specification in PA2 if util.is_url_ok_to_follow(reverted_url, limiting_domain): clean_urls.append(reverted_url) # remove duplicates final_url_list = [] for link in clean_urls: if link not in final_url_list: final_url_list.append(link) return final_url_list
def get_neighbors(node): print(" completed") neighbors = [] test_link = [] response = util.get_request(node) if response == None: print('No response') neighbors = None else: text = util.read_request(response) if text == "": print("No text read") neighbors = None else: soup = bs4.BeautifulSoup(text, "html5lib") for link in soup.find_all("a"): url_raw = link.get("href") url_rel = util.remove_fragment(url_raw) url = util.convert_if_relative_url(node, url_rel) print(url) if url != None: neighbors.append(url) return neighbors, response, soup
def find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain): ''' Adds links to be visited to the queue 'q' and adds links visited to the list 'links_visited.' Inputs: soup: soup object from the text of the HTML document url: starting url to begin crawling with post_url: this is the processed absolute url q: queue of urls that is being added to for each url crawled links_visited: list of visited links limiting_domain: domain name ''' tag_list = soup.find_all("div", attrs = {"class":"search-listing-content"}) for tag in tag_list: href_tag = tag.findChild() possible_link = href_tag.get('href') actual_link = util.convert_if_relative_url(post_url, possible_link) if actual_link is not None and actual_link not in links_visited: if util.is_url_ok_to_follow(actual_link, limiting_domain): pull_info_q.put(actual_link) links_visited.append(url) if post_url != url: links_visited.append(post_url)
def movie_level_data(index_filename, i=0): ''' Scrapes all movie level data using urls scraped from the all movies page ''' s = 'SELECT title, movie_id, url FROM all_page' db = sqlite3.connect('sql_db_files/rotten_tomatoes.db') c = db.cursor() r = c.execute(s) urls = r.fetchall()[:i + 1] urls = urls[::-1] # Replace the above two lines with the below for forward scraping # urls = r.fetchall()[i:] db.close() current_url = 'https://www.rottentomatoes.com/' with open('movie_level_files/' + index_filename + '_' + str(i) + '.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter='|') for title, movie_id, url in urls: url = util.convert_if_relative_url(current_url, url) print(i, title, movie_id) i -= 1 # Replace the above for forward scraping # i += 1 r = url_request(index_filename, i, url) html = r.read() soup = bs4.BeautifulSoup(html, features='html5lib') movie = data_collector(soup) if movie.get('Runtime:'): movie['Runtime:'] = movie.get('Runtime:').strip('minutes ') if movie.get('In Theaters:'): movie['In Theaters:'] = re.search( '[a-zA-Z]{3} [0-9,]+ [0-9]{4}', movie.get('In Theaters:')).group() if movie.get('Box Office:'): movie['Box Office:'] = movie.get('Box Office:').strip() writer.writerow([ movie_id, title, movie.get('Directed By:'), movie.get('Genre:'), movie.get('In Theaters:'), movie.get('On Disc/Streaming:'), movie.get('Box Office:'), movie.get('Rating:'), movie.get('Runtime:'), movie.get('Studio:'), movie.get('Written By:'), movie.get('full_synop'), movie.get('all_reviewers_average'), movie.get('num_reviewers'), movie.get('all_fresh'), movie.get('all_rotten'), movie.get('top_reviewers_average'), movie.get('num_top_reviewers'), movie.get('top_fresh'), movie.get('top_rotten'), movie.get('user_rating'), movie.get('num_users') ])
def get_lyrics(cnx, cursor, url=STARTING_URL, visited_pages=set()): print("\ntrying", url) visited_pages.add(url) print("visited_pages length: {}".format(len(visited_pages))) try: url, soup = open_page(url) except Exception: print("there was an exception - you done f****d up\n") return None if not url: print("no url") return None ## base case if url[-4:] == '.txt':# or len(visited_pages) > 2: if soup.find('pre'): text = soup.find('pre').text print("adding {}\n".format(url)) else: ps = soup.find_all('p') ps = list(filter(lambda x: 'Artist:' in x.text, ps)) if len(ps)==1: text = ps[0].text else: text = soup.text cursor.execute(ADD_RAW_TEXT, (url, text)) cnx.commit() # return [(url, text)] ## recursive case else: # print("reached recursive case") lyrics = [] if soup.find('div', id='leftmain'): tag = soup.find('div', id='leftmain') else: tag = soup new_links = tag.find_all('a', href=True) for link in new_links: # if len(visited_pages) > 30: # continue ### check if link can be followed # if 'href' not in link.attrs: # print("href not in attrs") # continue # print(link['href']) clean_link = util.remove_fragment(link['href']) # print("link: {}".format(clean_link)) if not clean_link: print("no clean link") continue abs_link = util.convert_if_relative_url(url, clean_link) if abs_link in visited_pages or not util.is_url_ok_to_follow(abs_link, LIMITING_DOMAIN) or 'update' in abs_link: print("child link shan't be followed: {}".format(abs_link)) continue get_lyrics(cnx, cursor, abs_link, visited_pages)