def get_cities(): city_state = get_loc_cook() cook_cities = [] for ele in city_state: cook_cities.append(ele[0]) #print(cook_cities) #print(len(cook_cities)) url = 'https://www.rentcafe.com/sitemaps/us/il/average-rent-market-trends/' request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") cities = [] count = 0 for tag in tags: if "title" in tag.attrs: city = tag['title'] if city[0:15] == "Average Rent in": #print(city) city = city[16:] #print(city) count += 1 if city in cook_cities: cities.append(city) #print(count) #print(len(cities)) return cities
def get_walk_score(zip_code): ''' Gets walk score for single zip code Input: zip_code (str or int): a US zip code Output: score (int): Walk score for that zip code. Missing values get -1. ''' url = "https://www.walkscore.com/score/" + str(zip_code) req = util.get_request(url) if req: text = util.read_request(req) else: score = -1 text = None if text: soup = bs4.BeautifulSoup(text, features='lxml') span = soup.find('span', attrs={'id': 'score-description-sentence'}) try: score_txt = span.text match = re.search("(Walk Score of)(\s)(\d+)(\s)", score_txt) score = int(match.group(3)) except AttributeError: score = -1 else: score = -1 return score
def get_restaurant_links_chicago(): # start from searching "Restaurant", "Chicago" from yelp main page page_suffix = [i for i in range(0, 231, 10)] url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=' url_list = [] for suffix in page_suffix: page_url = url + str(suffix) url_list.append(page_url) links = [] for url in url_list: request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") # extract href links to restaurants links = [] for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) # Hardcoded filter if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link) return links
def get_movie_links(): ''' Gets all movie links, short synopsis, runtime, three main actors, and id ''' movie_dict = {} start_url = ('https://www.rottentomatoes.com/api/private/v2.0/browse?' 'maxTomato=100&maxPopcorn=100&services=amazon;hbo_go;itunes;' 'netflix_iw;vudu;amazon_prime;fandango_now&certified&sortBy=' 'release&type=dvd-streaming-all&page=') for i in range(312): r = util.get_request(start_url + str(i)) result = r.json() count = result.get('counts').get('count') if count != 32: print("HEY, THIS ONE IS DIFFERENT:", count) for movie in result.get('results'): new_movie = {} print(i, movie.get('title')) movie_id = movie.get('id') new_movie['actors'] = movie.get('actors') new_movie['h_runtime'] = movie.get('runtime') new_movie['short_synopsis'] = movie.get('synopsis') new_movie['title'] = movie.get('title') new_movie['relative_url'] = movie.get('url') new_movie['poster_url'] = movie.get('posters').get('primary') movie_dict[movie_id] = new_movie return movie_dict
def test_get_channels(channel, game_id, token): URL = config.GET_CHANNEL_ROUTE.format(game_id) recieved_channels = util.get_request(URL, util.make_headers(token)) if len(recieved_channels) != 1: raise ValueError('Wrong number of channels: {}'.format(len(recieved_channels))) if recieved_channels[0]['id'] != channel['id']: raise ValueError('Wrong channel returned: {}'.format(channel['id']))
def crawler(): # starting_url = "https://www.teenlife.com/search/?q=None&l=None&c=Summer%20Program&p=1" starting_url = "https://rusticpathways.com/students/programs?_=1584132668586&page=1" limiting_domain = "rusticpathways.com" numpages = 0 links_visited = [] index_list = [] page_parser_q = queue.Queue() pull_info_q = queue.Queue() page_parser_q.put(starting_url) while page_parser_q.empty() == False: link = page_parser_q.get() mini_crawler(link, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list) numpages += 1 while pull_info_q.empty() == False: page_link = pull_info_q.get() # print(page_link) request = util.get_request(page_link) if request is not None: html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") make_index(soup, index_list, page_link) # print(index_list) df = pd.DataFrame(index_list) return df
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain): ''' Crawl the college catalog and adds to an index dictionary to map set of words with associated course identifier. Inputs: url: starting url to begin crawling with q: queue of urls in line to be crawled links_visited: list of visited links limiting_domain: domain name index_list: dictionary that maps words to course identifiers ''' if url in links_visited: return request = util.get_request(url) if request is None: return post_url = util.get_request_url(request) if post_url in links_visited: return html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain) tag_list = soup.find_all("ul", attrs = {"class": "pagination"}) current_page = tag_list[0].find_all("li", attrs = {"class": "current"}) next_page = current_page[0].next_sibling.next_sibling.findChild() next_page_href = next_page.get('href') next_page_href = util.convert_if_relative_url(post_url, next_page_href) page_parser_q.put(next_page_href)
def crawler(): starting_url = "https://www.teenlife.com/search?q=&l=&c=Summer%20Program&p=1" limiting_domain = "www.teenlife.com" parsing_default_domain = "https://www.teenlife.com/search" numpages = 0 links_visited = [] index_list = [] page_parser_q = queue.Queue() pull_info_q = queue.Queue() page_parser_q.put(starting_url) while page_parser_q.empty() == False: link = page_parser_q.get() mini_crawler(link, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain) numpages += 1 print(link, "link") while pull_info_q.empty() == False: page_link = pull_info_q.get() print(page_link, "page_link") request = util.get_request(page_link) if request is not None: html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") make_index(soup, index_list) df = pd.DataFrame(index_list) return df
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list): ''' Crawl the college catalog and adds to an index list to map set of words with associated course identifier. Inputs: url: starting url to begin crawling with q: queue of urls in line to be crawled links_visited: list of visited links limiting_domain: domain name index_list: list of dictionaries for each webpage ''' if url in links_visited: return request = util.get_request(url) if request is None: return post_url = util.get_request_url(request) if post_url in links_visited: return html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain) tag_list = soup.find_all("ul", attrs={"class": "Pagination"}) pages = tag_list[0].find_all("li") pages = pages[1:] for page in pages: page_parser_q.put(page.findChild().get('href'))
def queue_children_sites(starting_url, queue): '''Given a url and a queue, adds all children urls of the start point to the queue Inputs: starting_url -- string that corresponds to a url queue -- queue.Queue object Outputs: None, queue is modified in place to contain all child urls''' if starting_url[4] == 's': pass else: starting_url = starting_url[:4] + 's' + starting_url[4:] #turns http to https if not already request = util.get_request(starting_url) assert request != None text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") URLs = soup.find_all("a") URLs = [URL["href"] for URL in URLs if URL.has_attr("href")] children = [] for URL in URLs: if util.is_absolute_url(URL): children.append(URL) else: URL = util.convert_if_relative_url(starting_url, URL) children.append(URL) children = [ child for child in children if util.is_url_ok_to_follow(child, limiting_domain) ] for child in children: queue.put(child)
def test_get_me(env): user_1 = env['directory']['users']['user-1']['user'] token = env['directory']['users']['user-1']['token'] resp = util.get_request(config.GET_USER_ROUTE, util.make_headers(token)) if resp['id'] != user_1['id']: raise ValueError('Wrong user id returned from: {}'.format( config.GET_USER_ROUTE))
def get_game_urls(master_regular_game_dict, day_urls, urls_visited): game_id = 0 parent_url = "www.baseball-reference.com" for day_url in day_urls: if day_url not in urls_visited: urls_visited.append(day_url) abs_day_url = url_check(day_url, parent_url) if abs_day_url: #print("abs_day_url", abs_day_url) day_request = util.get_request(abs_day_url) if day_request: day_text = day_request.text day_soup = bs4.BeautifulSoup(day_text, parse_only=bs4.SoupStrainer("pre")) possible_links = day_soup.find_all("a") #print("possible_links", possible_links) #game_url = possible_urls[0].get("href") game_urls = [] for link in possible_links: if re.search('[a-zA-Z]', link.text) == None: game_urls.append(link.get("href")) num_games = len(game_urls) #print("game_urls", game_urls) for game_url in game_urls: game_dict = get_game_info(game_url, parent_url, urls_visited, num_games) game_id += 1 master_regular_game_dict[game_id] = game_dict return master_regular_game_dict
def go(housing_links): ''' Main function Inputs: housing_links (list): a list of links obtained from inputing different zipcodes to the search bar of rentcafe.com Output: d (dict): a dictionary mapping each zipcode to a tuple (mean_price, income) ''' # a dictionary with zipcode as keys, avg rent price as values d = {} # start from the first zip_code... for link in housing_links: zip_code = str(link[-5:]) d[zip_code] = [] request = util.get_request(link) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") # find median income under this zipcode li_tags = soup.find_all('li', class_="medium") income = np.int64(re.findall(r'\d+(?:,\d+)?', li_tags[2].text)[0].replace(',','')) # collect all subpages under this zipcode pages_to_crawl = [] tags = soup.find('ul', class_="pagination") if tags is None: pages_to_crawl = [link] else: pages = tags.find_all('a', href=True) for a in pages: if a['href'] not in pages_to_crawl: pages_to_crawl.append(a['href']) for url in pages_to_crawl: request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") property_tags = soup.find_all('div', class_='item-information') for item in property_tags: d[zip_code].append(find_adj_price(item)) d[zip_code] = (np.mean([x for x in d[zip_code] if x != 0]), income) return d
def create_dictionary(num_pages_to_crawl, course_map_filename, starting_url, limiting_domain): ''' Creates the dictionary mapping course id numbers to the words in the course titles and descriptions. Inputs: num_pages_to_crawl: (int) The number of pages to process during the crawl. course_map_filename: (string) The name of the JSON file that contains the mapping of course codes to course ids. starting_url: (string) The url of the first page that the crawler visits. limiting_domain: (string) The limiting domain of the url. Outputs: The dictionary mapping course id numbers to the words in the course titles and descriptions. ''' with open(course_map_filename) as json_file: coursemap = json.load(json_file) url_list = [] url_queue = queue.Queue() num_pages = 0 course_dict = {} process_dict = {} starting_url = clean_url(starting_url, limiting_domain, parent_url=None) if starting_url: url_queue.put(starting_url) while num_pages < num_pages_to_crawl and not url_queue.empty(): num_pages += 1 next_url = url_queue.get() if next_url and next_url not in url_list: request = util.get_request(next_url) if request: request_url = util.get_request_url(request) if request_url and request_url not in url_list: url_list.append(next_url) if request_url not in url_list: url_list.append(request_url) html_text = util.read_request(request) soup = bs4.BeautifulSoup(html_text, "html5lib") process_dict.update(find_course_info(soup, coursemap,\ course_dict)) if process_dict: course_dict.update(process_dict) href_list = soup.find_all("a", href=True) for h in href_list: h_url = h['href'] h_url = clean_url(h_url, limiting_domain, request_url) if h_url: url_queue.put(h_url) return course_dict
def get_data(self, numPoints, url, useProxy): logging.info('Starting {} Requests to {}. Proxy? {}'.format( numPoints, url, useProxy)) rtts = [] errors = 0 for i in range(numPoints): try: if useProxy: res = get_request(url, self.proxyDict) else: res = get_request(url) rtts.append(res.elapsed.total_seconds()) if i % 10 == 0: time.sleep(5) # avoid 429s except Exception as e: errors += 1 logging.info('Finished {} Requests to {}. Got {} errors'.format( numPoints, url, errors)) return (rtts, errors)
def worker(self, nReqs, url): errors = 0 start = time.time() for i in range(nReqs): try: res = get_request(url, self.proxyDict) except Exception as e: errors += 1 elapsed = time.time() - start self.workers.append(workerData(elapsed, nReqs, errors))
def get_alpha_player_urls(letter_url): abs_letter_url = letter_url #print("abs_letter_url", abs_letter_url) letter_request = util.get_request(abs_letter_url) if letter_request: letter_text = letter_request.text letter_soup = bs4.BeautifulSoup(letter_text, parse_only=bs4.SoupStrainer("pre")) player_urls = [a.attrs.get("href") for a in letter_soup.select("a")] #makes list of player urls return player_urls
def test_get_users(env): ids = [user['user']['id'] for user in env['directory']['users'].values()] get_users_route = '{}?{}'.format(config.GET_USERS_ROUTE, util.create_query('userId', ids)) token = env['directory']['users']['user-1']['token'] users = util.get_request(get_users_route, util.make_headers(token)) found_ids = set([user['id'] for user in users]) for id in ids: if not id in found_ids: raise ValueError(f'Id: {id} not found')
def get_game_info(game_url, parent_url, urls_visited, num_games): game_dict = {"date": "", "stadium": "", "team1": "", "team2": "", "team1_runs": "", "team2_runs": "", "team1_hits": "", "team2_hits": "", "team1_hr": "", "team2_hr": "", "winner": ""} if game_url not in urls_visited: urls_visited.append(game_url) #print('parent_url', parent_url) #print('game url', game_url) abs_game_url = url_check(game_url, parent_url) if abs_game_url: print("abs_game_url", abs_game_url) game_request = util.get_request(abs_game_url) if game_request: game_text = game_request.text game_soup = bs4.BeautifulSoup(game_text, parse_only=bs4.SoupStrainer("div", id="page_content")) all_tables = game_soup.find_all("table", class_=False) game_table = all_tables[num_games] game_table_rows = game_table.find_all("tr") date_and_stadium_table = game_table_rows[0] date_and_stadium = date_and_stadium_table.find_all("div", class_="bold_text float_left") date = date_and_stadium[0].text stadium = date_and_stadium[1].text hits_runs_and_teams_table = game_table_rows[3] teams = hits_runs_and_teams_table.find_all("a", href=True) team1 = teams[0].text team2 = teams[1].text game_data = hits_runs_and_teams_table.find_all("strong") team1_runs = game_data[2].text[4] team2_runs = game_data[4].text[4] team1_hits = game_data[2].text[7] team2_hits = game_data[4].text[7] team1_hr_table = all_tables[num_games+5] team1_hr_row = team1_hr_table.find("tfoot") team1_hr = team1_hr_row.find_all("td")[7] team2_hr_table = all_tables[num_games+4] team2_hr_row = team2_hr_table.find("tfoot") team2_hr = team2_hr_row.find_all("td")[7] if int(team1_runs) > int(team2_runs): winner = team1 elif int(team2_runs) > int(team1_runs): winner = team2 else: winner = "tie" game_dict["date"] = date game_dict["stadium"] = stadium[2:] game_dict["team1"] = team1 game_dict["team2"] = team2 game_dict["team1_runs"] = team1_runs game_dict["team2_runs"] = team2_runs game_dict["team1_hits"] = team1_hits game_dict["team2_hits"] = team2_hits game_dict["team1_hr"] = team1_hr game_dict["team2_hr"] = team2_hr game_dict["winner"] = winner print("game dict", game_dict) return game_dict
def open_page(url): # print("opening...", url) r = util.get_request(url) # print("r:", r) if r: return r.url, BeautifulSoup(util.read_request(r)) else: return None, None
def test_get_messages(channel, token): URL = config.MESSAGE_ROUTE.format(channel['id']) messages = util.get_request(URL, util.make_headers(token)) if len(messages) != len(config.MELIAN_DIALOGE): raise ValueError('Wrong number of messages recieved: {}'.format(len(messages))) for i in range(len(messages)): expected = config.MELIAN_DIALOGE[i] actual = messages[i] log.debug('Message Id: {}'.format(actual['id'])) log.debug(actual['text']) if actual['text'] != expected: raise ValueError('Wrong message text for message: {}'.format(actual['id']))
def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit): urls = Queue.Queue() visited = [] index = {} def search(word): rv = [] matches = [] words = re.findall("[a-zA-Z]\w*", word) if len(words) == 0: return [] for url in index.keys(): for title in index[url].keys(): for word in words: word = word.lower() if word in title or word in index[url][title]: matches.append((title, url)) for pair in matches: if matches.count(pair) == len(words): if pair not in rv: rv.append(pair) return rv if util.is_url_ok_to_follow(starting_url, limiting_domain): urls.put(starting_url) while not urls.empty() and len(visited) < max_num_pages_to_visit: top_queue = urls.get() if top_queue not in visited and util.is_url_ok_to_follow( top_queue, limiting_domain): request = util.get_request(top_queue) if request == None: visited.append(top_queue) continue new_page = util.get_request_url(request) if new_page != top_queue: if new_page not in visited: visited.append(new_page) top_queue = new_page data = bs4.BeautifulSoup(util.read_request(request)) visited.append(top_queue) index = indexer(index, top_queue, data) for link in data.find_all('a'): href = link.get('href') if href == None: continue href = util.remove_fragment(href) if not util.is_absolute_url(href): url = util.convert_if_relative_url(top_queue, href) urls.put(url) else: return None return search
def get_soup_from_url(url): ''' Input: url - absolute url Returns: BeautifulSoup object corresponding to url ''' request = util.get_request(url) if request == None: return None text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") return soup
def get_restaurant_links_cook(): cities = get_cities() city_state = get_loc_cook() new_city_state = [] for ele in city_state: if ele[0] in cities: new_city_state.append(ele) page_suffix = [i for i in range(0, 231, 10)] #print(city_state) url_list = [] for city, state in city_state: html = "https://www.yelp.com/search?find_desc=Restaurants&find_loc=" + city.replace( " ", "") + "%2C%20" + state for suffix in page_suffix: html_page = html + "&start=" + str(suffix) url_list.append(html_page) r''' with open(r"c:\Users\35653\Desktop\CS122\project\urls.txt", "w") as write_file: write_file.writelines(url_list) write_file.close() ''' url_list = [ "https://www.yelp.com/search?find_desc=Restaurants&find_loc=Lyons%2C%20IL&start=190" ] for url in url_list: request = util.get_request(url) if request: text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") # extract href links to restaurants links = [] for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) # Hardcoded filter if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link + "\n") return links
def cook_soup(url): ''' simple function, takes an url and converts it into a beautiful soup object Input: url (str): a valid url/link Output: soup (bs4 object): a bs4 soup object using 'html5lib' parser ''' answer = None r_object = util.get_request(url) if r_object: soup = BeautifulSoup(r_object.content, "html5lib") answer = soup return answer
def create_players(master_player_dict, player_urls, parent_url): ''' Loops through the list of player urls and passes the rows of the standard batting table down to get_player info so that function can get the team and year information ''' id_number = 0 for player_url in player_urls: print("index player_url", player_urls.index(player_url)) print("player_url:", player_url) player_dict = {"name": "", "positions": "", "years": "", "span": "", "years_played": "", "teams": "", "WARs_nonpitcher": "", "WARs_pitcher": "", "ERAs": "", "IPs": "", "GSs": "", "FIPs": "", "E_Fs": ""} abs_player_url = parent_url + player_url if abs_player_url: player_request = util.get_request(abs_player_url) if player_request: player_text = player_request.text #all the things to put in the player_employment_dict years = get_player_info_from_standard_batting(player_text)[0] teams = get_player_info_from_standard_batting(player_text)[1] player_name = get_player_info_from_main_player_page(player_text)[0] positions = get_player_info_from_main_player_page(player_text)[1] wars_nonpitcher = get_player_info_from_player_value_batters(player_text) #put them in the player_employment_dict player_dict["years"] = years player_dict["teams"] = teams player_dict["name"] = player_name player_dict["positions"] = positions player_dict["WARs_nonpitcher"] = wars_nonpitcher if "Pitcher" in positions: eras = get_player_info_from_standard_pitching(player_text)[0] ips = get_player_info_from_standard_pitching(player_text)[1] gss = get_player_info_from_standard_pitching(player_text)[2] fips = get_player_info_from_standard_pitching(player_text)[3] e_fs = get_player_info_from_standard_pitching(player_text)[4] wars_pitcher = get_player_info_from_player_value_pitchers(player_text) player_dict["ERAs"] = eras player_dict["IPs"] = ips player_dict["GSs"] = gss player_dict["FIPs"] = fips player_dict["E_Fs"] = e_fs player_dict["WARs_pitcher"] = wars_pitcher if player_dict["years"] != "": player_dict["span"] = "-".join([player_dict["years"][:4], player_dict["years"][len(player_dict["years"])-4:]]) player_dict["years_played"] = int(player_dict["years"][len(player_dict["years"])-4:]) - int(player_dict["years"][:4]) print("id number:", id_number) print("player dict:", player_dict) master_player_dict[id_number] = player_dict id_number += 1 return master_player_dict
def make_soup(url): ''' Makes a soup object from a html request object Inputs: request: a request object of the html Outputs: soup - Soup object, if request is valid url. ''' req = util.get_request(url) html = util.read_request(req) if html is not None and html is not "": soup = bs4.BeautifulSoup(html, "html5lib") return soup return None
def get_soup(url): ''' Returns the soup of the current_market_url. Inputs: url: str Output: BeautifulSoup object ''' time.sleep(0.05) url_request = util.get_request(url) if not url_request: return False html = util.read_request(url_request) if not html: return False return bs4.BeautifulSoup(html, "html5lib")
def get_soup_object(url): """ Takes a url, checks for possible redirection, returns soup object. Inputs: url (string) Returns: Soup object """ request = util.get_request(url) html_text = util.read_request(request) soup = bs4.BeautifulSoup(html_text, 'html5lib') return soup
def get_restaurant_links(): ''' Start from searching "Restaurant", "Chicago" on yelp main page, and collect all restaurant links from 24 pages Input: None Output: links (list): a list of links ''' page_suffix = [i for i in range(0, 231, 10)] url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=' url_list = [] for suffix in page_suffix: page_url = url + str(suffix) url_list.append(page_url) links = [] count = 0 for url in url_list: count += 1 print(count) request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link + "\n") print(link) i = 5 + random.random() * 5 time.sleep(i) return links
def get_referees(url): base_url = 'http://www.basketball-reference.com/boxscores/' comp_url = base_url + url + '.html' request = util.get_request(comp_url) if request != None: html = util.read_request(request) if html != None: soup = bs4.BeautifulSoup(html, "html5lib") div_tags = soup.find_all('div') good_tags = str(div_tags) string = re.findall(r'(?<=Officials:)(.*?)(?=\<br)', good_tags) rv = re.findall(r'(?<=.html\"\>)(.*?)(?=<\/a)', string[0]) return rv
def get_day_urls(regular_season_year_urls, parent_url, master_regular_game_dict): parent_url = "http://www.baseball-reference.com/boxes/" for year_url in regular_season_year_urls: abs_year_url = parent_url + year_url print("year_url", year_url) print("if abs_year_url", abs_year_url) year_request = util.get_request(abs_year_url) if year_request: year_text = year_request.text year_soup = bs4.BeautifulSoup(year_text, parse_only=bs4.SoupStrainer("table", class_="wide_container")) #print("year soup", year_soup) day_urls = [a.attrs.get("href") for a in year_soup.select("a")] #day_links = year_soup.find("a") print("day urls:", day_urls) #print("day links", day_links) return master_regular_game_dict
def get_ps_series_page(series_urls, parent_url, master_ps_game_dict): for series_url in series_urls: abs_series_url = parent_url + series_url if abs_series_url: series_request = util.get_request(abs_series_url) if series_request: series_text = series_request.text series_soup = bs4.BeautifulSoup(series_text, parse_only=bs4.SoupStrainer("pre")) games = series_soup.find_all("pre") if len(games) != 0: print("games", games) game_id = 0 for game in games: print("game_id", game_id) game_dict = get_ps_game_info(game) master_ps_game_dict[game_id] = game_dict game_id += 1 return master_ps_game_dict
def get_paper_links(number_of_articles, fields): ''' Crawls through Nature search pages, and pulls article links from different fields Input: number of articles, int number of articles to find fields, list of field str, all major fields in nature.com/subjects Output: paper_links, list of paper urls (str) ''' search_url = ('https://www.nature.com/search?article_type=protocols\ %2Cresearch%2Creviews&subject=') suffix = '&page=' search_urls = [] paper_links = [] num_articles_per_field = number_of_articles // 8 num_pages_to_visit = int(np.ceil(num_articles_per_field / 50)) num_on_last_page = num_articles_per_field % 50 for field in fields: for i in range(num_pages_to_visit): new_url = search_url + field + suffix + str(i + 1) search_urls.append(new_url) for url in search_urls: num_to_search = 50 if int(url[-1]) == num_pages_to_visit: num_to_search = num_on_last_page new_request = util.get_request(url) html = util.read_request(new_request) search_soup = bs4.BeautifulSoup(html, features = 'html.parser') article_links = search_soup.find_all('h2', class_ = 'h3 extra-tight-line-height', itemprop = 'headline') article_links = article_links[:num_to_search] paper_links.extend([i.find('a')['href'] for i in article_links]) return paper_links
def get_player_urls(letter_urls, parent_url): ''' Loops through the the list of letter urls and gets a list of player urls, which it passes down to create_players so that the info for each player can be made into a mini dictionary that can be added to the master dictionary ''' for letter_url in letter_urls: abs_letter_url = parent_url + letter_url print("abs_letter_url", abs_letter_url) letter_request = util.get_request(abs_letter_url) if letter_request: letter_text = letter_request.text letter_soup = bs4.BeautifulSoup(letter_text, parse_only=bs4.SoupStrainer("pre")) #players = letter_soup.find_all("a",href=True) player_urls = [a.attrs.get("href") for a in letter_soup.select("a")] #makes list of player urls #print("player_urls:", player_urls) #parent_url = abs_letter_url return player_urls
def analyze_page(url, queue, limiting_domain, course_dict): ''' Queues all urls, then makes a dictionary index of the course codes to a list of words in the course description. Inputs: url: the url of the page to analyze queue: the queue that holds the urls limiting_domain: a domain with which to stay in when queuing course_dict: the index dictionary Outputs: None ''' request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") queue_urls(url, soup, queue, limiting_domain) find_courses(soup, course_dict)
def make_br_player_dict(): ''' Crawls through the players part of baseball-reference and makes a dictionary where the keys are the player_id for each player, which map to the teams each player played for and the corresponding year for each team ''' master_player_dict = {} letter_urls = [] player_urls = [] starting_url = "http://www.baseball-reference.com/players/" parent_url = "http://www.baseball-reference.com" request = util.get_request(starting_url) if request: text = request.text soup = bs4.BeautifulSoup(text, parse_only=bs4.SoupStrainer("td", class_="xx_large_text bold_text")) letter_urls = [a.attrs.get("href") for a in soup.select("a")]#makes list of letter urls #print("letter urls:", letter_urls) player_urls = get_player_urls(letter_urls, parent_url) master_player_dict = create_players(master_player_dict, player_urls, parent_url) return master_player_dict
def make_br_games_dict(): master_game_dict = {} master_regular_game_dict = {} master_ps_game_dict = {} urls_visited = [] starting_url = "http://www.baseball-reference.com/boxes/" parent_url = "http://www.baseball-reference.com" if starting_url not in urls_visited: urls_visited.append(starting_url) request = util.get_request(starting_url) #print("request:", request) if request: text = request.text soup = bs4.BeautifulSoup(text, parse_only=bs4.SoupStrainer("table", class_="large_text")) #regular_season_year_urls = get_regular_season_year_urls(soup) #day_urls = get_day_urls(regular_season_year_urls, parent_url) #master_regular_game_dict = get_game_urls(master_regular_game_dict, day_urls) series_urls = get_post_season_urls(parent_url) master_ps_game_dict = get_ps_series_page(series_urls, parent_url, master_ps_game_dict) return master_game_dict
def translate_general(code): ''' Given an IDC-code, scrapes www.icd10data.com and returns a meaningful translation as a string Input: code (int): an IDC-code Output: rv (string): translation of the IDC-code ''' url = BASE_GEN + str(code) + '&codebook=icd9volume1' ro = util.get_request(url) html = util.read_request(ro) soup = bs4.BeautifulSoup(html, "html5lib") rv = None search = soup.find('div').next_sibling.next_sibling.find('div', class_='searchPadded') if search and search.text: rv = search.text return rv
def get_post_season_urls(parent_url): #need to get all urls for all the world series/postseason stuff until it gets to 1903, when I #should stop getting data bc there is barely any data abs_ps_url = parent_url + "/postseason/" if abs_ps_url: parent_url = abs_ps_url ps_request = util.get_request(abs_ps_url) if ps_request: ps_text = ps_request.text ps_soup = bs4.BeautifulSoup(ps_text, parse_only=bs4.SoupStrainer("div", id="page_content")) #table_rows = ps_soup.find_all("tr") #print("table_rows", table_rows) #for row in table_rows: series_urls = [a.attrs.get("href") for a in ps_soup.select("a[href^=/postseason/]")] #possible_links = row.find_all("a", href^="/postseason") #print("possible links", possible_links) #print("len possible_links", len(possible_links)) #if len(possible_links) > 3: #series_urls = [possible_links[0].get("href"), possible_links[len(possible_links)//2].get("href")] #elif len(possible_links) > 0: #series_urls = [possible_links[0].get("href")] #for series_url in series_urls: return series_urls