Exemplo n.º 1
0
def get_cities():
    city_state = get_loc_cook()
    cook_cities = []
    for ele in city_state:
        cook_cities.append(ele[0])

    #print(cook_cities)
    #print(len(cook_cities))

    url = 'https://www.rentcafe.com/sitemaps/us/il/average-rent-market-trends/'
    request = util.get_request(url)
    text = util.read_request(request)

    soup = bs4.BeautifulSoup(text, "html5lib")
    tags = soup.find_all('a', href=True, target="", role="")

    cities = []
    count = 0
    for tag in tags:
        if "title" in tag.attrs:
            city = tag['title']
            if city[0:15] == "Average Rent in":
                #print(city)
                city = city[16:]
                #print(city)
                count += 1
                if city in cook_cities:
                    cities.append(city)

    #print(count)
    #print(len(cities))

    return cities
Exemplo n.º 2
0
def get_walk_score(zip_code):
    '''
    Gets walk score for single zip code

    Input:
    zip_code (str or int): a US zip code

    Output:
    score (int): Walk score for that zip code. Missing values get -1.
    '''

    url = "https://www.walkscore.com/score/" + str(zip_code)
    req = util.get_request(url)
    if req:
        text = util.read_request(req)
    else:
        score = -1
        text = None
    if text:
        soup = bs4.BeautifulSoup(text, features='lxml')
        span = soup.find('span', attrs={'id': 'score-description-sentence'})
        try:
            score_txt = span.text
            match = re.search("(Walk Score of)(\s)(\d+)(\s)", score_txt)
            score = int(match.group(3))
        except AttributeError:
            score = -1
    else:
        score = -1

    return score
Exemplo n.º 3
0
def get_restaurant_links_chicago():
    # start from searching "Restaurant", "Chicago" from yelp main page
    page_suffix = [i for i in range(0, 231, 10)]
    url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start='
    url_list = []

    for suffix in page_suffix:
        page_url = url + str(suffix)
        url_list.append(page_url)

    links = []

    for url in url_list:

        request = util.get_request(url)
        text = util.read_request(request)

        soup = bs4.BeautifulSoup(text, "html5lib")
        tags = soup.find_all('a', href=True, target="", role="")

        # extract href links to restaurants
        links = []
        for tag in tags:
            link = tag['href']
            link = util.convert_if_relative_url(url, link)
            link = util.remove_fragment(link)
            # Hardcoded filter
            if link[-11:] == "Restaurants":
                if tag["name"] != '':
                    if link not in links:
                        links.append(link)

    return links
Exemplo n.º 4
0
def get_movie_links():
    '''
    Gets all movie links, short synopsis, runtime, three main actors, and id
    '''
    movie_dict = {}

    start_url = ('https://www.rottentomatoes.com/api/private/v2.0/browse?'
                 'maxTomato=100&maxPopcorn=100&services=amazon;hbo_go;itunes;'
                 'netflix_iw;vudu;amazon_prime;fandango_now&certified&sortBy='
                 'release&type=dvd-streaming-all&page=')

    for i in range(312):
        r = util.get_request(start_url + str(i))
        result = r.json()

        count = result.get('counts').get('count')
        if count != 32:
            print("HEY, THIS ONE IS DIFFERENT:", count)

        for movie in result.get('results'):
            new_movie = {}

            print(i, movie.get('title'))
            movie_id = movie.get('id')

            new_movie['actors'] = movie.get('actors')
            new_movie['h_runtime'] = movie.get('runtime')
            new_movie['short_synopsis'] = movie.get('synopsis')
            new_movie['title'] = movie.get('title')
            new_movie['relative_url'] = movie.get('url')
            new_movie['poster_url'] = movie.get('posters').get('primary')

            movie_dict[movie_id] = new_movie

    return movie_dict
Exemplo n.º 5
0
def test_get_channels(channel, game_id, token):
    URL = config.GET_CHANNEL_ROUTE.format(game_id)
    recieved_channels = util.get_request(URL, util.make_headers(token))
    if len(recieved_channels) != 1:
        raise ValueError('Wrong number of channels: {}'.format(len(recieved_channels)))
    if recieved_channels[0]['id'] != channel['id']:
        raise ValueError('Wrong channel returned: {}'.format(channel['id']))
Exemplo n.º 6
0
def crawler():
    # starting_url = "https://www.teenlife.com/search/?q=None&l=None&c=Summer%20Program&p=1"
    starting_url = "https://rusticpathways.com/students/programs?_=1584132668586&page=1"
    limiting_domain = "rusticpathways.com"

    numpages = 0
    links_visited = []
    index_list = []
    page_parser_q = queue.Queue()
    pull_info_q = queue.Queue()
    page_parser_q.put(starting_url)
    while page_parser_q.empty() == False:
        link = page_parser_q.get()
        mini_crawler(link, page_parser_q, pull_info_q, links_visited,
                     limiting_domain, index_list)
        numpages += 1

    while pull_info_q.empty() == False:
        page_link = pull_info_q.get()
        # print(page_link)
        request = util.get_request(page_link)
        if request is not None:
            html = util.read_request(request)
            soup = bs4.BeautifulSoup(html, features="html5lib")
            make_index(soup, index_list, page_link)
            # print(index_list)
    df = pd.DataFrame(index_list)
    return df
Exemplo n.º 7
0
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain):
    '''
    Crawl the college catalog and adds to an index dictionary to map set of
    words with associated course identifier.

    Inputs:
        url: starting url to begin crawling with
        q: queue of urls in line to be crawled
        links_visited: list of visited links
        limiting_domain: domain name
        index_list: dictionary that maps words to course identifiers
    '''
    if url in links_visited:
        return
    request = util.get_request(url)
    if request is None:
        return
    post_url = util.get_request_url(request)
    if post_url in links_visited:
        return
    html = util.read_request(request)
    soup = bs4.BeautifulSoup(html, features="html5lib")
    find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain)
    tag_list = soup.find_all("ul", attrs = {"class": "pagination"})
    current_page = tag_list[0].find_all("li", attrs = {"class": "current"})
    next_page = current_page[0].next_sibling.next_sibling.findChild()
    next_page_href = next_page.get('href')
    next_page_href = util.convert_if_relative_url(post_url, next_page_href)
    page_parser_q.put(next_page_href)
Exemplo n.º 8
0
def crawler():
    starting_url = "https://www.teenlife.com/search?q=&l=&c=Summer%20Program&p=1"
    limiting_domain = "www.teenlife.com"
    parsing_default_domain = "https://www.teenlife.com/search"

    numpages = 0
    links_visited = []
    index_list = []
    page_parser_q = queue.Queue()
    pull_info_q = queue.Queue()
    page_parser_q.put(starting_url)
    while page_parser_q.empty() == False:
        link = page_parser_q.get()
        mini_crawler(link, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain)
        numpages += 1
        print(link, "link")

    while pull_info_q.empty() == False:
        page_link = pull_info_q.get()
        print(page_link, "page_link")
        request = util.get_request(page_link)
        if request is not None:
	        html = util.read_request(request)
	        soup = bs4.BeautifulSoup(html, features="html5lib")
	        make_index(soup, index_list)


    df = pd.DataFrame(index_list)

    return df
Exemplo n.º 9
0
def mini_crawler(url, page_parser_q, pull_info_q, links_visited,
                 limiting_domain, index_list):
    '''
    Crawl the college catalog and adds to an index list to map set of
    words with associated course identifier.

    Inputs:
        url: starting url to begin crawling with
        q: queue of urls in line to be crawled
        links_visited: list of visited links
        limiting_domain: domain name
        index_list: list of dictionaries for each webpage
    '''
    if url in links_visited:
        return
    request = util.get_request(url)
    if request is None:
        return
    post_url = util.get_request_url(request)
    if post_url in links_visited:
        return
    html = util.read_request(request)
    soup = bs4.BeautifulSoup(html, features="html5lib")
    find_links(soup, url, post_url, pull_info_q, links_visited,
               limiting_domain)
    tag_list = soup.find_all("ul", attrs={"class": "Pagination"})
    pages = tag_list[0].find_all("li")
    pages = pages[1:]
    for page in pages:
        page_parser_q.put(page.findChild().get('href'))
Exemplo n.º 10
0
def queue_children_sites(starting_url, queue):
    '''Given a url and a queue, adds all children urls
     of the start point to the queue

     Inputs: starting_url -- string that corresponds to a url
     queue -- queue.Queue object

     Outputs: None, queue is modified
     in place to contain all child urls'''

    if starting_url[4] == 's':
        pass
    else:
        starting_url = starting_url[:4] + 's' + starting_url[4:]
    #turns http to https if not already
    request = util.get_request(starting_url)
    assert request != None
    text = util.read_request(request)
    soup = bs4.BeautifulSoup(text, "html5lib")
    URLs = soup.find_all("a")
    URLs = [URL["href"] for URL in URLs if URL.has_attr("href")]
    children = []
    for URL in URLs:
        if util.is_absolute_url(URL):
            children.append(URL)
        else:
            URL = util.convert_if_relative_url(starting_url, URL)
            children.append(URL)

    children = [
        child for child in children
        if util.is_url_ok_to_follow(child, limiting_domain)
    ]
    for child in children:
        queue.put(child)
Exemplo n.º 11
0
def test_get_me(env):
    user_1 = env['directory']['users']['user-1']['user']
    token = env['directory']['users']['user-1']['token']
    resp = util.get_request(config.GET_USER_ROUTE, util.make_headers(token))
    if resp['id'] != user_1['id']:
        raise ValueError('Wrong user id returned from: {}'.format(
            config.GET_USER_ROUTE))
Exemplo n.º 12
0
def get_game_urls(master_regular_game_dict, day_urls, urls_visited):
    game_id = 0
    parent_url = "www.baseball-reference.com"
    for day_url in day_urls:
        if day_url not in urls_visited:
            urls_visited.append(day_url)
            abs_day_url = url_check(day_url, parent_url)
            if abs_day_url:
                #print("abs_day_url", abs_day_url)
                day_request = util.get_request(abs_day_url)
                if day_request:
                    day_text = day_request.text
                    day_soup = bs4.BeautifulSoup(day_text, parse_only=bs4.SoupStrainer("pre"))
                    possible_links = day_soup.find_all("a")
                    #print("possible_links", possible_links)
                    #game_url = possible_urls[0].get("href")
                    game_urls = []
                    for link in possible_links:
                        if re.search('[a-zA-Z]', link.text) == None:
                            game_urls.append(link.get("href"))
                    num_games = len(game_urls)
                    #print("game_urls", game_urls)
                    for game_url in game_urls:
                        game_dict = get_game_info(game_url, parent_url, urls_visited, num_games)
                        game_id += 1
                        master_regular_game_dict[game_id] = game_dict
    return master_regular_game_dict
def go(housing_links):
    '''
    Main function
    Inputs:
        housing_links (list): a list of links obtained from inputing different
            zipcodes to the search bar of rentcafe.com
    Output: 
        d (dict): a dictionary mapping each zipcode to a tuple (mean_price, income)
    '''
    # a dictionary with zipcode as keys, avg rent price as values
    d = {}

    # start from the first zip_code...
    for link in housing_links:
        zip_code = str(link[-5:])
        d[zip_code] = []
        request = util.get_request(link)
        text = util.read_request(request)
        soup = bs4.BeautifulSoup(text, "html5lib")

        # find median income under this zipcode
        li_tags = soup.find_all('li', class_="medium")
        income = np.int64(re.findall(r'\d+(?:,\d+)?', li_tags[2].text)[0].replace(',',''))

        # collect all subpages under this zipcode
        pages_to_crawl = []
        tags = soup.find('ul', class_="pagination")
        if tags is None:
            pages_to_crawl = [link]
        else:
            pages = tags.find_all('a', href=True)
            for a in pages:
                if a['href'] not in pages_to_crawl:
                    pages_to_crawl.append(a['href'])

        for url in pages_to_crawl:
            request = util.get_request(url)
            text = util.read_request(request)
            soup = bs4.BeautifulSoup(text, "html5lib")
            property_tags = soup.find_all('div', class_='item-information')
    
            for item in property_tags:
                d[zip_code].append(find_adj_price(item))
            
        d[zip_code] = (np.mean([x for x in d[zip_code] if x != 0]), income)
        
    return d
Exemplo n.º 14
0
def create_dictionary(num_pages_to_crawl, course_map_filename, starting_url,
                      limiting_domain):
    '''
    Creates the dictionary mapping course id numbers to the words in the
    course titles and descriptions.

    Inputs:
        num_pages_to_crawl: (int) The number of pages to process
                            during the crawl.
        course_map_filename: (string) The name of the JSON file that contains
                             the mapping of course codes to course ids.
        starting_url: (string) The url of the first page that the
                      crawler visits.
        limiting_domain: (string) The limiting domain of the url.

    Outputs:
        The dictionary mapping course id numbers to the words in the course 
        titles and descriptions.
    '''

    with open(course_map_filename) as json_file:
        coursemap = json.load(json_file)

    url_list = []
    url_queue = queue.Queue()
    num_pages = 0
    course_dict = {}
    process_dict = {}

    starting_url = clean_url(starting_url, limiting_domain, parent_url=None)

    if starting_url:
        url_queue.put(starting_url)

    while num_pages < num_pages_to_crawl and not url_queue.empty():
        num_pages += 1
        next_url = url_queue.get()
        if next_url and next_url not in url_list:
            request = util.get_request(next_url)
            if request:
                request_url = util.get_request_url(request)
                if request_url and request_url not in url_list:
                    url_list.append(next_url)
                    if request_url not in url_list:
                        url_list.append(request_url)
                    html_text = util.read_request(request)
                    soup = bs4.BeautifulSoup(html_text, "html5lib")
                    process_dict.update(find_course_info(soup, coursemap,\
                    course_dict))
                    if process_dict:
                        course_dict.update(process_dict)
                    href_list = soup.find_all("a", href=True)
                    for h in href_list:
                        h_url = h['href']
                        h_url = clean_url(h_url, limiting_domain, request_url)
                        if h_url:
                            url_queue.put(h_url)

    return course_dict
Exemplo n.º 15
0
    def get_data(self, numPoints, url, useProxy):

        logging.info('Starting {} Requests to {}. Proxy? {}'.format(
            numPoints, url, useProxy))
        rtts = []
        errors = 0
        for i in range(numPoints):
            try:
                if useProxy: res = get_request(url, self.proxyDict)
                else: res = get_request(url)
                rtts.append(res.elapsed.total_seconds())
                if i % 10 == 0: time.sleep(5)  # avoid 429s
            except Exception as e:
                errors += 1

        logging.info('Finished {} Requests to {}. Got {} errors'.format(
            numPoints, url, errors))
        return (rtts, errors)
Exemplo n.º 16
0
 def worker(self, nReqs, url):
     errors = 0
     start = time.time()
     for i in range(nReqs):
         try:
             res = get_request(url, self.proxyDict)
         except Exception as e:
             errors += 1
     elapsed = time.time() - start
     self.workers.append(workerData(elapsed, nReqs, errors))
Exemplo n.º 17
0
def get_alpha_player_urls(letter_url):
    abs_letter_url = letter_url
    #print("abs_letter_url", abs_letter_url)
    letter_request = util.get_request(abs_letter_url)
    if letter_request:
        letter_text = letter_request.text
        letter_soup = bs4.BeautifulSoup(letter_text, parse_only=bs4.SoupStrainer("pre"))
        player_urls = [a.attrs.get("href") for a in letter_soup.select("a")] #makes list of player urls
                
    return player_urls
Exemplo n.º 18
0
def test_get_users(env):
    ids = [user['user']['id'] for user in env['directory']['users'].values()]
    get_users_route = '{}?{}'.format(config.GET_USERS_ROUTE,
                                     util.create_query('userId', ids))
    token = env['directory']['users']['user-1']['token']
    users = util.get_request(get_users_route, util.make_headers(token))
    found_ids = set([user['id'] for user in users])
    for id in ids:
        if not id in found_ids:
            raise ValueError(f'Id: {id} not found')
Exemplo n.º 19
0
def get_game_info(game_url, parent_url, urls_visited, num_games):
    game_dict = {"date": "", "stadium": "", "team1": "", "team2": "", "team1_runs": "", "team2_runs": "", "team1_hits": "", "team2_hits": "", "team1_hr": "", "team2_hr": "", "winner": ""}
    if game_url not in urls_visited:
        urls_visited.append(game_url)
        #print('parent_url', parent_url)
        #print('game url', game_url)
        abs_game_url = url_check(game_url, parent_url)
        if abs_game_url:
            print("abs_game_url", abs_game_url)
            game_request = util.get_request(abs_game_url)
            if game_request:
                game_text = game_request.text
                game_soup = bs4.BeautifulSoup(game_text, parse_only=bs4.SoupStrainer("div", id="page_content"))
                all_tables = game_soup.find_all("table", class_=False)
                game_table = all_tables[num_games]
                game_table_rows = game_table.find_all("tr")
                date_and_stadium_table = game_table_rows[0]
                date_and_stadium = date_and_stadium_table.find_all("div", class_="bold_text float_left")
                date = date_and_stadium[0].text
                stadium = date_and_stadium[1].text
                hits_runs_and_teams_table = game_table_rows[3]
                teams = hits_runs_and_teams_table.find_all("a", href=True)
                team1 = teams[0].text
                team2 = teams[1].text
                game_data = hits_runs_and_teams_table.find_all("strong")
                team1_runs = game_data[2].text[4]
                team2_runs = game_data[4].text[4]
                team1_hits = game_data[2].text[7]
                team2_hits = game_data[4].text[7]
                team1_hr_table = all_tables[num_games+5]
                team1_hr_row = team1_hr_table.find("tfoot")
                team1_hr = team1_hr_row.find_all("td")[7]
                team2_hr_table = all_tables[num_games+4]
                team2_hr_row = team2_hr_table.find("tfoot")
                team2_hr = team2_hr_row.find_all("td")[7]
                if int(team1_runs) > int(team2_runs):
                    winner = team1
                elif int(team2_runs) > int(team1_runs):
                    winner = team2
                else:
                    winner = "tie"
                game_dict["date"] = date
                game_dict["stadium"] = stadium[2:]
                game_dict["team1"] = team1
                game_dict["team2"] = team2
                game_dict["team1_runs"] = team1_runs
                game_dict["team2_runs"] = team2_runs
                game_dict["team1_hits"] = team1_hits
                game_dict["team2_hits"] = team2_hits
                game_dict["team1_hr"] = team1_hr
                game_dict["team2_hr"] = team2_hr
                game_dict["winner"] = winner
    print("game dict", game_dict)
    return game_dict
Exemplo n.º 20
0
def open_page(url):

	# print("opening...", url)
	r = util.get_request(url)

	# print("r:", r)

	if r:
		return r.url, BeautifulSoup(util.read_request(r))

	else:
		return None, None
Exemplo n.º 21
0
def test_get_messages(channel, token):
    URL = config.MESSAGE_ROUTE.format(channel['id'])
    messages = util.get_request(URL, util.make_headers(token))
    if len(messages) != len(config.MELIAN_DIALOGE):
        raise ValueError('Wrong number of messages recieved: {}'.format(len(messages)))
    for i in range(len(messages)):
        expected = config.MELIAN_DIALOGE[i]
        actual = messages[i]
        log.debug('Message Id: {}'.format(actual['id']))
        log.debug(actual['text'])
        if actual['text'] != expected:
            raise ValueError('Wrong message text for message: {}'.format(actual['id']))
Exemplo n.º 22
0
def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit):
    urls = Queue.Queue()
    visited = []
    index = {}

    def search(word):
        rv = []
        matches = []
        words = re.findall("[a-zA-Z]\w*", word)
        if len(words) == 0:
            return []
        for url in index.keys():
            for title in index[url].keys():
                for word in words:
                    word = word.lower()
                    if word in title or word in index[url][title]:
                        matches.append((title, url))
        for pair in matches:
            if matches.count(pair) == len(words):
                if pair not in rv:
                    rv.append(pair)
        return rv

    if util.is_url_ok_to_follow(starting_url, limiting_domain):
        urls.put(starting_url)
        while not urls.empty() and len(visited) < max_num_pages_to_visit:
            top_queue = urls.get()
            if top_queue not in visited and util.is_url_ok_to_follow(
                    top_queue, limiting_domain):
                request = util.get_request(top_queue)
                if request == None:
                    visited.append(top_queue)
                    continue
                new_page = util.get_request_url(request)
                if new_page != top_queue:
                    if new_page not in visited:
                        visited.append(new_page)
                        top_queue = new_page
                data = bs4.BeautifulSoup(util.read_request(request))
                visited.append(top_queue)
                index = indexer(index, top_queue, data)
                for link in data.find_all('a'):
                    href = link.get('href')
                    if href == None:
                        continue
                    href = util.remove_fragment(href)
                    if not util.is_absolute_url(href):
                        url = util.convert_if_relative_url(top_queue, href)
                    urls.put(url)
    else:
        return None
    return search
Exemplo n.º 23
0
def get_soup_from_url(url):
    '''
    Input:
        url - absolute url
    Returns:
        BeautifulSoup object corresponding to url
    '''
    request = util.get_request(url)
    if request == None:
        return None
    text = util.read_request(request)
    soup = bs4.BeautifulSoup(text, "html5lib")

    return soup
Exemplo n.º 24
0
def get_restaurant_links_cook():
    cities = get_cities()

    city_state = get_loc_cook()
    new_city_state = []
    for ele in city_state:
        if ele[0] in cities:
            new_city_state.append(ele)

    page_suffix = [i for i in range(0, 231, 10)]
    #print(city_state)

    url_list = []
    for city, state in city_state:
        html = "https://www.yelp.com/search?find_desc=Restaurants&find_loc=" + city.replace(
            " ", "") + "%2C%20" + state
        for suffix in page_suffix:
            html_page = html + "&start=" + str(suffix)
            url_list.append(html_page)
    r'''
    with open(r"c:\Users\35653\Desktop\CS122\project\urls.txt", "w") as write_file:
        write_file.writelines(url_list)

        write_file.close()
    '''

    url_list = [
        "https://www.yelp.com/search?find_desc=Restaurants&find_loc=Lyons%2C%20IL&start=190"
    ]
    for url in url_list:
        request = util.get_request(url)
        if request:

            text = util.read_request(request)

            soup = bs4.BeautifulSoup(text, "html5lib")
            tags = soup.find_all('a', href=True, target="", role="")

            # extract href links to restaurants
            links = []
            for tag in tags:
                link = tag['href']
                link = util.convert_if_relative_url(url, link)
                link = util.remove_fragment(link)
                # Hardcoded filter
                if link[-11:] == "Restaurants":
                    if tag["name"] != '':
                        if link not in links:
                            links.append(link + "\n")
    return links
Exemplo n.º 25
0
def cook_soup(url):
    '''
    simple function, takes an url and converts it into a beautiful soup object
    Input:
        url (str): a valid url/link
    Output:
        soup (bs4 object): a bs4 soup object using 'html5lib' parser
    '''
    answer = None
    r_object = util.get_request(url)
    if r_object:
        soup = BeautifulSoup(r_object.content, "html5lib")
        answer = soup
    return answer
Exemplo n.º 26
0
def create_players(master_player_dict, player_urls, parent_url):
    '''
    Loops through the list of player urls and passes the rows of the standard batting table
    down to get_player info so that function can get the team and year information
    '''
    id_number = 0
    for player_url in player_urls:
        print("index player_url", player_urls.index(player_url))
        print("player_url:", player_url)
        player_dict = {"name": "", "positions": "", "years": "", "span": "", "years_played": "", "teams": "", "WARs_nonpitcher": "", "WARs_pitcher": "", "ERAs": "", "IPs": "", "GSs": "", "FIPs": "", "E_Fs": ""}
        abs_player_url = parent_url + player_url
        if abs_player_url:
            player_request = util.get_request(abs_player_url)
            if player_request:
                player_text = player_request.text
                #all the things to put in the player_employment_dict
                years = get_player_info_from_standard_batting(player_text)[0]
                teams = get_player_info_from_standard_batting(player_text)[1]
                player_name = get_player_info_from_main_player_page(player_text)[0]
                positions = get_player_info_from_main_player_page(player_text)[1]
                wars_nonpitcher = get_player_info_from_player_value_batters(player_text)
                #put them in the player_employment_dict
                player_dict["years"] = years
                player_dict["teams"] = teams
                player_dict["name"] = player_name
                player_dict["positions"] = positions
                player_dict["WARs_nonpitcher"] = wars_nonpitcher 
                if "Pitcher" in positions:
                    eras = get_player_info_from_standard_pitching(player_text)[0]
                    ips = get_player_info_from_standard_pitching(player_text)[1]
                    gss = get_player_info_from_standard_pitching(player_text)[2]
                    fips = get_player_info_from_standard_pitching(player_text)[3]
                    e_fs = get_player_info_from_standard_pitching(player_text)[4]
                    wars_pitcher = get_player_info_from_player_value_pitchers(player_text)
                    player_dict["ERAs"] = eras
                    player_dict["IPs"] = ips
                    player_dict["GSs"] = gss
                    player_dict["FIPs"] = fips
                    player_dict["E_Fs"] = e_fs
                    player_dict["WARs_pitcher"] = wars_pitcher 
               
            if player_dict["years"] != "":
                player_dict["span"] = "-".join([player_dict["years"][:4], player_dict["years"][len(player_dict["years"])-4:]])
                player_dict["years_played"] = int(player_dict["years"][len(player_dict["years"])-4:]) - int(player_dict["years"][:4])
            print("id number:", id_number)
            print("player dict:", player_dict)
            master_player_dict[id_number] = player_dict
            id_number += 1
    return master_player_dict
Exemplo n.º 27
0
def make_soup(url):
    '''
    Makes a soup object from a html request object

    Inputs:
        request: a request object of the html 
    Outputs:
        soup - Soup object, if request is valid url. 
    '''
    req = util.get_request(url)
    html = util.read_request(req)
    if html is not None and html is not "":
        soup = bs4.BeautifulSoup(html, "html5lib")
        return soup
    return None
Exemplo n.º 28
0
def get_soup(url):
    '''
    Returns the soup of the current_market_url.
    Inputs:
        url: str
    Output:
        BeautifulSoup object
    '''
    time.sleep(0.05)
    url_request = util.get_request(url)
    if not url_request:
        return False
    html = util.read_request(url_request)
    if not html:
        return False
    return bs4.BeautifulSoup(html, "html5lib")
Exemplo n.º 29
0
def get_soup_object(url):
    """
    Takes a url, checks for possible redirection,
    returns soup object.

    Inputs:
        url (string)
    
    Returns:
        Soup object
    """
    request = util.get_request(url)
    html_text = util.read_request(request)
    soup = bs4.BeautifulSoup(html_text, 'html5lib')

    return soup
Exemplo n.º 30
0
def get_restaurant_links():
    '''
    Start from searching "Restaurant", "Chicago" on yelp main page,
    and collect all restaurant links from 24 pages

    Input:
        None

    Output:
        links (list): a list of links
    '''

    page_suffix = [i for i in range(0, 231, 10)]
    url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start='
    url_list = []

    for suffix in page_suffix:
        page_url = url + str(suffix)
        url_list.append(page_url)

    links = []
    count = 0

    for url in url_list:
        count += 1
        print(count)

        request = util.get_request(url)
        text = util.read_request(request)

        soup = bs4.BeautifulSoup(text, "html5lib")
        tags = soup.find_all('a', href=True, target="", role="")

        for tag in tags:
            link = tag['href']
            link = util.convert_if_relative_url(url, link)
            link = util.remove_fragment(link)
            if link[-11:] == "Restaurants":
                if tag["name"] != '':
                    if link not in links:
                        links.append(link + "\n")
                        print(link)

        i = 5 + random.random() * 5
        time.sleep(i)

    return links
Exemplo n.º 31
0
def get_referees(url):

    base_url = 'http://www.basketball-reference.com/boxscores/'
    comp_url = base_url + url + '.html'

    request = util.get_request(comp_url)

    if request != None:
        html = util.read_request(request)
        if html != None:
            soup = bs4.BeautifulSoup(html, "html5lib")
    div_tags = soup.find_all('div')
    good_tags = str(div_tags)
    string = re.findall(r'(?<=Officials:)(.*?)(?=\<br)', good_tags)

    rv = re.findall(r'(?<=.html\"\>)(.*?)(?=<\/a)', string[0])
    return rv
Exemplo n.º 32
0
def get_day_urls(regular_season_year_urls, parent_url, master_regular_game_dict):
    parent_url = "http://www.baseball-reference.com/boxes/"
    for year_url in regular_season_year_urls:
        abs_year_url = parent_url + year_url
        print("year_url", year_url)
        print("if abs_year_url", abs_year_url)
        year_request = util.get_request(abs_year_url)
        if year_request:
            year_text = year_request.text
            year_soup = bs4.BeautifulSoup(year_text, parse_only=bs4.SoupStrainer("table", class_="wide_container"))
            #print("year soup", year_soup)
            day_urls = [a.attrs.get("href") for a in year_soup.select("a")]
            #day_links = year_soup.find("a")
            print("day urls:", day_urls)
            #print("day links", day_links)
            
    return master_regular_game_dict
Exemplo n.º 33
0
def get_ps_series_page(series_urls, parent_url, master_ps_game_dict):
    for series_url in series_urls:
        abs_series_url = parent_url + series_url
        if abs_series_url:
            series_request = util.get_request(abs_series_url)
            if series_request:
                series_text = series_request.text
                series_soup = bs4.BeautifulSoup(series_text, parse_only=bs4.SoupStrainer("pre"))
                games = series_soup.find_all("pre")
                if len(games) != 0:
                    print("games", games)
                    game_id = 0
                    for game in games: 
                        print("game_id", game_id)
                        game_dict = get_ps_game_info(game)
                        master_ps_game_dict[game_id] = game_dict
                        game_id += 1
            return master_ps_game_dict
Exemplo n.º 34
0
def get_paper_links(number_of_articles, fields):
    '''
    Crawls through Nature search pages, and pulls article links 
    from different fields

    Input:
    number of articles, int number of articles to find 
    fields, list of field str, all major fields in nature.com/subjects
    
    Output:
    paper_links, list of paper urls (str)
    '''
    search_url = ('https://www.nature.com/search?article_type=protocols\
                   %2Cresearch%2Creviews&subject=')
    suffix = '&page='
    
    search_urls = []
    paper_links = []

    num_articles_per_field = number_of_articles // 8 
    num_pages_to_visit = int(np.ceil(num_articles_per_field / 50))
    num_on_last_page = num_articles_per_field % 50

    for field in fields:
        for i in range(num_pages_to_visit):
            new_url = search_url + field + suffix + str(i + 1)
            search_urls.append(new_url)
        
    for url in search_urls:
        num_to_search = 50
        if int(url[-1]) == num_pages_to_visit:
            num_to_search = num_on_last_page

        new_request = util.get_request(url)
        html = util.read_request(new_request)
        search_soup = bs4.BeautifulSoup(html, features = 'html.parser')
        article_links = search_soup.find_all('h2', 
                        class_ = 'h3 extra-tight-line-height', 
                        itemprop = 'headline')
        article_links = article_links[:num_to_search]
        paper_links.extend([i.find('a')['href'] for i in article_links])
    
    return paper_links
Exemplo n.º 35
0
def get_player_urls(letter_urls, parent_url):
    '''
    Loops through the the list of letter urls and gets a list of player urls, which it passes down to
    create_players so that the info for each player can be made into a mini dictionary that can be
    added to the master dictionary
    '''
    
    for letter_url in letter_urls:
        abs_letter_url = parent_url + letter_url
        print("abs_letter_url", abs_letter_url)
        letter_request = util.get_request(abs_letter_url)
        if letter_request:
            letter_text = letter_request.text
            letter_soup = bs4.BeautifulSoup(letter_text, parse_only=bs4.SoupStrainer("pre"))
            #players = letter_soup.find_all("a",href=True)
            player_urls = [a.attrs.get("href") for a in letter_soup.select("a")] #makes list of player urls
            #print("player_urls:", player_urls)
            #parent_url = abs_letter_url
                
    return player_urls
Exemplo n.º 36
0
def analyze_page(url, queue, limiting_domain, course_dict):
    '''
    Queues all urls, then makes a dictionary index of the course codes to
    a list of words in the course description.

    Inputs:
        url: the url of the page to analyze
        queue: the queue that holds the urls
        limiting_domain: a domain with which to stay in when queuing
        course_dict: the index dictionary

    Outputs:
        None
    '''
    request = util.get_request(url)
    text = util.read_request(request)
    soup = bs4.BeautifulSoup(text, "html5lib")

    queue_urls(url, soup, queue, limiting_domain)
    find_courses(soup, course_dict)
Exemplo n.º 37
0
def make_br_player_dict():
    '''
    Crawls through the players part of baseball-reference and makes a dictionary where the keys are 
    the player_id for each player, which map to the teams each player played for and the corresponding 
    year for each team
    '''
    master_player_dict = {}
    letter_urls = []
    player_urls = []
    starting_url = "http://www.baseball-reference.com/players/"
    parent_url = "http://www.baseball-reference.com"
    request = util.get_request(starting_url)
    if request:
        text = request.text
        soup = bs4.BeautifulSoup(text, parse_only=bs4.SoupStrainer("td", class_="xx_large_text bold_text"))
        letter_urls = [a.attrs.get("href") for a in soup.select("a")]#makes list of letter urls
        #print("letter urls:", letter_urls)
        player_urls = get_player_urls(letter_urls, parent_url)
        master_player_dict = create_players(master_player_dict, player_urls, parent_url)
        
    return master_player_dict
Exemplo n.º 38
0
def make_br_games_dict():
    master_game_dict = {}
    master_regular_game_dict = {}
    master_ps_game_dict = {}

    urls_visited = []
    starting_url = "http://www.baseball-reference.com/boxes/"
    parent_url = "http://www.baseball-reference.com"
    if starting_url not in urls_visited:
        urls_visited.append(starting_url)
        request = util.get_request(starting_url)
        #print("request:", request)
        if request:
            text = request.text
            soup = bs4.BeautifulSoup(text, parse_only=bs4.SoupStrainer("table", class_="large_text"))
            #regular_season_year_urls = get_regular_season_year_urls(soup)
            #day_urls = get_day_urls(regular_season_year_urls, parent_url)
            #master_regular_game_dict = get_game_urls(master_regular_game_dict, day_urls)
            
            series_urls = get_post_season_urls(parent_url)
            master_ps_game_dict = get_ps_series_page(series_urls, parent_url, master_ps_game_dict)
    return master_game_dict
Exemplo n.º 39
0
def translate_general(code):
    '''
    Given an IDC-code, scrapes www.icd10data.com and returns a meaningful
    translation as a string

    Input:
        code (int): an IDC-code
    Output:
        rv (string): translation of the IDC-code
    '''
    url = BASE_GEN + str(code) + '&codebook=icd9volume1'
    ro = util.get_request(url)
    html = util.read_request(ro)
    soup = bs4.BeautifulSoup(html, "html5lib")
    rv = None
    search = soup.find('div').next_sibling.next_sibling.find('div',
        class_='searchPadded')

    if search and search.text:
        rv = search.text

    return rv
Exemplo n.º 40
0
def get_post_season_urls(parent_url):
    #need to get all urls for all the world series/postseason stuff until it gets to 1903, when I 
    #should stop getting data bc there is barely any data
    abs_ps_url = parent_url + "/postseason/"
    if abs_ps_url:
        parent_url = abs_ps_url
        ps_request = util.get_request(abs_ps_url)
        if ps_request:
                ps_text = ps_request.text
                ps_soup = bs4.BeautifulSoup(ps_text, parse_only=bs4.SoupStrainer("div", id="page_content"))
                #table_rows = ps_soup.find_all("tr")
                #print("table_rows", table_rows)
                #for row in table_rows:
                series_urls = [a.attrs.get("href") for a in ps_soup.select("a[href^=/postseason/]")]
                    #possible_links = row.find_all("a", href^="/postseason")
                    #print("possible links", possible_links)
                    #print("len possible_links", len(possible_links))
                    #if len(possible_links) > 3:
                        #series_urls = [possible_links[0].get("href"), possible_links[len(possible_links)//2].get("href")]
                    #elif len(possible_links) > 0:
                        #series_urls = [possible_links[0].get("href")]
                #for series_url in series_urls:
                    
    return series_urls