def parse_genre_esrb(df): """loads every game's url to get genre and esrb rating""" headers = { 'User-Agent': generate_user_agent(device_type='desktop', os=('mac', 'linux')) } proxy = {} if proxy_enabled: #print("\n******getting list of proxies and testing them******'\n") # this an api call which returns a list of working proxies that get checked evrey 15 minutes proxies = cycle(get_proxies(5)) proxy = next(proxies) for index, row in df.iterrows(): try: game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={ "http": proxy, "https": proxy }, timeout=5) if game_page.status_code == 200: sub_soup = BeautifulSoup(game_page.text, "lxml") # again, the info box is inconsistent among games so we # have to find all the h2 and traverse from that to the genre gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"}) h2s = gamebox.find_all('h2') # make a temporary tag here to search for the one that contains # the word "Genre" temp_tag = element.Tag for h2 in h2s: if h2.string == 'Genre': temp_tag = h2 df.loc[index, 'Genre'] = temp_tag.next_sibling.string # find the ESRB rating game_rating = gamebox.find('img').get('src') if 'esrb' in game_rating: df.loc[index, 'ESRB_Rating'] = game_rating.split( '_')[1].split('.')[0].upper() # we successfuly got the genre and rating df.loc[index, 'status'] = 1 #print('Successfully scraped genre and rating for :', df.at[index, 'Name']) except (ProxyError): proxy = next(proxies) except (ConnectionError, Timeout, ProtocolError, TimeoutError): #print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') continue except Exception as e: #print('different error occurred while connecting, will pass') continue # wait for 1 seconds between every call, # we do not want to get blocked or abuse the server time.sleep(1) return df
urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' # get the number of pages vglink = requests.get('http://www.vgchartz.com/gamedb/').text x = fromstring(vglink).xpath("//th[@colspan='3']/text()")[0].split( '(', 1)[1].split(')')[0] pages = int(x.split(',')[0]) if not exists: page = 1 while True: if page > pages: break try: proxy = get_proxies(1)[0] headers = { 'User-Agent': generate_user_agent(device_type='desktop', os=('mac', 'linux')) } surl = urlhead + str(page) + urltail r = requests.get(surl, headers=headers, proxies={ 'http': proxy, 'https': proxy }, timeout=10) if r.status_code == 200: soup = BeautifulSoup(r.text, 'lxml') print("******Scraping page " + str(page) + "******'\n")