def Zomato_Scrape_info(URL, proxy): try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } ip_text = requests.get('http://icanhazip.com', proxies=proxy, headers=headers) print('Browsing from - ', ip_text.content.decode('utf-8')) rest_info = get_restaurant_info(URL, proxy) # print('Inside Function: ',rest_info) return rest_info except ConnectionError: # If error, find another proxy proxy = random_proxy() return Zomato_Scrape_info(URL, proxy)
def get_geocode(url, proxy): try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } r = requests.get(url, proxies=proxy, headers=headers) soup = BeautifulSoup(r.content, 'html.parser') map_container = soup.find('div', {'class': 'ui segment map_container'}) if map_container is None: return 'Not Available' geo_link = map_container.find('a') geocode = geo_link.get_attribute_list('href')[0].split('/')[-1] # print(geocode) return geocode except ConnectionError: print('ConnectionError inside get_geocode() method, changing IP') return get_geocode(url, random_proxy())
def get_img_urls_from(url, proxy): try: img_url_list = [] headers = \ {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} response = requests.get(url, headers=headers, proxies=proxy) soup = BeautifulSoup(response.content, 'html.parser') images_divs_container = soup.find( 'div', {'class': 'photos_container_load_more inlineblock w100'}) image_divs = images_divs_container.findChildren('div', recursive=False) if not len(image_divs) > 0: return 'Not Available' for div in image_divs[:10]: image_url_div = div.find('img') # print(image_url_div) image_url = image_url_div.get_attribute_list('data-original')[0] image_url = image_url.split('?')[0] # print(image_url) img_url_list.append(image_url) return ','.join(img_url_list) except ConnectionError: print('ConnectionError inside get_img_urls_from() method, changing IP') return get_img_urls_from(url, random_proxy())
URLS_df = pd.read_csv(prop.Restaurant_master_file_path) # os.chdir('E:\\DSA Internship\\Web Scrapping\\Zomato Restaurants Web Scrapping\\Cities\\Hyderabad\\Error Urls\\') # Error_URLS=[] # with open('ErrorURLs.txt','r') as f: # for URL in f.read().splitlines(): # Error_URLS.append(URL) # rest_urls_list = ['https://www.zomato.com/hyderabad/kebab-e-bahar-taj-banjara-banjara-hills', # 'https://www.zomato.com/hyderabad/36-downtown-brew-pub-jubilee-hills', # 'https://www.zomato.com/hyderabad/huber-holly-banjara-hills', # 'https://www.zomato.com/hyderabad/behrouz-biryani-banjara-hills', # 'https://www.zomato.com/hyderabad/seasonal-tastes-the-westin-hitech-city', # 'https://www.zomato.com/rolling-stove-food-truck'] # list of Restaurant URLs proxy = random_proxy() start = datetime.now() #for url in tqdm(URLS_df[30:50].values): database = connect_to_my_mongo_db('Restaurants') restaurant_mongo_master_collection = database.Restaurants_Info_Master start_index = int(input('Input the Start Index of the URLs: ')) stop_index = int(input('Input the Stop Index of the URLs: ')) n = start_index for url in URLS_df[start_index:stop_index].values: Not_properly_scrapped = True print('Scrapping Restaurant Data of index ', n, ': ', url[0]) while Not_properly_scrapped:
def scrape_reviews(start_index, stop_index): URLS_df = pd.read_csv(prop.Restaurant_master_file_path) # URLS=URLS_df['URLS'].tolist() ErrorURLs = pd.read_csv(prop.ErrorURLs_file_path) error_url_list = ErrorURLs['Error_URLS'].tolist() reviews_folder_path = prop.Reviews_folder_path start = datetime.now() proxy = random_proxy() total_reviews_count = 0 #Mongo_DB Connection... db = connect_to_my_mongo_db('Restaurants') mongo_reviews = db['Reviews_Master'] for row in URLS_df[start_index:stop_index].values: begin = datetime.now() url = row[0] # row[0] for url and row[1] for Rest_ID if url in error_url_list: print( 'The current URL had errors while scrapping its restuarant info, Hence skipping this url: ', url) continue Mongo_ref = get_mongo_rest_ref_by_url(db, 'Restaurants_Info_Master', url) rest_review_url = url + '/reviews' not_scrapped = True while not_scrapped: # loop repeats till a working proxy IP is caught chrome_browser = Create_Chrome_browser(use_proxy=True, proxy=proxy['http']) # print(url) reviews_df, success = scrape_reviews_from_url( chrome_browser, rest_review_url) if success: if len(reviews_df) == 0: print('CSV file will not be generated...') print('-' * 100, '\n') not_scrapped = False continue Rest_ID = str(row[1]) file_name = url.split('/')[-1] #print(file_name) #https://stackoverflow.com/questions/29815129/pandas-dataframe-to-list-of-dictionaries reviews_dict_list = reviews_df.to_dict('records') for review in reviews_dict_list: review['Restaurant_Ref'] = Mongo_ref mongo_reviews.insert_one(review) csv_file_name = reviews_folder_path + Rest_ID + '-' + file_name + '.csv' #print(row[1]) print('Restaurant ID:', Rest_ID) reviews_df[ 'Restaurant_ID'] = Rest_ID # Copying the string values of IDs reviews_df['Review_ID'] = reviews_df[ 'Restaurant_ID'] + '000' + reviews_df['ID'] reviews_df = reviews_df[[ 'Review_ID', 'Restaurant_ID', 'ID', 'review_title', 'user_importance', 'user_name', 'user_rating', 'user_review' ]] reviews_df.to_csv(csv_file_name, index=False) reviews_df.to_csv( prop.Reviews_master_file_path, mode='a', header=False, index=False ) # https://stackoverflow.com/questions/17530542/how-to-add-pandas-data-to-an-existing-csv-file chrome_browser.quit() not_scrapped = False print('-' * 100) print('{} has been created'.format(file_name + '.csv')) print('Time taken to scrape and create the the above file: ', datetime.now() - begin) print('\n\n') total_reviews_count += len(reviews_df) else: proxy = random_proxy() # with open('review_master.csv',mode='a',headers=False) as master: print('Total reviews scrapped: ', total_reviews_count) print('Time taken to scrape all the reviews: ', datetime.now() - start) chrome_browser.quit()