def run(self): base_url = "https://www.yelp.com/search?find_desc=" \ "Dry+Cleaners&find_loc=New+York%2C+NY&start=" for j in range(1, 1000, 10): try: url = base_url + str(j) print '[YelpScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "Failed to get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') li_class = "regular-search-result" # parsing html content to fet information about dry cleaners for li in soup.find_all('li', class_=li_class): self.scrap_row_yelp(li) # break # just use it for testing only sleep_scrapper('YelpScraper') except Exception as exp: print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp print(traceback.format_exc())
def run(self): try: base_url = 'https://www.flipkart.com/search?as=off&as-show=' \ 'on&otracker=start&page=' sufix = '&q=%s&viewType=list' % self.product for i in range(1, 100, 1): url = base_url + str(i) + sufix print '[FlipkartScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print '[FlipkartScraper] :: Failed to get the content ' \ 'of url: %s' % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # for div in soup.find_all('div', class_='col col-7-12'): for div in soup.find_all('div', class_='_1-2Iqu row'): # print '---------------------div', div self.scrap_result_row(div) sleep_scrapper('FlipkartScraper') except Exception as exp: print '[FlipkartScraper] :: run() :: Got exception: %s' % exp print(traceback.format_exc())
def run(self): url = '' try: base_url = 'https://www.overstock.com/Home-Garden/%s/%s/' \ % (self.product_category, self.product_code) sufix = 'subcat.html?page=' for j in range(1, 100, 1): url = base_url + sufix + str(j) print '[OverStockScraper] :: fetching data from url:', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[OverStockScraper] :: Failed to " \ "get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='product-tile'): # print '---------div', div self.scrap_result_row(div) # break sleep_scrapper('OverStockScraper') except Exception as exp: print '[OverStockScraper] :: run() :: Got exception : ' \ '%s and fetching data from url: %s' % (exp, url) print(traceback.format_exc())
def run(self): try: url = 'https://www.bedbathandbeyond.com/store/category' \ '/%s/%s/%s/%s/' \ % (self.product_category, self.product_subcategory, self.product_title, self.product_code) print '[BedBathAndBeyondScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[BedBathAndBeyondScraper] :: Failed to get " \ "content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='productCo' 'ntent ec_listing'): self.scrap_result_row(div) sleep_scrapper('BedBathAndBeyondScraper') except Exception as exp: print '[BedBathAndBeyondScraper] :: run() :: Got exception: %s'\ % exp print(traceback.format_exc())
def run(self): base_url = 'https://www.homedepot.com/b/' \ '%s/N-5yc1vZbm79?Nao=' % (self.product) sufix = '&Ns=None' for j in range(0, 1000, 12): url = '' try: url = base_url + str(j) + sufix print '[HomeDepot] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[HomeDepot] :: Failed to get " \ "content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='pod-inner'): self.scrap_result_row(div) sleep_scrapper('HomeDepot') except Exception as exp: print '[HomeDepot] :: run() :: Got exception : ' \ '%s and fetching data from url: %s' % (exp, url)
def run(self): base_url = 'https://www.indeed.co.in/jobs?q=' \ '%s&l=%s&start=' % (self.post, self.location) for j in range(0, 1000, 10): url = '' try: url = base_url + str(j) print '[IndeedScrapper] :: fetching data from url:', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[IndeedScrapper] :: Failed to " \ "get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div'): # ignore divs with classes if not div.attrs.has_key('class'): continue cls = div.attrs['class'] if 'row' in cls and 'result' in cls: self.scrap_result_row(div) # break sleep_scrapper('IndeedScraper') except Exception as exp: print '[IndeedScraper] :: run() :: Got exception : ' \ '%s and fetching data from url: %s' % (exp, url)
def scrap_android_developer(self): dir_path = os.path.realpath(__file__) file_data = "%s/%s" % (dir_path, PROGRESS_ANDROID) if not os.path.isfile(file_data): print("Error: %s file not found" % file_data) txt_write(PROGRESS_ANDROID, "10") print("file is exist %s ..." % file_data) f = open(PROGRESS_ANDROID, "r") i = int(f.readline()) base_url = "https://www.indeed.co.in/jobs?q=android+developer&l=Mohali%2C+Punjab&start=" for j in range(i, 100, 10): try: android_url = base_url + str(j) self.android.scrap_android_developer(android_url) # update scrapping progress in Android progress file txt_write(PROGRESS_ANDROID, str(j)) # sleep scrapper for a while sleep_scrapper("Android-Scrapper") except Exception as exp: print "scrap_android_developer() :: Got exception: %s" % exp print(traceback.format_exc())
def scrap_yellowpages_dry_cleaner(self): """ Scrap dry cleaners data from yellow pages """ dir_path = os.path.dirname(os.path.realpath(__file__)) file_data = "%s/%s" % (dir_path, PROGRESS_YELLOWPAGES_FILE) # if progress file is not found, # create a new one with default value 1 if not os.path.isfile(file_data): logging.error("Error: %s file not found" % file_data) print("Error: %s file not found" % file_data) txt_write(PROGRESS_YELLOWPAGES_FILE, "1") logging.info("file is exist %s ..." % file_data) print("file is exist %s ..." % file_data) f = open(PROGRESS_YELLOWPAGES_FILE, "r") i = int(f.readline()) logging.info( "\n\nYellowPages-Dry-Cleaner-Scrapper started at i: %d \n\n" % i ) print("\n\nYellowPages-Dry-Cleaner-Scrapper started at i: %d \n\n" % i) base_url = "https://www.yellowpages.com/" \ "new-york-ny/dry-cleaners-laundries?page=" for j in range(i, 86, 1): try: yellowpages_url = base_url + str(j) self.yellowpages.scrap_yellowpages(yellowpages_url, ST_DRY_CLEANERS) # progress % of yellowpages_dry_cleaner scrapping percentage = float(j * 100 / 86) logging.info( "\n\nYellowPages-Dry-Cleaner-Scrapper %f " "percent completed" % percentage ) print( "\n\nYellowPages-Dry-Cleaner-Scrapper %f " "percent completed" % percentage ) # update scrapping progress in # yellowpages_dry_cleaner progress file txt_write(PROGRESS_YELLOWPAGES_FILE, str(j)) # sleep scrapper for a while sleep_scrapper("YellowPages-Dry-Cleaner-Scrapper") except Exception as exp: logging.error( "scrap_yellowpages_dry_cleaner() :: " "Got Exception : %s" % exp ) logging.error( traceback.format_exc() )
def scrap_yelp_dry_cleaners(self): """ Scrap dry cleaners data from yelp """ dir_path = os.path.dirname(os.path.realpath(__file__)) file_data = "%s/%s" % (dir_path, PROGRESS_YELP_FILE) # if progress file is not found, # create a new one with default value 10 if not os.path.isfile(file_data): logging.error("Error: %s file not found" % file_data) print("Error: %s file not found" % file_data) txt_write(PROGRESS_YELP_FILE, "10 ") logging.info("file is exist %s ..." % file_data) print("file is exist %s ..." % file_data) f = open(PROGRESS_YELP_FILE, "r") i = int(f.readline()) logging.info( "\n\nYelp-Dry-Cleaner-Scrapper started at i: %d \n\n" % i ) print("\n\nYelp-Dry-Cleaner-Scrapper started at i: %d \n\n" % i) base_url = "https://www.yelp.com/search?find_desc" \ "=Dry+Cleaners&find_loc=New+York%2C+NY&start=" for j in range(i, 1000, 10): try: yelp_url = base_url + str(j) self.yelp.scrap_yelp(yelp_url, ST_DRY_CLEANERS) # progress % of yelp_dry_cleaner scrapping percentage = float(j * 100 / 1000) logging.info( "\n\nYelp-Dry-Cleaner-Scrapper %f " "percent completed" % percentage ) print( "\n\nYelp-Dry-Cleaner-Scrapper %f " "percent completed" % percentage ) # update scrapping progress in yelp_dry_cleaner progress file txt_write(PROGRESS_YELP_FILE, str(j)) # sleep scrapper for a while sleep_scrapper("Yelp-Dry-Cleaner-Scrapper") except Exception as exp: logging.error( "scrap_yelp_dry_cleaners() :: Got Exception : %s" % exp ) logging.error( traceback.format_exc() )
def scrap_yellowpages_shoe_repair(self): """ Scrap shoe repair data from yellow pages """ dir_path = os.path.dirname(os.path.realpath(__file__)) file_data = "%s/%s" % ( dir_path, PROGRESS_YELLOWPAGES_SHOE_REPAIR_FILE ) if not os.path.isfile(file_data): logging.error("Error: %s file not found" % file_data) txt_write(PROGRESS_YELLOWPAGES_SHOE_REPAIR_FILE, "1") logging.info("file is exist %s ..." % file_data) f = open(PROGRESS_YELLOWPAGES_SHOE_REPAIR_FILE, "r") i = int(f.readline()) logging.info( "\n\nYellowpages-Shoe-Repair-Scrapper started at i: %d \n\n" % i ) base_url = "https://www.yellowpages.com/search?" \ "search_terms=Shoe%20Repair" \ "&geo_location_terms=New%20York%2C%20NY&page=" for j in range(i, 30, 1): try: yellowpages_url = base_url + str(j) self.yellowpages.scrap_yellowpages(yellowpages_url, ST_SHOE_REPAIR) # progress of yellowpages_shoe_repair scrapping % percentage = float(j * 100 / 30) logging.info( "\n\nYellowPages-Shoe-Repair-Scrapper %f percent " "completed" % percentage ) # update scrapping progress in # yellowpages_shoe_repair progress file txt_write(PROGRESS_YELLOWPAGES_FILE, str(j)) # sleep scrapper for a while sleep_scrapper("YellowPages-Shoe-Repair-Scrapper") except Exception as exp: logging.error( "scrap_yellowpages_shoe_repair() :: " "Got Exception : %s" % exp ) logging.error(traceback.format_exc())
def run(self): try: url = 'https://www.countryliving.com/food-drinks/' #input("Enter url to be scrapped: ") # url1 = url.split('/') # print(url1) # #url2 = url1.split('/').pop() # url2 = url1[3] print ('[AnyWebsiteScraper] :: fetching data from url: ', url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print ("[AnyWebsiteScraper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div22 = soup.find('div',class_='site-content') div11= soup.find('div',class_='feed feed-grid') for div33 in div11.find_all('div',class_='simple-item'): self.scrap_result_row(div33) sleep_scrapper('AnyWebsiteScraper') #infinite scrolling logic start #infinitescrollingUrl ##################### Input here base_url = 'https://www.countryliving.com/ajax/infiniteload/?id=34aae02d-c035-47e5-95c5-b87ba30c1dd8&class=CoreModels%5Csections%5CSectionModel&viewset=section&cachebuster=&page=' for i in range(2, 100, 1): url = base_url + str(i) print ('[AnyWebsiteScraper] :: fetching data from url: ', url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print ("[AnyWebsiteScraper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div22 = soup.find('div',class_='site-content') div11= soup.find('div',class_='feed feed-grid') for div33 in div11.find_all('div',class_='simple-item'): self.scrap_result_row(div33) sleep_scrapper('AnyWebsiteScraper') #Infinite Scroll Logic Ends except Exception as exp: print ('[AnyWebsiteScraper] :: run() :: Got exception: %s'\ % exp) print(traceback.format_exc())
def scrap_cragilist_shoe_repair(self): """ Scrap shoe repair data from cragilist """ dir_path = os.path.dirname(os.path.realpath(__file__)) file_data = "%s/%s" % (dir_path, PROGRESS_CRAIGLIST_SHOE_REPAIR_FILE) # if progress file is not found, # create a new one with default value 1 if not os.path.isfile(file_data): logging.error("Error: %s file is not found" % file_data) txt_write(PROGRESS_CRAIGLIST_SHOE_REPAIR_FILE, "1") logging.info("file is exist %s ..." % file_data) f = open(PROGRESS_CRAIGLIST_SHOE_REPAIR_FILE, "r") i = int(f.readline()) logging.info("\n\nCraiglist-Dry-Cleaner started at i: %d \n\n" % i) base_url = "https://newyork.craigslist.org/search" \ "/sss?query=shoe+repair&sort=rel" for j in range(i, 90, 1): try: craiglist_url = base_url + str(i) self.craiglist.scrap_craiglist(craiglist_url, ST_SHOE_REPAIR) # progress of craiglist_shoe_repair scrapping % percentage = float(j * 100 / 90) logging.info( "\n\nCraiglist-Shoe-Repair-Scrapper %f " "percent completed" % percentage ) # update scrapping progress in # craiglist_shoe_repair progress file txt_write(PROGRESS_CRAIGLIST_SHOE_REPAIR_FILE, str(j)) # sleep scrapper for a while sleep_scrapper("Craiglist-Shoe-Repair-Scrapper") except Exception as exp: logging.error( "scrap_shoe_rapair_companies() :: " "Got Exception : %s" % exp ) logging.error( traceback.format_exc() )
def scrap_groupon_shoe_repair(self): """ Scrap shoe repair data from groupon """ dir_path = os.path.dirname(os.path.realpath(__file__)) file_data = "%s/%s" % (dir_path, PROGRESS_GROUPON_SHOE_REPAIR_FILE) # if progress file is not found, # create a new one with default value 1 if not os.path.isfile(file_data): logging.error("Error: %s file not found" % file_data) txt_write(PROGRESS_GROUPON_SHOE_REPAIR_FILE, "1") logging.info("file is exist %s ..." % file_data) f = open(PROGRESS_GROUPON_FILE, "r") i = int(f.readline()) logging.info( "\n\nGroupon-Shoe-Repair-Scrapper started at i: %d \n\n" % i ) base_url = "https://www.groupon.com/browse/chicago?" \ "lat=41.8795&lng=-87.6243&address=Chicago&query=" \ "new+york+shoe+repair&locale=en_US" for j in range(i, 400, 1): try: groupon_url = base_url + str(j) self.groupon.scrap_groupon(groupon_url, ST_SHOE_REPAIR) # progress of groupon_shoe_repair scrapping % percentage = float(j * 100 / 400) logging.info( "\n\nGroupon-Shoe-Repair-Scrapper %f percent " "completed" % percentage ) # update scrapping progress in # groupon_shoe_repair progress file txt_write(PROGRESS_GROUPON_FILE, str(j)) # sleep scrapper for a while sleep_scrapper("Groupon-Shoe-Repair-Scrapper") except Exception as exp: logging.error( "scrap_groupon_shoe_repair() :: Got Exception : %s" % exp ) logging.error(traceback.format_exc())
def srap_yelp_shoe_repair(self): """ Scrap shoe repair data from Yelp """ dir_path = os.path.dirname(os.path.realpath(__file__)) file_data = "%s/%s" % (dir_path, PROGRESS_YELP_SHOE_REPAIR_FILE) if not os.path.isfile(file_data): logging.error("Error: %s file not found" % file_data) txt_write(PROGRESS_YELP_SHOE_REPAIR_FILE, "1") logging.info("file is exist %s ..." % file_data) f = open(PROGRESS_YELP_SHOE_REPAIR_FILE, "r") i = int(f.readline()) logging.info( "\n\nYelp-Shoe-Repair-Scrapper started at i: %d \n\n" % i ) base_url = "https://www.yelp.com/search?find_desc" \ "=Shoe+repair&find_loc=New+York,+NY&start=" for j in range(i, 820, 10): try: yelp_url = base_url + str(j) self.yelp.scrap_yelp(yelp_url, ST_SHOE_REPAIR) # progress of yelp_shoe_repair scrapping % percentage = float(j * 100 / 820) logging.info( "\n\nYelp-Shoe-Repair-Scrapper %f percent " "completed" % percentage ) # update scrapping progress in yelp_shoe_repair progress file txt_write(PROGRESS_YELP_SHOE_REPAIR_FILE, str(j)) # sleep scrapper for a while sleep_scrapper("Yelp-Shoe-Repair-Scrapper") except Exception as exp: logging.error( "srap_yelp_shoe_repair() :: Got exception: %s" % exp ) logging.error(traceback.format_exc())
def run(self): base_url = 'https://www.yellowpages.com/search?search_terms=software+company&geo_location_terms=New+York%2C+NY&page=' for j in range(27, 100, 1): try: url = base_url + str(j) print ('[YellowPagesScraper] :: fetching data from url: ', url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print ('[YellowPagesScraper] :: Failed to get the content ' \ 'of url: %s' % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for div in soup.find_all('div', class_='info'): self.scrap_result_row(div) sleep_scrapper('YellowPagesScraper') except Exception as exp: print ('[YellowPagesScraper] :: run() :: Got exception: %s' % exp) print(traceback.format_exc())
def run(self): try: base_url = 'https://inc42.com/buzz/' r = requests.get(base_url, headers=get_request_headers()) print('[IncScrapper] :: fetching data from TEAMS url: ', base_url) if not r.status_code == 200: print ("[IncScrapper] :: Failed to get " \ "content of url: %s" % base_url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div1 = soup.find('div', class_="site-content") inc_news = div1.find_all("div", {"class": "card-wrapper horizontal-card"}) for news in inc_news: self.scrape_home(news) sleep_scrapper('Inc42scrapper') #next pages data for i in range(2, 100, 1): page = 'page' url = base_url + page + str(i) r = requests.get(url, headers=get_request_headers()) print('[IncScrapper] :: fetching data from TEAMS url: ', url) if not r.status_code == 200: print ("[IncScrapper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') div1 = soup.find('div', class_="site-content") inc_news = div1.find_all( "div", {"class": "card-wrapper horizontal-card"}) for news in inc_news: self.scrape_home(news) sleep_scrapper('Inc42scrapper') except Exception as exp: print ('[Inc42Scrapper] :: run() :: Got exception at fetching data from INC42 Homepage: %s'\ % exp) print(traceback.format_exc())
def run(self): base_url = "https://www.yelp.com/search?find_desc=%s&find_loc=%s,+NY&start=" % ( self.product, self.location) for j in range(1, 1000, 10): try: url = base_url + str(j) print '[YelpScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print '[YelpScraper] :: Failed to get content of url: %s' % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') for li in soup.find_all('li', class_='regular-search-result'): self.scrap_row_yelp(li) sleep_scrapper('YelpScraper') except Exception as exp: print '[YelpScraper] :: run() :: Got exceptiion : %s ' % exp print(traceback.format_exc())
def run(self): try: url = 'https://news.google.com/news/headlines/section/topic' \ '/NATION.en_in/India?ned=in&hl=en-IN&gl=IN' print '[GoogleNewsScraper] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[GoogleNewsScraper] :: Failed to get " \ "content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # print '------soup', soup for div in soup.find_all('div', class_='v4IxVd'): # print '-----div', div self.scrap_result_row(div) sleep_scrapper('GoogleNewsScraper') except Exception as exp: print '[GoogleNewsScraper] :: run() :: Got exception: %s'\ % exp print(traceback.format_exc())
def run(self): try: options = Options() options.add_argument("window-size=1400,600") ua = UserAgent() a = ua.random user_agent = ua.random options.add_argument(f'user-agent={user_agent}') driver = webdriver.Chrome( "C:/Users/Dell/Downloads/chromedriver_win32/chromedriver.exe", options=options) html = driver.page_source for i in range(00, 60, 10): suffix = '&q=%s' % self.keyword url = 'https://www.google.com/search?client=firefox-b-d&biw=1366&bih=654&sa=N&ved=0ahUKEwjfy8ugnZHkAhVLro8KHXq1Ar0Q8tMDCJMC&ei=zslbXd-sHcvcvgT66oroCw&start=' + str( i) + suffix driver.get(url) html = driver.page_source # scrap_data(website_link,website_title,website_snippet) soup = BeautifulSoup(html, 'html.parser') print('...........soupppppppppp', soup.encode('utf-8')) for div in soup.find_all('div', class_='g'): print('---------------------div', div) self.scrap_result_row(div) time.sleep(15) sleep_scrapper('GoogleSearchListingsScraper') except Exception as exp: print( '[GoogleSearchListingsScraper] :: run() :: Got exception: %s' % exp) print(traceback.format_exc())
def run(self): for j in range(0, 2, 1): try: # url = base_url + str(j) + sufix url = 'https://www.samsclub.com/sams/coffee-tea-cocoa/1493.cp?xid=cat_sub&navAction=jump' print '[Samsclub] :: fetching data from url: ', url r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print "[Samsclub] :: Failed to get content of url: %s" % url return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') # parsing html content to fet information about python developer for div in soup.find_all('div', class_='products-card'): self.scrap_result_row(div) sleep_scrapper('Samsclub') except Exception as exp: print '[Samsclub] :: run() :: Got exception : %s' % exp
def scrap_craiglist_dry_cleaner(self): """ Scrap dry cleaners data from craiglist :return: """ dir_path = os.path.dirname(os.path.realpath(__file__)) file_data = "%s/%s" % (dir_path, PROGRESS_CRAIGLIST_FILE) # if progress file is not found, # create a new one with default value 1 if not os.path.isfile(file_data): logging.error("Error: %s file is not found" % file_data) print( "file is exist %s ..." % file_data ) txt_write(PROGRESS_CRAIGLIST_FILE, "1") logging.info("file is exist %s ..." % file_data) print( "file is exist %s ..." % file_data ) f = open(PROGRESS_CRAIGLIST_FILE, "r") i = int(f.readline()) logging.info( "\n\nCraiglist-Dry-Cleaner started at i: %d \n\n" % i ) print( "\n\nCraiglist-Dry-Cleaner started at i: %d \n\n" % i ) base_url = "https://newyork.craigslist.org/" \ "search/sss?query=dry%20cleaner&sort=rel" for j in range(i, 89, 1): try: craiglist_url = base_url + str(j) self.craiglist.scrap_craiglist(craiglist_url, ST_DRY_CLEANERS) # progress % of craiglist_dry_cleaner scrapping percentage = float(j * 100 / 89) logging.info( "\n\nCraiglist-Dry-Cleaner-Scrapper %f " "percent completed" % percentage ) print( "\n\nCraiglist-Dry-Cleaner-Scrapper %f " "percent completed" % percentage ) # update scrapping progress in # craiglist_dry_cleaner progress file txt_write(PROGRESS_CRAIGLIST_FILE, str(j)) # sleep scrapper for a while sleep_scrapper("Craiglist-Dry-Cleaner-Scrapper") except Exception as exp: logging.error( "scrap_craiglist_dry_cleaner() :: " "Got Exception : %s" % exp ) logging.error( traceback.format_exc() )
def scrap_groupon_dry_cleaner(self): """ Scrap dry cleaners data from groupon """ dir_path = os.path.dirname(os.path.realpath(__file__)) file_data = "%s/%s" % (dir_path, PROGRESS_GROUPON_FILE) # if progress file is not found, # create a new one with default value 1 if not os.path.isfile(file_data): logging.error("Error: %s file not found" % file_data) print( "Error: %s file not found" % file_data ) txt_write(PROGRESS_GROUPON_FILE, "1") logging.info("file is exist %s ..." % file_data) print( "file is exist %s ..." % file_data ) f = open(PROGRESS_GROUPON_FILE, "r") i = int(f.readline()) logging.info( "\n\nGroupon-Dry-Cleaner-Scrapper started at i: %d \n\n" % i ) print( "\n\nGroupon-Dry-Cleaner-Scrapper started at i: %d \n\n" % i ) base_url = "https://www.groupon.com/browse/chicago?" \ "lat=41.8795&lng=-87.6243&address=Chicago&query=" \ "dry+cleaners&locale=en_US&page=" for j in range(i, 16, 1): try: groupon_url = base_url + str(j) self.groupon.scrap_groupon(groupon_url, ST_DRY_CLEANERS) # progress % of groupon_dry_cleaner scrapping percentage = float(j * 100 / 15) logging.info( "\n\nGroupon-Dry-Cleaner-Scrapper %f " "percent completed" % percentage ) print( "\n\nGroupon-Dry-Cleaner-Scrapper %f " "percent completed" % percentage ) # update scrapping progress in # groupon_dry_cleaner progress file txt_write(PROGRESS_GROUPON_FILE, str(j)) # sleep scrapper for a while sleep_scrapper("Groupon-Dry-Cleaner-Scrapper") except Exception as exp: logging.error( "scrap_groupon_dry_cleaner() :: Got Exception : %s" % exp ) logging.error( traceback.format_exc() )
def run(self): try: #Teams news scrapping 'https://www.espn.in/cricket/team/_/id/1/england', # 'https://www.espn.in/cricket/team/_/id/2/australia', # 'https://www.espn.in/cricket/team/_/id/3/south-africa', # 'https://www.espn.in/cricket/team/_/id/4/west-indies', # 'https://www.espn.in/cricket/team/_/id/5/new-zealand', # 'https://www.espn.in/cricket/team/_/id/6/india', # 'https://www.espn.in/cricket/team/_/id/7/pakistan', # 'https://www.espn.in/cricket/team/_/id/8/sri-lanka', # 'https://www.espn.in/cricket/team/_/id/9/zimbabwe', # 'https://www.espn.in/cricket/team/_/id/25/bangladesh/', cricketTeams_URLs = [ 'http://www.espn.in/cricket/team/_/id/1/england', 'http://www.espn.in/cricket/team/_/id/2/australia', 'http://www.espn.in/cricket/team/_/id/3/south-africa', 'http://www.espn.in/cricket/team/_/id/4/west-indies', 'http://www.espn.in/cricket/team/_/id/5/new-zealand', 'http://www.espn.in/cricket/team/_/id/6/india', 'http://www.espn.in/cricket/team/_/id/7/pakistan', 'http://www.espn.in/cricket/team/_/id/8/sri-lanka', 'http://www.espn.in/cricket/team/_/id/9/zimbabwe', 'http://www.espn.in/cricket/team/_/id/25/bangladesh/' ] for url in cricketTeams_URLs: print("PART_________________________________A") print('[ESPNScrapper] :: fetching data from TEAMS url: ', url) r = requests.get(url, headers=get_request_headers()) if not r.status_code == 200: print ("[ESPNScrapper] :: Failed to get " \ "content of url: %s" % url) return html_doc = r.content soup = BeautifulSoup(html_doc, 'html.parser') sect1 = soup.find('section', id='pane-main') sect11 = sect1.find('section', id='main-container') sect2 = sect11.find('section', id='news-feed') news_feed_list = sect2.find('div', class_='container') section_news = news_feed_list.find_all( "article", {"class": "news-feed-item news-feed-story-package"}) Headlines = news_feed_list.find_all("article", { "class": "news-feed-item news-feed-story-package is-headline" }) team_parsed_urls = [] for news in Headlines: print("PART______________________________A.A") news_ = news.find("a", {"class": "story-link"}) if (news): news_url = (news_)['data-popup-href'] #self.scrape_post_content(news_url) team_parsed_urls.append(news_url) for news in section_news: print("PART______________________________A.B") print( 'You are now scrapping regular news fro webpage !!.. ') news_ = news.find("a", {"class": "story-link"}) if (news): news_url = (news_)['data-popup-href'] print('[ESPNScrapper] :: section news URL: ', news_url) #self.scrape_post_content(news_url) team_parsed_urls.append(news_url) infnite_scroll_url = 'https://secure.espn.com/core/minifeed?render=true&partial=newsfeed&limit=20&xhr=1&template=clubhouse&headlinestack=true&site=espn&lang=en®ion=in&sport=cricket&pubkey=cricket-clubhouse&insider=false&device=desktop&country=in&lang=en®ion=in&site=espn&edition-host=espn.in&site-type=full&userab=0&offset=' ur_l = url.split('/') print('url to be scrolled infinite', ur_l) team = '&team=' + ur_l[7] for i in range(0, 100, 25): scroll_url = infnite_scroll_url + str(i) + team print( "PART____________AUTO-SCROLL______________________A.C") print( '[ESPNScrapper] :: fetching data from infinite-url: ', scroll_url) try: raw_json = requests.get(scroll_url).text data = json.loads(raw_json) qw = (data['content']['html']['items'][0]['html']) for data in data['content']['html']['items']: qw = data['html'] try: qwe = json.dumps(qw) soup = BeautifulSoup(qwe, 'html.parser') section_ = soup.find("a")['data-popup-href'] print(section_) sect = section_.replace('\\"', '') if re.search('clip', sect): print( "NEWS only contains video, no text , no image , so skipping this News" ) else: team_parsed_urls.append(sect) self.scrape_post_content(sect) except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc()) except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc()) sleep_scrapper('ESPNScrapper') sleep_scrapper('ESPNScrapper') base_url = 'https://www.espn.in/cricket/' parent_fol = base_url.split('/') print(parent_fol) print("PART_______________________________B") print('[ESPNScrapper] :: fetching data from BASE Url: ', base_url) ree = requests.get(base_url, headers=get_request_headers()) if not ree.status_code == 200: print ("[ESPNScrapper] :: Failed to get " \ "content of url: %s" % base_url) return html_doc = ree.content soup = BeautifulSoup(html_doc, 'html.parser') col2_feed = soup.find('section', class_='col-two contentFeed') contentf = col2_feed.find_all("section", {"class": "contentItem"}) for content in contentf: print('contentfeedsections') self.scrape_sports(content, cricketTeams_URLs) contentf = col2_feed.find_all("article", {"class": "contentItem"}) for contentfeed in contentf: print('contentfeedARTICLES') self.scrape_sports(contentfeed, cricketTeams_URLs) #scrape only 26 posts of the starting webpage #Infinite loading URL infinite_url = 'https://onefeed.fan.api.espn.com/apis/v3/cached/contentEngine/oneFeed/leagues/cricket?source=ESPN.com%2B-%2BFAM&showfc=true®ion=in&limit=15&lang=en&authorizedNetworks=espn_free&editionKey=espnin-en&device=desktop&pubkey=espncricinfo-en-in-cricket-index&isPremium=true&locale=in&featureFlags=expandAthlete&featureFlags=mmaGB&offset=' #for 10 times scrolling for i in range(10, 100, 15): scroll_url = infinite_url + str(i) print( '[ESPNScrapper] :: fetching data from infinite-url: ', scroll_url) r = requests.get(scroll_url, headers=get_request_headers()) try: raw_json = requests.get(scroll_url).text dataa = json.loads(raw_json) for data in dataa['feed']: qw = (data['data']['now'][0]) try: keys = sorted(qw.items()) result = [(key, value) for key, value in keys if key.startswith("links")] result11 = result[0] result33 = list(result11) reo = result33[1] scroll_news_url = reo['web']['href'] print('FETCHING post from scroll URL') self.scrape_post_content(scroll_news_url) except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc()) except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc()) sleep_scrapper('ESPNScrapper') except Exception as exp: print ('[ESPNscrapper] :: run() :: Got exception at fetching data from TEAMSurl: %s'\ % exp) print(traceback.format_exc())