def reset_sessions(data_url): s = SgRequests() driver = SgChrome(is_headless=True).driver() driver.get(base_url) incap_str = "/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3" incap_url = base_url + incap_str s.get(incap_url) for request in driver.requests: headers = request.headers try: response = s.get(data_url, headers=headers) response_text = response.text test_html = response_text.split("div") if len(test_html) < 2: continue else: return [s, driver, headers, response_text] except Exception: continue
def fetchSinglePage(data_url, findRedirect=False): session = SgRequests() driver.get(data_url) incap_str = "/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3" incap_url = website + incap_str session.get(incap_url) for x in range(10): if findRedirect: print("find redirect") print("try: " + str(x)) for request in driver.requests: headers = request.headers try: response = session.get(data_url, headers=headers) response_text = response.text test_html = response_text.split("div") if findRedirect and response_text.find("window.location.replace") > -1: try: return [session, headers, response_text.split("window.location.replace('")[1].split( "')" )[0]] except Exception: continue elif len(test_html) < 2: continue else: return [ session, headers, { "response": response_text, "hours_of_operation": getHoursOfOperation(), "phone": getPhone(session, headers, response_text), }, ] except Exception: continue
for row in data: writer.writerow(row) @retry(stop=stop_after_attempt(10)) def get_result(url, headers): global session try: return session.get(url, headers=headers) except: session = SgRequests() raise url_sitemap_main = "https://www.autozone.com/locations/sitemap.xml" r = session.get(url_sitemap_main, headers=headers, timeout=10) datar = html.fromstring(bytes(r.text, encoding="utf8")) url_sub_sitemap = datar.xpath("//sitemap/loc/text()") logger.info(f"Sitemap URLs: {len(url_sub_sitemap)}") def get_all_raw_store_urls(): urls_part1_and_part2 = [] for url_part in url_sub_sitemap: r0 = session.get(url_part, headers=headers, timeout=120) datar0 = html.fromstring(bytes(r0.text, encoding="utf8")) logger.info(f"Scraping All Store URLs from: {url_part} ") urls_0 = datar0.xpath("//url/loc/text()") urls_part1_and_part2.extend(urls_0) return urls_part1_and_part2
async def get_brand(brand_code, brand_name, url): url = url + brand_code headers = {} headers["authority"] = "www.radissonhotels.com" headers["method"] = "GET" headers["path"] = "/zimba-api/destinations/hotels?brand=" + brand_code headers["scheme"] = "https" headers["accept"] = "application/json, text/plain, */*" headers["accept-encoding"] = "gzip, deflate, br" headers["accept-language"] = "en-us" headers["referer"] = "https://www.radissonhotels.com/en-us/destination" headers["sec-fetch-dest"] = "empty" headers["sec-fetch-mode"] = "cors" headers["sec-fetch-site"] = "same-origin" headers[ "user-agent"] = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36" session = SgRequests() son = session.get(url, headers=headers) print(son.text) son = son.json() task_list = [] results = [] chunk_size = 10 last_chunk = 0 last_tick = time.monotonic() total_records = len(son["hotels"]) global EXPECTED_TOTAL EXPECTED_TOTAL += total_records for index, record in enumerate(son["hotels"]): task_list.append(fetch_data(index, record["overviewPath"])) if index % chunk_size == 0 and last_chunk != index: last_tick = time.monotonic() last_chunk = index if len(task_list) > 0: z = await asyncio.gather(*task_list) for item in z: results.append({ "main": son["hotels"][item["index"]], "sub": item, "@type": brand_name, }) logzilla.info( f"Finished {last_chunk}/{total_records} for brand {brand_name}, last step took {round(time.monotonic()-last_tick,5)} seconds." ) task_list = [] last_tick = time.monotonic() if len(task_list) > 0: z = await asyncio.gather(*task_list) for item in z: results.append({ "main": son["hotels"][item["index"]], "sub": item, "@type": brand_name }) logzilla.info( f"Finished {total_records}/{total_records} for brand {brand_name}, last step took {round(time.monotonic()-last_tick,5)} seconds." ) return results
zips = [] country_codes = [] store_numbers = [] phones = [] location_types = [] latitudes = [] longitudes = [] hours_of_operations = [] session = SgRequests() headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36" } url = "https://www.potatocornerusa.com" response = session.get(url, headers=headers).text soup = bs(response, "html.parser") all_script = soup.find_all("script") goods = [] for script in all_script: try: stripped = script logger.info(stripped) data = json.loads(stripped) goods.append(data) except Exception as ex: logger.info(stripped) logger.info(ex)
def fetch_data(): base_link = "https://www.picknsave.com/storelocator-sitemap.xml" user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36" headers = {"User-Agent": user_agent} session = SgRequests() req = session.get(base_link, headers=headers) base = BeautifulSoup(req.text, "lxml") items = base.find_all("loc") data = [] locator_domain = "picknsave.com" for item in items: link = item.text if "stores/details" in link: req = session.get(link, headers=headers) base = BeautifulSoup(req.text, "lxml") script = (base.find("script", attrs={ "type": "application/ld+json" }).text.replace("\n", "").strip()) store = json.loads(script) location_name = store["name"] street_address = store["address"]["streetAddress"] city = store["address"]["addressLocality"] state = store["address"]["addressRegion"] zip_code = store["address"]["postalCode"] country_code = "US" store_number = link.split("/")[-1] location_type = "<MISSING>" phone = store["telephone"] hours_of_operation = store["openingHours"][0] latitude = store["geo"]["latitude"] longitude = store["geo"]["longitude"] # Store data data.append([ locator_domain, link, location_name, street_address, city, state, zip_code, country_code, store_number, phone, location_type, latitude, longitude, hours_of_operation, ]) return data
def fetch_data(): out = [] locator_domain = "https://www.citybbq.com" api_url = "https://order.citybbq.com/locations" session = SgRequests() headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", "TE": "Trailers", } r = session.get(api_url, headers=headers) tree = html.fromstring(r.text) block = tree.xpath('//ul[@id="ParticipatingStates"]/li') for i in block: url1 = "".join(i.xpath(".//a/@href")) url1 = f"https://order.citybbq.com{url1}" session = SgRequests() headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Connection": "keep-alive", "Referer": "https://order.citybbq.com/locations", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", "TE": "Trailers", } cookies = { "_gcl_au": "1.1.1275666536.1616147724", "_ga": "GA1.2.1565131436.1616147732", "_gid": "GA1.2.169092942.1616147732", "_fbp": "fb.1.1616147732783.1672002159", "__cfduid": "d51d0f4f8d1b467178bce7dd202af32771616149617", } r = session.get(url1, headers=headers, cookies=cookies) trees = html.fromstring(r.text) block = trees.xpath("//h2") for n in block: page_url = "".join(n.xpath(".//a/@href")) session = SgRequests() headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Connection": "keep-alive", "Referer": "https://order.citybbq.com/locations", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", "TE": "Trailers", } r = session.get(page_url, headers=headers) tree = html.fromstring(r.text) location_name = "".join(tree.xpath("//h1/text()")).replace( "\n", "").strip() street_address = ("".join( tree.xpath('//span[@class="street-address"]/text()')).replace( "\n", "").strip()) phone = ("".join( tree.xpath('//span[@class="tel"]/text()')).replace("\n", "").strip()) city = ("".join( tree.xpath('//span[@class="locality"]/text()')).replace( "\n", "").strip()) state = ("".join( tree.xpath('//span[@class="region"]/text()')).replace( "\n", "").strip()) country_code = "US" store_number = "<MISSING>" latitude = "".join( tree.xpath('//span[@class="latitude"]/span/@title')) longitude = "".join( tree.xpath('//span[@class="longitude"]/span/@title')) location_type = "<MISSING>" hours_of_operation = tree.xpath( '//dl[@id="available-business-hours-popover"]//text()') hours_of_operation = list( filter(None, [a.strip() for a in hours_of_operation])) hours_of_operation = " ".join(hours_of_operation) postal = ("".join( tree.xpath('//span[@class="postal-code"]/text()')).replace( "\n", "").strip()) row = [ locator_domain, page_url, location_name, street_address, city, state, postal, country_code, store_number, phone, location_type, latitude, longitude, hours_of_operation, ] out.append(row) return out
for code in search: url = "https://www.picknsave.com/stores/search?searchText=" + code headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1", "DNT": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate" } #data = json.loads(bs(session.get(url, headers=headers).text, "html.parser").find_all("script")[-3].text.strip().split("parse(")[1].split("\')")[0][1:].replace("\\", "\\\\").replace("\\\\\\\\\"", "")) response = session.get(url, headers=headers).text soup = bs(response, "html.parser") scripts = soup.find_all("script") #logger.info(scripts[0]) #data = json.loads(soup.find_all("script")[-3].text.strip().split("parse(")[1].split("\')")[0][1:].replace("\\", "\\\\").replace("\\\\\\\\\"", "")) data = soup.find_all("script") logger.info(data) data = data[-3] data = data.text.strip() logger.info("") logger.info("") logger.info(data) data = data.split("parse(")[1] data = data.split("\')")[0][1:] data = data.replace("\\", "\\\\").replace("\\\\\\\\\"", "")
phones = [] location_types = [] latitudes = [] longitudes = [] hours_op = [] session = SgRequests() search = DynamicGeoSearch(country_codes=[SearchableCountries.USA]) base_url = "https://www.coffeebean.com/store-locator" # Country search locs = [] for x in range(101): params = {"field_country_value": "USA", "page": x} r = session.get(base_url, params=params).text soup = bs(r, "html.parser") view_store = soup.find_all("a", attrs={"class": "view-store"}) for item in view_store: locs.append(item["href"]) # Lat Lng Boundary search base_url = "https://www.coffeebean.com/store-locator?field_geo_location_boundary%5Blat_north_east%5D=47.56&field_geo_location_boundary%5Blng_north_east%5D=69.44&field_geo_location_boundary%5Blat_south_west%5D=16.11&field_geo_location_boundary%5Blng_south_west%5D=-178.85" for x in range(101): params = {"page": x} r = session.get(base_url, params=params).text soup = bs(r, "html.parser") view_store = soup.find_all("a", attrs={"class": "view-store"}) for item in view_store:
def fetch_data(): # Your scraper here session = SgRequests() items = [] scraped_items = [] DOMAIN = "dreamdoors.co.uk" start_url = "https://www.dreamdoors.co.uk/kitchen-showrooms" all_codes = DynamicZipSearch( country_codes=[SearchableCountries.BRITAIN], max_radius_miles=10, max_search_results=None, ) for code in all_codes: formdata = { "option": "com_ajax", "module": "dreamdoors_store_finder", "postcode": code, "format": "raw", } headers = { "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36", "x-requested-with": "XMLHttpRequest", } response = session.post(start_url, data=formdata, headers=headers) if response.status_code != 200: continue data = json.loads(response.text) for poi in data: if type(poi) == str: continue store_url = poi["url"] if store_url in scraped_items: continue loc_response = session.get(store_url) loc_dom = etree.HTML(loc_response.text) location_name = poi["name"] location_name = location_name if location_name else "<MISSING>" raw_address = loc_dom.xpath('//div[@class="address"]//text()') raw_address = [ elem.strip() for elem in raw_address if elem.strip() ] addr = parse_address_intl(" ".join(raw_address).replace( "Address", "")) if addr.street_address_2: street_address = f"{addr.street_address_2} {addr.street_address_1}" else: street_address = addr.street_address_1 street_address = street_address if street_address else "<MISSING>" if "Coming Soon" in street_address: continue city = addr.city city = city if city else "<MISSING>" if "Tbc" in city: street_address = city city = "<MISSING>" state = "<MISSING>" zip_code = addr.postcode zip_code = zip_code if zip_code else "<MISSING>" country_code = addr.country country_code = country_code if country_code else "<MISSING>" store_number = poi["id"] store_number = store_number if store_number else "<MISSING>" phone = loc_dom.xpath('//a[@id="showroom-phone"]/text()') phone = phone[0] if phone else "<MISSING>" location_type = "<MISSING>" hoo = loc_dom.xpath('//div[@class="opening_times"]//text()') hoo = [elem.strip() for elem in hoo if elem.strip()] hours_of_operation = (" ".join(hoo[2:]).split(" Call ")[0] if hoo else "<MISSING>") geo = re.findall(r'.map_initialize\("map_canvas", ".+", (.+?)\)', loc_response.text) latitude = "<MISSING>" longitude = "<MISSING>" if geo: geo = geo[0].split(", ") latitude = geo[0] longitude = geo[1] else: with SgFirefox() as driver: driver.get(store_url) sleep(10) loc_dom = etree.HTML(driver.page_source) geo = loc_dom.xpath('//a[contains(@href, "maps/@")]/@href') if geo: geo = geo[0].split("maps/@")[-1].split(",")[:2] latitude = geo[0] longitude = geo[1] item = [ DOMAIN, store_url, location_name, street_address, city, state, zip_code, country_code, store_number, phone, location_type, latitude, longitude, hours_of_operation, ] if store_url not in scraped_items: scraped_items.append(store_url) items.append(item) return items