def reset_sessions(data_url): s = SgRequests() driver = SgChrome(is_headless=True).driver() driver.get(base_url) incap_str = "/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3" incap_url = base_url + incap_str s.get(incap_url) for request in driver.requests: headers = request.headers try: response = s.get(data_url, headers=headers) response_text = response.text test_html = response_text.split("div") if len(test_html) < 2: continue else: return [s, driver, headers, response_text] except Exception: continue
def get_result(url, headers): global session try: return session.get(url, headers=headers) except: session = SgRequests() raise
def fetchSinglePage(data_url, findRedirect=False): session = SgRequests() driver.get(data_url) incap_str = "/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3" incap_url = website + incap_str session.get(incap_url) for x in range(10): if findRedirect: print("find redirect") print("try: " + str(x)) for request in driver.requests: headers = request.headers try: response = session.get(data_url, headers=headers) response_text = response.text test_html = response_text.split("div") if findRedirect and response_text.find("window.location.replace") > -1: try: return [session, headers, response_text.split("window.location.replace('")[1].split( "')" )[0]] except Exception: continue elif len(test_html) < 2: continue else: return [ session, headers, { "response": response_text, "hours_of_operation": getHoursOfOperation(), "phone": getPhone(session, headers, response_text), }, ] except Exception: continue
import csv from sgrequests import SgRequests from bs4 import BeautifulSoup import re import json import unicodedata session = SgRequests() def write_output(data): with open('data.csv', mode='w', newline='', encoding="utf-8") as output_file: writer = csv.writer(output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) # Header writer.writerow([ "locator_domain", "location_name", "street_address", "city", "state", "zip", "country_code", "store_number", "phone", "location_type", "latitude", "longitude", "hours_of_operation", "page_url" ]) # Body for row in data: writer.writerow(row) def fetch_data(): addressess = [] headers = {
from tenacity import retry from tenacity import stop_after_attempt import time logger = SgLogSetup().get_logger(logger_name="autozone_com") locator_domain_url = " https://www.autozone.com" MISSING = "<MISSING>" headers = { "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9", "cache-control": "max-age=0", } session = SgRequests() FIELDS = [ "locator_domain", "page_url", "location_name", "street_address", "city", "state", "zip", "country_code", "store_number", "phone", "location_type", "latitude", "longitude",
async def get_brand(brand_code, brand_name, url): url = url + brand_code headers = {} headers["authority"] = "www.radissonhotels.com" headers["method"] = "GET" headers["path"] = "/zimba-api/destinations/hotels?brand=" + brand_code headers["scheme"] = "https" headers["accept"] = "application/json, text/plain, */*" headers["accept-encoding"] = "gzip, deflate, br" headers["accept-language"] = "en-us" headers["referer"] = "https://www.radissonhotels.com/en-us/destination" headers["sec-fetch-dest"] = "empty" headers["sec-fetch-mode"] = "cors" headers["sec-fetch-site"] = "same-origin" headers[ "user-agent"] = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36" session = SgRequests() son = session.get(url, headers=headers) print(son.text) son = son.json() task_list = [] results = [] chunk_size = 10 last_chunk = 0 last_tick = time.monotonic() total_records = len(son["hotels"]) global EXPECTED_TOTAL EXPECTED_TOTAL += total_records for index, record in enumerate(son["hotels"]): task_list.append(fetch_data(index, record["overviewPath"])) if index % chunk_size == 0 and last_chunk != index: last_tick = time.monotonic() last_chunk = index if len(task_list) > 0: z = await asyncio.gather(*task_list) for item in z: results.append({ "main": son["hotels"][item["index"]], "sub": item, "@type": brand_name, }) logzilla.info( f"Finished {last_chunk}/{total_records} for brand {brand_name}, last step took {round(time.monotonic()-last_tick,5)} seconds." ) task_list = [] last_tick = time.monotonic() if len(task_list) > 0: z = await asyncio.gather(*task_list) for item in z: results.append({ "main": son["hotels"][item["index"]], "sub": item, "@type": brand_name }) logzilla.info( f"Finished {total_records}/{total_records} for brand {brand_name}, last step took {round(time.monotonic()-last_tick,5)} seconds." ) return results
import csv from sgrequests import SgRequests from sgzip.dynamic import SearchableCountries from sgzip.static import static_zipcode_list session = SgRequests(retry_behavior=False) headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36", "content-type": "application/json", "accept": "application/json, text/plain, */*", } def write_output(data): with open("data.csv", mode="w") as output_file: writer = csv.writer(output_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL) # Header writer.writerow([ "locator_domain", "page_url", "location_name", "street_address", "city", "state", "zip",
except Exception: a_tags = div_tag.find_all("a") for a_tag in a_tags: try: location_url = a_tag["href"] except Exception: pass if location_url in location_urls: pass else: location_urls.append(location_url) count = count + 1 x = 0 phone_session = SgRequests() for location_url in location_urls: print(x) print(location_url) response = s.get(location_url, headers=headers) response_text = response.text if len(response_text.split("div")) > 2: pass else: new_sess = reset_sessions(location_url) s = new_sess[0] driver = new_sess[1] headers = new_sess[2] response_text = new_sess[3]
locator_domains = [] page_urls = [] location_names = [] street_addresses = [] citys = [] states = [] zips = [] country_codes = [] store_numbers = [] phones = [] location_types = [] latitudes = [] longitudes = [] hours_of_operations = [] session = SgRequests() headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36" } url = "https://www.potatocornerusa.com" response = session.get(url, headers=headers).text soup = bs(response, "html.parser") all_script = soup.find_all("script") goods = [] for script in all_script: try: stripped = script logger.info(stripped) data = json.loads(stripped)
def fetch_data(): base_link = "https://www.picknsave.com/storelocator-sitemap.xml" user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36" headers = {"User-Agent": user_agent} session = SgRequests() req = session.get(base_link, headers=headers) base = BeautifulSoup(req.text, "lxml") items = base.find_all("loc") data = [] locator_domain = "picknsave.com" for item in items: link = item.text if "stores/details" in link: req = session.get(link, headers=headers) base = BeautifulSoup(req.text, "lxml") script = (base.find("script", attrs={ "type": "application/ld+json" }).text.replace("\n", "").strip()) store = json.loads(script) location_name = store["name"] street_address = store["address"]["streetAddress"] city = store["address"]["addressLocality"] state = store["address"]["addressRegion"] zip_code = store["address"]["postalCode"] country_code = "US" store_number = link.split("/")[-1] location_type = "<MISSING>" phone = store["telephone"] hours_of_operation = store["openingHours"][0] latitude = store["geo"]["latitude"] longitude = store["geo"]["longitude"] # Store data data.append([ locator_domain, link, location_name, street_address, city, state, zip_code, country_code, store_number, phone, location_type, latitude, longitude, hours_of_operation, ]) return data
phones = [] location_types = [] latitudes = [] longitudes = [] hours_of_operations = [] headers = { 'User-Agent': 'PostmanRuntime/7.19.0', "Upgrade-Insecure-Requests": "1", "DNT": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate" } session = SgRequests(retry_behavior=None) url = 'https://www.picknsave.com/stores/api/graphql' search = DynamicZipSearch(country_codes=[SearchableCountries.USA]) for postal in search: data = { "query": "\n query storeSearch($searchText: String!, $filters: [String]!) {\n storeSearch(searchText: $searchText, filters: $filters) {\n stores {\n ...storeSearchResult\n }\n fuel {\n ...storeSearchResult\n }\n shouldShowFuelMessage\n }\n }\n \n fragment storeSearchResult on Store {\n banner\n vanityName\n divisionNumber\n storeNumber\n phoneNumber\n showWeeklyAd\n showShopThisStoreAndPreferredStoreButtons\n storeType\n distance\n latitude\n longitude\n tz\n ungroupedFormattedHours {\n displayName\n displayHours\n isToday\n }\n address {\n addressLine1\n addressLine2\n city\n countryCode\n stateCode\n zip\n }\n pharmacy {\n phoneNumber\n }\n departments {\n code\n }\n fulfillmentMethods{\n hasPickup\n hasDelivery\n }\n }\n", "variables": { "searchText": postal, "filters": [] }, "operationName": "storeSearch" } response = session.post(url, json=data, headers=headers).json() print(response)
def fetch_data(): out = [] locator_domain = "https://www.citybbq.com" api_url = "https://order.citybbq.com/locations" session = SgRequests() headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", "TE": "Trailers", } r = session.get(api_url, headers=headers) tree = html.fromstring(r.text) block = tree.xpath('//ul[@id="ParticipatingStates"]/li') for i in block: url1 = "".join(i.xpath(".//a/@href")) url1 = f"https://order.citybbq.com{url1}" session = SgRequests() headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Connection": "keep-alive", "Referer": "https://order.citybbq.com/locations", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", "TE": "Trailers", } cookies = { "_gcl_au": "1.1.1275666536.1616147724", "_ga": "GA1.2.1565131436.1616147732", "_gid": "GA1.2.169092942.1616147732", "_fbp": "fb.1.1616147732783.1672002159", "__cfduid": "d51d0f4f8d1b467178bce7dd202af32771616149617", } r = session.get(url1, headers=headers, cookies=cookies) trees = html.fromstring(r.text) block = trees.xpath("//h2") for n in block: page_url = "".join(n.xpath(".//a/@href")) session = SgRequests() headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Connection": "keep-alive", "Referer": "https://order.citybbq.com/locations", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", "TE": "Trailers", } r = session.get(page_url, headers=headers) tree = html.fromstring(r.text) location_name = "".join(tree.xpath("//h1/text()")).replace( "\n", "").strip() street_address = ("".join( tree.xpath('//span[@class="street-address"]/text()')).replace( "\n", "").strip()) phone = ("".join( tree.xpath('//span[@class="tel"]/text()')).replace("\n", "").strip()) city = ("".join( tree.xpath('//span[@class="locality"]/text()')).replace( "\n", "").strip()) state = ("".join( tree.xpath('//span[@class="region"]/text()')).replace( "\n", "").strip()) country_code = "US" store_number = "<MISSING>" latitude = "".join( tree.xpath('//span[@class="latitude"]/span/@title')) longitude = "".join( tree.xpath('//span[@class="longitude"]/span/@title')) location_type = "<MISSING>" hours_of_operation = tree.xpath( '//dl[@id="available-business-hours-popover"]//text()') hours_of_operation = list( filter(None, [a.strip() for a in hours_of_operation])) hours_of_operation = " ".join(hours_of_operation) postal = ("".join( tree.xpath('//span[@class="postal-code"]/text()')).replace( "\n", "").strip()) row = [ locator_domain, page_url, location_name, street_address, city, state, postal, country_code, store_number, phone, location_type, latitude, longitude, hours_of_operation, ] out.append(row) return out
citys = [] states = [] zips = [] country_codes = [] store_numbers = [] phones = [] location_types = [] latitudes = [] longitudes = [] hours_of_operations = [] search = DynamicZipSearch(country_codes=[SearchableCountries.USA], max_search_results=100) store_types = {"Pharmacy": "C", "Marketplace": "M", "Healthcare Clinic": "LC"} session = SgRequests() x = 0 for code in search: url = "https://www.picknsave.com/stores/search?searchText=" + code headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1", "DNT": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate" }
websites = [] locations = [] names = [] addresses = [] citys = [] states = [] zips = [] countrys = [] stores = [] phones = [] location_types = [] latitudes = [] longitudes = [] hours_op = [] session = SgRequests() search = DynamicGeoSearch(country_codes=[SearchableCountries.USA]) base_url = "https://www.coffeebean.com/store-locator" # Country search locs = [] for x in range(101): params = {"field_country_value": "USA", "page": x} r = session.get(base_url, params=params).text soup = bs(r, "html.parser") view_store = soup.find_all("a", attrs={"class": "view-store"}) for item in view_store: locs.append(item["href"]) # Lat Lng Boundary search
"city", "state", "zip_postal", "country_code", "store_number", "phone", "location_type", "latitude", "longitude", "locator_domain", "hours_of_operation", "brand_website", ] session = SgRequests().requests_retry_session() log = sglog.SgLogSetup().get_logger(logger_name=website) driver = SgChrome( is_headless=True, executable_path=ChromeDriverManager().install() ).driver() def fetchStores(): stores = [] response = session.get(website + "/sitemaps/profile.xml", headers=headers) root = ET.fromstring(response.text) for elem in root: for var in elem: if "loc" in var.tag: stores.append(var.text) return stores
def fetch_data(): # Your scraper here session = SgRequests() items = [] scraped_items = [] DOMAIN = "dreamdoors.co.uk" start_url = "https://www.dreamdoors.co.uk/kitchen-showrooms" all_codes = DynamicZipSearch( country_codes=[SearchableCountries.BRITAIN], max_radius_miles=10, max_search_results=None, ) for code in all_codes: formdata = { "option": "com_ajax", "module": "dreamdoors_store_finder", "postcode": code, "format": "raw", } headers = { "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36", "x-requested-with": "XMLHttpRequest", } response = session.post(start_url, data=formdata, headers=headers) if response.status_code != 200: continue data = json.loads(response.text) for poi in data: if type(poi) == str: continue store_url = poi["url"] if store_url in scraped_items: continue loc_response = session.get(store_url) loc_dom = etree.HTML(loc_response.text) location_name = poi["name"] location_name = location_name if location_name else "<MISSING>" raw_address = loc_dom.xpath('//div[@class="address"]//text()') raw_address = [ elem.strip() for elem in raw_address if elem.strip() ] addr = parse_address_intl(" ".join(raw_address).replace( "Address", "")) if addr.street_address_2: street_address = f"{addr.street_address_2} {addr.street_address_1}" else: street_address = addr.street_address_1 street_address = street_address if street_address else "<MISSING>" if "Coming Soon" in street_address: continue city = addr.city city = city if city else "<MISSING>" if "Tbc" in city: street_address = city city = "<MISSING>" state = "<MISSING>" zip_code = addr.postcode zip_code = zip_code if zip_code else "<MISSING>" country_code = addr.country country_code = country_code if country_code else "<MISSING>" store_number = poi["id"] store_number = store_number if store_number else "<MISSING>" phone = loc_dom.xpath('//a[@id="showroom-phone"]/text()') phone = phone[0] if phone else "<MISSING>" location_type = "<MISSING>" hoo = loc_dom.xpath('//div[@class="opening_times"]//text()') hoo = [elem.strip() for elem in hoo if elem.strip()] hours_of_operation = (" ".join(hoo[2:]).split(" Call ")[0] if hoo else "<MISSING>") geo = re.findall(r'.map_initialize\("map_canvas", ".+", (.+?)\)', loc_response.text) latitude = "<MISSING>" longitude = "<MISSING>" if geo: geo = geo[0].split(", ") latitude = geo[0] longitude = geo[1] else: with SgFirefox() as driver: driver.get(store_url) sleep(10) loc_dom = etree.HTML(driver.page_source) geo = loc_dom.xpath('//a[contains(@href, "maps/@")]/@href') if geo: geo = geo[0].split("maps/@")[-1].split(",")[:2] latitude = geo[0] longitude = geo[1] item = [ DOMAIN, store_url, location_name, street_address, city, state, zip_code, country_code, store_number, phone, location_type, latitude, longitude, hours_of_operation, ] if store_url not in scraped_items: scraped_items.append(store_url) items.append(item) return items