Пример #1
0
def reset_sessions(data_url):

    s = SgRequests()

    driver = SgChrome(is_headless=True).driver()
    driver.get(base_url)

    incap_str = "/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3"
    incap_url = base_url + incap_str

    s.get(incap_url)

    for request in driver.requests:

        headers = request.headers
        try:
            response = s.get(data_url, headers=headers)
            response_text = response.text

            test_html = response_text.split("div")
            if len(test_html) < 2:
                continue
            else:
                return [s, driver, headers, response_text]

        except Exception:
            continue
Пример #2
0
def fetchSinglePage(data_url, findRedirect=False):
    session = SgRequests()
    driver.get(data_url)
    incap_str = "/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3"
    incap_url = website + incap_str
    session.get(incap_url)

    for x in range(10):
        if findRedirect:
            print("find redirect")
        print("try: " + str(x))
        for request in driver.requests:
            headers = request.headers
            try:
                response = session.get(data_url, headers=headers)
                response_text = response.text

                test_html = response_text.split("div")

                if findRedirect and response_text.find("window.location.replace") > -1:

                    try:
                        return [session, headers, response_text.split("window.location.replace('")[1].split(
                            "')"
                        )[0]]
                    except Exception:
                        continue
                elif len(test_html) < 2:
                    continue
                else:

                    return [
                        session,
                        headers,
                        {
                            "response": response_text,
                            "hours_of_operation": getHoursOfOperation(),
                            "phone": getPhone(session, headers, response_text),
                        },
                    ]

            except Exception:
                continue
Пример #3
0
        for row in data:
            writer.writerow(row)


@retry(stop=stop_after_attempt(10))
def get_result(url, headers):
    global session
    try:
        return session.get(url, headers=headers)
    except:
        session = SgRequests()
        raise


url_sitemap_main = "https://www.autozone.com/locations/sitemap.xml"
r = session.get(url_sitemap_main, headers=headers, timeout=10)
datar = html.fromstring(bytes(r.text, encoding="utf8"))
url_sub_sitemap = datar.xpath("//sitemap/loc/text()")
logger.info(f"Sitemap URLs: {len(url_sub_sitemap)}")


def get_all_raw_store_urls():
    urls_part1_and_part2 = []
    for url_part in url_sub_sitemap:
        r0 = session.get(url_part, headers=headers, timeout=120)
        datar0 = html.fromstring(bytes(r0.text, encoding="utf8"))
        logger.info(f"Scraping All Store URLs from: {url_part} ")
        urls_0 = datar0.xpath("//url/loc/text()")
        urls_part1_and_part2.extend(urls_0)
    return urls_part1_and_part2
Пример #4
0
async def get_brand(brand_code, brand_name, url):
    url = url + brand_code

    headers = {}
    headers["authority"] = "www.radissonhotels.com"
    headers["method"] = "GET"
    headers["path"] = "/zimba-api/destinations/hotels?brand=" + brand_code
    headers["scheme"] = "https"
    headers["accept"] = "application/json, text/plain, */*"
    headers["accept-encoding"] = "gzip, deflate, br"
    headers["accept-language"] = "en-us"
    headers["referer"] = "https://www.radissonhotels.com/en-us/destination"
    headers["sec-fetch-dest"] = "empty"
    headers["sec-fetch-mode"] = "cors"
    headers["sec-fetch-site"] = "same-origin"
    headers[
        "user-agent"] = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"

    session = SgRequests()
    son = session.get(url, headers=headers)
    print(son.text)
    son = son.json()
    task_list = []
    results = []
    chunk_size = 10
    last_chunk = 0
    last_tick = time.monotonic()
    total_records = len(son["hotels"])
    global EXPECTED_TOTAL
    EXPECTED_TOTAL += total_records
    for index, record in enumerate(son["hotels"]):
        task_list.append(fetch_data(index, record["overviewPath"]))
        if index % chunk_size == 0 and last_chunk != index:
            last_tick = time.monotonic()
            last_chunk = index
            if len(task_list) > 0:
                z = await asyncio.gather(*task_list)
                for item in z:
                    results.append({
                        "main": son["hotels"][item["index"]],
                        "sub": item,
                        "@type": brand_name,
                    })
                logzilla.info(
                    f"Finished {last_chunk}/{total_records} for brand {brand_name}, last step took {round(time.monotonic()-last_tick,5)} seconds."
                )
                task_list = []

    last_tick = time.monotonic()
    if len(task_list) > 0:
        z = await asyncio.gather(*task_list)
        for item in z:
            results.append({
                "main": son["hotels"][item["index"]],
                "sub": item,
                "@type": brand_name
            })
        logzilla.info(
            f"Finished {total_records}/{total_records} for brand {brand_name}, last step took {round(time.monotonic()-last_tick,5)} seconds."
        )
    return results
Пример #5
0
zips = []
country_codes = []
store_numbers = []
phones = []
location_types = []
latitudes = []
longitudes = []
hours_of_operations = []

session = SgRequests()
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}

url = "https://www.potatocornerusa.com"
response = session.get(url, headers=headers).text
soup = bs(response, "html.parser")
all_script = soup.find_all("script")

goods = []
for script in all_script:
    try:
        stripped = script
        logger.info(stripped)
        data = json.loads(stripped)
        goods.append(data)
    except Exception as ex:
        logger.info(stripped)
        logger.info(ex)
        
Пример #6
0
def fetch_data():

    base_link = "https://www.picknsave.com/storelocator-sitemap.xml"

    user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"
    headers = {"User-Agent": user_agent}

    session = SgRequests()

    req = session.get(base_link, headers=headers)
    base = BeautifulSoup(req.text, "lxml")

    items = base.find_all("loc")

    data = []
    locator_domain = "picknsave.com"

    for item in items:
        link = item.text
        if "stores/details" in link:
            req = session.get(link, headers=headers)
            base = BeautifulSoup(req.text, "lxml")

            script = (base.find("script",
                                attrs={
                                    "type": "application/ld+json"
                                }).text.replace("\n", "").strip())
            store = json.loads(script)

            location_name = store["name"]
            street_address = store["address"]["streetAddress"]
            city = store["address"]["addressLocality"]
            state = store["address"]["addressRegion"]
            zip_code = store["address"]["postalCode"]
            country_code = "US"
            store_number = link.split("/")[-1]
            location_type = "<MISSING>"
            phone = store["telephone"]
            hours_of_operation = store["openingHours"][0]
            latitude = store["geo"]["latitude"]
            longitude = store["geo"]["longitude"]

            # Store data
            data.append([
                locator_domain,
                link,
                location_name,
                street_address,
                city,
                state,
                zip_code,
                country_code,
                store_number,
                phone,
                location_type,
                latitude,
                longitude,
                hours_of_operation,
            ])

    return data
Пример #7
0
def fetch_data():
    out = []
    locator_domain = "https://www.citybbq.com"
    api_url = "https://order.citybbq.com/locations"
    session = SgRequests()
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
        "TE": "Trailers",
    }

    r = session.get(api_url, headers=headers)
    tree = html.fromstring(r.text)
    block = tree.xpath('//ul[@id="ParticipatingStates"]/li')
    for i in block:
        url1 = "".join(i.xpath(".//a/@href"))
        url1 = f"https://order.citybbq.com{url1}"
        session = SgRequests()
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
            "Connection": "keep-alive",
            "Referer": "https://order.citybbq.com/locations",
            "Upgrade-Insecure-Requests": "1",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
            "TE": "Trailers",
        }
        cookies = {
            "_gcl_au": "1.1.1275666536.1616147724",
            "_ga": "GA1.2.1565131436.1616147732",
            "_gid": "GA1.2.169092942.1616147732",
            "_fbp": "fb.1.1616147732783.1672002159",
            "__cfduid": "d51d0f4f8d1b467178bce7dd202af32771616149617",
        }
        r = session.get(url1, headers=headers, cookies=cookies)
        trees = html.fromstring(r.text)
        block = trees.xpath("//h2")
        for n in block:
            page_url = "".join(n.xpath(".//a/@href"))
            session = SgRequests()
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
                "Connection": "keep-alive",
                "Referer": "https://order.citybbq.com/locations",
                "Upgrade-Insecure-Requests": "1",
                "Pragma": "no-cache",
                "Cache-Control": "no-cache",
                "TE": "Trailers",
            }
            r = session.get(page_url, headers=headers)
            tree = html.fromstring(r.text)
            location_name = "".join(tree.xpath("//h1/text()")).replace(
                "\n", "").strip()
            street_address = ("".join(
                tree.xpath('//span[@class="street-address"]/text()')).replace(
                    "\n", "").strip())
            phone = ("".join(
                tree.xpath('//span[@class="tel"]/text()')).replace("\n",
                                                                   "").strip())
            city = ("".join(
                tree.xpath('//span[@class="locality"]/text()')).replace(
                    "\n", "").strip())
            state = ("".join(
                tree.xpath('//span[@class="region"]/text()')).replace(
                    "\n", "").strip())
            country_code = "US"
            store_number = "<MISSING>"
            latitude = "".join(
                tree.xpath('//span[@class="latitude"]/span/@title'))
            longitude = "".join(
                tree.xpath('//span[@class="longitude"]/span/@title'))
            location_type = "<MISSING>"
            hours_of_operation = tree.xpath(
                '//dl[@id="available-business-hours-popover"]//text()')
            hours_of_operation = list(
                filter(None, [a.strip() for a in hours_of_operation]))
            hours_of_operation = " ".join(hours_of_operation)
            postal = ("".join(
                tree.xpath('//span[@class="postal-code"]/text()')).replace(
                    "\n", "").strip())
            row = [
                locator_domain,
                page_url,
                location_name,
                street_address,
                city,
                state,
                postal,
                country_code,
                store_number,
                phone,
                location_type,
                latitude,
                longitude,
                hours_of_operation,
            ]
            out.append(row)

    return out
Пример #8
0
for code in search:
    url = "https://www.picknsave.com/stores/search?searchText=" + code
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate"
    }

    #data = json.loads(bs(session.get(url, headers=headers).text, "html.parser").find_all("script")[-3].text.strip().split("parse(")[1].split("\')")[0][1:].replace("\\", "\\\\").replace("\\\\\\\\\"", ""))

    response = session.get(url, headers=headers).text

    soup = bs(response, "html.parser")
    scripts = soup.find_all("script")
    #logger.info(scripts[0])
    #data = json.loads(soup.find_all("script")[-3].text.strip().split("parse(")[1].split("\')")[0][1:].replace("\\", "\\\\").replace("\\\\\\\\\"", ""))
    data = soup.find_all("script")
    logger.info(data)
    data = data[-3]
    data = data.text.strip()
    logger.info("")
    logger.info("")
    logger.info(data)
    data = data.split("parse(")[1]
    data = data.split("\')")[0][1:]
    data = data.replace("\\", "\\\\").replace("\\\\\\\\\"", "")
Пример #9
0
phones = []
location_types = []
latitudes = []
longitudes = []
hours_op = []

session = SgRequests()
search = DynamicGeoSearch(country_codes=[SearchableCountries.USA])

base_url = "https://www.coffeebean.com/store-locator"

# Country search
locs = []
for x in range(101):
    params = {"field_country_value": "USA", "page": x}
    r = session.get(base_url, params=params).text
    soup = bs(r, "html.parser")
    view_store = soup.find_all("a", attrs={"class": "view-store"})
    for item in view_store:
        locs.append(item["href"])

# Lat Lng Boundary search
base_url = "https://www.coffeebean.com/store-locator?field_geo_location_boundary%5Blat_north_east%5D=47.56&field_geo_location_boundary%5Blng_north_east%5D=69.44&field_geo_location_boundary%5Blat_south_west%5D=16.11&field_geo_location_boundary%5Blng_south_west%5D=-178.85"
for x in range(101):
    params = {"page": x}

    r = session.get(base_url, params=params).text

    soup = bs(r, "html.parser")
    view_store = soup.find_all("a", attrs={"class": "view-store"})
    for item in view_store:
Пример #10
0
def fetch_data():
    # Your scraper here
    session = SgRequests()

    items = []
    scraped_items = []

    DOMAIN = "dreamdoors.co.uk"
    start_url = "https://www.dreamdoors.co.uk/kitchen-showrooms"

    all_codes = DynamicZipSearch(
        country_codes=[SearchableCountries.BRITAIN],
        max_radius_miles=10,
        max_search_results=None,
    )
    for code in all_codes:
        formdata = {
            "option": "com_ajax",
            "module": "dreamdoors_store_finder",
            "postcode": code,
            "format": "raw",
        }
        headers = {
            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
            "user-agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
            "x-requested-with": "XMLHttpRequest",
        }
        response = session.post(start_url, data=formdata, headers=headers)
        if response.status_code != 200:
            continue
        data = json.loads(response.text)

        for poi in data:
            if type(poi) == str:
                continue
            store_url = poi["url"]
            if store_url in scraped_items:
                continue

            loc_response = session.get(store_url)
            loc_dom = etree.HTML(loc_response.text)
            location_name = poi["name"]
            location_name = location_name if location_name else "<MISSING>"
            raw_address = loc_dom.xpath('//div[@class="address"]//text()')
            raw_address = [
                elem.strip() for elem in raw_address if elem.strip()
            ]
            addr = parse_address_intl(" ".join(raw_address).replace(
                "Address", ""))
            if addr.street_address_2:
                street_address = f"{addr.street_address_2} {addr.street_address_1}"
            else:
                street_address = addr.street_address_1
            street_address = street_address if street_address else "<MISSING>"
            if "Coming Soon" in street_address:
                continue
            city = addr.city
            city = city if city else "<MISSING>"
            if "Tbc" in city:
                street_address = city
                city = "<MISSING>"
            state = "<MISSING>"
            zip_code = addr.postcode
            zip_code = zip_code if zip_code else "<MISSING>"
            country_code = addr.country
            country_code = country_code if country_code else "<MISSING>"
            store_number = poi["id"]
            store_number = store_number if store_number else "<MISSING>"
            phone = loc_dom.xpath('//a[@id="showroom-phone"]/text()')
            phone = phone[0] if phone else "<MISSING>"
            location_type = "<MISSING>"
            hoo = loc_dom.xpath('//div[@class="opening_times"]//text()')
            hoo = [elem.strip() for elem in hoo if elem.strip()]
            hours_of_operation = (" ".join(hoo[2:]).split(" Call ")[0]
                                  if hoo else "<MISSING>")

            geo = re.findall(r'.map_initialize\("map_canvas", ".+", (.+?)\)',
                             loc_response.text)
            latitude = "<MISSING>"
            longitude = "<MISSING>"
            if geo:
                geo = geo[0].split(", ")
                latitude = geo[0]
                longitude = geo[1]
            else:
                with SgFirefox() as driver:
                    driver.get(store_url)
                    sleep(10)
                    loc_dom = etree.HTML(driver.page_source)
                    geo = loc_dom.xpath('//a[contains(@href, "maps/@")]/@href')
                    if geo:
                        geo = geo[0].split("maps/@")[-1].split(",")[:2]
                        latitude = geo[0]
                        longitude = geo[1]

            item = [
                DOMAIN,
                store_url,
                location_name,
                street_address,
                city,
                state,
                zip_code,
                country_code,
                store_number,
                phone,
                location_type,
                latitude,
                longitude,
                hours_of_operation,
            ]
            if store_url not in scraped_items:
                scraped_items.append(store_url)
                items.append(item)

    return items