-
Notifications
You must be signed in to change notification settings - Fork 4
/
scraper_functions.py
228 lines (196 loc) · 9.96 KB
/
scraper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 27 17:10:33 2020
@author: Anastasia
"""
#Import all packages
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from daterangeparser import parse
import datetime
import requests
import random
import time
import re
import numpy as np
def get_url_list(search_list):
"""Gets a list of URLs for Etsy and a list of terms to append to the 'category' column from an input of a list of search term strings. Spaces will be accounted for.
"""
url_list = []
term_list=[]
for term in search_list:
url_list.append('https://www.etsy.com/uk/search?q=' + re.sub("\s", "+", term))
term_list.append(term)
return url_list, term_list
def close_popup(driver):
"""Closes the 'privacy settings' popup on the homepage of search results. Takes the Chromdriver as an input.
"""
pop_up_xpath = "//*[@id='gdpr-single-choice-overlay']/div/div[2]/div[2]/button"
try:
driver.find_element_by_xpath(pop_up_xpath).click()
except:
pass
def open_page(driver, URL):
"""Opens a webpage given a driver and a URL. This function uses the 'close_popup' function to get past the privacy setting popup. It also accounts for potential IP blocks by sleeping for a random amount of time between 10 and 15 seconds with each page opened.
"""
for i in range(3):
try:
random_sleep_link = random.uniform(10, 15)
time.sleep(random_sleep_link)
driver.get(URL)
time.sleep(3)
close_popup(driver)
except requests.exceptions.RequestException:
random_sleep_except = random.uniform(240,360)
print("I've encountered an error! I'll pause for"+str(random_sleep_except/60) + " minutes and try again \n")
time.sleep(random_sleep_except)
continue
else:
break
else:
raise Exception("Something really went wrong here... I'm sorry.")
def get_links(driver):
"""This function finds all of the links to listings on an Etsy page and returns a list composed of the text of the href attribute. It takes your webdriver as an input.
"""
link_list = []
links = driver.find_elements_by_xpath('//a[starts-with(@href, "https://www.etsy.com/uk/listing/")]')
for link in links:
link_text = link.get_attribute("href")
link_list.append(link_text)
return link_list
def scrape_link_details(driver,link):
"""Opens a link to a listing and scrapes all of the pertinent details. Returns 1) the number of sales made by the shop, 2) the number of this item currently in people's baskets, 3) the description of the item, 4) the average number of days between today and when the item arrives, 5) the cost of delivery, 6) whether returns are accepted, 7) the country where the item is dispatched from, and 8) how many images the listing has.
"""
for i in range(3):
try:
random_sleep_link = random.uniform(5,7)
time.sleep(random_sleep_link)
windows_before = driver.current_window_handle
driver.execute_script("window.open('" + link +"');")
print('opened window')
windows_after = driver.window_handles
new_window = [x for x in windows_after if x != windows_before][0]
print('got new deets')
driver.switch_to.window(new_window)
loaded = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "gnav-search")))
try:
sales = loaded.find_elements_by_xpath("//div[starts-with(@class, 'wt-display-inline-flex-xs wt-align-items-center')]/a/span[1]")
s = sales[0].text
num_sales = s.split(" ")[0]
except:
num_sales = 0
try:
basket = loaded.find_elements_by_xpath("//p[@class='wt-position-relative wt-text-caption']")
x = basket[0].text
y = [int(i) for i in x.split() if i.isdigit()]
for i in y:
num_basket = i
except:
num_basket = 0
try:
description = loaded.find_element_by_xpath("//meta[@name='description']")
descriptions = description.get_attribute("content")
except:
descriptions = np.nan
try:
arrival = loaded.find_element_by_xpath("//*[@id='shipping-variant-div']/div/div[2]/div[1]/div/div[1]/p")
arrival_range = arrival.text
start, end = parse(arrival_range)
average = start + (end - start)/2
today = datetime.date.today()
diff = average.date() - today
days_to_arrival = diff.days
except:
days_to_arrival = np.nan
try:
delivery = loaded.find_element_by_xpath("//*[contains(text(), 'Cost to deliver')]/following-sibling::p").text
if delivery == 'Free':
cost_delivery = 0
else:
match = re.search(r'\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})', delivery).group(0)
cost_delivery = float(match)
except:
cost_delivery = np.nan
try:
loaded.find_element_by_xpath("//*[contains(text(), 'Accepted')]")
returns_accepted = 1
except:
returns_accepted = 0
try:
dispatch = loaded.find_element_by_xpath("//*[@id='shipping-variant-div']/div/div[2]/div[7]").text
d_split = dispatch.split(" ")[2:]
d_join = " ".join(d_split)
dispatch_from = d_join
except:
dispatch_from = np.nan
try:
images = loaded.find_element_by_xpath("//ul[starts-with(@class, 'wt-list-unstyled wt-display-flex-xs')]")
i_list = images.find_elements_by_xpath("//li[@class='wt-mr-xs-1 wt-mb-xs-1 wt-bg-gray wt-flex-shrink-xs-0 wt-rounded carousel-pagination-item-v2']")
count_images = len(i_list)
except:
count_images = 1
driver.close() # close the window
driver.switch_to.window(windows_before) # switch_to the parent_window_handle
print('switched')
except requests.exceptions.RequestException: #if anything weird happens...#
random_sleep_except = random.uniform(240,360)
print("I've encountered an error! I'll pause for"+str(random_sleep_except/60) + " minutes and try again \n")
time.sleep(random_sleep_except) #sleep the script for x seconds and....#
continue #...start the loop again from the beginning#
else: #if the try-part works...#
break #...break out of the loop#
print('broke out of the loop')
else: #if x amount of retries on the try-part don't work...#
raise Exception("Something really went wrong here... I'm sorry.") #...raise an exception and stop the script#
return num_sales, num_basket, descriptions, days_to_arrival, cost_delivery, returns_accepted, dispatch_from, count_images
def get_main_page(driver, result, term):
"""Scrapes the details of the search page. This function takes 3 arguments as inputs: 1) your webdriver, 2) the location of the contents area of the Etsy search page, and 3) the search term you're currently scrubbing for. It returns 65 object lists for each of the following: 1) titles of the listings, 2) whether the listings are ads, 3) the names of the listing shops, 3) the star rating of the shops, 4) the number of reviews the shops have, 5) the prices of the objects, 6) whether the listings are bestsellers, and 7) the category of the search (which will be the term you've input)
"""
titles = result.find_element_by_css_selector("div > a[href]").get_attribute("title")
shop_name = result.find_element_by_css_selector("p.screen-reader-only").text
if shop_name[:2] == 'Ad':
is_ad = 1
else:
is_ad = 0
shop_names = shop_name.split(" ")[-1]
try:
star_rating = result.find_element_by_css_selector("span.screen-reader-only").text
star_ratings = star_rating.split(" ")[0]
except:
star_ratings = np.nan
try:
num_review = result.find_element_by_css_selector('span.text-body-smaller.text-gray-lighter.display-inline-block.icon-b-1').text
num_reviews = num_review.strip("()")
except:
num_reviews = 0
prices = result.find_element_by_css_selector('span.currency-value').text
try:
result.find_element_by_xpath("//span[@class='wt-badge wt-badge--small wt-badge--status-03']/span[2]")
bestseller = 1
except:
bestseller = np.nan
category = term
return titles, is_ad, shop_names, star_ratings, num_reviews, prices, bestseller, category
def next_page(driver, page_counter):
"""Clicks on the next page of search results. Takes the webdriver and current page_counter as arguments to ensure the right next page is located.
"""
try:
page = driver.find_element_by_xpath('//a[contains(@data-page,"{}")]'.format(page_counter))
next_page = page.get_attribute("href")
for i in range(3):
try:
random_sleep_link = 4
time.sleep(random_sleep_link)
driver.get(next_page)
except requests.exceptions.RequestException:
random_sleep_except = random.uniform(240,360)
print("I've encountered an error! I'll pause for"+str(random_sleep_except/60) + " minutes and try again \n")
time.sleep(random_sleep_except)
continue
else:
break
else:
raise Exception("Something really went wrong here... I'm sorry.")
except:
pass