/
scrape.py
47 lines (39 loc) · 1.64 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from catalog import find_year_urls, find_product_urls
from product import load_single_page, find_options, find_name, NoMaxQuantity
from storage import save
category_urls = [
"http://www.white2tea.com/tea-shop/product-category/raw-puer-tea/misc-raw-puer-tea/",
"http://www.white2tea.com/tea-shop/product-category/white2tea-raw-puer-tea/"
]
year_urls = [
# Teas - Not under any category
"http://www.white2tea.com/tea-shop/product-category/view-all-ripe-puer-teas/",
"http://www.white2tea.com/tea-shop/product-category/black-tea/",
"http://www.white2tea.com/tea-shop/product-category/oolong-tea/",
"http://www.white2tea.com/tea-shop/product-category/tea-sample-sets/",
"http://www.white2tea.com/tea-shop/product-category/white-tea/",
# Accessories!
"http://www.white2tea.com/tea-shop/product-category/teaware-and-tea-accessories/teacups-teaware-and-tea-accessories/",
"http://www.white2tea.com/tea-shop/product-category/teaware-and-tea-accessories/teapots/"
]
product_urls = []
output_filename = "output"
data = {"products": []}
for category_url in category_urls:
year_urls.extend(find_year_urls(category_url))
for year_url in year_urls:
product_urls.extend(find_product_urls(year_url))
print("done finding product urls")
for product_url in product_urls:
try:
page = load_single_page(product_url)
name = find_name(page)
options = find_options(page)
single_product = {
"name": name,
"options": options
}
data["products"].append(single_product)
except NoMaxQuantity as exception:
print(str(exception).upper())
save(data, output_filename)