forked from sbourdelin/scrapyjiji
/
scrapyjiji.py
executable file
·213 lines (171 loc) · 7.58 KB
/
scrapyjiji.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
#
# Copyright (C) 2016 Sebastien Bourdelin <sebastien.bourdelin@gmail.com>
#
import scrapy
import re
import geocoder
import folium
import datetime
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
# Settings
MAX_POST_DAYS = 15
MAP_ZOOM = 13
MIN_PRICE = 0
MAX_PRICE = 700
GREEN_PRICE = 400
ORANGE_PRICE = 550
MAX_PAGE = 10
if False:
MAP_LATLNG = [46.810811, -71.215439] # your city latitude and longitude
START_URL = "http://www.kijiji.ca/b-appartement-condo/ville-de-quebec/c37l1700124?ad=offering&siteLocale=en_CA"
LINKS_NEXT_PAGE = "http://www.kijiji.ca/b-appartement-condo/ville-de-quebec"
LINKS_TO_FOLLOW = "http://www.kijiji.ca/v-appartement-condo-.*"
else:
MAP_LATLNG = [45.545046, -73.716665]# your city latitude and longitude
START_URL = "http://www.kijiji.ca/b-chambres-a-louer-colocataire/grand-montreal/c36l80002?ad=offering&siteLocale=en_CA"
LINKS_NEXT_PAGE = "http://www.kijiji.ca/b-chambres-a-louer-colocataire/grand-montreal"
LINKS_TO_FOLLOW = "http://www.kijiji.ca/v-chambres-a-louer-colocataire/.*"
#http://www.kijiji.ca/v-chambres-a-louer-colocataire/ville-de-montreal/transport-small-moving/1173170938?enableSearchNavigationFlag=true
MAPQUEST_API_KEY = ""
class Appartement(scrapy.Item):
"""Definition of items which went to retrieve"""
url = scrapy.Field()
address = scrapy.Field()
price = scrapy.Field()
title = scrapy.Field()
description = scrapy.Field()
date_listed = scrapy.Field()
num_bathrooms = scrapy.Field()
#num_bedrooms = scrapy.Field()
class Kijiji(CrawlSpider):
"""Definition of the kijiji spider crawler"""
m_map = folium.Map(location=MAP_LATLNG, zoom_start=MAP_ZOOM)
name = "kijiji"
allowed_domains = ["kijiji.ca"]
start_urls = ["%s&price=%d__%d" % (START_URL, MIN_PRICE, MAX_PRICE)]
#start_urls = START_URL
print(start_urls)
#Other way : http://stackoverflow.com/questions/32624033/scrapy-crawl-with-next-page
# rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@title="Next"]',)), callback="parse_item", follow= True),)
#print(rules.)
rules = [
# Extract link
Rule(
LinkExtractor(
allow=[LINKS_TO_FOLLOW]
),
callback='parse_item'),
Rule(
LinkExtractor(
allow=["%s/page-[0-%d]/.*" % (LINKS_NEXT_PAGE, MAX_PAGE)]
)
)
]
def __init__(self, *a, **kw):
"""Attach a callback to the spider_closed signal"""
super(Kijiji, self).__init__(*a, **kw)
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
"""Handle the spider_closed event to save the map"""
self.m_map.save('map.html')
def parse_item(self, response):
"""The item parser callback"""
appartement = Appartement()
appartement["url"] = response.url
appartement["address"] = self._extract_field(response, "Address")
appartement["price"] = self._extract_field(response, "Price")
appartement["title"] = self._extract_title(response)
# phone http://stackoverflow.com/questions/6682503/click-a-button-in-scrapy
# from source :: https://github.com/mfournierca/kijiji_scraper/blob/master/kijiji_scraper/spiders/aptmnt_spider.py
appartement["date_listed"] = self._extract_field(response, "Date Listed")
appartement["num_bathrooms"] = self._extract_field(response, "Bathrooms (#)")
appartement["description"] = self._extract_description(response)
#appartement["num_bedrooms"] = self._extract_bedrooms(response)
self.add_marker(appartement)
return appartement
def add_marker(self, appartement):
"""Helper function to add a marker on the map"""
# folium.Marker(self.geocode(appartement),
# popup=self.popup(appartement),
# icon=folium.Icon(color=self.color_price(appartement)),
# ).add_to(self.m_map)
print(appartement)
folium.CircleMarker(location=self.geocode(appartement),
popup=self.popup(appartement),
radius=200,
fill_color=self.color_price(appartement),
fill_opacity=self.delta_ratio_day(date_in=appartement["date_listed"],
denum=MAX_POST_DAYS)).add_to(self.m_map)
def popup(self, appartement):
"""Helper function to create a popup with appartement informations"""
html = """{price}<br>
<a href={url} target=_blank>{title}<a/>
<br>{description}""".format(url=appartement["url"],
title=appartement["title"],
price=appartement["price"],
description=appartement["description"])
iframe = folium.element.IFrame(html=html, width=500, height=100)
popup = folium.Popup(iframe, max_width=500)
return popup
def color_price(self, appartement):
"""Helper function to return a color for marker based on price"""
price = float(appartement["price"][1:])
print(price)
if price < GREEN_PRICE:
color = 'green'
elif price < ORANGE_PRICE:
color = 'orange'
else:
color = 'red'
return color
def geocode(self, appartement):
"""Retrieve the lat and long using different fallback services"""
g = geocoder.google(appartement["address"])
if not g.latlng:
g = geocoder.osm(appartement["address"])
if not g.latlng and MAPQUEST_API_KEY:
g = geocoder.mapquest(appartement["address"], key=MAPQUEST_API_KEY)
# if we can't geocode the address, we return the map center
if g.latlng:
return g.latlng
else:
return MAP_LATLNG
@staticmethod
def _extract_title(response):
"""Retrieve the title based on xpath"""
l = " ".join(response.xpath("//h1/text()").extract())
return l.strip() if l else None
@staticmethod
def _extract_field(response, fieldname):
"""Retrieve the html field based on xpath"""
l = response.xpath("//th[contains(text(), '{0}')]/following::td[1]//./text()".
format(fieldname)).extract()
return l[0].strip() if l else None
@staticmethod
def _clean_string(string):
for i in [",", "\n", "\r", ";", "\\"]:
string = string.replace(i, "")
return string.strip()
def _extract_description(self, response):
l = " ".join(response.xpath("//span[@itemprop='description']/text()").extract())
return self._clean_string(l)
# def _extract_bedrooms(self, response):
# r = re.search(r'kijiji.ca\/v-(\d)-bedroom-apartments-condos', response.url)
# return r.group(1).strip() if r else None
@staticmethod
def delta_ratio_day(date_in, denum):
now = datetime.date.today()
i = datetime.datetime.strptime(date_in, "%d-%b-%y").date()
return 1-min(abs((now - i).days/denum), 1.0) # [0,1]
# Entry point
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(Kijiji)
process.start()