/
FatBoyPipeline.py
59 lines (50 loc) · 2.02 KB
/
FatBoyPipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from scrapy import signals
from scrapy.exceptions import DropItem
from scrapy.contrib.exporter import CsvItemExporter
import os
import csv
from system_config import system_config
class FatBoyPipeline(object):
def __init__(self):
self.files = {}
self.filename = '_data/craigslist.csv'
self.imported_posts = set()
self.city_dic = map(lambda x: {x['url_part']: x['name']}, system_config['cities_data_sources'])
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
includeHeader = not os.path.isfile(self.filename)
if (not includeHeader):
self.load_existing_posts(self.filename)
file = open(self.filename, 'a+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file, include_headers_line=includeHeader)
self.exporter.fields_to_export = ['title', 'post_date', 'price', 'city', 'url']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
url = item['url']
if (url in self.imported_posts):
raise DropItem("Item with %s has been imported already." % url)
else:
item['city'] = self.decidePostCity(item['url'])
self.exporter.export_item(item)
return item
def load_existing_posts(self, path):
with open(path, 'rt') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
self.imported_posts.add(row['url'])
def decidePostCity(self, url):
matchObj = url[8:url.index('.craigslist.org')]
if (matchObj in self.city_dic):
return self.city_dic[matchObj]
else:
return matchObj