class MezituProcessor(BaseProcessor): spider_id = 'mzitu' spider_name = 'mzitu' allowed_domains = ['mzitu.com', 'meizitu.net'] start_requests = [Request(url='http://www.mzitu.com/xinggan/')] rules = ( Rule(LinkExtractor( regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"), callback="save", priority=3), Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1), Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"), priority=2), Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"), priority=0), ) def save(self, response): if response.m_response: if not os.path.exists("img"): os.mkdir("img") with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs: fs.write(response.m_response.content) print("download success!")
class FeProcessor(BaseProcessor): spider_id = 'fe' spider_name = 'fe' allowed_domains = ['58.com'] start_requests = [Request(url='http://www.58.com/daikuan/changecity/')] rules = ( Rule(LinkExtractor(regex_str=r"http://[a-z]*?.58.com/daikuan/"), priority=0), Rule(LinkExtractor(regex_str=r"/daikuan/pn\d+/"), priority=1), Rule(LinkExtractor(css_str="table.small-tbimg a.t"), priority=3, callback='save'), ) def save(self, response): if response.m_response: print bs(response.m_response.content, 'lxml').title.string
class CityLocationProcessor(BaseProcessor): spider_id = 'city' spider_name = 'city' allowed_domains = ['supfree.net'] start_requests = [Request(url='http://jingwei.supfree.net/')] rules = ( Rule(LinkExtractor(regex_str=r"kongzi\.asp\?id=\d+"), priority=0), Rule(LinkExtractor(regex_str=r"mengzi\.asp\?id=\d+"), priority=1, only_first=True, callback='save'), ) def save(self, response): if response.m_response: soup = bs(response.m_response.content, 'lxml') name = soup.select("div.cdiv p")[0].string.strip().split(' ') if len(name) > 2: province = name[0] city = name[1] area = name[2] elif len(name) > 1: province = name[0] city = name[0] area = name[1] else: province = name[0] city = name[0] area = name[0] lo = soup.select("div.cdiv p")[1].select("span")[0].string.strip() la = soup.select("div.cdiv p")[1].select("span")[1].string.strip() data = province + ',' + city + ',' + area + ',' + lo + ',' + la print data with open('city.txt', 'a+') as fs: data = province + ',' + city + ',' + area + ',' + lo + ',' + la fs.write(data + '\n') print data