class KijijiRideshareSpider(CrawlSpider): """ *A scrapy crawler to extract rideshare information from kijiji *Crawls the kijiji wepages to find rideshare information *currently, only supports Ottawa """ name = "kijiji_rideshare_spider" allowed_domains = ['kijiji.ca'] start_urls = ["http://www.kijiji.ca/b-rideshare-carpool/ottawa/c5l1700185"] rules = [ Rule(LinkExtractor( allow=['http://www.kijiji.ca/v-rideshare-carpool/ottawa/.+']), callback='parse_rideshare'), Rule( LinkExtractor(allow=[ "http://www.kijiji.ca/b-rideshare-carpool/ottawa/page-[0-9]/.+" ]), ), ] def parse_item(self, response): """ *An earlier version of the code that uses hxs selector *based on code from Github: mjhea0/Scrapy-Samples *Not used currently, but left alone for debugging and initial help purpose """ selection = scrapy.Selector(response) titles = selection.xpath("//td[@class='description']") result = [] for title in titles: item = items.KijijiRideshareItem() item["title"] = title.select("a/text()").extract() item["link"] = title.select("a/@href").extract() result.append(item) return result def parse_rideshare(self, response): """ Parses and stores the required rideshare information """ rideshare_item = items.kijijiRideshareData() rideshare_item["url"] = response.url rideshare_item["title"] = self._extract_title(response) rideshare_item["date_listed"] = self._extract_field( response, "Date Listed") rideshare_item["address"] = self._extract_field(response, "Address") rideshare_item["phone_number"] = self._extract_phone_number(response) rideshare_item["full_text"] = self._extract_full_text(response) return rideshare_item def _extract_title(self, response): l = " ".join(response.xpath("//h1/text()").extract()) return self._clean_string(l) def _extract_full_text(self, response): l = " ".join( response.xpath("//span[@itemprop='description']/text()").extract()) return self._clean_string(l) def _extract_phone_number(self, response): return "613" def _extract_field(self, response, fieldname): l = response.xpath( "//th[contains(text(), '{0}')]/following::td[1]//./text()".format( fieldname)).extract() return l[0].strip() if l else None def _clean_string(self, string): for i in [",", "\n", "\r", ";", "\\"]: string = string.replace(i, "") return string.strip()
class XhamsterSpider(CrawlSpider): name = "xhamster" allowed_domains = ["xhamster.com"] start_urls = ["http://xhamster.com/channels.php"] rules = ( Rule(SgmlLinkExtractor(allow=['/movies/\d+/.*'], ), callback='parse_video'), Rule(SgmlLinkExtractor(deny=[ '/webcam(.*)', '/cam(.*)', '/start(.*)', '/games(.*)', '/stories(.*)', '/dating(.*)', '/photos(.*)', '/information(.*)', ], allow_domains=["xhamster.com"]), follow=True), ) def parse_video(self, response): hxs = HtmlXPathSelector(response) video = VideoItem() video['masturbator'] = self.name url_parsed = urlparse(response.url) video['remote_url'] = "{0}://{1}{2}".format(url_parsed.scheme, url_parsed.netloc, url_parsed.path) try: url_re_result = url_re.search(video['remote_url']) video['remote_id'] = int(url_re_result.group(1)) except: return None video['title'] = first_or_none( hxs.select( "//div[@id='playerBox']//h2[@class='gr']/text()").extract()) if not video['title']: return None else: video['title'] = video['title'].strip() remote_date = first_or_none( hxs.select( "//td[@id='videoUser']//span[@class='hint']/@hint").extract()) if remote_date: video['remote_date'] = datetime.datetime.strptime( remote_date, "%Y-%m-%d %H:%M:%S %Z") duration = first_or_none( hxs.select( "//td[@id='videoUser']//div[span[text()='Runtime:']]/text()"). extract()) if duration: duration = duration.strip().split(":") video['duration'] = int(duration[0]) * 60 + int(duration[1]) video['tags'] = set() video['thumbs'] = set() video['stars'] = set() for tag in hxs.select("//td[@id='channels']//a/text()").extract(): video['tags'].add(tag.lower().strip()) id_str_part = str(video['remote_id'])[-3:] thumb_pattern_url = "http://et1.xhamster.com/t/{id_part}/{number}_{id}.jpg" for i in range(1, 11): video['thumbs'].add( "http://et0.xhamster.com/t/{0}/{1}_{2}.jpg".format( id_str_part, i, video['remote_id'])) return video
class FunTVSpider(CrawlSpider): """ 风行综艺爬取 """ name = 'fun_variety' allows_domains = [ 'fun.tv', ] start_urls = [ 'http://www.fun.tv/retrieve/c-e7bbbce889ba.n-e5bdb1e78987.o-mf.pg-1' ] rules = [ Rule(sle( allow=('/retrieve/c-e7bbbce889ba.n-e5bdb1e78987.o-mf.pg-\d+$', )), follow=True, callback='parse1'), ] def parse1(self, response): sel = Selector(response) tv_list = sel.css( 'body div.mod-list.page-wrap div div.mod-wrap-in.mod-vd-lay.fix div.mod-vd-i' ) for tv in tv_list: tv_id = tv.css('div.info h3 a::attr(data-id)').extract()[0] if db_session.query(FunVideo).filter(FunVideo.id == tv_id).first(): continue name = tv.css('div.info h3 a::attr(title)').extract()[0] image = tv.css('div.pic a img::attr(_lazysrc)').extract()[0] description = tv.css('div.info p::text').extract()[0] point = tv.css('div.info h3 b::text').extract()[0] request = Request('http://www.fun.tv{}'.format( tv.css('div.pic a::attr(href)').extract()[0]), callback=self.parse2) fv = FunVideo(id=tv_id, name=name, name_pinyin=pinyin.get_initials(name, splitter=''), image=image, description=description, point=point) request.meta['tv'] = fv yield request def parse2(self, response): tv = response.meta['tv'] sel = Selector(response) tv.origin_url = response.url p_dirsort = sel.css('div#main-rt div.mod-datum p.dirsort') for p in p_dirsort: p_type = p.css('::text').extract()[0] if u'导演' in p_type: tv.director = ''.join(p.css('span::text').extract()) elif u'主演' in p_type: tv.starring = ''.join(p.css('span::text').extract()) elif u'类型' in p_type: tv.category = ''.join(p.css('span::text').extract()) tv.detail = sel.css( 'div#main-rt div.mod-datum p.dirtext span:nth-child(2)::text' ).extract()[0] print tv.name, '------->', tv.origin_url # 表明综艺 tv.type = 3 db_session.add(tv) db_session.commit()
class ZZSpider(CrawlSpider): name = "zz_prilepin" allowed_domains = ["livejournal.com"] start_urls = [ #"http://prilepin.livejournal.com/2007/03/" #"http://prilepin.livejournal.com/2014/10/" "http://prilepin.livejournal.com/2015/07/" ] rules = ( Rule(LinkExtractor( allow=('prilepin.livejournal.com/\d\d\d\d/\d\d/', ), deny=('prilepin.livejournal.com/\d\d\d\d/\d\d/\d\d', 'tag', 'reply', 'thread', 'page'), ), callback='parse_overview', follow=True), Rule(LinkExtractor( allow=('http://prilepin.livejournal.com/\d+/.html', ), deny=('tag', 'reply', 'thread', 'page'), ), callback='parse_page', follow=True), ) def parse_start_url(self, response): list(self.parse_overview(response)) def parse_overview(self, response): urls = response.xpath("//dd/a/@href").extract() for url in urls: yield Request(url, callback=self.parse_page) def parse_page(self, response): # use scrapy shell to find xpath #from scrapy.shell import inspect_response #inspect_response(response) item = ScraperItem() item["url"] = response.url item["date"] = response.xpath( "//p[@class='entry-footer']/text()").extract()[0] item["text"] = " ".join( response.xpath( "//div[@class='entry-body']/child::node()").extract()) try: item["title"] = response.xpath( "//h3[@class='entry-header']/text()").extract()[0] except IndexError: item["title"] = "" try: item["comment_count"] = response.xpath( "//p[@class='entry-footer']/a[3]/text()").extract()[0] except IndexError: item["comment_count"] = "0" yield item
class Yy138Spider(CrawlSpider): name = 'yy138' allowed_domains = ['www.yy138.com'] start_urls = [ 'http://www.yy138.com/android/youxi/', 'http://www.yy138.com/android/ruanjian/', 'http://www.yy138.com/wangyou/', ] rules = ( Rule(SgmlLinkExtractor(allow = ['/[a-zA-Z][a-zA-Z0-9]*/']), callback = 'parse_item', follow = False), Rule(SgmlLinkExtractor(allow = ['/wangyou/']), callback = 'noapk', follow = True), Rule(SgmlLinkExtractor(allow = ['/\d+/(\d+\.html)*']), callback = 'noapk', follow = True), Rule(SgmlLinkExtractor(allow = ['/wangyou/zuixin/(\d+\.html)*']), callback = 'noapk', follow = True), Rule(SgmlLinkExtractor(allow = ['/youxi/']), callback = 'noapk', follow = True), Rule(SgmlLinkExtractor(allow = ['/youxi/zuixin/(\d+\.html)*']), callback = 'noapk', follow = True), Rule(SgmlLinkExtractor(allow = ['/ruanjian/']), callback = 'noapk', follow = True), Rule(SgmlLinkExtractor(allow = ['/ruanjian/zuixin/(\d+\.html)*']), callback = 'noapk', follow = True), ) def noapk(self, response): print 'No apk: ', response.url def parse_item(self, response): print 'There is a new apk: ',response.url hxs = HtmlXPathSelector(response) i = BaidumarketItem() print 'begin:' try: # print '.....' i['app_name'] = ''.join(hxs.select('//div[@class="column download"]/div[1]/h1[1]/text()').extract()) i['app_keywords'] = ''.join(hxs.select('//div[@class="intro"]/p[3]/a/text()').extract()) i['app_url'] = response.url i['app_icon_url'] = ''.join(hxs.select('//div[@class="icon"]/img/@src').extract()) # print 'zhongduan' #i['icon_content'] = i['app_size'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div[2]/div[2]/div/div/div[1]/div/a/span/text()').extract()) if i['app_size'] =="": i['app_size'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div/div[2]/div/div/div[1]/div/a/span/text()').extract()) i['app_version'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div[2]/div[2]/div/div/div[1]/p/text()[2]').extract())[5:] if i['app_version'] == "": i['app_version'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div/div[2]/div/div/div[1]/p/text()[2]').extract())[5:] i['download_times'] = '0' i['download_url'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div[2]/div[2]/div/div/div[1]/div/a/@href').extract()) if i['download_url']=='': i['download_url']=''.join(hxs.select('//*[@id="xiazai"]/div/div/div[2]/div/div/div[1]/div/a/@href').extract()) i['app_author'] = 'None' i['os_version'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div[2]/div[2]/div/div/div[1]/p/text()[3]').extract())[5:] i['app_description'] = ''.join(hxs.select('//div[@class="column introduction"]/div[2]/p/text()').extract()) if i['app_description'] == '': i['app_description'] = ''.join(hxs.select('//div[@class="column introduction"]/div[2]/text()').extract()) i['last_update_date'] = '1990-01-01' i['app_class'] = ''.join(hxs.select('//div[@class="intro"]/p[2]/a[2]/text()').extract()) i['app_market'] = u'yy138.com' i['market_site'] = 'www.yy138.com' i['user_rate'] = ''.join(hxs.select('//div[@class="intro"]/p[1]/span[1]/span/@class').extract())[4] i['comments_num'] = '0' print i return i except Exception, e: print e
class PlaystoreSpider(CrawlSpider): def gen_urls(): for c in ('ARCADE', 'BRAIN', 'CARDS', 'CASUAL', 'GAME_WALLPAPER', 'RACING', 'SPORTS_GAMES', 'GAME_WIDGETS', 'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION', 'EDUCATION', 'ENTERTAINMENT', 'FINANCE', 'HEALTH', 'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'APP_WALLPAPER', 'MEDIA_AND_VIDEO', 'MEDICAL', 'MUSIC_AND_AUDIO', 'NEWS_AND_MAGAZINES', 'PERSONALIZATION', 'PHOTOGRAPHY', 'PRODUCTIVITY', 'SHOPPING', 'SOCIAL', 'SPORTS', 'TOOLS', 'TRANSPORTATION', 'TRAVEL_AND_LOCAL', 'WEATHER', 'APP_WIDGETS'): yield 'https://play.google.com/store/apps/category/%s/collection/topselling_paid' % c yield 'https://play.google.com/store/apps/category/%s/collection/topselling_free' % c name = 'playstore' allowed_domains = ['play.google.com'] start_urls = gen_urls() reg_start = re.compile('start=([\d]+)') rules = ( #Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), Rule(SgmlLinkExtractor(allow=r'category/[A-Z_]+\?', deny=r'/accounts/'), follow=True, callback='parse_app'), #categories # Rule(SgmlLinkExtractor(allow=r'start=[\d]+&num=[\d]+', deny=r'/accounts/'), follow=True), #categories Rule(SgmlLinkExtractor(allow=r'/collection/', deny=r'editors_choice'), follow=True), #categories #parse_app ) def parse(self, response): hxs = HtmlXPathSelector(response) m = PlaystoreSpider.reg_start.search(response.url) start = 0 if m: start = int(m.group(1)) artworks = hxs.select( '//div[@class="thumbnail-wrapper goog-inline-block"]/a/img/@src' ).extract() ids = hxs.select( '//li[@class="goog-inline-block"]/@data-docid').extract() ids += hxs.select( '//li[@class="goog-inline-block z-last-child"]/@data-docid' ).extract() #scary! names = hxs.select( '//div[@class="details goog-inline-block"]/div/a/text()').extract( ) urls = hxs.select( '//div[@class="details goog-inline-block"]/div/a/@href').extract() reg_cat = re.compile('/category/([\w_]+)(/|\?|/)*') category = reg_cat.search(response.url).group(1).replace('_', ' ').title() sellers = hxs.select('//span[@class="attribution"]/div/a').extract() seller_links = hxs.select( '//span[@class="attribution"]/div/a/@href').extract() assert not "We're sorry" in response.body assert len(artworks) == len(ids) == len(names) == len(urls) == len( sellers) == len(seller_links), (len(artworks), len(ids), len(names), len(urls), len(sellers), len(seller_links)) for artwork, id, name, url, seller, seller_link in zip( artworks, ids, names, urls, sellers, seller_links): i = AppStoreItem() i['store'] = 'play' i['id'] = id i['artwork'] = artwork i['category'] = category i['url'] = 'https://play.google.com' + url i['name'] = name i['last_update'] = datetime.date.today().isoformat() i['seller'] = seller i['seller_link'] = 'https://play.google.com' + seller_link yield i if start == 0: prefix = '?' if '?' in response.url: prefix = '&' for i in range(24, 480 + 1, 24): yield Request(response.url + prefix + 'start=%d&num=24' % i)
class ZhihuSpider(CrawlSpider): name = "zhihu" allowed_domains = ["zhihu.com"] start_urls = [ # "http://www.zhihu.com/", # "https://www.zhihu.com/people/hu-shi-wei-63", "https://www.zhihu.com/people/hu-shi-wei-63/followees", ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2', 'Connection': 'keep-alive', 'Host': 'www.zhihu.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36', 'Referer': 'http://www.zhihu.com/', } rules = [ # Rule(LinkExtractor(allow=("/people/[^/]+/followees$", )), # callback='parse_followees'), # Rule(LinkExtractor(allow=("/people/[^/]+/followers$", )), # callback='parse_followers'), Rule(LinkExtractor(allow=("/people/[^/]+$", )), callback='parse_people_with_rules', follow=True), Rule(LinkExtractor(allow=('/question/\d+#.*?', )), callback='parse_question', follow=True), Rule(LinkExtractor(allow=('/question/\d+', )), callback='parse_question', follow=True), ] # need dfs/bfs all_css_rules = { '.zm-profile-header': { '.zm-profile-header-main': { '__use': 'dump', 'name': '.title-section .name::text', 'sign': '.title-section .bio::text', 'location': '.location.item::text', 'business': '.business.item::text', 'employment': '.employment.item::text', 'position': '.position.item::text', 'education': '.education.item::text', 'education_extra': '.education-extra.item::text', }, '.zm-profile-header-operation': { '__use': 'dump', 'agree': '.zm-profile-header-user-agree strong::text', 'thanks': '.zm-profile-header-user-thanks strong::text', }, '.profile-navbar': { '__use': 'dump', 'asks': 'a[href*=asks] .num::text', 'answers': 'a[href*=answers] .num::text', 'posts': 'a[href*=posts] .num::text', 'collections': 'a[href*=collections] .num::text', 'logs': 'a[href*=logs] .num::text', }, }, '.zm-profile-side-following': { '__use': 'dump', 'followees': 'a.item[href*=followees] strong::text', 'followers': 'a.item[href*=followers] strong::text', } } def start_requests(self): return [ Request("https://www.zhihu.com/login/email", meta={'cookiejar': 1}, callback=self.post_login) ] def get_captcha(self): s = requests.session() captcha_url = 'http://www.zhihu.com/captcha.gif' captcha = s.get(captcha_url, stream=True) print captcha f = open('captcha.gif', 'wb') for line in captcha.iter_content(10): f.write(line) f.close() return s # FormRequeset出问题了 def post_login(self, response): print 'Preparing login' # 下面这句话用于抓取请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单 xsrf = Selector(response).xpath( '//input[@name="_xsrf"]/@value').extract()[0] s = self.get_captcha() captcha_str = raw_input('Input captcha:') logindata = { '_xsrf': xsrf, 'email': '*****@*****.**', 'password': '******', 'rememberme': 'true', 'captcha': captcha_str } res = s.post('https://www.zhihu.com/login/email', headers=self.headers, data=logindata) cookies = dict(res.cookies) for url in self.start_urls: yield Request(url, cookies=cookies) def traversal(self, sel, rules, item): if '__use' in rules: for nk, nv in rules.items(): if nk == '__use': continue if nk not in item: item[nk] = [] if sel.css(nv): item[nk] += [i.extract() for i in sel.css(nv)] else: item[nk] = [] else: for nk, nv in rules.items(): for i in sel.css(nk): self.traversal(i, nv, item) def dfs(self, sel, rules, item_class): if sel is None: return [] item = item_class() self.traversal(sel, rules, item) return item def parse_with_rules(self, response, rules, item_class): return self.dfs(Selector(response), rules, item_class) def parse_people_with_rules(self, response): info('Parsed ' + response.url) item = self.parse_with_rules(response, self.all_css_rules, ZhihuPeopleItem) item['id'] = urlparse(response.url).path.split('/')[-1] yield item def parse_followers(self, response): return self.parse_people_with_rules(response) def parse_followees(self, response): return self.parse_people_with_rules(response) def parse_question(self, response): problem = Selector(response) item = QuestionItem() item['url'] = response.url item['name'] = problem.xpath('//span[@class="name"]/text()').extract() item['title'] = problem.xpath( '//h2[@class="zm-item-title zm-editable-content"]/text()').extract( ) item['description'] = problem.xpath( '//div[@class="zm-editable-content"]/text()').extract() item['answer'] = problem.xpath( '//div[@class="zm-editable-content clearfix"]/text()').extract() return item
class MonitorSpider(RedisMixin, CrawlSpider): # class MonitorSpider( CrawlSpider ): # class MonitorSpider(BaseSpider): # class MonitorSpider(RedisSpider): name = "monitorspider" redis_key = 'monitorspider:start_urls' allowed_domains = [ "tmall.com", "taobao.com", # tmall "jd.com", "3.cn", # jd "feifei.com", # feifei "yhd.com", "yihaodian.com", # yihaodian "yixun.com", # yixun "amazon.cn" ] # amazon start_urls = [] pipeline = ['MongoPipeline'] rules = ( Rule( SgmlLinkExtractor( allow=(r'detail.tmall.com'), restrict_xpaths=( "//div[@id='J_ItemList']//p[@class='productTitle']"), unique=True), callback='parseTmall', ), Rule(SgmlLinkExtractor( allow=(r'list.tmall.com'), restrict_xpaths=("//a[@class='ui-page-s-next']"), unique=True), follow=True), ) def set_crawler(self, crawler): CrawlSpider.set_crawler(self, crawler) RedisMixin.setup_redis(self) def parse_start_url(self, response): """ Main parse function """ url = response.url if url.find('detail.tmall.com') > -1: return self.parseTmall(response) elif url.find('jd.com') > -1: return self.parseJd(response) elif url.find('feifei.com') > -1: return self.parseFeifei(response) elif url.find('yhd.com') > -1: return self.parseYhd(response) elif url.find('yixun.com') > -1: return self.parseYixun(response) elif url.find('amazon.cn') > -1: return self.parseAmazon(response) def make_requests_from_url(self, url): if url.find('yhd.com') > -1: return Request(url, dont_filter=True, cookies={'provinceId': 20}) elif url.find('yixun.com') > -1: return Request(url, dont_filter=True, cookies={ 'loc': '6_1001_440000_440100_440106_0', 'wsid': '1001' }) else: return Request(url, dont_filter=True) ###### # # Tmall parser # def parseTmall(self, response): """ Tmall parser """ def _referer(): referer = response.request.headers.get('Referer') if referer and referer.find('list.tmall.com') > -1: rto = 'http://list.tmall.com/search_product.htm?' resultC = re.compile('[\?&]cat=(\d+)').search(referer) if resultC: rto += 'cat=%s' % resultC.group(1) resultQ = re.compile('[\?&]q=([^&]+)').search(referer) if resultQ: if resultC: rto += '&q=%s' % resultQ.group(1) else: rto += 'q=%s' % resultQ.group(1) if not 'http://list.tmall.com/search_product.htm?' == rto: return rto elif not referer and response.url.find('detail.tmall.com') > -1: return response.url return '' sel = Selector(response) item = ProductItem() item['source'] = 'tmall' item['name'] = self.get_product_name(sel) item['start_url'] = _referer() store = ''.join( sel.xpath('//input[@name="seller_nickname"]/@value').extract()) item['tm_store'] = '[%s] %s' % (store[-3:], store) if len(store) > 3 else store try: # 获取TShop字符串,并对TShop字符串进行JSON标准化处理 TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0] # 移除注释,目前只有天猫超市有注释,以逗号开头 regex = re.compile(',\s*\/\/[^\n]*') TShop_str = re.sub(regex, ',', TShop_str) TShop = eval( TShop_str, type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))()) except SyntaxError: return item['itemId'] = TShop.get('itemDO').get('itemId', '') item['url'] = response.url initApi_url = TShop.get('initApi') yield Request(initApi_url, headers={'Referer': 'http://www.google.com.hk/'}, meta={'item': item}, dont_filter=True, callback=self.parse_initapi) def parse_initapi(self, response): """ 处理initApi的链接 """ item = response.meta['item'] try: initObj = eval( response.body.strip().decode('gbk'), type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))()) priceInfo = initObj.get('defaultModel').get( 'itemPriceResultDO').get('priceInfo') item['price'] = self.get_default_price(priceInfo) item['tm_moonSellCount'] = initObj.get('defaultModel').get( 'sellCountDO').get('sellCount', 0) except: print response.body finally: yield Request( 'http://dsr.rate.tmall.com/list_dsr_info.htm?itemId=' + item['itemId'], meta={'item': item}, dont_filter=True, callback=self.parse_comment) def parse_comment(self, response): """ 处理获取评论数的链接 """ item = response.meta['item'] comment = re.findall('rateTotal\":(\d+)', response.body)[0] item['comment'] = int(comment) if comment.isdigit() else 0 yield item def get_product_name(self, sel): """ 获取商品名 """ name_node = sel.xpath('//div[@id="J_DetailMeta"]//h3') if len(name_node.xpath('./a')) > 0: return name_node.xpath('./a/text()').extract()[0] elif len(name_node.xpath('./a')) == 0: return name_node.xpath('./text()').extract()[0] else: return '' def get_default_price(self, priceInfo): """ 计算商品的默认价格 """ def_obj = priceInfo.get('def', None) if def_obj: # 有Def属性 promotionList = def_obj.get('promotionList', None) if type(promotionList) == list and len(promotionList) > 0: # 有促销信息 min_price = sys.maxint for i in range(len(promotionList)): if promotionList[i].get('price') and float( promotionList[i].get('price')) < min_price: min_price = float(promotionList[i].get('price')) return min_price else: # 没促销信息 return float(def_obj.get('price')) else: # 没有def属性 for sku in priceInfo: promotionList = priceInfo[sku].get('promotionList', None) if type(promotionList) == list and len(promotionList) > 0: # 有促销信息 min_price = sys.maxint for i in range(len(promotionList)): if promotionList[i].get('price') and float( promotionList[i].get('price')) < min_price: min_price = float(promotionList[i].get('price')) return min_price else: # 没促销信息 return float(priceInfo[sku].get('price')) ###### # # Jd parser # def parseJd(self, response): """ Jd parser """ sel = Selector(response) item = ProductItem() item['source'] = 'jd' item['name'] = sel.xpath("//div[@id='name']//h1/text()").extract()[0] item['url'] = response.url item['itemId'] = self.getSku(response.url) # return item yield Request('http://p.3.cn/prices/get?skuid=J_' + item['itemId'], meta={'item': item}, dont_filter=True, callback=self.parsePrice) def parsePrice(self, response): item = response.meta['item'] rto = json.loads(response.body)[0] item['price'] = float(rto.get('p', 0)) yield Request( 'http://club.jd.com/ProductPageService.aspx?method=GetCommentSummaryBySkuId&referenceId=' + item['itemId'] + '&callback=getCommentCount', meta={'item': item}, dont_filter=True, callback=self.parseComment) def parseComment(self, response): item = response.meta['item'] regex = re.compile('\{.*\}') result = regex.search(response.body) if result: rto = json.loads(result.group(0)) item['comment'] = int(rto.get('CommentCount', 0)) else: item['comment'] = 0 return item def getSku(self, url): regex = re.compile('\/(\d+)\.htm') result = regex.search(url) return result.group(1) if result else '' ###### # # Feifei parser # def parseFeifei(self, response): """ Feifei parser """ sel = Selector(response) item = ProductItem() item['source'] = 'feifei' item['name'] = sel.xpath( "//h2[@class='np-intro-title']/text()").extract()[0] item['url'] = response.url price = sel.xpath("//dd[@class='price-m']/text()").extract()[0] item['price'] = float(price[1:]) item['category'] = '|'.join( sel.xpath("//ul[@class='np-crumbs']//a/text()").extract()) return item ###### # # Yhd parser # def parseYhd(self, response): """ Yihaodian parser """ sel = Selector(response) item = ProductItem() item['source'] = 'yhd' item['name'] = sel.xpath( "//font[@id='productMainName']/text()").extract()[0] item['url'] = response.url # get pmId regex = re.compile('item\/(\d+)') result = regex.search(response.url) pmId = result.group(1) if result else 0 yield Request( 'http://e.yhd.com/front-pe/queryNumsByPm.do?pmInfoId=%s' % pmId, meta={ 'item': item, 'pmId': pmId }, callback=self.parse_yhd_comment) def parse_yhd_comment(self, response): item = response.meta['item'] pmId = response.meta['pmId'] rto = json.loads(response.body) item['comment'] = rto.get('experienceNum', -1) yield Request( 'http://busystock.i.yihaodian.com/restful/detail?mcsite=1&provinceId=20&pmId=%s' % pmId, meta={'item': item}, callback=self.parse_yhd_price) def parse_yhd_price(self, response): item = response.meta['item'] rto = json.loads(response.body) item['price'] = rto.get('currentPrice', -1) return item ###### # # Yixun parser # def parseYixun(self, response): """ Yixun parser """ sel = Selector(response) item = ProductItem() item['source'] = 'yixun' item['name'] = sel.xpath( "//div[@class='xbase']//h1[@class='xname']/text()").extract()[0] item['url'] = response.url price = ''.join( sel.xpath("//div[@class='xbase']//span[@itemprop='price']/text()"). extract()) lowPrice = ''.join( sel.xpath( "//div[@class='xbase']//span[@itemprop='lowPrice']/text()"). extract()) item['price'] = price or lowPrice return item ###### # # Amazon parser # def parseAmazon(self, response): """ Amazon parser """ sel = Selector(response) item = ProductItem() item['source'] = 'amazon' item['name'] = ''.join( sel.xpath( '//span[@id="btAsinTitle"]/span/text()').extract()).strip() item['url'] = response.url price = ''.join(sel.xpath('//b[@class="priceLarge"]/text()').extract()) item['price'] = price[2:] if len(price) > 2 else '' return item
class SpiderSpider(CrawlSpider): count = 0 name = "pcconnection_camera" dic = set() allowed_domains = init_allowed_domains start_urls = init_start_urls rules = ( #only extract links here Rule(SgmlLinkExtractor(allow=allowed_url), callback="parse"), ) @property def sleep_time(self): return random.random() * MAX_SLEEP_TIME def parse(self, response): ''' extract title content url ''' print '>' * 50 print 'response url: ', response.url hxs = HtmlXPathSelector(response) print '>>>> repsonse.url: ', response.url #get urls content_urls = hxs.select(content_url_format).extract() list_urls = hxs.select(list_url_format).extract() list_urls = [up.urljoin(response.url, url) for url in list_urls] content_urls = [up.urljoin(response.url, url) for url in content_urls] print "@" * 60 time.sleep(self.sleep_time) self.start_urls.extend(list_urls) for url in list_urls: yield Request(url, self.parse) #http://www.pcconnection.com/IPA/Shop/Product/Detail.htm?sku=16037879&cac=Result content_re = re.compile( r'http://www[.]pcconnection[.]com/.*cac=Result') for url in content_urls: if content_re.match(url): if len(self.dic) > 160: self.start_urls = [] raise CloseSpider('reach pages limit, end the spider.') self.count += 1 self.dic.add(hash(url)) #extract data item = SpiderItem() item['url'] = url item['kind'] = self.name yield item else: print "!!!!!!! not match content url:" print url
class GmaSpider(CrawlSpider): """ scrapy crawl inquirer_spider -o gmanews.json """ name = 'inquirer_spider' allowed_domains = [ 'inquirer.net', 'newsinfo.inquirer.net', 'sports.inquirer.net', 'lifestyle.inquirer.net', 'entertainment.inquirer.net', 'business.inquirer.net', 'technology.inquirer.net', 'globalnation.inquirer.net', ] start_urls = [ 'http://www.inquirer.net', ] rules = (Rule(SgmlLinkExtractor(allow=('', )), process_links="link_filter", callback="parse_items", follow=True), ) def parse_items(self, response): title = response.xpath('//div[@class="al-headline"]/\ div[@class="container"]/h1').extract() if len(title): item = None link = response.url title = strip_tags(title[0]) # parse date created = response.xpath('//h4[@class="byline"]').extract()[0] created = created.split('>')[-2].strip()[:-4] ord_str = None if 'st,' in created: ord_str = 'st' elif 'nd,' in created: ord_str = 'nd' elif 'rd,' in created: ord_str = 'rd' elif 'th,' in created: ord_str = 'th' created_format = '%H:%M %p | %A, %B %d' + ord_str + ', %Y' created = time.strptime(created, created_format) #content = response.xpath('/html/body/div[6]/div[8]/div/div[2]/div[2]').extract() content = response.xpath('//div[@class="main-article"]').extract() #tags = response.xpath('//div[@class="story"]\ # /div[@class="main"]/div[@class="tags"]\ # /a[@class="tag"]/text()').extract() item = NewsItem() item['link'] = link item['title'] = title item['created'] = strftime('%Y-%m-%d', created) item['content'] = content #item['tags'] = list(set(tags)) item.save() return item def link_filter(self, links): ret = [] for link in links: parsed_url = urlparse(link.url) if not News.objects.filter(link=parsed_url).count(): ret.append(link) return ret def process_title(response): pass
class DoubanSpider(CrawlSpider): name = "doubanmovie" allowed_domains = ["movie.douban.com"] # start_urls = ["http://movie.douban.com/tag/2016?start=0&type=T"] # start_urls = ["http://movie.douban.com"] start_urls = [ "http://movie.douban.com/tag/1994", "http://movie.douban.com/tag/1995", "http://movie.douban.com/tag/1996", "http://movie.douban.com/tag/1997" ] rules = [ # All grep rules here # Parse Movie Information Rule(SgmlLinkExtractor(allow=(r'tag/\d{4}\?start=\d+', ))), Rule(SgmlLinkExtractor(allow=( r'https://movie\.douban\.com/subject/\d+/collections\?start=[2468]0$', )), callback='parse_comment', follow=True), Rule(SgmlLinkExtractor( allow=(r'https://movie\.douban\.com/subject/\d+/$', )), callback='parse_page', follow=True), # Parse Movie Comments Rule(SgmlLinkExtractor(allow=( r'https://movie\.douban\.com/subject/\b\d+\b/collections$', )), callback='parse_comment', follow=True) ] def parse_page(self, response): sel = Selector(response) item = MovieItem() # print 'Crawl ' + response.url + ' start...' item['name'] = sel.xpath( '//h1/span[@property="v:itemreviewed"]/text()').extract() item['year'] = sel.xpath('//h1/span[@class="year"]/text()').extract() item['director'] = sel.xpath( '//a[@rel="v:directedBy"]/text()').extract() item['date'] = sel.xpath( '//span[@property="v:initialReleaseDate"]/text()').extract() item['time'] = sel.xpath( '//span[@property="v:runtime"]/text()').extract() item['description'] = sel.xpath( '//span[@property="v:summary"]/text()').extract() item['value'] = sel.xpath( '//strong[@property="v:average"]/text()').extract() item['people'] = sel.xpath( '//span[@property="v:votes"]/text()').extract() item['image_url'] = sel.xpath( '//a[contains(@href, "photos")]/img/@src').extract() item['star5'] = sel.xpath( '//span[@class="stars5 starstop"]/following-sibling::*[2]/text()' ).extract() item['star4'] = sel.xpath( '//span[@class="stars4 starstop"]/following-sibling::*[2]/text()' ).extract() item['star3'] = sel.xpath( '//span[@class="stars3 starstop"]/following-sibling::*[2]/text()' ).extract() item['star2'] = sel.xpath( '//span[@class="stars2 starstop"]/following-sibling::*[2]/text()' ).extract() item['star1'] = sel.xpath( '//span[@class="stars1 starstop"]/following-sibling::*[2]/text()' ).extract() item['movietype'] = sel.xpath( '//span[@property="v:genre"]/text()').extract() item['actor'] = sel.xpath( '//span/span[@class="attrs"]/a[@rel="v:starring"]/text()').extract( ) item['writer'] = sel.xpath( u'//span/span[./text()="编剧"]/following-sibling::*/a/text()' ).extract() item['country'] = sel.xpath( u'//span[./text()="制片国家/地区:"]/following::text()[1]').extract() item['language'] = sel.xpath( u'//span[./text()="语言:"]/following::text()[1]').extract() item['othername'] = sel.xpath( u'//span[./text()="又名:"]/following::text()[1]').extract() item['movie_id'] = GetMovieOrUserID(response.url) item['movie_url'] = response.url # item['site'] = sel.xpath('//div[@id="info"]/span[contains(@href, "http")]/text()').extract() print 'Crawl ' + response.url + ' done...' # print item return item def parse_comment(self, response): items = [] sel = Selector(response) print 'Crawl ' + response.url + ' start...' comments = sel.xpath('//table[@width="100%"]') for comment in comments: # print comment item = CommentItem() item['user_img'] = comment.xpath('.//img/@src').extract() item['user_name'] = comment.xpath('.//img/@alt').extract() item['user_city'] = comment.xpath( './/span[@style="font-size:12px;"]/text()').extract() item['user_value'] = comment.xpath( './/p[@class="pl"]/span/@class').extract() item['comment'] = comment.xpath( './/p[@class="pl"]/following::*[1]/text()').extract() item['comment_date'] = comment.xpath( './/p[@class="pl"]/text()').extract() url = comment.xpath( './/td[@width="80"]/a[contains(@href, "people")]/@href' ).extract() print '======================' print 'Get This URL ID' print url print '======================' item['user_url'] = url item['user_id'] = GetMovieOrUserID(url) item['movie_id'] = GetMovieOrUserID(response.url) items.append(item) print item return items def parse_try(self, response): pass
class WikiSpider(CrawlSpider): pcounter = 0 name = "wiki" allowed_domains = ["freeswitch.org"] # start_urls = [ # "http://wiki.freeswitch.org/" # ] start_urls = [ "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=.1.3.6.1.4.1.27880&to=FS_weekly_2010_11_10", "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=FS_weekly_2010_11_17&to=Java_ESL_Client", "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Javascript&to=Mod_managed", "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Mod_memcache&to=Report_Issue_Checklist", "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Reporting_Bugs&to=Variable_execute_on_tone_detect", "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Variable_export_vars&to=Variable_stream_prebuffer", "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Variable_suppress-cng&to=Zeroconf.conf.xml" ] # rules = ( # Rule(SgmlLinkExtractor(), callback='parse_item', follow=True), # ) # <a href="/wiki/Release_Notes" title="Release Notes"> # wiki/Special: # wiki/User_talk: # wiki/User: # wiki/Talk: rules = [ # Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))), Rule(SgmlLinkExtractor(allow=[r'wiki/\w+']), callback='parse_item', follow=True), # Rule(SgmlLinkExtractor(allow=[r'wiki/\w+'], deny=[r'wiki/[Special\:|User_talk\:|User\:|Talk\:]\w+']), callback='parse_item', follow=True), ] # r'page/\d+' : regular expression for http://isbullsh.it/page/X URLs # rules = ( # # Extract links matching 'category.php' (but not matching 'subsection.php') # # and follow links from them (since no callback means follow=True by default). # Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))), # # Extract links matching 'item.php' and parse them with the spider's method parse_item # Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'), # ) # def parse(self, response): # sel = Selector(response) # sites = sel.xpath('//ul/li') # items = [] # for site in sites: # item = DmozItem() # item['title'] = site.xpath('a/text()').extract() # item['link'] = site.xpath('a/@href').extract() # item['desc'] = site.xpath('text()').extract() # items.append(item) # return items # Try this in a shell def parse_item(self, response): self.pcounter += 1 self.log('Hi, this is an item page (%d)! %s' % (self.pcounter, response.url)) sel = Selector(response) item = WikiItem() item['title'] = sel.xpath('//title/text()').extract() item['pageurl'] = response.url # item['content'] = sel.xpath('//div[re:test(@id, "content")]').extract() list_links = [] # for links in sel.xpath('//a/@href').extract(): for links in sel.xpath( '//div[re:test(@id, "content")]//a/@href').extract(): if links[:6] != '/wiki/': continue if links.find('wiki/Special:') != -1 or links.find( 'wiki/User_talk:' ) != -1 or links.find('wiki/User:'******'wiki/Talk:') != -1 or links.find('wiki/Category:') != -1: continue list_links.append(links) # item['links'] = list_links return item def parse_item_long(self, response): self.log('Hi, this is an item page! %s' % response.url) hxs = HtmlXPathSelector(response) item = IsBullshitItem() # Extract title item['title'] = hxs.select('//header/h1/text()').extract()[0] # Extract author item['author'] = hxs.select('//header/p/a/text()').extract()[0] # Extract tag(s) item['tag'] = hxs.select( "//header/div[@class='post-data']/p/a/text()").extract() # Extract date item['date'] = hxs.select( "//header/div[@class='post-data']/p[contains(text(), '20')]/text()" ).extract()[0] # Extract location item['location'] = hxs.select( "//header/div[@class='post-data']/p[contains(text(), 'From')]/text()" ).extract()[0].replace('From', '') # Extract article url urls = hxs.select( "//div[@class='breadcrumb-container']/ul[@class='breadcrumb']/li/a/@href" ).extract() item['url'] = urlparse.urljoin(urls[1], urls[2]) # Extract article text, with html tags item['article_html'] = hxs.select( "//div[@role='main']/article").extract()[0] return item
class StackoverflowSpider(CrawlSpider): name = 'stackoverflow' allowed_domains = ['stackoverflow.com'] start_urls = get_stack_urls() rules = ( Rule(SgmlLinkExtractor(allow=r'(.*)\?tab=answers(.*)'), callback='parse_item', follow=True), Rule(SgmlLinkExtractor(allow=r'/questions/'), callback='parse_question', follow = False), ) question_xpath = "//div[@id='question']" answers_xpath = "//div[@id='answers']//div[@data-answerid]" def parse_item(self, response): pass def parse_question(self, response): # print 'I am parsing question' hxs = HtmlXPathSelector(response) for question_selector in hxs.select(self.question_xpath): yield self.get_question(question_selector, response) for answer_selector in hxs.select(self.answers_xpath): yield self.get_answer(answer_selector, response) # label can be 'question' or 'answer' def get_user(self, selector, response, label): user_loader = XPathItemLoader(item = StackOverflowUser(), selector = selector) user_loader.add_xpath('user_name', ''.join([ './/div[contains(@class, "user-details")]', '/a/text()' ])) user_loader.add_xpath('user_link', ''.join([ './/div[contains(@class, "user-details")]', '/a/@href' ])) if user_loader.get_output_value('user_link'): user_id = user_loader.get_output_value('user_link') user_loader.add_value('user_id', user_loader.get_output_value('user_link')) return user_loader.load_item() def get_question(self, selector, response): hxs = HtmlXPathSelector(response) number_of_answers = hxs.select(''.join([ '//div[@id="answers"]', '//div[contains(@class, "answers-subheader")]', '/h2/text()' ])).extract() question_loader = XPathItemLoader(item = StackOverflowQuestion(), selector = selector) question_loader.add_xpath('question_content', ''.join([ ".//td[@class='postcell']", "//div[@class='post-text']/p/text()" ])) question_loader.add_xpath('question_tags', ''.join([ ".//div[@class='post-taglist']", "//a[@class='post-tag']/text()" ])) question_loader.add_xpath('question_id', ''.join([ './@data-questionid' ])) question_loader.add_xpath('marks', ''.join([ ".//span[contains(@class, 'vote-count-post')]/text()" ])) question_loader.add_value('asker', self.get_user(selector, response, 'question')) question_loader.add_value('number_of_answers', int(number_of_answers[0].strip().split(' ')[0])) question_title = hxs.select(''.join([ '//div[contains(@id, "question-header")]', '//a[contains(@class, "question-hyperlink")]/text()' ])).extract() question_loader.add_value('question_title', question_title) # print question_loader.get_output_value('question_title') return question_loader.load_item() def get_answer(self, selector, response): answer_loader = XPathItemLoader(item = StackOverflowAnswer(), selector = selector) answer_loader.add_xpath('answer_content', ''.join([ ".//td[@class='answercell']/div[@class='post-text']", "/p/text()" ])) answer_loader.add_xpath('answer_id', ''.join([ "./@data-answerid" ])) answer_loader.add_xpath('marks', ''.join([ ".//span[contains(@class, 'vote-count-post')]/text()" ])) # is best answer? if selector.select('./@class').extract()[0].find('accepted-answer') != -1: answer_loader.add_value('is_best_answer', 1) else: answer_loader.add_value('is_best_answer', 0) # get user name answer_loader.add_value('answerer', self.get_user(selector, response, 'answer')) return answer_loader.load_item()
class TestSpider(InitSpider): name = "test" allowed_domains = ["test.co.kr"] login_page = "http://local.test.co.kr/login" start_urls = "http://local.test.co.kr/" #Rule 객체를 이용해 크롤링 되는 사이트의 동작을 정의 한다. rules = ( #Rule(SgmlLinkExtractor(allow=r'-\w+.html$'), callback='parse_item', follow=True), Rule(SgmlLinkExtractor(allow=("local\.test\.co\.kr[^\s]*\/*$")), callback='parse_item', follow=True), ) ## initRequest 메소드가 맨 처음 시작 됨. def init_request(self): ## 로그인 페이지와 callback 지정 return Request(url=self.login_page, callback=self.login) ## FormRequest를 이용해서 해당 페이지에서 submit요청을 보낸다. def login(self, response): return FormRequest.from_response(response, formdata={'id': '0000', 'password': '******'}, callback=self.check_login_response) ## response된 html을 파싱해서 로그인 여부를 판단 한다. def check_login_response(self, response): //check login success if "/auth/logout" in response.body: ## 로그인이 성공하면 initialized를 실행해 파싱을 시작한다. return self.initialized() else return self.error() def initialized(self): return Request(url=self.start_urls, callback=self.parse_item) def parse_item(self, response): ## 중복처리를 위해 수집된 url을 불러옴. if self.isFirstLoop : self.tempUrls = self.getUrlSet() self.isFirstLoop = 0; site = "test" rank = "0" title = response.xpath('//title/text()').extract() req_url = response.request.url.replace('http://'+host, '', 1) res_url = response.url s = re.search("<(!\s*doctype\s*.*?)>", response.body, re.IGNORECASE) doctype = s.group(1) if s else "" css = response.xpath('//link/@href').extract() js = response.xpath('//script/@src').extract() layout = response.xpath('//div[@class="debug_layout"]/text()').extract() sidebar = response.xpath('//div[@class="debug_side_layout"]/text()').extract() emulate = response.xpath('//meta[contains(@content, "IE")]/@content').extract() embed_style_cnt = len(response.xpath('//style').extract()) embed_script_cnt = len(response.xpath('//script').extract()) - len(response.xpath('//script/@src').extract()) # 호스트부분은 제거해 준다. ckurl = req_url.replace("http://local.test.co.kr", "") ckurl = req_url.replace("https://local.test.co.kr", "") if ckurl.find('?') > -1 : ckurl = ckurl.split('?')[0] elif len(ckurl.split('/')) > 4 : piece = ckurl.split('/') ckurl = piece[0]+'/'+piece[1]+'/'+piece[2]+'/'+piece[3]+'/'+piece[4] # 중복 확인. if ckurl in self.tempUrls: print ">>>>>>>>>>>>>>>[DropItem]:" + ckurl raise #DropItem("Duplicate url found: %s" % ckurl) else : req_url = ckurl self.tempUrls.add(req_url) if len(layout) > 0 : layout = layout[-1] else : layout = ",".join(layout) if len(sidebar) > 0 : sidebar = sidebar[-1] else : sidebar = ",".join(sidebar) item = SaraminWebItem() item["site"] = site item["rank"] = rank item["title"] = ",".join(title) item["req_url"] = req_url item["res_url"] = res_url item["doctype"] = doctype item["css"] = ",".join(css) item["js"] = ",".join(js) item["layout"] = layout item["sidebar"] = sidebar item["emulate"] = ",".join(emulate) item["embed_style_cnt"] = embed_style_cnt item["embed_script_cnt"] = embed_script_cnt # print(item); yield item
class buildkSpiders(CrawlSpider): handle_httpstatus_list = [302] name = "bk" start_urls = [ 'http://buildkar.com/building-materials/blocks/', 'http://buildkar.com/building-materials/boards-planks/', 'http://buildkar.com/building-materials/boards-planks/page/2/', 'http://buildkar.com/building-materials/bricks/' ] rules = ( Rule(SgmlLinkExtractor(allow=(".*/building-materials/cement-rmc/.*", ), deny=(".*add-to-cart=.*", ), unique=True), callback='parse_item', follow=True), Rule(SgmlLinkExtractor(allow=(".*/page/.*", ), unique=True), callback='parse_item', follow=True), ) def parse(self, response): #print ">>>>>", response.request.url sel = Selector(response) items = [] item = BuildkarItem() # item['url'] = response.request.url # #import pdb;pdb.set_trace() # title = (map(unicode.strip,sel.xpath('//h3[@class="heading-title product-title"]//a/text()').extract())) # #print len(title) # item['title'] = title # category = (map(unicode.strip,sel.xpath('//div[@class="product-meta-wrapper"]/div[@class="wd_product_categories"]/a/text()').extract())) # item['category'] = category # price1 = (map(unicode.strip,sel.xpath('//span[@class="price"]/del/span[@class="amount"]/text()').extract())) # item['price1'] = price1 # price = (map(unicode.strip,sel.xpath('//ins/span[@class="amount"]/text()').extract())) # item['price'] = price item['href'] = sel.xpath( '//h3[@class="heading-title product-title"]/a/@href').extract() # print len (item['href']) # description = [] # for i in range (len(item['href'])) : # url = item['href'][i] # html_doc = urllib2.urlopen (url) # soup = BeautifulSoup (html_doc.read ()) # raw_data = soup.find('div',{'id':"content_description"}) # p =raw_data.text # print">>>>>",p # description.append (p) # for x1 in range(len(title)): # print title[x1] # item1 = BaseProducts() # item1.source_url = item['url'][x1] # item1.Sku = title[x1] # item1.title = title[x1] # item1.category_name=category[x1] # item1.description = description[x1] # item1.source_id = 5 # item1.save() # item2 = Subscribe_Product() # item3 = SPM() # item2.bp = item1 # item2.source_id =5 # item2.Sku = title[x1] # item2.save() # item3.sp = item2 # item3.Sku = title[x1] # try: # item3.price = price[x1].replace("[","").replace(",","").replace("u","").replace("]","").replace("'","").replace("\xa0","").replace("Rs.","") # except : item3.price = 0 # try: # item3.store_price = price1[x1].replace("[","").replace(",","").replace("u","").replace("]","").replace("'","").replace("\xa0","").replace("Rs.","") # except: item3.store_price = 0 # item3.source_id = 5 # #item3.saller = item['Saller'] # item3.save() # item4 = Category() # item4.category_name = category[x1] # item4.category_path = category[x1] # item4.level = "3" # item4.source_id = 5 # item4.save() # if item['title'] : # # print item return item
class MovieSpider(CrawlSpider): name = "movie" allowed_domains = ["movie.douban.com"] start_urls = ["http://movie.douban.com"] rules = ( Rule(LinkExtractor(allow=r"/subject/\d+/($|\?\w+)"), callback="parse_movie", follow=True), ) def parse_movie(self, response): item = MovieItem() if self.parse_tv(response, item) is False: return None item["douban_id"] = response.url.split("/")[-2] self.get_name(response, item) self.get_name_other(response, item) self.get_directors(response, item) self.get_actors(response, item) self.get_countries(response, item) self.get_genres(response, item) self.get_languages(response, item) self.get_runtime(response, item) self.get_description(response, item) self.get_release_date(response, item) self.get_tags(response, item) self.get_image(response, item) self.get_douban_rating(response, item) #for i in item.keys(): #print(i + " " + str(item[i])) return item def get_name(self, response, item): name = response.xpath("//title/text()").extract() if name: item["name"] = name[0].replace(u" (豆瓣)", "").strip() def get_name_other(self, response, item): year = response.xpath("//*[@id='info']").re(NAMEOTHER_RE) if year: item["name_other"] = year[0] def get_directors(self, response, item): directors = response.xpath("//a[@rel='v:directedBy']/text()").extract() if directors: item["directors"] = '/'.join(directors) def get_actors(self, response, item): stars = response.xpath("//a[@rel='v:starring']/text()").extract() if stars: item["actors"] = '/'.join(stars) def get_genres(self, response, item): genres = response.xpath("//span[@property='v:genre']/text()").extract() if genres: item["genres"] = '/'.join(genres) def get_runtime(self, response, item): runtime = response.xpath("//span[@property='v:runtime']/text()").re(NUM_RE) if runtime: item["runtime"] = runtime[0] def get_douban_rating(self, response, item): average = response.xpath("//strong[@property='v:average']/text()").extract() if average and average[0] != "": item["douban_rating"] = average[0] def get_tags(self, response, item): T = [] tags = response.xpath("//div[@class='tags-body']/a") for tag in tags: t = tag.xpath("text()").extract() if t: T.append(t[0]) if T: item["tags"] = '/'.join(T) def get_languages(self, response, item): S = "".join(response.xpath("//div[@id='info']").extract() ) M = LANGUAGES_RE.search(S) if M is not None: item["languages"] = M.group(1) def get_countries(self, response, item): S = "".join(response.xpath("//div[@id='info']").extract() ) M = COUNTRIES_RE.search(S) if M is not None: item["countries"] = M.group(1) def get_description(self, response, item): summary = response.xpath("//span[@property='v:summary']/text()").extract() if summary: item["description"] = "<br/>".join( summary ) def get_image(self, response, item): image = response.xpath("//*[@id='mainpic']/a/img").re(IMAGE_RE) if image: item["image"] = image[0] def get_release_date(self, response, item): comment = response.xpath("////span[@property='v:initialReleaseDate']/text()").extract() if comment: item["release_date"] = '/'.join(comment) def parse_tv(self, response, item): S = "".join( response.xpath("//div[@id='info']//text()").extract() ) M = TV_RUNTIME_RE.search(S) if M is not None: return False return True
class ls(CrawlSpider): name = "ls" download_delay = 2 allowed_domains = ["xe.gr"] start_urls = [ "http://www.xe.gr/property/search?Publication.age=1&System.item_type=re_land&Transaction.type_channel=117518&page=1&per_page=50" ] rules = (Rule(LxmlLinkExtractor( allow_domains=('xe.gr'), restrict_xpaths=("//a[@class='white_button right']")), callback='parse_start_url', follow=True), ) def parse_start_url(self, response): return self.parse_items(response) def parse_items(self, response): for sel in response.xpath("//div[contains(@class,'r_desc')]/h2/a"): link = "http://www.xe.gr" + sel.xpath( "@href").extract_first() + "?mode=spec" yield Request(link, callback=self.parse2) def parse2(self, response): # Creating an empty item object item = {} # Assigning values to it's fields item['url'] = response.url region_string = response.xpath( u"//th[text()='Περιοχή:']/following-sibling::*/text()" ).extract_first() region_list = region_string.strip().split(' > ') item['regionA'] = region_list[0] try: item['regionB'] = region_list[1] except (IndexError): item['regionB'] = None try: item['regionC'] = region_list[2] except (IndexError): item['regionC'] = None try: item['regionD'] = region_list[3] except (IndexError): item['regionD'] = None price_string = response.xpath( u"//td[@class='auto_price']/span/text()").extract_first() try: item['price'] = float(price_string.strip().replace( u" €", "").replace(".", "").replace(",", ".")) except: item['price'] = None item['location_name'] = response.xpath( u"//th[text()='Τοποθεσία:']/following-sibling::*/text()" ).extract_first() item['category'] = response.xpath( u"//th[text()='Είδος:']/following-sibling::*/text()" ).extract_first() area_string = response.xpath( u"//th[text()='Εμβαδόν:']/following-sibling::*/text()" ).extract_first() try: item['area'] = float(area_string.strip().replace(".", "").replace( ",", ".")) except: item['area'] = None item['city_plan'] = response.xpath( u"//th[text()='Σχέδιο Πόλης:']/following-sibling::*/text()" ).extract_first() item['structure_factor'] = response.xpath( u"//th[text()='Συντελεστής Δόμησης:']/following-sibling::*/text()" ).extract_first() item['coverage_factor'] = response.xpath( u"//th[text()='Συντελεστής Κάλυψης:']/following-sibling::*/text()" ).extract_first() facade_length_string = response.xpath( u"//th[text()='Πρόσοψη:']/following-sibling::*/text()" ).extract_first() try: item['facade_length'] = float(facade_length_string) except: item['facade_length'] = None try: item['facade_count'] = float( response.xpath( u"//th[text()='Αριθμός Όψεων:']/following-sibling::*/text()" ).extract_first()) except: item['facade_count'] = None item['airy'] = response.xpath( u"//th[text()='Διαμπερές:']/following-sibling::*/text()" ).extract_first() item['slope'] = response.xpath( u"//th[text()='Κλίση:']/following-sibling::*/text()" ).extract_first() item['artio'] = response.xpath( u"//th[text()='Άρτιο:']/following-sibling::*/text()" ).extract_first() item['oikodomisimo'] = response.xpath( u"//th[text()='Οικοδομήσιμο:']/following-sibling::*/text()" ).extract_first() item['me_adia'] = response.xpath( u"//th[text()='Με άδεια οικοδομής:']/following-sibling::*/text()" ).extract_first() try: item['ktizei'] = float( response.xpath( u"//th[text()='Κτίζει:']/following-sibling::*/text()"). extract_first()) except: item['ktizei'] = None item['availability'] = response.xpath( u"//th[text()='Διαθεσιμότητα:']/following-sibling::*/text()" ).extract_first() item['availability_from'] = response.xpath( u"//th[text()='Διαθέσιμο από:']/following-sibling::*/text()" ).extract_first() item['antiparoxi'] = response.xpath( u"//th[text()='Και αντιπαροχή:']/following-sibling::*/text()" ).extract_first() # Δεν είμαι σίγουρος για το xpath item['view'] = response.xpath( u"//th[text()='Θέα:']/following-sibling::*/text()").extract_first( ) try: item['dist_from_sea'] = float( response.xpath( u"//th[text()='Απόσταση από Θάλασσα:']/following-sibling::*/text()" ).extract_first()) except: item['dist_from_sea'] = None item['paling'] = response.xpath( u"//th[text()='Περίφραξη:']/following-sibling::*/text()" ).extract_first() item['supplies'] = response.xpath( u"//th[text()='Παροχές:']/following-sibling::*/text()" ).extract_first() item['drilling'] = response.xpath( u"//th[text()='Γεώτρηση:']/following-sibling::*/text()" ).extract_first() item['with_building'] = response.xpath( u"//th[text()='Κτίσμα:']/following-sibling::*/text()" ).extract_first() item['corner_plot'] = response.xpath( u"//th[text()='Γωνιακό:']/following-sibling::*/text()" ).extract_first() item['mesites'] = response.xpath( u"//th[text()='Μεσίτες δεκτοί:']/following-sibling::*/text()" ).extract_first() item['epaggelmatiki_xrisi'] = response.xpath( u"//th[text()='Επαγγελματική χρήση:']/following-sibling::*/text()" ).extract_first() item['dimensions'] = response.xpath( u"//th[text()='Διαστάσεις:']/following-sibling::*/text()" ).extract_first() item['contains'] = response.xpath( u"//th[text()='Περιέχει:']/following-sibling::*/text()" ).extract_first() # Τώρα θα πάμε να πάρουμε και την ημερομηνία τελευταίας τροποποίησης. yield Request(response.url[:-10], callback=self.parse3, meta={'item': item}) def parse3(self, response): # Retrieving the item item = response.meta['item'] # Assigning more values to it's fields x = response.xpath("//td[@class='headItem']/text()").extract_first() datelist = x.split(" ") months = [ u'Ιανουαρίου', u'Φεβρουαρίου', u'Μαρτίου', u'Απριλίου', u'Μαΐου', u'Ιουνίου', u'Ιουλίου', u'Αυγούστου', u'Σεπτεμβρίου', u'Οκτωβρίου', u'Νοεμβρίου', u'Δεκεμβρίου' ] date = datetime.date(int(datelist[3]), months.index(datelist[2]) + 1, int(datelist[1])) item['date'] = date try: item['details'] = response.xpath("//p[@class='dets']").xpath( "text()").extract_first().strip() except: item['details'] = None yield item
class MySpider(CrawlSpider): name = "swimoutlet" allowed_domains = ["swimoutlet.com"] start_urls = [ ## "http://www.swimoutlet.com/womens-tan-thru-swimsuits-c9374/", ## "http://www.swimoutlet.com/shoes-accessories-c10211/", ## "http://www.swimoutlet.com/swim-caps-c9633/#cat=9633&clrc=481&sortby=Popularity" ## "http://www.swimoutlet.com/womens-swim-dresses-c9373/", ## "http://www.swimoutlet.com/shoes-accessories-c10211/", ## "http://www.swimoutlet.com/swimming-watches-c14082/", ## "http://www.swimoutlet.com/kickboards-c9661/" i.strip() for i in urllist ] rules = ( ## Rule (SgmlLinkExtractor(allow=(), ## restrict_xpaths=('//ul[@class="pagination"]',)) ## , follow= True), Rule(SgmlLinkExtractor( allow=(), restrict_xpaths= ('//ul[@class="pagination floatR"] | //nav[@id="blockcontentmnutop"]/span[last()]/a', )), callback="parse_category", follow=True), ) def parse_category(self, response): ## def parse(self,response): sel = Selector(response) hxs = HtmlXPathSelector(response) pageurl = response.url.strip() breadcrumb = sel.xpath( "//nav[@id='blockcontentmnutop']/span[last()]/a/text()").extract( )[0].strip() ## url = response.url ## for i in range(len(urllist)): ## if url == urllist[i]: ## row =(breadcrumb,priceidlist[i],url,cat1list[i],cat2list[i],cat3list[i]) ## mywriter.writerow(row) for i in range(len(urllist)): if breadcrumb == categnamelist[i]: producturls = sel.xpath( "//div[@class='pd-details']/a/@href").extract() for x in producturls: item = BigCItem() item['Category'] = cat1list[i] item['Category2'] = cat2list[i] item['Category3'] = cat3list[i] item['id1'] = priceidlist[i] request = Request(x, callback=self.parse_items) request.meta["item"] = item yield request ## def parse(self,response): ## item = BigCItem() ## item['Category'] = '' ## item ['id1'] = 'Apparel' def parse_items(self, response): item = response.meta['item'] sel = Selector(response) hxs = HtmlXPathSelector(response) pname = sel.xpath("//h1/text()").extract()[0] item["Product_Name"] = pname item["Option_Set"] = pname item[ "Product_Image_Description_1"] = "Buy " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" item[ "MetaDescription"] = "Get your hands on the " + pname + ". Buy it Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" item[ "TitleTag"] = "Buy the " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" item["Brand_Name"] = sel.xpath( "//div[@id='divhoverpopup']/h2/a/@title").extract()[0] pcode = sel.xpath( "//div[@class='pro-code']/i/text()").extract()[0].replace( "Product Code: ", "") item["Product_Code"] = pcode item["Product_Description"] = sel.xpath( "//div[@class='pro-description']/p |//div[@class='pro-description']/ul" ).extract() item["Product_Description"] = ''.join( item["Product_Description"]).encode('utf-8') mrp = sel.xpath( "//span[@id='ListPrice']/text()| //span[@id='ProductPrice']/text()" ).extract()[0].replace("$", "") sp = sel.xpath( "//span[@id='PriceRange']/text() | //span[@id='SalePrice']/text()" ).extract() item["Sale_Price"] = "" if item["Brand_Name"] in ("FINIS", "Arena", "Speedo", "Finis", "2XU", "Garmin", "HYDRO-FIT", "Nike", "TYR", "Yurbuds", "Timex"): sortorder = "-300" elif item["Brand_Name"] == "Sporti": sortorder = "-270" else: sortorder = "-270" item['Retail_Price'], item['Sale_Price'] = mycsv.pricing( mrp, sp, item['id1']) breadcrumb = sel.xpath( "//nav[@class='block-content mnu-top mnu-top-product-detail']/span[last()]/a/text()" ).extract()[0].strip() if breadcrumb == 'Swim.com Compatible': breadcrumb = sel.xpath( "//nav[@class='block-content mnu-top mnu-top-product-detail']/span[last()-1]/a/text()" ).extract()[0].strip() if item['Category'] and item['Category2'] and item['Category3']: Category = item['Category'] + '/' + breadcrumb + ';' + item[ 'Category2'] + '/' + breadcrumb + ';' + item[ 'Category3'] + '/' + breadcrumb elif item['Category'] and item['Category2']: Category = item['Category'] + '/' + breadcrumb + ';' + item[ 'Category2'] + '/' + breadcrumb + ';' else: Category = item['Category'] + '/' + breadcrumb size = response.xpath( '//*[@id="divChooseOption2"]/div[2]/script[1][contains(text(),"arraySize")]' ).extract() colorArray = response.xpath( '//script[@language="JavaScript"][contains(text(),"arrayColor")]' ).extract() if size or colorArray: trackinventory = "By Option" else: trackinventory = "By Product" item["Product_Image_File1"] = response.xpath( '//*[@id="divChooseOption2"]/img/@name |//div[@class="box-content block-content mnu-content pro-option"]/img/@name' ).extract() tup = ( "Product", item["Product_Name"] + "*", item["Brand_Name"], item["Retail_Price"], item["Retail_Price"], item["Sale_Price"], #price item["Product_Code"] + "SWMOTLT", Category, "SWIMOUTLET", item["Product_Description"], "100", item["Product_Name"], "15-21 Working days", "N", sortorder, item["MetaDescription"], item["TitleTag"], item["Product_Image_Description_1"], "Y", trackinventory, "1", "2", "3", "4", "5", "6", "7") obj = list(tup) c = 0 for i in item["Product_Image_File1"]: c = c + 1 imgurl = "http://www.swimoutlet.com/photos/" + i + ".jpg" if size or colorArray: imgurl = "http://www.swimoutlet.com/photos/options/" + i + ".jpg" obj.append(imgurl) if c == 7: break row = tuple(obj) if size: size = response.xpath( '//*[@id="divChooseOption2"]/div[2]/script[1][contains(text(),"arraySize")]' ).extract()[0].replace("arraySize[0] =", "") size = re.sub(r'<script(.*)>', '', size) size = size.replace("arraySize = new Array();", "") size = re.sub(r'arraySize(.*)=', ',', size) size = size.replace("[", '').replace("];", "").replace( "'", '"').replace('",', '":').replace("</script>", "") size = "[{" + size + "}]" item['size'] = {} item['size'] = json.loads(size)[0] else: item['size'] = "" if colorArray: colorArray = response.xpath( '///script[@language="JavaScript"][contains(text(),"arrayColor")]' ).extract()[0].replace("arrayColor[0] =", "") colorArray = re.sub(r'<script(.*)>', '', colorArray) colorArray = colorArray.replace("var arrayColor = new Array();", "") colorArray = re.sub(r'arrayColor(.*)=', ',', colorArray) colorArray = colorArray.replace("[", '').replace("];", "").replace( "'", '"').replace('",', '":').replace("</script>", "") #print colorArray colorArray = "[{" + colorArray + "}]" item['color'] = {} item['color'] = json.loads(colorArray)[0] item['variant'] = {} for colorcode, color in item['color'].iteritems(): if item['size'] == "": item['variant'][colorcode + "_" + colorcode] = "[S]Color= " + color elif len(item['size']) == 1: for sizecode, size in item['size'].iteritems(): item['variant'][ colorcode + "_" + sizecode] = "[S]Color= " + color + ",[RB]Size= " + size elif len(item['color']) == 1: for sizecode, size in item['size'].iteritems(): item['variant'][ colorcode + "_" + sizecode] = "[RB]Color= " + color + ",[S]Size= " + size else: for sizecode, size in item['size'].iteritems(): item['variant'][ colorcode + "_" + sizecode] = "[S]Color= " + color + ",[S]Size= " + size combosArray = sel.xpath( '//script[@language="JavaScript"][contains(text(),"var separator")]' ).extract()[0] combosArray = re.findall(r'id=.*name', combosArray) combosArray = [ w.replace("id='size_", "").replace("name", "").replace( '"', "").replace("'", "").replace(" ", "") for w in combosArray ] priceArray = sel.xpath( '//script[@language="JavaScript"][contains(text(),"var separator")]' ).extract()[0] priceArray = re.findall(r'value.*/', priceArray) priceArray = [ w.replace("/", "").replace("value=", "").replace("'", "") for w in priceArray ] item["Price"] = dict(zip(combosArray, priceArray)) #print pricedict notfound = 0 for key, price in item['Price'].iteritems(): if key not in item['variant']: notfound = 1 break if notfound == 0: mywriter.writerow(row) for key, price in item["Price"].iteritems(): row = ("Rule", item['variant'][key], "", "", "", "", pcode + key, "", "SWIMOUTLET", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "http://www.swimoutlet.com/photos/options/" + pcode + "-" + key.split("_")[0] + "-zoomin.jpg") mywriter.writerow(row) row1 = ("SKU", item['variant'][key], "", "", "", "", pcode + key, "", "SWIMOUTLET", "", "100", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "http://www.swimoutlet.com/photos/options/" + pcode + "-" + key.split("_")[0] + "-zoomin.jpg") mywriter.writerow(row1)
class TabelogSpider(CrawlSpider): name = 'tabebot' allowed_domains = ['tabelog.com'] download_delay = 1.0 prefectures = [ 'hokkaido', 'aomori', 'iwate', 'miyagi', 'akita', 'yamagata', 'fukushima', 'ibaraki', 'tochigi', 'gunma', 'saitama', 'chiba', 'tokyo', 'kanagawa', 'niigata', 'toyama', 'ishikawa', 'fukui', 'yamanashi', 'nagano', 'gifu', 'shizuoka', 'aichi', 'mie', 'shiga', 'kyoto', 'osaka', 'hyogo', 'nara', 'wakayama', 'tottori', 'shimane', 'okayama', 'hiroshima', 'yamaguchi', 'tokushima', 'kagawa', 'ehime', 'kochi', 'fukuoka', 'saga', 'nagasaki', 'kumamoto', 'oita', 'miyazaki', 'kagoshima', 'okinawa', ] categories = [ 'japanese', # 日本料理 'RC0102', # 寿司・魚介類 'RC0103', # 天ぷら・揚げ物 'RC0104', # そば・うどん・麺類 'RC0105', # うなぎ・どじょう 'RC0106', # 焼鳥・串焼・鳥料理 'RC0107', # すき焼き・しゃぶしゃぶ 'RC0108', # おでん 'RC0109', # お好み焼き・たこ焼き 'RC0110', # 郷土料理 'RC0111', # 丼もの 'RC0199', # 和食(その他) 'RC0201', # ステーキ・ハンバーグ 'RC0203', # 鉄板焼き 'RC0202', # パスタ・ピザ 'hamburger', # ハンバーガー 'RC0209', # 洋食・欧風料理 'french', # フレンチ 'italian', # イタリアン 'RC0219', # 西洋各国料理 'RC0301', # 中華料理 'RC0302', # 餃子・肉まん 'RC0303', # 中華粥 'RC0304', # 中華麺 'korea', # 韓国料理 'RC0402', # 東南アジア料理 'RC0403', # 南アジア料理 'RC0404', # 西アジア料理 'RC0411', # 中南米料理 'RC0412', # アフリカ料理 'RC0499', # アジア・エスニック(その他) 'RC1201', # カレーライス 'RC1202', # 欧風カレー 'RC1203', # インドカレー 'RC1204', # タイカレー 'RC1205', # スープカレー 'RC1299', # カレー(その他) 'RC1301', # 焼肉・ホルモン 'RC1302', # ジンギスカン 'nabe', # 鍋 'izakaya', # 居酒屋 'RC2102', # ダイニングバー 'RC2199', # 居酒屋・ダイニングバー(その他) 'RC9901', # 定食・食堂 'RC9902', # 創作料理・無国籍料理 'RC9903', # 自然食・薬膳 'RC9904', # 弁当・おにぎり 'RC9999', # レストラン(その他) 'ramen', # ラーメン 'MC11', # つけ麺 'SC0101', # パン 'SC0201', # 洋菓子 'SC0202', # 和菓子・甘味処 'SC0203', # 中華菓子 'SC0299', # スイーツ(その他) ] start_urls = [ 'http://tabelog.com/{0}/rstLst/{1}/?SrtT=rt&Srt=D'.format( prefecture, category) for prefecture in prefectures for category in categories ] rules = [ # Follow business list pagination Rule(LxmlLinkExtractor(allow=(r'[a-z]+/rstLst/RC\d+/\d+/\?.*', ), deny=(r's.tabelog.com')), follow=True), # Extract business Rule(LxmlLinkExtractor(allow=(r'[a-z]+/A\d{4}/A\d{6}/\d+/$', ), deny=(r's.tabelog.com')), callback='parse_business'), # Follow review list pagination (first page) Rule(LxmlLinkExtractor( allow=(r'[a-z]+/A\d{4}/A\d{6}/\d+/dtlrvwlst/$', ), deny=(r's.tabelog.com')), follow=True), # COND-0 すべての口コミ # COND-1 夜の口コミ # COND-2 昼の口コミ # smp0 簡易リスト # smp1 通常 # smp2 全文 # Follow review list pagination and extract reviews Rule(LxmlLinkExtractor( allow=(r'[a-z]+/A\d{4}/A\d{6}/\d+/dtlrvwlst/COND-0/smp2/\?.+', ), deny=(r'favorite_rvwr', r's.tabelog.com')), follow=True, callback='parse_reviews_and_users'), ] def is_tabelog(self, response): selector = Selector(response) return bool(selector.xpath("//img[@id='tabelogo']")) def parse_reviews_and_users(self, response): if not self.is_tabelog(response): return Request(url=response.url, dont_filter=True) dom = PyQuery(response.body) review_nodes = dom('div.rvw-item') business_id = int( re.findall(r'[a-z]+/A\d{4}/A\d{6}/(\d+)/dtlrvwlst/', response.url)[0]) reviews_and_users = [] for review_node in review_nodes: user_id = self._extract_user_id(review_node) review = self._generate_review(review_node, business_id, user_id) if review: reviews_and_users.append(review) user = self._generate_user(review_node, user_id) if user: reviews_and_users.append(user) return reviews_and_users def _extract_user_id(self, review_node): user_link = review_node.cssselect( '.rvw-item__rvwr-name > a:first-child') if user_link: url = user_link[0].attrib['href'] return re.findall(r'rvwr/(.+)/', url)[0] def _generate_review(self, review_node, business_id, user_id): review = ReviewItem() review['review_id'] = int(review_node.getchildren()[0].attrib['name']) review['business_id'] = business_id set_value_if_true(review, 'user_id', user_id) review['visit'] = review_node.cssselect( '.rvw-item__visit-month-num')[0].text review['text'] = [ sentence for sentence in review_node.cssselect( 'div.rvw-item__rvw-comment > p')[0].itertext() ] review['title'] = review_node.cssselect( 'p.rvw-item__rvw-title')[0].text_content().strip() for meal in ['dinner', 'lunch']: css = 'span.rvw-item__usedprice-icon--{0}'.format(meal) review['price_{0}'.format(meal)] = review_node.cssselect(css)[0] \ .getnext().text_content() set_value_if_true(review, 'stars_{0}'.format(meal), self._extract_stars(review_node, meal)) review['situations'] = self._extract_situations(review_node) return review def _extract_stars(self, review_node, meal): lis = review_node.cssselect( 'li.rvw-item__ratings-item--{0}'.format(meal)) if not lis: return stars = {} li = lis[0] stars['total'] = convert_to_float_if_float( li.cssselect('strong.rvw-item__ratings-total-score')[0].text) lis = li.cssselect('ul.rvw-item__ratings-dtlscore > li') for li, criterion in zip( lis, ['taste', 'service', 'ambience', 'cp', 'drink']): score = li.cssselect( 'strong.rvw-item__ratings-dtlscore-score')[0].text stars[criterion] = convert_to_float_if_float(score) return stars def _extract_situations(self, review_node): imgs = review_node.cssselect('p.rvw-item__situation > img') situations = [] for img, situation in zip( imgs, ['friends', 'date', 'settai', 'party', 'family', 'alone']): if not img.attrib['src'].endswith('_g.gif'): situations.append(situation) return situations def _generate_user(self, review_node, user_id): user = UserItem() user['user_id'] = user_id user['name'] = review_node.cssselect( '.rvw-item__rvwr-name > a > span')[0].text.strip() counts = review_node.cssselect('.rvw-item__rvwr-rvwcount') if counts: count = counts[0].text count_candidates = re.findall(r'\d+', count) if count_candidates: user['review_count'] = int(count_candidates[0]) profile = review_node.cssselect('.rvw-item__rvwr-profile') if profile: user['profile'] = profile[0].text_content().strip() user['verified'] = bool(review_node.cssselect('.mark-auth-mobile')) return user def parse_business(self, response): if not self.is_tabelog(response): return Request(url=response.url, dont_filter=True) selector = Selector(response) business = BusinessItem() business['business_id'] = int( re.findall(r'[a-z]+/A\d{4}/A\d{6}/(\d+)/', response.url)[0]) business['name'] = selector.xpath( "//span[@class='display-name']/text()")[0].extract().strip() business['categories'] = selector.xpath( "//span[@property='v:category']/text()").extract() stars = selector.xpath( "//span[@property='v:average']/text()")[0].extract().strip() business['stars'] = convert_to_float_if_float(stars) for meal in ['dinner', 'lunch']: price = selector.xpath( "//dt[@class='budget-{0}']/following-sibling::dd/em/a/text()". format(meal)).extract() if price: business['price_{0}'.format(meal)] = price[0] stars = selector.xpath( "//div[@class='score-s']/span[@class='{0}']/following-sibling::em/text()" .format(meal))[0].extract() business['stars_{0}'.format(meal)] = convert_to_float_if_float( stars) review_count = selector.xpath( "//em[@property='v:count']/text()")[0].extract() business['review_count'] = convert_to_int_if_int(review_count) business['prefecture'] = selector.xpath( "//p[@class='pref']/a/text()")[0].extract().strip() business['area'] = re.findall(r'[a-z]+/(A\d{4})/A\d{6}/\d+/', response.url)[0] business['subarea'] = re.findall(r'[a-z]+/A\d{4}/(A\d{6})/\d+/', response.url)[0] # business['menu_items'] = self._generate_menu_items(response) return business def _generate_menu_items(self, response): # TODO: implement me pass
class FunTVSpider(CrawlSpider): """ 风行电视爬取 """ name = 'fun_tv' allows_domains = ['fun.tv', ] start_urls = [ 'http://www.fun.tv/retrieve/c-e794b5e8a786e589a7.n-e5bdb1e78987.pg-1' ] rules = [ Rule(sle(allow=('/retrieve/c-e794b5e8a786e589a7.n-e5bdb1e78987.pg-\d+$', )), follow=True, callback='parse1'), ] def parse1(self, response): sel = Selector(response) tv_list = sel.css('body div.mod-list.page-wrap div div.mod-wrap-in.mod-vd-lay.fix div.mod-vd-i') for tv in tv_list: tv_id = tv.css('div.info h3 a::attr(data-id)').extract()[0] if db_session.query(FunVideo).filter(FunVideo.id == tv_id).first(): continue name = tv.css('div.info h3 a::attr(title)').extract()[0] image = tv.css('div.pic a img::attr(_lazysrc)').extract()[0] description = tv.css('div.info p::text').extract()[0] point = tv.css('div.info h3 b::text').extract()[0] request = Request('http://www.fun.tv{}'.format(tv.css('div.pic a::attr(href)').extract()[0]), callback=self.parse2) fv = FunVideo(id=tv_id, name=name, name_pinyin=pinyin.get_initials(name, splitter=''), image=image, description=description, point=point) request.meta['tv'] = fv yield request def parse2(self, response): tv = response.meta['tv'] sel = Selector(response) tv.origin_url = response.url tv.director = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(2) span::text').extract()) tv.starring = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(3) span::text').extract()) tv.category = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(4) span::text').extract()) tv.detail = sel.css('div#main-rt div.mod-datum p.dirtext span:nth-child(2)::text').extract()[0] print tv.name, '------->', tv.origin_url # 表明电视 tv.type = 1 db_session.add(tv) db_session.commit() sub_tv_list = sel.css('div#playCont div div div div.torrent-panel ul li') for st in sub_tv_list: try: st.css('a span').extract()[0] except IndexError: sub_tv_index = st.css('::attr(data-idx)').extract()[0] else: continue sub_tv = SubFunViedo(fv_id=tv.id, index=sub_tv_index) sub_tv.id = st.css('::attr(data-vid)').extract()[0] sub_tv.origin_url = 'http://www.fun.tv{}'.format(st.css('a::attr(href)').extract()[0]) print sub_tv.index, '-------->', sub_tv.origin_url request1 = Request(sub_tv.origin_url, callback=self.parse3) request1.meta['sub_tv'] = sub_tv yield request1 def parse3(self, response): print 'parse 3 ------->' sub_tv = response.meta['sub_tv'] sel = Selector(response) play_count = sel.css('div.playInfo.crumbs div.rightBtn.fix a::text').extract()[0] sub_tv.play_count = ''.join(play_count[3:].split(',')) db_session.add(sub_tv) db_session.commit()
class mensrunning(CrawlSpider): name = "roadrunner" allowed_domains = ["roadrunnersports.com", "roadrunnersports.scene7.com"] start_urls = ["http://www.roadrunnersports.com/rrs/products/BRK1078/mens-brooks-beast-12/" #"http://www.roadrunnersports.com/rrs/products/ASC1724/mens-asics-gelkayano-21/", #"http://www.roadrunnersports.com/rrs/products/ASC1726/" ## "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=2094&searchQuery=mensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=1482&searchQuery=mensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=2715&searchQuery=mensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/c/track-spikes/", ## "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=128&searchQuery=mensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=3555&searchQuery=mensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=2547&searchQuery=mensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=2094&searchQuery=womensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=128&searchQuery=womensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=1482&searchQuery=womensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=2715&searchQuery=womensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=2905&searchQuery=womensshoes%20menurrs", ## "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=2547&searchQuery=womensshoes%20menurrs" ] rules = (Rule (SgmlLinkExtractor(allow=(),restrict_xpaths=('//td[@id="paging_count"]',)), follow= True), Rule (SgmlLinkExtractor(restrict_xpaths=('//div[@class="product_colorways_image"]',)), callback="parse_item", follow= True),) csvfile = None printHeader = True def to_csv(self, item): start = 0 end=0 if self.printHeader: self.csvfile = open('RoadRunnerSportsRunningShoes.csv','w') if self.csvfile: strWrite = '' #headers if self.printHeader: strWrite +='Item Type,Product ID,Product Name,Brand Name,Price,Retail Price,Sale Price,Product Description,Product Code/SKU,Bin Picking Number,' strWrite +='Category,Option Set,Product Availability,Current Stock Level,Free Shipping,Sort Order, Meta Description,Page Title, Product Image Description - 1,Product Image Is Thumbnail - 1,' strWrite +='Track Inventory,Product Image Sort - 1,Product Image Sort - 2,Product Image Sort - 3,Product Image Sort - 4,Product Image Sort-5,Product Image Sort-6,Product Image Sort-7,' strWrite +='Product Image File - 1,Product Image File - 2,Product Image File - 3,Product Image File - 4,Product Image File - 5 ,Product Image File - 6,Product Image File - 7, \n' self.printHeader = False pfound = 0 #counter to find product from master sheet. If 0 after the loop, product is a NEW product and not uploaded previously productid = 0 #Storing the Product ID value for the product row for color,sizes in item['variant'][item['sku']].iteritems(): for i in range(len(namelist)): #Loop to go through all the Item Types in old file. if typelist[i] == "Product" and skulist[i] == (item['sku']+item['color'][color]): #if typelist[i] == "Product" and namelist[i] == (item['Product_Name']+" " + item['color'][color]+"*"): #Comparing Product Names from old sheet and new scrapped start = i # Counter to store index of found Product Name pfound = 1 productid = idlist[i] for r in range(i+1,len(namelist)): #Loop to start at the Counter and Look for next occurance of Item Type = "Product" if typelist[r] == "Product" : break #Loop breaks for next occurance of Item Type = "Product" else: end = end+1 #Counting the number of SKUS for each product from the OLD sheet print "#",pfound #not Found Products if pfound ==0: if item["Brand_Name"] not in("Nike","adidas","Reebok","Puma"): for color,sizes in item['variant'][item['sku']].iteritems(): # generate product row strWrite += 'Product,,'+item['Product_Name']+" " + item['color'][color]+"*"+','+item['Brand_Name']+','+item['Retail_Price']+','+item['Retail_Price']+','+item['Sale_Price']+',' strWrite += '.'.join(item["Product_Description"]).replace(',',';').replace("\n","").replace("\r","").replace('<.*?>)',"").replace("When choosing running shoes, find your perfect fit using this chart. Category types are based on factors like arch height, running habits and body frame.","").replace("This web exclusive item ships separately within the continental U.S. only. You can count on this item to ship in 3-5 business days!","") + "," strWrite += item['sku']+item['color'][color]+ ',' + "ROADRUNNER" +',' strWrite += item['Category'] + ',' + item['Product_Name']+item['color'][color] + ',' + item["Product_Availability"] +',' strWrite += item["Current_Stock"] + ',' + item["Free_Shipping"] + ',' + item["Sort_Order"] + "," + "Buy the " + item['Product_Name']+" " + item['color'][color] + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" strWrite += ',' + "Buy the " + item['Product_Name']+" " + item['color'][color]+ " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" + ',' strWrite += "Buy the " + item['Product_Name']+" " + item['color'][color]+ " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" + ',' + item["Product_Image_Is_Thumbnail_1"] + ',' + item["Track_Inventory"] + ',' strWrite += item["Product_Image_Sort_1"] + ',' + item["Product_Image_Sort_2"] + ',' + item["Product_Image_Sort_3"] + ',' strWrite += item["Product_Image_Sort_4"] + ',' + item["Product_Image_Sort_5"] + ',6,7,' strWrite += ','.join(item['Product_Image_File1'][color])+',\n' #strWrite += 'Product,'+item['productname']+','+item['sku']+','+item['color'][color]+',,,,'+','.join(item[''][color])+',\n' #only write availabe products to csv for width,sizeList in sizes.iteritems(): for size,sku in sizeList.iteritems(): strWrite += 'SKU,,[S]Size= US '+size+'.Width ='+width+',,,,,,'+sku+','+"ROADRUNNER,,,,100"+',\n' else: if item["Brand_Name"] not in("Nike","adidas","Reebok","Puma"): for color,sizes in item['variant'][item['sku']].iteritems(): print pfound # generate product row strWrite += 'Product,'+productid+","+item['Product_Name']+" " + item['color'][color]+"*"+','+item['Brand_Name']+','+item['Retail_Price']+','+item['Retail_Price']+','+item['Sale_Price']+',' strWrite += '.'.join(item["Product_Description"]).replace(',',';').replace("\n","").replace("\r","").replace('<.*?>)',"").replace("When choosing running shoes, find your perfect fit using this chart. Category types are based on factors like arch height, running habits and body frame.","").replace("This web exclusive item ships separately within the continental U.S. only. You can count on this item to ship in 3-5 business days!","") + "," strWrite += item['sku']+item['color'][color]+ ',' + "ROADRUNNER" +',' strWrite += item['Category'] + ',' + item['Product_Name']+item['color'][color] + ',' + item["Product_Availability"] +',' strWrite += item["Current_Stock"] + ',' + item["Free_Shipping"] + ',' + item["Sort_Order"] + "," + "Buy the " + item['Product_Name']+" " + item['color'][color] + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" strWrite += ',' + "Buy the " + item['Product_Name']+" " + item['color'][color]+ " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" + ',' strWrite += "Buy the " + item['Product_Name']+" " + item['color'][color]+ " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" + ',' + item["Product_Image_Is_Thumbnail_1"] + ',' + item["Track_Inventory"] + ',' strWrite += item["Product_Image_Sort_1"] + ',' + item["Product_Image_Sort_2"] + ',' + item["Product_Image_Sort_3"] + ',' strWrite += item["Product_Image_Sort_4"] + ',' + item["Product_Image_Sort_5"] + ',6,7,' strWrite += ','.join(item['Product_Image_File1'][color])+',\n' #VARIANT PRINTING SECTION old_dict = {} #Dictionary to contain old SKUs and Sizes oldlen = 0 for i in range(start+1,start+1+end): #Storing all list of SKUS in a new list. Will be used for comparing with the new list old_dict[0,oldlen]=skulist[i] old_dict[1,oldlen]= sizelist[i] old_dict[2,oldlen]= idlist[i] oldlen = oldlen+1 new_dict = {} #Dictionary to contain new SKUs and Sizes c=0 for width,sizeList in sizes.iteritems(): for size,sku in sizeList.iteritems(): new_dict[0,c] = sku new_dict[1,c] = 'SKU,,[S]Size= US '+size+'.Width ='+width c= c+1 diff_dict = {} #Dict which contains older skus r=0 for i in range(oldlen): found = 0 for x in range(c): if old_dict[0,i] == new_dict[0,x]: found = 1 break if found ==0: diff_dict[0,r] = old_dict[0,i] diff_dict[1,r] = old_dict[1,i] diff_dict[2,r] = old_dict[2,i] r=r+1 for width,sizeList in sizes.iteritems(): t=0 for size,sku in sizeList.iteritems(): if sku == old_dict[0,i]: strWrite += 'SKU,'+old_dict[2,i]+',[S]Size= US '+size+'.Width ='+width+',,,,,,'+sku+','+"ROADRUNNER,,,,100"+',\n' t = 1 if t==0:# For SKUS which are new and hence will not have a product ID strWrite += 'SKU,,[S]Size= US '+size+'.Width ='+width+',,,,,,'+sku+','+"ROADRUNNER,,,,100"+',\n' if diff_dict: for i in range (r): strWrite += 'SKU,'+diff_dict[2,i]+','+diff_dict[1,i] +',,,,,,'+diff_dict[0,i] + ',' +'ROADRUNNER,,,,0,,,,,,,,,,,\n' self.csvfile.write(strWrite.encode('utf8')) #def parse_item(self, response): def parse(self, response): sel = Selector(response) url = 'http://www.roadrunnersports.com/rrs/product-detail/build-selections.jsp' item = BigCItem() pname = response.xpath("//meta[@property='og:title']/@content").extract()[0] item ["Product_Name"] = response.xpath("//meta[@property='og:title']/@content").extract()[0] if "Trail" in pname : item ["Product_Name"] = response.xpath("//meta[@property='og:title']/@content").extract()[0] + " Running Shoe" mrp = float(sel.xpath("//span[@class='prod_detail_reg_price']/span/text()").extract()[0]) item ["Retail_Price"] = str((mrp*65 + mrp*30/100*70/100*65)*112.5/100 + mrp*65*15/100) item_sp = response.xpath("//span[@class='prod_detail_sale_price']/span/text()").extract() if item_sp: sp = float(sel.xpath("//span[@class='prod_detail_sale_price']/span/text()").extract()[0].split("-")[-1].replace("$","")) item ["Sale_Price"] = str((sp*65 + 30/100*70*65)*112.5/100 + sp*65*15/100) else: item ["Sale_Price"] = '' #categorization cat = response.xpath("//div[@id='grp_1']/p/span[1]/text()") sex = response.xpath("//meta[@property='og:title']/@content").extract()[0] if sex in("Women's"): sex= "Women's" else: sex= "Men's" item["Product_Description"] = response.xpath("//div[@id='grp_1']/p").extract() + response.xpath("//div[@id='grp_1']/ul/li").extract() if cat: # item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/" + sel.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0].replace("+","") cat= ";Shoes/"+sex+" Running Shoes/"+response.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0].replace("+","") +" Running Shoes" item ["Product_Name"] = response.xpath("//meta[@property='og:title']/@content").extract()[0] + " " + response.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0] + " Running Shoe" else: cat= "" if any("hiking" in s for s in item["Product_Description"]) or any("Hiking" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Shoes/Hiking Shoes" + cat elif any("trail" in s for s in item["Product_Description"]) or any("Trail" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Trail Running Shoes" + cat elif any("minimalist" in s for s in item["Product_Description"]) or any("barefoot" in s for s in item["Product_Description"]) or any("Barefoot" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Barefoot Running Shoes" + cat elif any("spike" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Racing Spikes" + cat elif any("cross-train" in s for s in item["Product_Description"])or any("trainer" in s for s in item["Product_Description"])or any("training shoe" in s for s in item["Product_Description"]) or any("gym" in s for s in item["Product_Description"]) or any("workout" in s for s in item["Product_Description"]): item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Shoes/Cross Training Shoes" + cat else: if cat: item ["Category"] = "Run & Cycle/Running/Running Shoes"+ cat else: item ["Category"] = "NULL" item ["Brand_Name"] = response.xpath("//span[@itemprop='brand']/text()").extract()[0] if item["Brand_Name"] in ("Asics","Mizuno","Brooks","Saucony","New Balance"): item ["Sort_Order"] = str(-300-(20/100*mrp)) elif item["Brand_Name"] in ("Under Armour","Altra","Hoka One One","Inov8","Salomon","Vibram FiveFingers"): item ["Sort_Order"] = str(-270-(20/100*mrp)) else : item ["Sort_Order"] = str(-250-(20/100*mrp)) item["Product_Availability"] = "12-17 Working Days" item["Current_Stock"] = "100" item ["Free_Shipping"] = "N" item["Product_Image_Is_Thumbnail_1"] = "Y" item["Track_Inventory"] = "By Option" item["Product_Image_Sort_1"] = "1" item["Product_Image_Sort_2"] = "2" item["Product_Image_Sort_3"] = "3" item["Product_Image_Sort_4"] = "4" item["Product_Image_Sort_5"] = "5" item ["imageSetUrls"] = {} item ["imageSetUrls2"] = {} colors = response.xpath("//a[@class='ref2QIColor']/@name").extract() item ["Product_Image_File1"] = {} hrefs = response.xpath("//a[@class='ref2QIColor']/@href").extract() item ["color"] = {} for idx,href in enumerate(hrefs): #create links to image sets if colors[idx] not in item ["imageSetUrls"]: item ["imageSetUrls"][colors[idx]] = [] item ["imageSetUrls"][colors[idx]].append("http://roadrunnersports.scene7.com/is/image/roadrunnersports/"+href.split('/')[-1].split('_')[0]+"-IS?req=set,json&scl=1") if colors[idx] not in item ["imageSetUrls2"]: item ["imageSetUrls2"][colors[idx]] = [] item ["imageSetUrls2"][colors[idx]].append("http://roadrunnersports.scene7.com/is/image/roadrunnersports/"+href.split('/')[-1].split('_')[0]+"-IS?req=set,json&scl=1") item ["color"][href.split('/')[-1].split('_')[0].split('-')[1]] = colors[idx] #request product info as json item ["sku"] = response.url.strip('/').split('/')[-2] payload = {'id':item ["sku"]} request = FormRequest(url,formdata=payload,callback=self.parseJsonProduct) request.meta['item'] = item return request #parse product info from json file def parseJsonProduct(self,response): item = response.meta['item'] #make a valid json file out of it and remove unneeded data prodResponse = response.body.split('$+$')[0].strip().replace("'",'"') prodDict = {} sizeWidthDict = {} jsonresponse = json.loads(prodResponse) for product,value in jsonresponse.iteritems(): if item["sku"] not in prodDict: prodDict[item["sku"]]={} if value['c'] not in prodDict[item["sku"]]: prodDict[item["sku"]][value['c']] ={} if value['w'] not in prodDict[item["sku"]][value['c']]: prodDict[item["sku"]][value['c']][value['w']]={} if value['s'] not in sizeWidthDict: sizeWidthDict[value['s']] = [] if value['w'] not in sizeWidthDict[value['s']]: sizeWidthDict[value['s']].append(value['w']) prodDict[item["sku"]][value['c']][value['w']][value['s']]=value['sku'] item['variant'] = prodDict item['size_width_list'] = sizeWidthDict #request first imageset if item["imageSetUrls"]: color,href = item["imageSetUrls"].popitem() if len(href)>1: item["imageSetUrls"][color] = href[1:] request = Request(href[0],callback=self.parseJsonImageSet) request.meta['item'] = item return request self.to_csv(item) return item def parseJsonImageSet(self,response): item = response.meta['item'] imageSetResponse = response.body #make a valid json file out of it, if only one image available it was a list => make a dict imageSetResponse = imageSetResponse.replace('/*jsonp*/s7jsonResponse(','') imageSetResponse = ','.join(imageSetResponse.split(',')[:-1]) imageSetResponse = imageSetResponse.replace('"item":[','"item":') imageSetResponse = imageSetResponse.replace('"item":','"item":[') imageSetResponse = imageSetResponse.replace('}]}}','}}}') imageSetResponse = imageSetResponse[::-1].replace('}}}','}}]}')[::-1] color = response.url.split('-')[1].split('?')[0] isImageSet = False if len(response.url.split('-'))>2: isImageSet = True item['Product_Image_File1'][color] = [] jsonresponse = json.loads(imageSetResponse) for index,imageItem in enumerate(jsonresponse['set']['item']): #check if there is a image set or only one image if 'isDefault' not in imageItem['i']: imageUrl = 'http://roadrunnersports.scene7.com/is/image/'+imageItem['i']['n']+'?iv='+imageItem['iv'] #response url is image set => image can be scaled if isImageSet: imageUrl += '&scl=1' item['Product_Image_File1'][color].append(imageUrl) else: # there is no image set append request for default image if item['color'][color] not in item["imageSetUrls"]: item ["imageSetUrls"][item['color'][color]] = [] if item['color'][color] not in item["imageSetUrls2"]: item ["imageSetUrls2"][item['color'][color]] = [] item["imageSetUrls"][item['color'][color]].append('http://roadrunnersports.scene7.com/is/image/roadrunnersports/'+item['sku']+'-'+color+'?req=set,json&scl=1') item["imageSetUrls2"][item['color'][color]].append('http://roadrunnersports.scene7.com/is/image/roadrunnersports/'+item['sku']+'-'+color+'?req=set,json&scl=1') if item["imageSetUrls"]: color,href = item["imageSetUrls"].popitem() if len(href)>1: item["imageSetUrls"][color] = href[1:] request = Request(href[0],callback=self.parseJsonImageSet) request.meta['item'] = item return request self.to_csv(item) return item
class FBSpider(CrawlSpider): name = "fb_kudryasheva" allowed_domains = ["facebook.com"] start_urls = [ '/alya.khaitlina.html', ] rules = ( Rule( LinkExtractor( allow=("facebook.com/alya.khaitlina/posts", "facebook.com/photo"), #allow=("facebook.com/TatyanaTolstaya", "facebook.com/photo"), restrict_xpaths='//a[@class="_5pcq"]'), callback='parse_page', follow=True), ) def __init__(self, category=None, *args, **kwargs): super(FBSpider, self).__init__(*args, **kwargs) self.driver = webdriver.Firefox() dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): self.driver.close() def parse_page(self, response): # scrape dynamically generated HTML self.driver.get(response.url) hxs = Selector(text=self.driver.page_source) item = ScraperItem() # use scrapy shell to find xpath # from scrapy.shell import inspect_response # inspect_response(response) try: divs = hxs.xpath( '//div[@id="contentArea"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/descendant-or-self::*/text()' ).extract() text = u" ".join(divs[1:]) no_text = len(divs) == 0 except IndexError: no_text = True if no_text: try: text = " ".join( hxs.xpath( '//span[@class="hasCaption"]/child::node()').extract()) except IndexError: text = "" item['url'] = response.url item['text'] = text item['title'] = hxs.xpath('//title/text()').extract() item['date'] = hxs.xpath( '//span[@class="timestampContent"]/text()').extract() comments = float(hxs.xpath('count(//abbr)').extract()[0]) - 1 try: likes = hxs.xpath( '//div[@class="UFILikeSentenceText"]/span/span/text()' ).extract()[0] if "likes" in likes: like_count = 1.0 else: try: like_count = len(likes.split(", ")) if "others" in likes: like_count += float( likes.split("and ")[1].split(" others")[0].replace( ",", "")) elif "and" in likes: like_count += 1.0 except IndexError: like_count = 2.0 except IndexError: like_count = 0.0 # print "like count: "+str(like_count) try: shares = hxs.xpath( '//a[@class="UFIShareLink"]/text()').extract()[0] share_count = float(shares.split(" share")[0].replace(",", "")) except IndexError: share_count = 0.0 print like_count, share_count, comments item['comment_count'] = [like_count, share_count, comments] yield item
class StockSpider(HistorySpider): name = 'stocks' allowed_domains = ['tase.co.il'] start_urls = [ 'http://www.tase.co.il/eng/marketdata/stocks/marketdata/Pages/MarketData.aspx' ] rules = ( Rule(SgmlLinkExtractor(allow=('MarketData\.aspx', )), callback='parse_company_list'), Rule(SgmlLinkExtractor(allow=('companyMainData\.aspx', )), callback='parse_company'), Rule(SgmlLinkExtractor(allow=('companyhistorydata\.aspx', )), callback='get_history_data'), Rule(SgmlLinkExtractor(allow=('companyDetails\.htm', )), callback='parse_company_details'), ) header = (('date_', tase.common.get_date), ('adjusted_closing_price', tase.common.to_float), ('closing_price', tase.common.to_float), ('change_', tase.common.to_float), ('opening_price', tase.common.to_float), ('base_price', tase.common.to_float), ('high', tase.common.to_float), ('low', tase.common.to_float), ('capital_listed_for_trading', tase.common.to_long), ('market_cap', tase.common.to_long), ('turnover', tase.common.to_long), ('volume', tase.common.to_long), ('trans', tase.common.to_long)) header2 = { 'Total Assets': ('total_assets', tase.common.to_long), 'Current Assets': ('current_assets', tase.common.to_long), 'Non Current Assets': ('non_current_assets', tase.common.to_long), 'Shareholders Equity': ('shareholders_equity', tase.common.to_long), 'Of which Minority Interest': ('of_which_minority_interest', tase.common.to_long), 'Current Liabilities': ('current_liabilities', tase.common.to_long), 'Long-Term Liabilities': ('long_term_liabilities', tase.common.to_long), #'Profit & Loss Statement' : ('', tase.common.to_long), 'Revenues': ('revenues', tase.common.to_long), 'Gross Profit': ('gross_profit', tase.common.to_long), 'Operating Income': ('operating_income', tase.common.to_long), 'Income Before Tax': ('income_before_tax', tase.common.to_long), 'Net Income': ('net_income', tase.common.to_long), 'Income Attributable to Shareholders': ('income_atributable_to_shareholders', tase.common.to_long), 'Earnings per Share': ('earnings_per_share', tase.common.to_float), #'Additional Data' : ('', tase.common.to_long), 'Dividends': ('dividends', tase.common.to_long), 'Net cash flow generated by operating activities': ('net_cash_flow_generated_by_operating_activities', tase.common.to_long), #'Financial Ratios' : ('', tase.common.to_long), 'Market to book value': ('market_to_book_value', tase.common.to_float), 'Price-Earning Ratio': ('price_earning_ratio', tase.common.to_float), 'Equity-Assets Ratio': ('equity_assets_ratio', tase.common.to_float), 'Return on Equity': ('return_on_equity', tase.common.to_float) } details_url = "http://www.tase.co.il/Eng/General/Company/Pages/companyDetails.aspx?subDataType=0&companyID={companyID}&shareID={shareID}" history_url = "http://www.tase.co.il/Eng/General/Company/Pages/companyHistoryData.aspx?subDataType=0&companyID={companyID}&shareID={shareID}&intPeriod={period}&intFrequency1=0&IsYield=False&IsDollar=False" def get_control_id(self): return "g_301c6a3d_c058_41d6_8169_6d26c5d97050" # Main companies list, with paging def parse_company_list(self, response): sel = Selector(response) fd = dict() inputs = sel.xpath("//input[@type='hidden']") for inpt in inputs: name = tase.common.get_string(inpt.xpath("@name").extract()) value = tase.common.get_string(inpt.xpath("@value").extract()) fd[name] = value #print fd #req_digest = sel.xpath("//input[@id='__REQUESTDIGEST']/@value").extract() #ev_val = sel.xpath("//input[@id='__EVENTVALIDATION']/@value").extract() links = sel.xpath("//tr[@class='pagerText']/td/a") for link in links: m = re.search("javascript:__doPostBack\('(.*?)'", link.extract()) if m: url = urllib.unquote(m.group(1)) fd['__EVENTTARGET'] = url #yield FormRequest(self.start_urls[0], method='POST', formdata={'__EVENTTARGET': url, '__EVENTARGUMENT': '', '__REQUESTDIGEST': req_digest, '__EVENTVALIDATION': ev_val}) #print "url: " + self.start_urls[0] yield FormRequest(self.start_urls[0], method='POST', formdata=fd) def parse_company(self, response): sel = Selector(response) item = TaseItem() item['category'] = category_comp item['tase_url'] = response.url item['date_'] = '' query = parse_qs(urlparse(response.url)[4]) # query try: item['CompanyID'] = query['CompanyID'][0] except KeyError: item['CompanyID'] = query['FundID'][0] try: item['ShareID'] = query['ShareID'][0] except KeyError: item['ShareID'] = query['FundID'][0] try: item['name'] = sel.xpath( "//td[@class='BigBlue']/text()").extract()[0] except IndexError: item['name'] = "" try: base_url = get_base_url(response) relative_url = sel.xpath( "//td[@rowspan='4']/img/@src").extract()[0] item['image_url'] = urljoin(base_url, relative_url) except IndexError: item['image_url'] = "" lst = sel.xpath( "//td[contains(child::text(), 'Symbol:')]/following-sibling::td[1]/table/tr/td[1]/text()" ).extract() if len(lst) > 0: item['symbol'] = lst[0] else: try: item['symbol'] = sel.xpath( "//td[contains(., 'Symbol:')]/following-sibling::td[1]/text()" ).extract()[0] except IndexError: item['symbol'] = item['ShareID'] href = sel.xpath('//tr[1]/td[1]/a[@target="_blank"]/@href').extract() url = href[0] o = urlparse(url) if len(o.netloc) > 0: item['url'] = url else: item['url'] = '' try: href = sel.xpath("//tr/td[@class='subtitle']/text()").extract() item['sector'] = tase.common.unescape( urllib.unquote(href[4].strip())) item['subsector'] = tase.common.unescape( urllib.unquote(href[3].strip())) except IndexError: item['sector'] = "" item['subsector'] = "" item['sector_int'] = 0 item['subsector_int'] = 0 if PROCESS_FINANCIAL_STATEMENTS: yield self.get_company_details(item) yield self.process_history(item) def get_company_details(self, item): url = self.details_url.format(shareID=item['ShareID'], companyID=item['CompanyID']) return Request(url, callback=self.parse_company_details, meta={'item': item}) def parse_company_details(self, response): item = response.request.meta['item'] sel = Selector(response) item['financial_statements'] = [] for i in range(3): fs = self.process_company_statement(sel, i) if not fs is None: item['financial_statements'].append(fs) return self.process_history(item) def process_company_statement(self, sel, index): table = sel.xpath( '//table[@id="ctl00_SPWebPartManager1_g_8e3d9f18_75c6_43cc_bc21_c3e7170427ca_ctl00_gridFinanceReport_DataGrid1"]' ) rows = table.xpath('tr') #[@class != "gridHeader"]') fs = FinancialStatement() start = True for row in rows: if start: columns = row.xpath('td[@class="titleGridReg"]/text()') if index >= len(columns): return None fs['period'] = columns[index].extract() start = False else: name = row.xpath('td/text()')[0].extract().strip() values = row.xpath('td/div/text()') if len(values) > 0: value = values[index].extract().strip() if not self.header2.get(name) is None: key = self.header2[name][0] func = self.header2[name][1] val = func(value) #self.log('Value: %s' % val) fs[key] = val return fs
class NewTaipeiCitySpider(CrawlSpider): name = 'new_taipei_city' allowed_domains = ['61.60.124.185'] rules = [ Rule(SgmlLinkExtractor(allow=('InfoAllList.asp\?a=c.*')), follow=True, callback='parse_list'), ] county_name = u'新北市' def __init__(self): self.start_urls = ['http://61.60.124.185/tpctempdig/InfoAllList.asp'] super(CrawlSpider, self).__init__() self._compile_rules() def parse_list(self, response): self.log('crawl: %s' % response.url) hxs = HtmlXPathSelector(response) # Get data records = hxs.select("//div[@class='tabs_content']//tr") for r in records: fields = r.select('.//td/text()').extract() if not fields: continue cfg.logger.debug('fields: %s', fields) data_dict = self._process_data_dict(fields, _columns) item = NewTaipeiCityItem() item['county_name'] = self.county_name item['the_category'] = 'kaohsiung_dig_point' item['the_idx'] = fields[6] ts = re.findall('(\d+)', fields[7]) item['start_timestamp'] = ts[0] item['end_timestamp'] = ts[1] item['the_data'] = data_dict item['start_timestamp'] = util.tw_date_to_timestamp( item['start_timestamp']) item['end_timestamp'] = util.tw_date_to_timestamp( item['end_timestamp']) process_data(item['county_name'], item['the_category'], item['the_idx'], item['start_timestamp'], item['end_timestamp'], {}, item['the_data']) yield item # Traverse items = hxs.select("//div[@id='pagenate']/a/@href") for item in items: url = 'http://pipegis.kcg.gov.tw/' + re.findall( "(default[^']+)", item.extract())[0] yield Request(url, callback=self.parse_list, method='POST', errback=self.errback) def errback(self): self.log('Request failed') def _process_data_dict(self, fields, columns): result = { column: '' if idx >= len(fields) else fields[idx] for (idx, column) in enumerate(columns) } return result
class MySpider(CrawlSpider): name = "gplay" allowed_domains = ["play.google.com"] start_urls = ["https://play.google.com/store/apps/"] rules = (Rule(LinkExtractor(allow=('/store/apps', )), follow=True), Rule(LinkExtractor(allow=('/store/apps/details\?')), follow=True, callback='parse_link')) def abs_url(url, response): """Return absolute link""" base = response.xpath('//head/base/@href').extract() if base: base = base[0] else: base = response.url return urlparse.urljoin(base, url) def parse_link(self, response): hxs = HtmlXPathSelector(response) titles = hxs.select('/html') items = [] for titles in titles: item = GplaycrawlerItem() item["Link"] = titles.select('head/link[5]/@href').extract() item["Item_name"] = titles.select( '//*[@class="document-title"]/div/text()').extract() item["Updated"] = titles.select( '//*[@itemprop="datePublished"]/text()').extract() item["Author"] = titles.select( '//*[@itemprop="author"]/a/span/text()').extract() item["Filesize"] = titles.select( '//*[@itemprop="fileSize"]/text()').extract() item["Downloads"] = titles.select( '//*[@itemprop="numDownloads"]/text()').extract() item["Version"] = titles.select( '//*[@itemprop="softwareVersion"]/text()').extract() item["Compatibility"] = titles.select( '//*[@itemprop="softwareVersion"]/text()').extract() item["Content_rating"] = titles.select( '//*[@itemprop="contentRating"]/text()').extract() item["Author_link"] = titles.select( '//*[@class="dev-link"]/@href').extract() item["Author_link_test"] = titles.select( '//*[@class="content contains-text-link"]/a/@href').extract() item["Genre"] = titles.select( '//*[@itemprop="genre"]/text()').extract() item["Price"] = titles.select( '//*[@class="price buy id-track-click"]/span[2]/text()' ).extract() item["Rating_value"] = titles.select( '//*[@class="score"]/text()').extract() item["Review_number"] = titles.select( '//*[@class="reviews-num"]/text()').extract() item["Description"] = titles.select( '//*[@class="id-app-orig-desc"]//text()').extract() item["IAP"] = titles.select( '//*[@class="inapp-msg"]/text()').extract() item["Developer_badge"] = titles.select( '//*[@class="badge-title"]//text()').extract() item["Physical_address"] = titles.select( '//*[@class="content physical-address"]/text()').extract() item["Video_URL"] = titles.select( '//*[@class="play-action-container"]/@data-video-url').extract( ) item["Developer_ID"] = titles.select( '//*[@itemprop="author"]/a/@href').extract() items.append(item) return items
class BondSpider(HistorySpider): name = 'bonds' allowed_domains = ['tase.co.il'] start_urls = [ 'http://www.tase.co.il/eng/marketdata/t-bills/Pages/ShortTermLoan.aspx', 'http://www.tase.co.il/eng/marketdata/bonds/governmentbonds/Pages/BondsGov.aspx', 'http://www.tase.co.il/eng/marketdata/bonds/corporatebonds/Pages/BondsByCuts.aspx', ] rules = ( Rule(SgmlLinkExtractor(allow=(r'ErrorHandler.aspx',)), callback='process_error'), Rule(SgmlLinkExtractor(allow=('ShortTermLoan\.aspx',)), callback='parse_bond_list'), Rule(SgmlLinkExtractor(allow=('BondsGov\.aspx',)), callback='parse_bond_list'), Rule(SgmlLinkExtractor(allow=('BondsByCuts\.aspx',)), callback='parse_bond_list'), Rule(SgmlLinkExtractor(allow=('BondsMainData\.aspx',)), callback='parse_bond'), Rule(SgmlLinkExtractor(allow=('companyMainData\.aspx',)), callback='parse_bond'), ) header = ( ('date_', tase.common.get_date), ('adjusted_closing_price', tase.common.to_float), ('closing_price', tase.common.to_float), ('change_', tase.common.to_float), ('gross_yield_to_maturity', tase.common.to_float), ('opening_price', tase.common.to_float), ('base_price', tase.common.to_float), ('high', tase.common.to_float), ('low', tase.common.to_float), ('capital_listed_for_trading', tase.common.to_int), ('market_cap', tase.common.to_int), ('turnover', tase.common.to_int), ('volume', tase.common.to_int), ('trans', tase.common.to_int) ) history_url = "http://www.tase.co.il/TASEEng/General/BONDs/bondsHistoryData.htm?bondType=4&subDataType=5&companyID={companyID}&shareID={shareID}&intPeriod={period}&intFrequency1=0&IsYield=False&IsDollar=False" def get_control_id(self): return "g_ed8af170_7f0e_440a_85fe_19d9352a2a86" # Main companies list, with paging def parse_bond_list(self, response): sel = Selector(response) fd = dict() inputs = sel.xpath("//input[@type='hidden']") for inpt in inputs: name = tase.common.get_string(inpt.xpath("@name").extract()) value = tase.common.get_string(inpt.xpath("@value").extract()) fd[name] = value links = sel.xpath("//tr[@class='pagerText']/td/a") for link in links: m = re.search("javascript:__doPostBack\('(.*?)'", link.extract()) if m: url = urllib.unquote(m.group(1)) fd['__EVENTTARGET'] = url #print self.start_urls[2] #print fd #yield FormRequest(self.start_urls[2], method='POST', formdata={'__EVENTTARGET': url, '__EVENTARGUMENT': ''}) yield FormRequest(self.start_urls[2], method='POST', formdata=fd) # almost same as parse_company def parse_bond(self, response): sel = Selector(response) item = TaseItem() item['category'] = category_bond item['tase_url'] = response.url item['date_'] = '' query = parse_qs(urlparse(response.url)[4]) # query try: item['CompanyID'] = query['CompanyID'][0] except KeyError: item['CompanyID'] = query['FundID'][0] try: item['ShareID'] = query['ShareID'][0] except KeyError: item['ShareID'] = query['FundID'][0] try: item['name'] = sel.xpath("//td[@class='BigBlue']/text()").extract()[0] except IndexError: item['name'] = "" try: base_url = get_base_url(response) relative_url = sel.xpath("//td[@rowspan='4']/img/@src").extract()[0] item['image_url'] = urljoin(base_url, relative_url) except IndexError: item['image_url'] = "" lst = sel.xpath("//td[contains(child::text(), 'Symbol:')]/following-sibling::td[1]/table/tr/td[1]/text()").extract() if len(lst) > 0: item['symbol'] = lst[0] else: try: item['symbol'] = sel.xpath("//td[contains(., 'Symbol:')]/following-sibling::td[1]/text()").extract()[0] except IndexError: item['symbol'] = item['ShareID'] href = sel.xpath('//tr[1]/td[1]/a[@target="_blank"]/@href').extract() if len(href) > 0: url = href[0] o = urlparse(url) if len(o.netloc) > 0: item['url'] = url else: item['url'] = '' else: item['url'] = '' try: href = sel.xpath("//tr/td[@class='subtitle']/text()").extract() item['sector'] = tase.common.unescape(urllib.unquote(href[4].strip())) item['subsector'] = tase.common.unescape(urllib.unquote(href[3].strip())) except IndexError: item['sector'] = "" item['subsector'] = "" item['sector_int'] = 0 item['subsector_int'] = 0 #url = "http://archive.globes.co.il/searchgl/%s" % item['symbol'] url = "http://www.globes.co.il/serveen/globes/searchresults.asp?exact=%s" % item['symbol'] yield self.process_history(item)
class PaipaidaiSpider(CrawlSpider): # to solve the problem:'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128) reload(sys) sys.setdefaultencoding('utf8') #/ name = 'paipaidai4' allowd_domain = ['www.paipaidai.com'] download_delay = 2 #访问间隔秒数 #['https://www.itouzi.com/dinvest/invest/detail?id=44335555475675434642733d'] url1 = ['http://www.ppdai.com/lend/12_s0_p' + str(x) for x in range(7, 9)] #热投区 url2 = ['http://www.ppdai.com/lend/13_s0_p' + str(x) for x in range(7, 9)] #安全标专区 url3 = ['http://www.ppdai.com/lend/14_s0_p' + str(x) for x in range(7, 9)] #逾期就赔 url4 = ['http://www.ppdai.com/lend/8_s0_p' + str(x) for x in range(7, 9)] #网商专区 url5 = ['http://www.ppdai.com/lend/3_s0_p' + str(x) for x in range(7, 9)] #二次借款 url6 = ['http://www.ppdai.com/lend/15_s0_p' + str(x) for x in range(7, 9)] #合作机构专区 url7 = ['http://www.ppdai.com/lend/16_s0_p' + str(x) for x in range(7, 9)] #新手福利标 url1.extend(url2) url1.extend(url3) url1.extend(url4) url1.extend(url5) url1.extend(url6) url1.extend(url7) start_urls = url1 #print start_urls rules = (Rule(SgmlLinkExtractor(allow=('/list/.*', )), callback='parse_page', follow=True), ) #def parse_page(self, response): def parse_page(self, response): item = PaipaidaiItem() sel = Selector(response) item['name'] = sel.xpath('//span[@class=\"\"]/text()').extract()[0] item['link'] = response.url item['amount'] = sel.xpath( '//dd[@id=\"listRestMoney\"]/text()').extract()[0].strip() item['min_amount'] = '' item['income_rate'] = sel.xpath( '//div[@class=\"w528 clearfix\"]/dl/dd/text()').extract()[1] term1 = sel.xpath('//dl[@class=\"nodbr\"]/dd/text()').extract()[0] term2 = sel.xpath('//dl[@class=\"nodbr\"]/dd/em/text()').extract()[0] item['term'] = term1 + term2 item['area'] = '' item['transfer_claim'] = '' item['repay_type'] = sel.xpath( '//div[@class=\"item item1\"]/text()').extract()[2].strip() item['reward'] = '' item['protect_mode'] = '' item['description'] = sel.xpath( '//div[@class=\"lendDetailTab_tabContent\"]/p/text()').extract()[0] item['process'] = sel.xpath( '//div[@class=\"item\"]/text()').extract()[1].strip() #[0].encode('utf-8') #[n.encode('utf-8') for n in title] yield item
class MovieSpider(CrawlSpider): name = "movie" allowed_domains = ["movie.douban.com"] start_urls = ["http://movie.douban.com"] rules = ( Rule(LinkExtractor(allow=r"/subject/\d+/($|\?\w+)"), callback="parse_movie", follow=True), ) def parse_movie(self, response): item = MovieItem() item["subject_id"] = int(response.url.split("/")[-2]) self.get_name(response, item) self.get_year(response, item) self.get_directors(response, item) self.get_actors(response, item) self.get_genres(response, item) self.get_runtime(response, item) self.get_languages(response, item) self.get_countries(response, item) self.get_average(response, item) self.get_vote(response, item) self.get_tags(response, item) self.get_watched(response, item) self.get_wish(response, item) self.get_summary(response, item) self.get_stars(response, item) self.get_comment(response, item) self.get_question(response, item) self.get_review(response, item) self.get_discussion(response, item) self.get_image(response, item) return item def get_stars(self, response, item): if not item.get("vote", None): return xpath = response.xpath("//div[@class='rating_wrap clearbox']/text()").extract() stars = "".join( map(unicode.strip, xpath ) ).split("%")[:-1] stars = [ int( round((float( "%.3f" % (float(star)/100))) * item["vote"]) ) for star in stars ] item["stars"] = stars def get_name(self, response, item): name = response.xpath("//title/text()").extract() if name: item["name"] = name[0].replace(u" (豆瓣)", "").strip() def get_year(self, response, item): year = response.xpath("//span[@class='year']").re(NUM_RE) if year: item["year"] = int( year[0] ) def get_directors(self, response, item): directors = response.xpath("//a[@rel='v:directedBy']/text()").extract() if directors: item["directors"] = directors def get_actors(self, response, item): stars = response.xpath("//a[@rel='v:starring']/text()").extract() if stars: item["actors"] = stars def get_genres(self, response, item): genres = response.xpath("//span[@property='v:genre']/text()").extract() if genres: item["genres"] = genres def get_runtime(self, response, item): if not self.parse_tv(response, item): runtime = response.xpath("//span[@property='v:runtime']/text()").re(NUM_RE) if runtime: item["channel"] = "mv" item["runtime"] = int( runtime[0] ) def get_average(self, response, item): average = response.xpath("//strong[@property='v:average']/text()").extract() if average and average[0] != "": item["average"] = float( average[0] ) + 0.0 def get_vote(self, response, item): votes = response.xpath("//span[@property='v:votes']/text()").extract() if votes and votes[0] != "": item["vote"] = int( votes[0] ) def get_tags(self, response, item): T = [] tags = response.xpath("//div[@class='tags-body']/a") for tag in tags: t = tag.xpath("text()").extract() if t: T.append(t[0]) if T: item["tags"] = T def get_watched(self, response, item): spec = "//div[@class='subject-others-interests-ft']/a[re:test(@href, 'collections$')]/text()" collections = response.xpath(spec).re(NUM_RE) if collections: item["watched"] = int( collections[0] ) def get_wish(self, response, item): spec = "//div[@class='subject-others-interests-ft']/a[re:test(@href, 'wishes$')]/text()" wishes = response.xpath(spec).re(NUM_RE) if wishes: item["wish"] = int( wishes[0] ) def get_languages(self, response, item): S = "".join(response.xpath("//div[@id='info']").extract() ) M = LANGUAGES_RE.search(S) if M is not None: item["languages"] = [ lang.strip() for lang in M.group(1).split("/") ] def get_countries(self, response, item): S = "".join(response.xpath("//div[@id='info']").extract() ) M = COUNTRIES_RE.search(S) if M is not None: item["countries"] = [ country.strip() for country in M.group(1).split("/") ] def get_summary(self, response, item): summary = response.xpath("//span[@property='v:summary']/text()").extract() if summary: item["summary"] = "<br/>".join( summary ) def get_image(self, response, item): image = response.xpath("//a[re:test(@href, 'all_photos$')]/text()").re(NUM_RE) if image: item["image"] = int( image[0] ) def get_comment(self, response, item): comment = response.xpath("//a[re:test(@href, '/comments$')]/text()").re(NUM_RE) if comment: item["comment"] = int( comment[0] ) def get_question(self, response, item): question = response.xpath("//a[re:test(@href, '/questions/\?from=subject$')]/text()").re(NUM_RE) if question: item["question"] = int( question[0] ) def get_review(self, response, item): review = response.xpath("//a[re:test(@href, '/reviews$')]/text()").re(NUM_RE) if review: item["review"] = int( review[0] ) def get_discussion(self, response, item): discussion = response.xpath("//a[re:test(@href, 'discussion/')]/text()").re(NUM_RE) if discussion: item["discussion"] = int( discussion[0] ) def parse_tv(self, response, item): S = "".join( response.xpath("//div[@id='info']//text()").extract() ) M = TV_RUNTIME_RE.search(S) if M is not None: item["channel"] = "tv" item["runtime"] = int(M.group(1)) return True return False
class TraderaSpider(CrawlSpider): name = 'tradera' allowed_domains = ['tradera.com'] start_urls = [ 'http://www.tradera.com/nintendo-gamecube-c3_3004', 'http://www.tradera.com/playstation-psone-c3_3012', 'http://www.tradera.com/playstation-2-ps2-c3_3013', 'http://www.tradera.com/sega-dreamcast-c3_3001', 'http://www.tradera.com/xbox-c3_3016' ] rules = ( Rule(SgmlLinkExtractor(allow=('.*', ), restrict_xpaths='//a[@class="nextPageBtn"]'), callback='parse_item', follow=True), Rule(SgmlLinkExtractor( allow=('.*', ), restrict_xpaths=('//div[@class="ObjectHeadline"]', )), callback='parse_item2', follow=True), ) # def parse_item(self, response): """A callback function""" hxs = HtmlXPathSelector(response) def getStringFromArray(self, array): result = u"" for item in array: result = result + u" " + item.strip() return result def getStringFromXPath(self, hxs, xPath): extractedText = hxs.select(xPath).extract() return self.getStringFromArray(extractedText) def parse_item2(self, response): """A callback function that produces traderaItems from auction html""" hxs = HtmlXPathSelector(response) traderaItem = TraderaItem() traderaItem['itemHeading'] = self.getStringFromXPath( hxs, '//h1[@class="auction_headline"]/text()') traderaItem['leadingBid'] = self.getStringFromXPath( hxs, '//label[@id="leadingBidAmount"]/text()') traderaItem['bids'] = self.getStringFromXPath( hxs, '//h5[@id="numberOfBids"]/text()') traderaItem['remainingTime'] = self.getStringFromXPath( hxs, '//label[@id="timeLeftLabel"]/text()') traderaItem['itemText'] = self.getStringFromXPath( hxs, '//div[@class="description"]/p/text()') traderaItem['seller'] = self.getStringFromXPath( hxs, '//a[@class="blueLink"]/b/text()') traderaItem['sellerRating'] = self.getStringFromXPath( hxs, '//div[@class="rightSideInfoInBoxG-bottomLine"]/a[@class="DSRMedium"]/text()' ) if len(hxs.select('//div[@class="objectInfoOnTop"]/text()')) == 3: traderaItem['publiced'] = hxs.select( '//div[@class="objectInfoOnTop"]/text()').extract()[1].strip() traderaItem['objectID'] = hxs.select( '//div[@class="objectInfoOnTop"]/text()').extract()[2].strip() return traderaItem
def load_config(self): self.pretty_conf = utils.load_cfg(self.config, pretty=True) conf_dump = json.dumps(self.pretty_conf) conf = json.loads(conf_dump) ### debug if self.debug==None: self.debug = conf.get('debug', False) ### site self.site = conf.get('site', u'未知站点') self.macro = utils.MacroExpander({ 'SITE': self.site, 'CONF': conf_dump }) ### allowed_domains self.allowed_domains = conf.get('domains', []) ### start_urls urls = conf.get('urls', []) self.start_urls = utils.generate_urls(urls, self.macro) if isinstance(urls, dict): self.start_method = urls.get('method', 'GET') self.make_headers(urls.get('headers', {})) if urls.get('parse'): self.parse_start_url = self.parse_page else: self.start_method = 'GET' self.make_headers({}) ### rules self.tr = HTMLTranslator() self.rules = [] self.page_extractor = None for k,v in conf.get('rules', {}).iteritems(): follow = v.get('follow', True) callback = None if follow else 'parse_page' follow = True if follow is None else follow match = self.macro.expand(v.get('match')) regex = self.macro.expand(v.get('regex')) css = self.macro.expand(v.get('css')) if css: xpath = self.tr.css_to_xpath(css) else: xpath = self.macro.expand(v.get('xpath')) pages = v.get('pages') sub = v.get('sub') vars = v.get('vars') rule = Rule( SgmlLinkExtractor( allow=regex, restrict_xpaths=xpath, process_value=utils.first_n_pages(regex, pages) ), process_links=self.sub_links(sub), process_request=self.set_vars(k, vars), callback=callback, follow=follow ) rule.match = match self.rules.append(rule) self._compile_rules() if not self.rules: self.parse_start_url = self.parse_page self.make_page_extractor(conf.get('urls', [])) ### mappings(loop/fields) self.build_item(conf) ### settings self.load_settings(conf) return conf
class HideMyAssSpider(CrawlSpider): name = 'hidemyass' start_urls = ['http://hidemyass.com/proxy-list/'] allowed_domains = ['hidemyass.com'] rules = (Rule(SgmlLinkExtractor(restrict_xpaths=("//a[@class='next']")), callback='parse', follow=True), ) def parse(self, response): self.log('No item received for %s' % response.url) for elem in super(HideMyAssSpider, self).parse(response): yield elem hxs = HtmlXPathSelector(response) links = hxs.select('//tr[@class="altshade"]') for link in links: ipaddress_parts = link.select('td[2]/span') style_text = ipaddress_parts.select('style/text()').extract() style_text = style_text[0].split('\n') display_none = [ style[1:style.index('{')] for style in style_text if 'none' in style ] display_inline = [ style[1:style.index('{')] for style in style_text if 'inline' in style ] display_none = set(display_none) display_inline = set(display_inline) ipaddress = [] for ipaddress_part in ipaddress_parts.select('span|div|text()'): tag_class = tag_style = tag_name = None try: tag_class = ipaddress_part.select('@class').extract() except TypeError: # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) pass try: tag_style = ipaddress_part.select('@style').extract() except TypeError: # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) pass try: tag_name = ipaddress_part.select("name()") except TypeError: # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) pass if tag_name: tag_text = ipaddress_part.select('text()').extract() else: tag_text = ipaddress_part.extract() if tag_style and 'none' in tag_style[0]: continue if tag_class and tag_class[0] in display_none: continue if isinstance(tag_text, list): tag_text = ''.join(tag_text) tag_texts = tag_text.split('.') for tag_text in tag_texts: tag_text = tag_text.strip() if not tag_text.isdigit(): continue ipaddress.append(tag_text) ipaddress = '.'.join(ipaddress) loader = WebsiteLoader(selector=link) loader.add_value('ipaddress', ipaddress) loader.add_xpath('port', 'td[3]/text()') loader.add_xpath('country', 'td[4]/span/text()') loader.add_xpath('proxy_type', 'td[7]/text()') loader.add_xpath('anonimity', 'td[8]/text()') loader.add_value('url', response.url) item = loader.load_item() yield item