# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@class='title']/h2", 'price' : "//p[@class='pridis']/strong", 'category' : "", 'description' : "//div[@class='detail']", 'images' : "//div [@class='sllist']/ul/li/a/img/@src", 'canonical' : "", 'base_url' : "", 'brand' : "" } name = 'necdeal.vn' allowed_domains = ['necdeal.vn'] start_urls = ['http://necdeal.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow = ['/deal/[a-z0-9-]+/[a-z0-9-]+-[a-z0-9]+'], deny = ['.*/$']), 'parse_item'), Rule(LinkExtractor(deny = ['.*']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//h2[@id='hTitleDeal']/span/p", 'price': "//p[@class='pGiaTienID']/strong", 'category': "", 'description': "//div[@class='div_LBodyHL']", 'images': "//div[@class='lof-main-outer']/img/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'windeal.vn' allowed_domains = ['windeal.vn'] start_urls = ['http://windeal.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/xem-san-pham/']), 'parse_item'), Rule(LinkExtractor(allow=['\?type']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class HddznetSpider(CrawlSpider): __BASE_DOMAIN = 'www.hddznet.com' name = 'hddznet' allowed_domains = [ __BASE_DOMAIN ] start_urls = ['http://www.hddznet.com'] rules = ( # 提取 产品中心 Rule(LinkExtractor(allow=(r'product-.*.html$')), follow=True, callback='parse_product'), # 提取 方案与案例 Rule(LinkExtractor(allow=(r'program-.*.html$')), follow=True, callback='parse_program'), # 提取 经典案例 Rule(LinkExtractor(allow=(r'news/detail.*-jdal.html$')), follow=True, callback='parse_case'), # 提取 新闻中心 Rule(LinkExtractor(allow=(r'news/detail.*-xwzx.html$')), follow=True, callback='parse_news'), # # 提取 图片 .png .jpg .jpeg .bmp # Rule(LinkExtractor(allow=(r'www.hddznet.com'), deny_extensions=set(), tags=('img'), attrs=('src'), canonicalize=True, unique=True), \ # follow=False, callback='parse_images') # 提取 所有链接 Rule(LinkExtractor(allow=(r'.*')), follow=True), ) def __init__(self, *args, **kwargs): super(HddznetSpider, self).__init__(*args, **kwargs) pool = redis.ConnectionPool(host='128.1.6.45', port=6379, decode_responses=True) self.redis = redis.Redis(connection_pool=pool) def __parse(self, response, parse_name, title_class, content_class): cache_key = 'uri:url:{0}'.format(response.url) if self.redis.exists(cache_key): print('xx> SKIP', self.name, response.url, parse_name) return None title = self.__extract_title(response, '//div[@class="{0}"]/text()'.format(title_class)) print('==>', self.name, response.url, parse_name, title) elements = response.xpath('//div[@class="{0}"]//span|//div[@class="{0}"]//p|//div[@class="{0}"]//td'.format(content_class)) contents = elements.xpath('text()').extract() content = clear_content(contents) item = ContentItem() item['company'] = self.name item['title'] = title item['url'] = response.url item['content'] = content return item def parse_product(self, response): yield self.__parse(response, '产品中心', 'current-menu', 'right') def parse_program(self, response): yield self.__parse(response, '方案与案例', 'current-menu', 'right') def parse_case(self, response): yield self.__parse(response, '经典案例', 'detail-title', 'detail-content') def parse_news(self, response): yield self.__parse(response, '新闻中心', 'detail-title', 'detail-content') # def parse_images(self, response): # print('==>', self.name, '-Images-' , response.url) def __extract_title(self, response, xpath=None): if xpath is None: return '' title = response.xpath(xpath).extract_first() return title.strip() if title is not None else ''
class XjSpider(SpiderRedis): name = 'xinjiangdj' website = u'新疆党建网' download_delay = 0.1 allowed_domains = ['www.xjkunlun.cn'] start_urls = ['http://www.xjkunlun.cn/'] rules = [ # 只抓取2015以后的新闻, 太久远新闻存在内容抓取匹配问题 Rule(LinkExtractor(allow=r'/201[5-9]/[0-9]+.htm', deny=('/iptv', '/wlsp', '/mobile', '/kxj', '/xzzx', '/sy.xjkunlun', '/ycjy', '/djkw', '/index')), callback='parse_item', follow=False), Rule(LinkExtractor(allow=('/xinwen', '/gzgz', '/dswx', '/ldjh', '/dkyj', '/lgbgz', '/wnfw'), deny=('/iptv', '/wlsp', '/mobile', '/kxj', '/xzzx', '/sy.xjkunlun', '/ycjy', '/djkw')), follow=True), ] def parse_item(self, response): loader = ItemLoader(item=SpiderItem(), response=response) contents = '' try: title = response.xpath( r'//td[@class="STYLE1"]/div//text()').extract_first() content_list = response.xpath( r'//div[@class="container"]/div[2]/table/tbody/tr/td/p/text()' ).extract() if len(content_list) == 0: # 定义另一种匹配形式 content_list = response.xpath( r'//*[@id="00010"]/table[2]/tbody/tr[2]/td/p/text()' ).extract() for content in content_list: contents = contents + content # 定义日期的两种匹配规则 date_text = response.xpath( r'//*[@id="00010"]/table[2]/tbody/tr[1]/td/p/text() | //*[@id="00010"]/table[1]/tbody/tr[3]/td/text()' ).extract_first() match = re.search(r'(20[0-9]{2}-[0-9]{2}-[0-9]{2})', date_text) date = match.group(1) ### print info # try: # print 'title, ', title.encode('GB18030') # print 'url, ', response.url # print "date, ", date # print "content, ", contents.encode('GB18030') # except Exception as e: # print " error : ", e loader.add_value('title', title) loader.add_value('date', date) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('date', '1970-01-01') loader.add_value('title', '') finally: #self.logger.info('crawling url: %s' % response.url) loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) if contents == '': self.logger.warning(' url: %s msg: %s' % (response.url, ' content is None')) loader.add_value('content', contents) return loader.load_item()
class XpahaSpider(RedisCrawlSpider): name = 'xpaha' allowed_domains = ['zidian.xpcha.com'] start_urls = ['http://zidian.xpcha.com/'] redis_key = "zidianspider:start_urls" # pagelink = LinkExtractor(allow=r'href="/hans/\w+') contentlink = LinkExtractor(allow=r'\w.html') rules = ( # Rule(pagelink), Rule(contentlink, callback='parse_item'), ) def parse_item(self, response): item = ZidainItem() item['zi'] = self.get_zi(response) item['thumb'] = self.get_thumb(response) item['pinyin'] = self.get_pinyin(response) item['wuxing'] = self.get_wuxing(response) item['jiegou'] = self.get_jiegou(response) item['bushou'] = self.get_bushou(response) item['bihua'] = self.get_bihua(response) item['base'] = self.get_base(response) item['kangxi'] = self.get_kangxi(response) item['guhanyu'] = self.get_guhanyu(response) item['xiangxi'] = self.get_xiangxi(response) item['develop'] = self.get_develop(response) item['request'] = response.url yield item def get_zi(self, response): try: item = response.xpath('/html/body/div[5]/div[1]/dl/dt[6]/a/text()' ).extract()[0].split()[-1] except Exception as e: item = '' if len(item): item return item def get_thumb(self, response): try: item = response.xpath( '/html/body/div[5]/div[1]/div[1]/img/@src').extract()[-1] except Exception as e: item = '' if len(item): item return item def get_pinyin(self, response): try: item = response.css( 'body > div.body_1000 > div.left_leirong > h1').extract()[-1] except Exception as e: item = '' if len(item): item return item def get_wuxing(self, response): try: item = response.xpath( '/html/body/div[5]/div[1]/div[1]/dl/dd[3]/text()').extract( )[-1].split(':')[1] except Exception as e: item = '' if len(item): item return item def get_jiegou(self, response): try: item = response.xpath( '/html/body/div[5]/div[1]/div[1]/dl/dd[5]/text()').extract( )[-1].split(':')[1] except Exception as e: item = '' if len(item): item return item def get_bushou(self, response): try: item = response.xpath( '/html/body/div[5]/div[1]/div[1]/dl/dd[1]/text()').extract( )[-1].split(':')[1] except Exception as e: item = '' if len(item): item return item def get_bihua(self, response): try: item = response.xpath( '/html/body/div[5]/div[1]/div[1]/dl/dd[2]/text()').extract( )[-1].split(':')[1] except Exception as e: item = '' if len(item): item return item def get_base(self, response): try: item = response.xpath('//*[@id="jbjs"]').extract()[-1] except Exception as e: item = '' if len(item): item return item def get_kangxi(self, response): try: item = response.css('.zidian_tab a::attr(href)').extract()[2] except Exception as e: item = '' if len(item): item return item def get_guhanyu(self, response): try: item = response.xpath('//*[@id="ghyzd"]/div').extract()[-1] except Exception as e: item = '' if len(item): item return item def get_xiangxi(self, response): try: item = response.xpath('//*[@id="xxjs"]/div').extract()[-1] except Exception as e: item = '' if len(item): item return item def get_develop(self, response): try: item = response.xpath('//*[@id="jbjs"]/dl').extract()[-1] except Exception as e: item = '' if len(item): item return item
class BbcspiderSpider(CrawlSpider): name = 'BBCspider' allowed_domains = ['www.bbc.com'] start_urls = ['http://www.bbc.com/news', 'https://www.bbc.com/news/stories' # 'https://www.bbc.com/news/world', # 'https://www.bbc.com/news/world/africa', # 'https://www.bbc.com/news/world/australia', # 'https://www.bbc.com/news/world/europe', # 'https://www.bbc.com/news/world/latin_america', # 'https://www.bbc.com/news/world/middle_east', # 'https://www.bbc.com/news/world/us_and_canada', # 'https://www.bbc.com/news/world/asia', # 'https://www.bbc.com/news/world/asia/china', # 'https://www.bbc.com/news/world/asia/india', # 'https://www.bbc.com/news/uk', ] rules = [ Rule( LinkExtractor(allow=r'news', deny=denyrule, unique=True), callback='parse_item', follow=True ), Rule( LinkExtractor(allow=r'https://traffic.outbrain.com/network', deny=denyrule, unique=True), callback='parse_item', follow=True ) ] def start_requests(self): for url in self.start_urls: yield Request(url, callback=self.parse, dont_filter=True) def parse_item(self, response): item=BbcCrawlerItem() c_type = response.xpath('//meta[@property="og:type"]/@content').extract_first() if (c_type == "article"): item['headline'] =response.xpath('//meta[@property="og:title"]/@content').extract_first() ar_author = response.xpath('//meta[@property="article:author"]/@content').extract_first() author = response.xpath('//meta[@name="author"]/@content').extract_first() item["author"]=ar_author if ar_author else author item["keywords"] = response.xpath('//div/ul[@class="tags-list"]/li[@class="tags-list__tags"]/a/text()').extract() # import pdb; pdb.set_trace() item["description"] = response.xpath('//meta[@name="description"]/@content').extract_first() body_sc = response.xpath("//div[@class='story-body__inner']") if len(body_sc) > 0: list_text=body_sc.xpath(delete_cotent).xpath("string(.)").extract() text = '\n'.join(list_text).strip() else: body_sc = response.xpath("//div[contains(@class,'main_article_text')]") list_text=body_sc.xpath(delete_cotent).xpath("string(.)").extract() text='\n'.join(list_text).strip() item['text']=text item["viewtime"] = datetime.utcnow() item["url"] = response.url sha1.update(item['text'].encode('utf-8')) item["sha1"]=str(sha1.hexdigest()) # text.replace('\n', '') yield item
class ShudCrawler(scrapy.Spider): name = "amazon" config = configparser.ConfigParser() config.read('../shud.ini') sparkSession = SparkSession \ .builder \ .appName(config.get('spark', 'appname')) \ .config("spark.some.config.option", "some-value") \ .getOrCreate() sqlContext = SQLContext(sparkSession) # Spider Name #self.config.get('crawling', 'spidername') # The domains that are allowed (links to other domains are skipped) allowed_domains = config.get('crawling', 'allowedDomain') # The URLs to start with start_urls = config.get('crawling', 'startUrl') # This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method rules = [ Rule( LinkExtractor(canonicalize=False, unique=True ), follow=True, callback="parse_items" ) ] def start_requests(self): print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") #TODO #Parcourir la liste de start urls à crawler et la mettre en mémore avec parsed=false initUrlList = [(self.config.get('crawling', 'startUrl'), "false")] df = self.sparkSession.createDataFrame(initUrlList, schema=["url", "crawled"]) self.sqlContext.registerDataFrameAsTable(df, "WorkTable") indx = 0 urlListe = self.sqlContext.sql("SELECT url from WorkTable where crawled = 'false'") while len(urlListe.rdd.collect()) > 0: print("####################### Current step = %s " %str(indx)) for url in urlListe.rdd.collect(): print("************************** Current url = %s " %str(url)) #TODO #Vérifier que l'url contient au moins un des allowed domain # if self.config.get('crawling', 'allowedDomain') in url[0]: a=url[0] try: self.parse2(self, a, indx) except: pass #yield scrapy.Request(url=str(a), callback=self.parse) print("************************** Current url = %s " %str(url)) urlListe = self.sqlContext.sql("SELECT url from WorkTable where crawled = 'false'") print("************************** DEBUTs WorkTable " ) Myliste = self.sqlContext.sql("SELECT * from WorkTable") print(Myliste.show()) print("************************** FIN WorkTable") if indx > 4: break indx += 1 def parse(self, response): print("%%%%%%% Current url = %s " %response.url) newUrls = [] items = [] # Only extract canonicalized and unique links (with respect to the current page) links = LinkExtractor(canonicalize=False, unique=True).extract_links(response) # Now go through all the found links for link in links: # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains is_allowed = False for allowed_domain in self.allowed_domains: if allowed_domain in link.url: is_allowed = True # If it is allowed, append the url to the list if is_allowed: newUrls.append((link.url, "false")) #Get all urls to synchronize and update df = self.sqlContext\ .sql("SELECT url, crawled from WorkTable where url <>'%s'" % response.url)\ .union(self.sparkSession.createDataFrame(newUrls))\ .union(self.sparkSession.createDataFrame([(response.url, "true")]))\ .dropDuplicates(['url']) self.sqlContext.dropTempTable("WorkTable") self.sqlContext.registerDataFrameAsTable(df, "WorkTable") #print(df.show()) # TODO #Put response body's content into RDDs #page = response.url.split("/")[-2] #page = response.url m=hashlib.md5(bytes(str(response.url),"ascii")) # python 3 filename = str(self.name)+'_'+ m.hexdigest() + '.html' with open(filename, 'wb') as f: f.write(response.body) self.log('Saved file %s' % filename) item = ShudScraperItem() item['url_from'] = response.url items.append(item) yield item
class BusinessInsiderSpider(CommonBaseSpider): name = "businessinsider" base_dir = "crawl" allowed_domains = ["businessinsider.in"] urls = [ filter(None, item['subcategory'].values()) for item in categories if filter(None, item['subcategory'].values()) ] urls = sum( sum(urls, []), [] ) ## i.e. similar to [item for sublist in urls for subsublist in sublist for item in subsublist] start_urls = urls rules = (Rule(LinkExtractor( allow=(r'http\:\/\/www\.businessinsider\.in\/.+\.cms', )), callback='parse_item', follow=False), ) def parse_item(self, response): super(BusinessInsiderSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: # title = tree.xpath('//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1/text()') title = tree.xpath( '//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1//text()' ) # details = tree.xpath('.//div[contains(@class,\'section1\')]//p//text()') details = tree.xpath( './/div[contains(@class,"hide_show_handler main_content")]//p//text()' ) if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii', 'ignore') news_item['details'] = "\t".join([ item.strip().encode('ascii', 'ignore') for item in details ]) img_urls = tree.xpath( './/div[contains(@class,\'MeetingImg blk\')]/img/@src') img_url_list = [] if img_urls: for img_url in img_urls: img_url_list.append("http://www.businessinsider.in" + img_url) news_item['img_urls'] = get_stripped_list(img_url_list) published_date = tree.xpath( './/div[contains(@class,\'ByLine\')]//span[contains(@class,\'Date\')]//text()' ) if published_date: news_item['published_date'] = datetime.strptime( get_stripped_list(published_date)[0], '%b %d, %Y, %I.%M %p') author = tree.xpath('.//a[contains(@class,\'Name\')]/text()') if author: news_item['author'] = get_stripped_list(author) tags = tree.xpath( './/span[contains(@class,\'anchorLink\')]/text()') more_tags = tree.xpath( './/div[contains(@id,\'commentHash\')]//a/text()') if tags: news_item['tags'] = get_stripped_list(tags) if more_tags: news_item['tags'] = get_stripped_list(more_tags) cover_image = tree.xpath( './/div[contains(@class,\'MeetingImg blk\')]/img/@src') if cover_image: news_item['cover_image'] = img_url_list[0] # get_stripped_list(cover_image) referer = response.request.headers['Referer'] for item in categories: if referer in sum(item['subcategory'].values(), []): news_item['category'] = item['category'] key = (key for key, value in item['subcategory'].items() if referer in value).next() news_item['sub_categories'] = [key] return news_item except: self.log('==Exception=================>>>>>>>>! %r' % e) return None
class VansCrawler(CrawlSpider, Mixin): name = Mixin.retailer + "-crawler" parser = VansParser() listings_css = [".topnav-main-item"] product_css = [".product-block-figure"] deny_re = [".html"] PAGE_SIZE = 48 rules = (Rule(LinkExtractor(restrict_css=listings_css, deny=deny_re), callback="parse_pagination"), ) def category_zones(self, response): css = ".body-container div::attr(lmzone)" return response.css(css).extract() def site_id(self, response): script_re = "WCS_CONFIG.ATTRAQT = (.+?);" raw_site_id = json.loads( re.findall(script_re, response.body.decode("utf-8").replace("\n", ""))[0]) return re.findall("zones/(.*).min", raw_site_id["MAINJS"])[0] def config_categorytree(self, response): return re.findall('categorytree : "(.*)"', response.body.decode("utf-8"))[0] def config_language(self, response): css = "meta[name='locale']::attr(content)" return response.css(css).extract_first() def parse_pagination(self, response): pages = self.page_count(response) cat_zones = self.category_zones(response) lang = self.config_language(response) parameters = { "zone0": cat_zones[0], "zone1": cat_zones[1], "mergehash": "true", "config_categorytree": self.config_categorytree(response), "siteId": self.site_id(response), "config_language": lang, "language": lang, "config_country": self.market } for page in range(0, pages + self.PAGE_SIZE, self.PAGE_SIZE): parameters[ "pageurl"] = f"{response.url}#esp_pg={page//self.PAGE_SIZE}" url = self.pagniation_req_url_t.format(urlencode(parameters)) yield Request(url, callback=self.parse_raw_content, dont_filter=True) def parse_raw_content(self, response): script_re = "LM.buildZone\((.*)\)" raw_html = json.loads( re.findall(script_re, response.body.decode("utf-8"))[0]) new_response = response.replace(body=raw_html["html"]) return [ Request(url, callback=self.parse_item) for url in self.product_urls(new_response) ] def parse_item(self, response): return self.parser.parse_product(response) def product_urls(self, response): css = ".product-block-pdp-url::attr(href)" urls = response.css(css).extract() return [f"{self.start_urls[0]}{url}" for url in urls] def page_count(self, response): css = ".header-result-counter ::text" return int(response.css(css).re_first("\d+") or '0')
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@class='Box_SP_detail']/div[@class='Box_SP_detail_right']/h2", 'price' : "//div[@class='Box_SP_detail']/div[@class='Box_SP_detail_right']/p/b/span", 'category' : "", 'description' : "//div[@class='Page_left_2']/div[@class='Box_content']/div[@id='content']", 'images' : "//div[@class='Box_SP_detail_left']/h3/a/@href", 'canonical' : "", 'base_url' : "", 'brand' : "" } name = 'daotoan.com' allowed_domains = ['daotoan.com'] start_urls = ['http://daotoan.com'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+-id+\.aspx']), 'parse_item'), Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+-in+\.aspx']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class ShclearingSpider(CrawlSpider): name = 'shclearing' source = "上海清算所" allowed_domains = ["shclearing.com"] yesterday = datetime.date.today() - datetime.timedelta(days=1) yesterday = yesterday.strftime('%Y%m%d') reg = yesterday start_urls = ['http://www.shclearing.com/cpyyw/tzgg/'] rules = ( Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True), # Rule(LinkExtractor(allow='_[0-9].html')) ) def printcn(uni): for i in uni: print uni.encode('utf-8') def parse_news(self, response): item = GenericItem() self.get_id(response, item) self.get_url(response, item) self.get_source(response, item) self.get_title(response, item) self.get_date(response, item) self.get_body(response, item) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse if item['body']: return item def get_id(self, response, item): id = uuid.uuid4() if id: item['id'] = id def get_url(self, response, item): news_url = response.url if news_url: item['url'] = news_url def get_source(self, response, item): source = self.source if source: item['source'] = source def get_title(self, response, item): title = response.xpath('//h1[@id="title"]/text()').extract() if title: item['title'] = ''.join(title).strip() def get_date(self, response, item): item['date'] = self.yesterday + '000000' def get_body(self, response, item): paras = response.xpath('//div[@class="TRS_Editor"]/p') news_body = '' for p in paras: data = p.xpath('string(.)').extract() if data: body = '' for line in ''.join(data).splitlines(): # print entry.encode('utf-8') body += line.strip() news_body += body + '_|_' item['body'] = news_body.replace('_|__|_', '_|_')
class X163Spider(CrawlSpider): name = '163' source = "网易财经" allowed_domains = ["163.com"] yesterday = datetime.date.today() - datetime.timedelta(days=1) yesterday = yesterday.strftime('%Y/%m%d')[2:] reg = yesterday start_urls = [ 'http://money.163.com/special/00252G50/macro.html', 'http://money.163.com/special/00252C1E/gjcj.html' ] rules = (Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True), Rule(LinkExtractor(allow='macro_')), Rule(LinkExtractor(allow='gjcj_'))) def printcn(uni): for i in uni: print uni.encode('utf-8') def parse_news(self, response): item = GenericItem() self.get_id(response, item) self.get_url(response, item) self.get_source(response, item) self.get_title(response, item) self.get_date(response, item) self.get_body(response, item) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse if item['body']: return item def get_id(self, response, item): id = uuid.uuid4() if id: item['id'] = id def get_url(self, response, item): news_url = response.url if news_url: item['url'] = news_url def get_source(self, response, item): source = self.source if source: item['source'] = source def get_title(self, response, item): title = response.xpath( '//div[@id="epContentLeft"]/h1/text()').extract() if title: item['title'] = title def get_date(self, response, item): date = response.xpath( '//div[@class="post_time_source"]/text()').extract()[0] if date: item['date'] = ''.join(date).replace(u'-', u'').replace( u':', u'').replace(u' ', u'').strip()[:14] def get_body(self, response, item): paras = response.xpath('//div[@id="endText"]/p') news_body = '' for p in paras: data = p.xpath('string(.)').extract() if data: body = '' for line in ''.join(data).splitlines(): # print entry.encode('utf-8') body += line.strip() news_body += body + '_|_' item['body'] = news_body.replace('_|__|_', '_|_')
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[2]/h1/a", 'price': "//h3[@class='views-field views-field-display-price']/span[@class='field-content']", 'category': "//div[@class='breadcrumb']/a", 'description': "//div[@class='views-field views-field-body']/div[@class='field-content']", 'images': "//div[@class='field-item even']/a/img/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'khoe24.vn' allowed_domains = ['khoe24.vn'] start_urls = ['http://khoe24.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/thuoc/']), 'parse_item'), Rule(LinkExtractor(deny=['/']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@class='product-name']/h1", 'price' : "//div[@class='product-quick-view']/div[@class='left-info']/div[@class='price-box']/span/span", 'category' : "//div[@class='breadcrumbs']/ul/li/a", 'description' : "//div[@class='box-collateral box-description']", 'images' : "//img[@id='zoom_03']/@data-zoom-image", 'canonical' : "", 'base_url' : "", 'brand' : "" } name = 'kiddymart.vn' allowed_domains = ['kiddymart.vn'] start_urls = ['http://kiddymart.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(), 'parse_item'), Rule(LinkExtractor(), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='prod_content']/h1[@id='prod_title']", 'price': "//div[@class='final_price']/span[@id='special_price_box']", 'category': "//div[@class='bcr box breadcrumbs']/ul/li/a", 'description': "//div[@class='prod_details']", 'images': "//div[@id='prdMedia']/div[@id='img_large']/div/a/@href", 'canonical': "", 'base_url': "", 'brand': "" } name = 'anhchinh.vn' allowed_domains = ['anhchinh.vn'] start_urls = ['http://www.anhchinh.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+_id+\d+\.html']), 'parse_item'), Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+_dm+\d+\.html'], deny=['\?']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class UrbanDictSpider(CrawlSpider): name = "urbandict" allowed_domains = ["urbandictionary.com"] start_urls = [ "http://www.urbandictionary.com/", "http://www.urbandictionary.com/popular.php?character=Q", "http://www.urbandictionary.com/popular.php?character=W", "http://www.urbandictionary.com/popular.php?character=E", "http://www.urbandictionary.com/popular.php?character=R", "http://www.urbandictionary.com/popular.php?character=T", "http://www.urbandictionary.com/popular.php?character=Y", "http://www.urbandictionary.com/popular.php?character=U", "http://www.urbandictionary.com/popular.php?character=I", "http://www.urbandictionary.com/popular.php?character=O", "http://www.urbandictionary.com/popular.php?character=P", "http://www.urbandictionary.com/popular.php?character=A", "http://www.urbandictionary.com/popular.php?character=S", "http://www.urbandictionary.com/popular.php?character=D", "http://www.urbandictionary.com/popular.php?character=F", "http://www.urbandictionary.com/popular.php?character=G", "http://www.urbandictionary.com/popular.php?character=H", "http://www.urbandictionary.com/popular.php?character=J", "http://www.urbandictionary.com/popular.php?character=K", "http://www.urbandictionary.com/popular.php?character=L", "http://www.urbandictionary.com/popular.php?character=Z", "http://www.urbandictionary.com/popular.php?character=X", "http://www.urbandictionary.com/popular.php?character=C", "http://www.urbandictionary.com/popular.php?character=V", "http://www.urbandictionary.com/popular.php?character=B", "http://www.urbandictionary.com/popular.php?character=N", "http://www.urbandictionary.com/popular.php?character=M", "http://www.urbandictionary.com/yesterday.php", ] rules = (Rule(LinkExtractor(allow=("browse", "popular", "yesterday", "define", "favorites")), callback="parse_items", follow=True), ) def parse_items(self, response): """ """ sel = Selector(response) items = [] """if "urbandictionary.com/define" not in response.url: return items""" sites = sel.xpath('//div[@id="content"]/div[@class="def-panel"][1]') for site in sites: item = DomainNameScraperItem() item['word'] = site.xpath( 'div[@class="def-header"]/a[@class="word"]/text()').extract() item['meaning'] = site.xpath( 'div[@class="meaning"]/text()').extract() item['example'] = site.xpath( 'div[@class="example"]/text()').extract() item['pos'] = site.xpath( './/a[@class="thumb up"]/span[@class="count"]/text()').extract( ) item['neg'] = site.xpath( './/a[@class="thumb down"]/span[@class="count"]/text()' ).extract() item["source"] = "urbandictonary" items.append(item) return items
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div/div/div/h1", 'price': "//tr/td/font", 'category': "//div[@id='yarnball']/ul[@class='yarnball']/li/a", 'description': "//div[@id='centerid']/div[@id='content']/div[@id='contentid']/div[@class='viewpron']", 'images': "//td[@id='proimg']/img/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'alobaby.com.vn' allowed_domains = ['alobaby.com.vn'] start_urls = ['http://alobaby.com.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/\d+/\d+/']), 'parse_item'), Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+-\d+\.html']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//h1", 'price': "//div[@class='product-price-pomotion']/p[@class='price']", 'category': "//div[@class='product-category-path']/ul[@class='list-item']/li/a", 'description': "//div[@class='pr-l-snv-i active']", 'images': "//div[@id='gallery']/a/@href", 'canonical': "", 'base_url': "", 'brand': "", 'in_stock': "", 'guarantee': "" } name = 'haiphongtelecom.com' allowed_domains = ['haiphongtelecom.com'] start_urls = ['http://haiphongtelecom.com/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [''] rules = [ #Rule(LinkExtractor(), 'parse_item'), #Rule(LinkExtractor(), 'parse'), Rule( LinkExtractor(allow=[ '/[a-zA-Z0-9-]+\.html($|\?page=\d+&sort=goods_id&order=DESC#goods_list$)' ]), 'parse_item_and_links'), ]
class IndependentSpider(CrawlSpider): name = "independent" allowed_domains = ["independent.co.uk"] def __init__(self, yearmonth='', *args, **kwargs): super(IndependentSpider, self).__init__(*args, **kwargs) begin_date = pd.Timestamp(yearmonth + "-01") end_date = pd.Timestamp(begin_date) + pd.DateOffset( months=1) - pd.DateOffset(days=1) date_inds = [ d.date().isoformat() for d in pd.date_range(begin_date, end_date) ] self.start_urls = [ "http://www.independent.co.uk/archive/%s" % d for d in date_inds ] rules = (Rule(LinkExtractor( allow=(), restrict_xpaths=('//ol[@class="margin archive-news-list"]/li/a', )), callback="parse_items", follow=True), ) def parse_items(self, response): hxs = HtmlXPathSelector(response) item = NewsItem() item["link"] = response.request.url item["lang"] = "en" item["source"] = "independent" title = hxs.xpath('//h1[@itemprop="headline"]/text()').extract() intro = hxs.xpath('//div[@class="intro"]/p/text()').extract() author = hxs.xpath('//span[@itemprop="name"]/a/text()').extract() category = hxs.xpath( '//ol[@class="breadcrumbs clearfix"]//a//text()').extract() new_content = hxs.xpath( '//div[@itemprop="articleBody"]/p//text()').extract() date_time = hxs.xpath( '//ul[@class="caption meta inline-pipes-list"]//time/@datetime' ).extract() # # Processing outputs author = [re.sub('^By\s', '', a) for a in author] author = [re.sub('\sin\s.*', '', a) for a in author] new_content = [p for p in new_content if not re.search(u'\u2022', p)] new_content = [ p for p in new_content if not re.search('font-family|background-color:', p) ] new_content = ' '.join(new_content) new_content = re.sub('\n', '', new_content) item["content"] = re.sub('\s{2,}', ' ', new_content) author = '|'.join(author) item["category"] = '|'.join(category) item["intro"] = ' '.join(intro) item["title"] = ' '.join(title) datte = re.findall('[0-9]+.[0-9]+.[0-9]+', date_time[0])[0] tme = re.findall('[0-9]+:[0-9]+', date_time[0])[0] datte = datte.split('/') item["date_time"] = datte[2] + '-' + datte[1] + '-' + datte[ 0] + 'T' + tme item["author"] = author return (item)
class MovieSpider(CrawlSpider): name = 'movie' allowed_domains = ['www.id97.com'] start_urls = ['http://www.id97.com/movie/'] # 自己定制配置文件的某些选项 # custom_settings = { # 'ITEM_PIPELINES' : { # 'demo1.pipelines.MyMongoDbPipeline': 302, # } # } # 根据规则提取所有的页码链接 page_link = LinkExtractor(allow=r'/movie/\?page=\d') detail_link = LinkExtractor( restrict_xpaths='//div[contains(@class,"col-xs-1-5")]/div/a') # follow : 是否跟进 rules = ( # 所有的页码不用处理,跟进即可 Rule(page_link, follow=True), # 所有的详情页不用跟进 Rule(detail_link, callback='parse_item', follow=False), ) def parse_item(self, response): # 通过response的meta属性,获取到参数item item = Demo1Item() # 电影海报 item['post'] = response.xpath( '//a[@class="movie-post"]/img/@src').extract_first() # 电影名字 item['name'] = response.xpath('//h1').xpath( 'string(.)').extract_first() # 电影评分 item['score'] = response.xpath( '//div[@class="col-xs-8"]/table/tbody/tr[last()]/td[2]').xpath( 'string(.)').extract_first() # 电影类型 item['_type'] = response.xpath( '//div[@class="col-xs-8"]/table/tbody/tr[3]/td[2]').xpath( 'string(.)').extract_first() # 导演 item['director'] = response.xpath( '//div[@class="col-xs-8"]/table/tbody/tr[1]/td[2]/a/text()' ).extract_first() # 编剧 item['editor'] = response.xpath( '//div[@class="col-xs-8"]/table/tbody/tr[2]/td[2]/a/text()' ).extract_first() # 主演 # '张静初 / 龙品旭 / 黎兆丰 / 王同辉 / 张国强 / 叶婉娴 / 丽娜 / 吴海燕 / 吴若林 / 喻引娣 显示全部' item['actor'] = response.xpath( '//div[@class="col-xs-8"]/table/tbody/tr[3]/td[2]').xpath( 'string(.)').extract_first().replace(' ', '').replace('显示全部', '') # 片长 lala = response.xpath( '//div[@class="col-xs-8"]/table/tbody/tr[8]/td[2]/text()' ).extract_first() if lala and ('分钟' in lala): item['long_time'] = lala else: item['long_time'] = '' # 电影介绍 introduce = response.xpath('//div[@class="col-xs-12 movie-introduce"]' ).xpath('string(.)').extract_first() if introduce == None: item['introduce'] = '' else: item['introduce'] = introduce.replace('\u3000', '').replace('展开全部', '') # 电影链接 # item['download_url'] = response.xpath('') yield item
class Iosrpgcrawler(CrawlSpider): name = 'iosrpgcrawler' allowed_domains = ['apps.apple.com'] start_urls = [ 'https://apps.apple.com/us/genre/ios-games-role-playing/id7014?letter=A' ] rules = ( # paginate by letter Rule( LinkExtractor( allow='genre/ios-games-role-playing/id7014\?letter=(\D)')), # paginate to next page Rule( LinkExtractor( allow= 'genre/ios-games-role-playing/id7014\?letter=(\D)&page=(\d+)#page', restrict_xpaths='//a[@class="paginate-more"]')), # go to actuall app description page Rule(LinkExtractor(allow='app\/(.+)\/id(\d+)', restrict_xpaths='//div[@id="selectedcontent"]'), callback='parse_game'), ) def parse_game(self, response): il = GameItemLoader(item=Game(), response=response) # basic information il.add_xpath( 'title', '//h1[@class="product-header__title app-header__title"]/text()') il.add_xpath( 'subtitle', '//h2[@class="product-header__subtitle app-header__subtitle"]/text()' ) il.add_xpath( 'author', '//h2[@class="product-header__identity app-header__identity"]/a/text()' ) il.add_xpath( 'price', '//li[@class="inline-list__item inline-list__item--bulleted app-header__list__item--price"]/text()' ) il.add_xpath( 'iap', '//li[@class="inline-list__item inline-list__item--bulleted app-header__list__item--in-app-purchase"]/text()' ) il.add_xpath('age', '//span[@class="badge badge--product-title"]/text()') il.add_xpath('desc', '//div[@class="section__description"]//p/text()') # game popularity and reception il.add_xpath('list_rank', '//li[@class="inline-list__item"]/text()') il.add_xpath( 'score', '//span[@class="we-customer-ratings__averages__display"]/text()') il.add_xpath( 'nrating', '//div[@class="we-customer-ratings__count small-hide medium-show"]/text()' ) il.add_xpath('stars', '//div[@class="we-star-bar-graph__row"]/div/div/@style') # other details il.add_xpath( 'editor', '//div[@class="we-editor-notes lockup ember-view"]/div/h3/text()') il.add_xpath( 'seller', '//dl[@class="information-list information-list--app medium-columns"]/div[1]/dd[@class="information-list__item__definition l-column medium-9 large-6"]/text()' ) il.add_xpath( 'size', '//dl[@class="information-list information-list--app medium-columns"]/div[2]/dd[@class="information-list__item__definition l-column medium-9 large-6"]/text()' ) il.add_xpath( 'category', '//dl[@class="information-list information-list--app medium-columns"]/div[3]/dd/a/text()' ) il.add_xpath( 'compat', '//dl[@class="information-list information-list--app medium-columns"]//p/text()' ) il.add_xpath( 'lang', '//dl[@class="information-list information-list--app medium-columns"]//p/text()' ) il.add_xpath( 'age_copy', '//dl[@class="information-list information-list--app medium-columns"]/div//dd/text()' ) il.add_xpath( 'support', '//div[@class="supports-list__item__copy"]/h3[@dir="ltr"]/text()') return il.load_item()
class BaiduSpider(RedisCrawlSpider): task_queue = baidu_task_queue base_url = "https://baike.baidu.com" name = baidu_spider_name allowed_domains = ['baike.baidu.com'] rules = (Rule(LinkExtractor(allow=('https://baike.baidu.com/item/', )), callback='parse', follow=True), ) # custom_settings = { # 'ITEM_PIPELINES': { # 'baikeSpider.pipelines.SpiderPipeline': 300, # 'baikeSpider.pipelines.SpiderRedisPipeline': 301, # 'baikeSpider.pipelines.WebCachePipeline': 302, # }, # } def parse(self, response): items = BaiduSpiderItem() selector = Selector(response) # print(response.status, response) items['url'] = unquote(response.url) items['html'] = response.text title = selector.xpath("/html/head/title/text()").extract() if title: items['title'] = title[0].strip().encode( 'utf-8', errors='ignore').decode('utf-8') else: items['title'] = '' summary = selector.xpath("//div[@class=\"lemma-summary\"]").xpath( "string(.)").extract() if summary: tmps = summary[0].encode('utf-8', errors='ignore').decode('utf-8') items['summary'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmps) else: items['summary'] = '' basic_info = selector.xpath("//div[@class=\"basic-info cmn-clearfix\"]" ).xpath("string(.)").extract() if basic_info: tmpb = basic_info[0].encode('utf-8', errors='ignore').decode('utf-8') items['basic_info'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmpb) else: items['basic_info'] = '' catalog = selector.xpath("//div[@class=\"lemmaWgt-lemmaCatalog\"]" ).xpath("string(.)").extract() if catalog: tmpc = catalog[0].encode('utf-8', errors='ignore').decode('utf-8') items['catalog'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmpc) else: items['catalog'] = '' # 进行迭代抓取的item链接 urls = [ unquote(item) for item in selector.xpath( "//div[@class=\"para\"]//a[@target=\"_blank\"]/@href").extract( ) ] items['keywords_url'] = list(set(filter(lambda x: 'item' in x, urls))) description = selector.xpath( "//div[@class=\"content-wrapper\"]").xpath("string(.)").extract() if description: tmpd = description[0].encode('utf-8', errors='ignore').decode('utf-8') items['description'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmpd) else: items['description'] = '' # 匹配pic、js、css items['embed_image_url'] = CacheTool.parse_img(items['html']) items['js'] = CacheTool.parse_js(items['html']) items['css'] = CacheTool.parse_css(items['html']) album_pic_url = selector.xpath( "//div[@class=\"album-list\"]//a[@class=\"more-link\"]/@href" ).extract() if album_pic_url: items['album_pic_url'] = self.base_url + unquote(album_pic_url[0]) else: items['album_pic_url'] = '' update_time = selector.xpath( "//span[@class = 'j-modified-time']").xpath("string(.)").extract() if update_time: tmpu = update_time[0].strip().encode( 'utf-8', errors='ignore').decode('utf-8') items['update_time'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmpu) else: items['update_time'] = '' reference_material = selector.xpath( "//dl[@class ='lemma-reference collapse nslog-area log-set-param']" ).xpath("string(.)").extract() if reference_material: tmpr = reference_material[0].encode( 'utf-8', errors='ignore').decode('utf-8') items['reference_material'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmpr) # print(items['reference_material']) else: items['reference_material'] = '' item_tag = selector.xpath("//dd[@id = \"open-tag-item\"]").xpath( "string(.)").extract() if item_tag: tmpi = item_tag[0].encode('utf-8', errors='ignore').decode('utf-8') items['item_tag'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmpi) else: items['item_tag'] = '' print('百度百科爬虫==》', items['title']) yield copy.deepcopy( items) # 深拷贝的目的是默认浅拷贝item会在后面的pipelines传递过程中会出现错误,比如串数据了
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='summary entry-summary']/h1[@class='product_title entry-title']", 'price': "//p[@class='price']/span[@class='amount']", 'category': "//div[@class='breadcrumb-trail']/a", 'description': "//div[@class='woocommerce-tabs']/div[@id='tab-description']/div/div/p", 'images': "//img[@class='attachment-shop_single wp-post-image']/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'phukieniphone.info' allowed_domains = ['phukieniphone.info'] start_urls = ['http://phukieniphone.info/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/9/[a-z0-9-]']), 'parse_item'), Rule(LinkExtractor(deny=['/9/[a-z0-9-]', 'add_to_cart']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class ShopindexSpider(CrawlSpider): name = 'shopindex' allowed_domains = ['nuomi.com', 'dianping.com', 'cq.meituan.com'] start_urls = [ # 'https://www.dianping.com/shop/24098260' # 'http://cq.meituan.com/shop/82458075' # ,'http://www.nuomi.com/deal/d3ccslof.html' # ,'https://www.dianping.com/shop/32463358' ] # settings = get_project_settings() downLoadUrlsFile = '../hlwdata/data/start_url.txt' startUrlsFile = '../hlwdata/data/downloaded_url.txt' lst = loadUrl(downLoadUrlsFile) rules = ( Rule(FilterLinkExtractor(allow=r'http://www.nuomi.com/deal/[\w]+', download=lst), callback='parse_nuomi', follow=True), Rule(FilterLinkExtractor(allow=r'https://www.dianping.com/shop/[\d]+$', download=lst), callback='parse_dianping', follow=True), Rule(FilterLinkExtractor( allow=r'http://cq.meituan.com/shop/[\d]+\.*[\w]*$', download=lst), callback='parse_meituan', process_links='link_filtering', follow=True), ) def link_filtering(self, links): for link in links: link.url = link.url.rstrip('.html') return links visitedShop = set() def start_requests(self): for url in loadUrl(self.startUrlsFile): yield self.make_requests_from_url(url) for url in self.start_urls: yield self.make_requests_from_url(url) def parse_nuomi(self, response): #只爬取美食类信息 prdType = response.xpath( '//div[@class="w-bread-crumb"]//a[@href="/326"]/text()').extract() prdType = "".join(prdType).strip('\n') if prdType != u'美食': return items = [] sel = response.xpath('//div[@class="p-item-info"]') dealId = sel.xpath('@mon').extract_first().split('=')[1] shopUrl = 'http://www.nuomi.com/pcindex/main/shopchain?dealId=' + dealId html = requests.get(shopUrl, headers=headers) js = json.loads(html.text) # shopCity = js['data']['city']['900010000']['city_name'] for shop in js['data']['shop']: shopId = shop['merchant_id'] shopCity = shop['city_id'] #只获取重庆的美食信息 # if shopId in self.visitedShop or shopCity != u'900010000': if shopId in self.visitedShop: continue else: self.visitedShop.add(shopId) shopName = shop['name'] shopCity = js['data']['city'][shopCity]['city_name'] shopAddr = shop['address'] shopPhone = shop['phone'] shopGlat = shop['baidu_latitude'] shopGlng = shop['baidu_longitude'] shopUrl = shop['link'] shopPicSave = '' shopScrapWeb = 'nuomi' item = ShopIndexItem() item['shopId'] = shopId item['shopCity'] = shopCity item['shopName'] = shopName item['shopAddr'] = shopAddr item['shopPhone'] = shopPhone item['shopGlat'] = shopGlat item['shopGlng'] = shopGlng item['shopUrl'] = shopUrl item['shopPicSave'] = shopPicSave item['shopScrapWeb'] = shopScrapWeb items.append(item) return items def parse_dianping(self, response): sel = response.xpath('//div[@id="basic-info"]') #只爬取美食类信息, 有如上标记,判断为美食信息 if not sel: print 'not meishi ' + response.url return shopId = re.search(r'/shop/([\d]+)$', response.url).group(1) if shopId in self.visitedShop: return else: self.visitedShop.add(shopId) shopCity = response.xpath( '//*[@id="page-header"]//a[@class="city J-city"]/text()' ).extract_first() shopName = sel.xpath('h1[@class="shop-name"]/text()').extract_first() shopAddr = sel.xpath( './/span[@itemprop="street-address"]/text()').extract_first() shopPhone = sel.xpath( './/span[@itemprop="tel"]/text()').extract_first() # shopDataUrl = 'http://www.dianping.com/ajax/json/shop/wizard/BasicHideInfoAjaxFP?shopId=%s'%shopId # htmlshop = requests.get(shopDataUrl, headers= headers) # try: # shopJson = json.loads(htmlshop.text) # shopInfo = shopJson['msg']['shopInfo'] # shopGlat = str(shopInfo['glat']) # shopGlng = str(shopInfo['glng']) # # except (ValueError, KeyError, TypeError): # print "JSON format error" shopInfo = '' lng = re.search(r'lng:([\d]+\.[\d]+)', response.body) lat = re.search(r'lat:([\d]+\.[\d]+)', response.body) shopGlat = '' shopGlng = '' if lng and lat: shopGlng = lng.group(1) shopGlat = lat.group(1) shopUrl = response.url shopPicSave = '' shopScrapWeb = 'dianping' item = ShopIndexItem() item['shopId'] = shopId item['shopCity'] = shopCity item['shopName'] = shopName.strip('\n').strip(' ').strip('\n') item['shopAddr'] = shopAddr.strip('\n').strip(' ').strip('\n') item['shopPhone'] = shopPhone item['shopGlat'] = shopGlat item['shopGlng'] = shopGlng item['shopUrl'] = shopUrl item['shopPicSave'] = shopPicSave item['shopScrapWeb'] = shopScrapWeb yield item def parse_meituan(self, response): sel = response.xpath('//div[@class="fs-section__left"]') # if not response.xpath('//div[@id="meishi-menu"]/h2[@class="content-title"]'): # print 'not meishi ' + response.url # return shopId = re.search(r'/shop/([\d]+)$', response.url).group(1) if shopId in self.visitedShop: return else: self.visitedShop.add(shopId) shopName = sel.xpath( './/h2/span[@class="title"]/text()').extract_first() shopAddr = sel.xpath('.//p/span[@class="geo"]/text()').extract_first() shopJson = json.loads( sel.xpath( './/p/span[@id="map-canvas"]/@data-params').extract_first()) shopInfo = shopJson['shops'][shopId] shopPhone = shopInfo['phone'] shopGlat = str(shopInfo['position'][0]) shopGlng = str(shopInfo['position'][1]) shopUrl = response.url shopPicSave = '' shopScrapWeb = 'meituan' item = ShopIndexItem() item['shopId'] = shopId item['shopCity'] = '' item['shopName'] = shopName.strip('\n').strip(' ').strip('\n') item['shopAddr'] = shopAddr.strip('\n').strip(' ').strip('\n') item['shopPhone'] = shopPhone item['shopGlat'] = shopGlat item['shopGlng'] = shopGlng item['shopUrl'] = shopUrl item['shopPicSave'] = shopPicSave item['shopScrapWeb'] = shopScrapWeb yield item
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@class='content_data']/div[@class='content_trangcon']/h1", 'price' : "//div[@class='noidung_sanpham']/div[@id='sanpham']/div[@class='sanpham_mota']/div[@class='giaban_ct']", 'category' : "", 'description' : "//div[@class='content_data']/div[@class='content_trangcon']/div[@class='noidung_sanpham']/div", 'images' : "//div[@class='img_noidung']/div[@id='load_IMG']/img/@src", 'canonical' : "", 'base_url' : "", 'brand' : "" } name = 'luxy.vn' allowed_domains = ['luxy.vn'] start_urls = ['http://luxy.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+\.html']), 'parse_item'), Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+/+$']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
import config.config as config, config.database as db from scrapy.crawler import CrawlerProcess from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor process = CrawlerProcess({ 'USER_AGENT': config.Config.USER_AGENT, 'DOWNLOAD_DELAY': 2, 'RETRY_ENABLED': False, 'COOKIES_ENABLED': False, 'REDIRECT_ENABLED': False, 'AJAXCRAWL_ENABLED': True }) conn_string = "host='" + db.DatabaseConfig.DB_HOST + "' " conn_string += "dbname='" + db.DatabaseConfig.DB_NAME + "' " conn_string += "user='******'" conn_string += "password='******'" conn = psycopg2.connect(conn_string) rules = (Rule(LinkExtractor(), callback='parse_item', follow=True), ) process.crawl(mySpider.wscrappingSpider, db=conn, allowed_domains=config.Config.DOMAINS_TO_SCRAP, keywords=config.Config.KEYWORDS_FILTER, start_urls=config.Config.URLS_TO_SCRAP, rules=rules) process.start() conn.close()
class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = ['https://www.lagou.com/'] custom_settings = { "COOKIES_ENABLED": False, "DOWNLOAD_DELAY": 1, 'DEFAULT_REQUEST_HEADERS': { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'user_trace_token=20171015132411-12af3b52-3a51-466f-bfae-a98fc96b4f90; LGUID=20171015132412-13eaf40f-b169-11e7-960b-525400f775ce; SEARCH_ID=070e82cdbbc04cc8b97710c2c0159ce1; ab_test_random_num=0; X_HTTP_TOKEN=d1cf855aacf760c3965ee017e0d3eb96; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DsXIrWUxpNGLE2g_bKzlUCXPTRJMHxfCs6L20RqgCpUq%26wd%3D%26eqid%3Dee53adaf00026e940000000559e354cc; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_hotjob; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAAFCAAEG50060B788C4EED616EB9D1BF30380575; _gat=1; _ga=GA1.2.471681568.1508045060; LGSID=20171015203008-94e1afa5-b1a4-11e7-9788-525400f775ce; LGRID=20171015204552-c792b887-b1a6-11e7-9788-525400f775ce', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.7 Safari/537.36', } } rules = ( Rule(LinkExtractor(allow=('zhaopin/.*')), follow=True), Rule(LinkExtractor(allow=('gongsi/j\d+.html.*')), follow=True), Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), ) # def parse_start_url(self, response): # return [] # # def process_results(self, response, results): # return results def parse_job(self, response): #解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath( "job_city", "/html/body/div[3]/div/div[1]/dd/p[1]/span[2]/text()") item_loader.add_xpath( "work_years", "/html/body/div[3]/div/div[1]/dd/p[1]/span[3]/text()") item_loader.add_xpath( "degree_need", "/html/body/div[3]/div/div[1]/dd/p[1]/span[4]/text()") item_loader.add_xpath( "job_type", "/html/body/div[3]/div/div[1]/dd/p[1]/span[5]/text()") item_loader.add_xpath( "tags", "/html/body/div[3]/div/div[1]/dd/ul/li/text()") #缺 item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", '.work_addr a::text') item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
class PlymouthSpider(CrawlSpider): name = 'falmouth_gd' allowed_domains = ['www.falmouth.ac.uk'] start_urls = [] base_url = '%s' Lists = ['http://flexible.falmouth.ac.uk/courses/ma-advertising-strategy-planning.htm', 'https://www.falmouth.ac.uk/communication-design-ma', 'https://www.falmouth.ac.uk/creativeadvertising', 'http://flexible.falmouth.ac.uk/courses/ma-creative-app-development.htm', 'http://flexible.falmouth.ac.uk/courses/ma-creative-events-management.htm', 'https://www.falmouth.ac.uk/film-television-ma', 'https://www.falmouth.ac.uk/illustrationma', 'https://www.falmouth.ac.uk/launchpad', 'https://www.falmouth.ac.uk/leasing-asset-finance', 'http://flexible.falmouth.ac.uk/courses/ma-photography.htm', 'http://flexible.falmouth.ac.uk/courses/pgche.htm', 'https://www.falmouth.ac.uk/professionalwriting', 'http://flexible.falmouth.ac.uk/courses/ma-writing-for-script-screen.htm', ] for i in Lists: fullurl = base_url % i start_urls.append(fullurl) rules = ( Rule(LinkExtractor(allow=(r'.*'), restrict_xpaths=('//li[@class="item isotope-item"]/a')),follow=True), # Rule(LinkExtractor(allow=r''),follow=True), Rule(LinkExtractor(allow=r'.*'),callback='parse_item', follow=False), ) def parse_item(self,response): print('==================================',response.url) item = HooliItem() url = response.url print(1,url) university = 'FALMOUTH UNIVERSITY' print(2,university) department = 'NULL' country = 'UK' city = 'NULL' website = 'https://www.falmouth.ac.uk' # programme = response.xpath('//div[@class="title"]/h1/text()').extract() programme = response.xpath('//div[@class="h1-box"]/h1/text()').extract() programme = ''.join(programme) print(3,programme) ucas_code = 'NULL' degree_level = '1' degree_type = response.xpath('//div[@class="h1-box"]/h1/text()').extract() degree_type = ''.join(degree_type) print(4,degree_type) start_date_lists = response.xpath('//div[@class="accordion"]//text()').extract() start_date_str = ''.join(start_date_lists) if "Start dates and application deadlines" in start_date_str: sdstart = start_date_str.find("Start dates and application deadlines") sdend = start_date_str.find("News and Events") start_date = start_date_str[sdstart:sdend] item["start_date"] = start_date else: start_date = 'NULL' print(5,start_date) # overview = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract() overview_list = response.xpath('//div[@class="content-block-wrapper"]//text()').extract() overview_str = ''.join(overview_list) if "Benefits" in overview_str: Ostart = overview_str.find("Benefits") Oend = overview_str.find("How the course is taught") overview = overview_str[Ostart:Oend] item["overview"] = overview else: overview = response.xpath('//div[@class="content-block-wrapper"]//text()').extract() overview = ''.join(overview) print(6, overview) mode = response.xpath('//div[@class="content-block-wrapper"]//dl//text()').extract() mode = ''.join(mode) # mode_lists = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract() # mode_str = ''.join(mode_lists) # # mode = mode.replace('\n','') # # mode = mode.replace(' ','') # if "Mode of study:" in mode_str: # mstart = mode_str.find("Mode of study:") # mend = mode_str.find("Summary") # mode = mode_str[mstart:mend] # item["mode"] = mode # else: # mode = '' print(7,mode) types = '' # duration_lists = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract() duration = response.xpath('//div[@class="content-block-wrapper"]//dl//text()').extract() duration = ''.join(duration) # duration_str = ''.join(duration_lists) # # duration = duration.replace('\n','') # # duration = duration.replace(' ','') # if "Mode of study:" in duration_str: # dstart = duration_str.find("Mode of study:") # dend = duration_str.find("Duration:") # duration = duration_str[dstart:dend] # item["duration"] = duration # else: # duration = '' print(8,duration) modules = response.xpath('//div[@class="accordion ui-accordion ui-widget ui-helper-reset"]//text()').extract() modules = ''.join(modules) # modules_lists = response.xpath('//div[@class="accordion"]//text()').extract() # modules_str = ''.join(modules_lists) # if "Course content" in modules_str: # mdstart = modules_str.find("Course content") # mdend = modules_str.find("Assessments") # modules = modules_str[mdstart:mdend] # item["modules"] = modules # else: # modules = '' # modules = modules.replace('\n','') print(9,modules) teaching = 'NULL' assessment = response.xpath('//div[@class="accordion"]//text()').extract() assessment = ''.join(assessment) # teaching_assessment_lists = response.xpath('//div[@class="accordion"]//text()').extract() # teaching_assessment_str = ''.join(teaching_assessment_lists) # if "Assessments" in teaching_assessment_str: # Astart = teaching_assessment_str.find("Assessments") # Aend = teaching_assessment_str.find("How you study") # teaching_assessment = teaching_assessment_str[Astart:Aend] # item["teaching_assessment"] = teaching_assessment # else: # teaching_assessment = '' print(10,assessment) career = response.xpath('//div[@class="field-career-opportunities"]//text()').extract() career = ''.join(career) print(11, career) application_date = 'NULL' deadline_lists = response.xpath('//div[@class="accordion"]//text()').extract() deadline_str = ''.join(deadline_lists) if "Start dates and application deadlines" in deadline_str: dlstart = deadline_str.find("Start dates and application deadlines") dlend = deadline_str.find("News and Events") deadline = deadline_str[dlstart:dlend] item["deadline"] = deadline else: deadline = 'NULL' print(11,deadline) application_fee = 'NULL' tuition_fee= 'NULL' # tuition_fee = ''.join(tuition_fee).replace('\r\n','') # tuition_fee = tuition_fee.replace('\n','') # tuition_fee = tuition_fee.replace(' ','') # print(11, tuition_fee) location = 'NULL' # location = ''.join(location) # print(13,location) ATAS = 'NULL' GPA = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS_lists = response.xpath('//div[@class="accordion"]//text()').extract() IELTS_str = ''.join(IELTS_lists) # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) if "Entry Requirements" in IELTS_str: Istart = IELTS_str.find("Entry Requirements") Iend = IELTS_str.find("Financing your studies") IELTS = IELTS_str[Istart:Iend] item["IELTS"] = IELTS else: IELTS = 'NULL' print(12, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = response.xpath('//div[@class="field-selection-process"]//text()').extract() interview = ''.join(interview) print(13,interview) portfolio = response.xpath('//div[@class="field-selection-process"]//text()').extract() portfolio = ''.join(portfolio) print(14,portfolio) application_documents = 'NULL' how_to_apply_lists = response.xpath('//div[@class="accordion"]//text()').extract() how_to_apply_str = ''.join(how_to_apply_lists) if "How to apply" in how_to_apply_str: hstart = how_to_apply_str.find("How to apply") hend = how_to_apply_str.find("Start dates and application deadlines") how_to_apply = how_to_apply_str[hstart:hend] item["how_to_apply"] = how_to_apply else: how_to_apply = 'NULL' print(13,how_to_apply) entry_requirements = response.xpath('//*[@id="start-of-content"]/div[2]/div[2]/div[1]//text()').extract() entry_requirements = ''.join(entry_requirements) # entry_requirements_lists = response.xpath('//div[@class="accordion"]//text()').extract() # entry_requirements_str = ''.join(entry_requirements_lists) # # EntryRequirements = EntryRequirements.replace(' ','') # if "Entry Requirements" in entry_requirements_str: # Estart = entry_requirements_str.find("Entry Requirements") # Eend = entry_requirements_str.find("Financing your studies") # entry_requirements = entry_requirements_str[Estart:Eend] # item["entry_requirements"] = entry_requirements # else: # entry_requirements = '' print(14,entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' degree_description = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item def getTuition_fee(self, tuition_fee): allfee = re.findall(r'\d+,\d+', tuition_fee) # print(allfee) for index in range(len(allfee)): fee = allfee[index].split(",") allfee[index] = ''.join(fee) # print(allfee[index]) # print(allfee) maxfee = 0 for fee in allfee: if int(fee) >= maxfee: maxfee = int(fee) return maxfee
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//h1[@class='product_title entry-title']", 'price': "//div[@class='summary entry-summary']/div/p[@class='price']/span[@class='amount']", 'category': "", 'description': "//div/div[@class='summary entry-summary']/div", 'images': "//div/div[@class='images']/a/img/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'dartchocolate.com' allowed_domains = ['dartchocolate.com'] start_urls = ['http://dartchocolate.com'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/product/']), 'parse_item'), Rule(LinkExtractor(allow=['']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class LapolarSpider(CrawlSpider): name = "lapolar" allowed_domains = ['lapolar.cl'] start_urls = ('http://www.lapolar.cl/internet/catalogo/', ) rules = ( Rule(LinkExtractor( allow="http://www.lapolar.cl/internet/catalogo/detalle"), callback='parse_product', follow=True), Rule(LinkExtractor(allow=[ 'http://www.lapolar.cl/internet/catalogo/grupo', 'http://www.lapolar.cl/internet/catalogo/categoria' ]), callback='parse', follow=True), Rule(LinkExtractor( allow='http://www.lapolar.cl/internet/catalogo/listados'), callback='parse_links', follow=True), Rule(LinkExtractor( allow='http://www.lapolar.cl/internet/catalogo/todolistados'), callback='parse_links', follow=True), ) def parse_links(self, response): jsonlinks = response.xpath('//script[@language="javascript"]/text()' ).re('"ruta":"[a-z0-9/_]+') for link in jsonlinks: url = "http://www.lapolar.cl/internet/catalogo/detalles/" + link.replace( '"ruta":"', '') yield scrapy.Request(url, callback=self.parse_product) def parse_product(self, response): try: item = AliceItem() item['url'] = response.url try: item['title'] = response.xpath( '//*[@class="titulo1 descrip_jq"]/text()').extract( )[0].encode('ascii', 'ignore') except: item['title'] = response.xpath( '//*[@class="titulo1 descrip_jq"]/text()').extract( ).encode('ascii', 'ignore') try: item['picture'] = response.xpath( '/html/head/meta[3]/@content').extract()[0] except: item['picture'] = response.xpath( '/html/head/meta[3]/@content').extract() try: item['price'] = int( response.xpath('//*[@class="precio precio_jq"]/text()').re( '\d\S*')[0].replace('.', '')) except: item['price'] = int( response.xpath('//*[@class="precio precio_jq"]/text()').re( '\d\S*').replace('.', '')) item['brand'] = "" item['store'] = "lapolar" item['id_store'] = 4 tags = response.xpath( '//tr[not(@id)]/td[@valign="top"]/div[@width]/a/text()' ).extract() try: item['tag1'] = tags[0] except: item['tag1'] = "" try: item['tag2'] = tags[1] except: item['tag2'] = "" try: item['tag3'] = tags[2] except: item['tag3'] = "" try: item['tag4'] = tags[3] except: item['tag4'] = "" try: item['tag5'] = tags[4] except: item['tag5'] = "" yield item except IOError: print 'cannot open', arg