# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@id='vmMainPage']/table//tr/td/div/h2", 'price': "//span[@class='productPrice']/b", 'category': "", 'description': "", 'images': "//div[@class='flexible-zoom-additionalImages']/a/@href | //div[@id='ja-current-content']/div[@id='vmMainPage']//a/img/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'format.vn' allowed_domains = ['format.vn'] start_urls = ['http://format.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+-\d+\.html']), 'parse_item'), Rule(LinkExtractor(allow=['/[a-zA-Z-]+\.html']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class WeiboRedisSpider(RedisCrawlSpider): name = 'weibo_redis' allowed_domains = ['weibo.com', 'sina.com.cn'] redis_key = 'WeiboRedisSpider:start_urls' page_links = LinkExtractor( restrict_xpaths='//div[@class="m-page"]//ul[@class="s-scroll"]/li/a') rules = (Rule(page_links, callback='parse_subjects', follow=True), ) # 自己定制配置文件 custom_settings = { 'ITEM_PIPELINES': { # 'weibo_spider.pipelines.MyMongoPipeline': 302, 'scrapy_redis.pipelines.RedisPipeline': 400 }, 'DOWNLOADER_MIDDLEWARES': { 'weibo_spider.middlewares.CookiesMiddleware': 544, 'weibo_spider.middlewares.ProxyMiddleware': 545 }, 'REDIS_HOST': '127.0.0.1', 'REDIS_PORT': 6379, 'DUPEFILTER_CLASS': 'scrapy_redis.dupefilter.RFPDupeFilter', 'SCHEDULER': 'scrapy_redis.scheduler.Scheduler', 'SCHEDULER_PERSIST': True, # 使用cookie池 'COOKIES_URL': 'http://192.168.199.233:5000/weibo/random' } def __int__(self, subject=None, *args, **kwargs): super(WeiboRedisSpider, self).__init__(*args, **kwargs) self.subject = subject def parse_subjects(self, response): div_list = response.xpath( '//div[@class="m-con-l"]/div/div[@class="card-wrap"]') for odiv in div_list: item = WeiboSpiderItem() item['title'] = odiv.xpath( './div[@class="card-top"]//a/text()').extract_first() if item['title']: item['title'] = item['title'].strip() item['avatar'] = odiv.xpath( './div[@class="card"]//div[@class="avator"]/a/img/@src' ).extract_first() item['nickname'] = odiv.xpath( './div[@class="card"]//div[@class="content"]/div[@class="info"]/div[2]/a[1]/text()' ).extract_first() item['icon'] = odiv.xpath( './div[@class="card"]//div[@class="content"]/div[@class="info"]/div[2]/a[2]/@title' ).extract_first() news = odiv.xpath( './div[@class="card"]//div[@class="content"]/p[@class="txt"]') if len(news) == 1: item['news'] = news[0].xpath( 'string(.)').extract_first().replace('\n', '').replace( ' ', '').replace('\u200b', '').replace('收起全文d', '') else: item['news'] = news[1].xpath( 'string(.)').extract_first().replace('\n', '').replace( ' ', '').replace('\u200b', '').replace('收起全文d', '') time = odiv.xpath( './div[@class="card"]//div[@class="content"]/p[@class="from"]/a[1]/text()' ).extract_first() now = datetime.now() if '秒' in time: time = datetime.strftime( now - timedelta(seconds=int(time.split('秒')[0])), '%Y-%m-%d %H:%M:%S') elif '分钟' in time: time = datetime.strftime( now - timedelta(minutes=int(time.split('分钟')[0])), '%Y-%m-%d %H:%M:%S') elif '今天' in time: today = re.findall(r'\d+', time) time = str( datetime(now.year, now.month, now.day, hour=int(today[0]), minute=int(today[1]), second=0)) else: date = re.findall(r'\d+', time) time = str( datetime(now.year, month=int(date[0]), day=int(date[1]), hour=int(date[2]), minute=int(date[3]), second=0)) item['time'] = time item['origin'] = odiv.xpath( './div[@class="card"]//div[@class="content"]/p[@class="from"]/a[2]/text()' ).extract_first() item['collect'] = card_act_int( odiv.xpath( './div[@class="card"]/div[@class="card-act"]/ul/li[1]/a/text()' ).extract_first().strip().split(' ')) item['forward'] = card_act_int( odiv.xpath( './div[@class="card"]/div[@class="card-act"]/ul/li[2]/a/text()' ).extract_first().strip().split(' ')) item['comment'] = card_act_int( odiv.xpath( './div[@class="card"]/div[@class="card-act"]/ul/li[3]/a/text()' ).extract_first().strip().split(' ')) item['like'] = card_act_int( odiv.xpath( './div[@class="card"]/div[@class="card-act"]/ul/li[4]/a/em/text()' ).extract_first()) yield item
class TeamS01pider(CrawlSpider): name = 'Team01' allowed_domains = ["soccer.hupu.com"] start_urls = ['https://soccer.hupu.com/teams/' ] rules = ( Rule(LinkExtractor(allow=(r'https://soccer.hupu.com/teams/121',)), callback='parse_team01'), #Rule(LinkExtractor(allow=(r'https://soccer.hupu.com/g/players/[a-z]-\d.html"',)),callback='parse_player'), ) def parse_team01(self,response): tempItems = [] item = items.TeamItem() # 截取球员信息url playerUrls = response.selector.xpath('//table[@class="team_player"]/tr/td/a/@href').extract() next_links = [] for url in playerUrls: next_links.append(url) parent = response.selector.xpath('//div[@class="team_info left"]')[0] name = parent.xpath('h3/span/text()').extract()[0] if len(name) > 0: cnname = re.findall(r'[\u4e00-\u9fa5]+', name) enname = re.findall(r'[^\u4e00-\u9fa5]+', name) item['TeamENName'] = enname[0].strip() if len(enname) > 0 else '' item['TeamCNName'] = cnname[0].strip() if len(cnname) > 0 else '' Coach=parent.xpath('//dl[@class="clearfix"]/dd[1]/text()').extract()[0].split(':') item['CoachName'] =Coach[1].strip() if len(Coach)>1 else '' City= parent.xpath('//dl[@class="clearfix"]/dd[3]/text()').extract()[0].split(':') item['City'] =City[1].strip() if len(City)>1 else '' Court=parent.xpath('ul[2]/li[1]/text()').extract()[0].split(':') item['CourtName'] = Court[1].strip() if len(Court)>1 else '' image=parent.xpath('ul[1]/li[@class=" left pic_logo"]/img/@src').extract() if len(image)>0: item['ImageUrl']='https:'+image[0] item['id'] = 0 item['Remark']='' yield item for url in next_links: yield Request(url, callback=lambda response,teamitem=item:self.parse_player(response,teamitem)) def parse_player(self,response,teamItem): item = items.PlayerItem() parent=response.selector.xpath('//ul[@class="player_detail"]') CNName=parent.xpath('li[@class="center"]/b[1]/text()').extract() item['CNName']=self.checkFirstStr(CNName) ENName=parent.xpath('li[@class="center"]/b[2]/text()').extract() item['ENName'] = self.checkFirstStr(ENName) CountryName=parent.xpath('li[@class="center"]/span[1]/text()').extract() item['CountryName'] = self.checkFirstStr(CountryName) Birthday = parent.xpath('li[@class="center"]/span[2]/text()').extract() item['Birthday'] = self.checkFirstStr(Birthday,'\d{4}-\d{2}-\d{2}') BodyWeight=parent.xpath('li[@class="center"]/span[3]/text()').extract() item['BodyWeight'] = self.checkFirstStr(BodyWeight,'\d+') Height=parent.xpath('li[3]/span[1]/text()').extract() item['Height'] = self.checkFirstStr(Height,'\d+') TeamName = parent.xpath('li[3]/span[2]/a/text()').extract() item['TeamName'] = self.checkFirstStr(TeamName) Position = parent.xpath('li[3]/span[3]/text()').extract() item['Position'] = self.checkFirstStr(Position) Number=parent.xpath('li[3]/span[4]/text()').extract() item['Number'] = Number[0] if len(Number) > 0 else '' item['TeamId']=teamItem['id'] item['TeamName']=teamItem['TeamCNName'] image = parent.xpath('li[1]/img/@src').extract() if len(image) > 0: item['ImageUrl'] = 'https:' + image[0] yield item def checkFirstStr(self,list,reg=None): if len(list)<=0: return '' if reg is None: return list[0] if len(list) > 0 else '' else: str = re.findall(reg, list[0]) return str[0].strip() if len(str) > 0 else ''
def parse(self, response): matched_domain = [x for x in self.allowed_domains if x in response.url] if len(matched_domain) > 0: domain = matched_domain[0].split('.')[0] folder_name = 'Crawled_Data/' + domain.capitalize( ) + '_University_Files' self.record[domain] = self.record.get(domain, 0) + 1 if self.record[domain] % 50 == 0: print('\n Crawled {} Bio-pages of {} University ...'.format( self.record[domain], domain.capitalize())) self.tree.save2file(folder_name + "/00__" + str(self.record[domain]) + "_tree.txt") isBio = self.bio_identifier.is_bio_html_content( response.xpath('//*').get()) if isBio: text = BeautifulSoup(response.xpath('//*').get(), features="html.parser").get_text() tokens = nltk.word_tokenize(text) normalized_text = ' '.join( [word for word in tokens if word.isalnum()]) normalized_text += '\n' + response.url hash_text = hashlib.md5(response.url.encode()) file_name = hash_text.hexdigest() with open(folder_name + "/" + file_name + ".txt", "w", encoding="utf-8") as file: file.write(normalized_text) AllLinks = LinkExtractor(allow_domains=domain + '.edu', unique=True).extract_links(response) for n, link in enumerate(AllLinks): if not any([x in link.url for x in self.exclude_words]): if self.tree.get_node(link.url) == None: referer = response.request.headers.get('Referer', None) if referer == None: self.tree.create_node(link.url, link.url, parent='root') else: referer = referer.decode("utf-8") if self.tree.contains(referer): self.tree.create_node(link.url, link.url, parent=referer) else: self.tree.create_node(link.url, link.url, parent='unknown') yield scrapy.Request(url=link.url, callback=self.parse)
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@class='product_info']/h1", 'price' : "//div[@class='price-box']/span[@class='regular-price']/span[@class='price']", 'category' : "//div[@class='breadcrumb']/a", 'description' : "//div[@class='tab-content']//div[@class='info']", 'images' : "//img[@id='image']/@src", 'canonical' : "//link[@rel='canonical']/@href", 'base_url' : "", 'brand' : "" } name = 'cafe.net.vn' allowed_domains = ['cafe.net.vn'] start_urls = ['http://www.cafe.net.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ #Rule(LinkExtractor(), 'parse_item'), #Rule(LinkExtractor(), 'parse'), Rule(LinkExtractor(allow=['/en/[a-zA-Z0-9-]+\.html']), 'parse_item_and_links'), ]
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='box_menu_top itemHeader']/h2[@class='itemTitle']", 'price': "//div[@class='itemImageBlock']/form/span", 'category': "", 'description': "//div[@class='itemBody']/div[@class='itemFullText']/ul[2]", 'images': "//div[@class='itemImageBlock']/span[@class='itemImage']/a/@href", 'canonical': "", 'base_url': "", 'brand': "" } name = 'goquymynghe.com' allowed_domains = ['goquymynghe.com'] start_urls = ['http://goquymynghe.com'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/item/']), 'parse_item'), Rule(LinkExtractor(allow=['/.*']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class DoctorlistSpider(RedisCrawlSpider): name = 'doctorList' allowed_domains = ['ask.39.net'] # scrapy-redis redis_key = 'myspider:start_urls' # 规则爬虫 rules = ( Rule(LinkExtractor(allow=r'question/(\d+).html'), callback='parse_item', follow=True), # Rule(LinkExtractor(allow=r'http://my.39.net/wulinqing'), callback='parse_item', follow=True), ) def parse_item(self, response): print('response.url===========', response.url) item = DoctorItem() div = response.xpath( '//div[@class="sele_all marg_top"] | //div[@class="sele_all"]') for dd in div: # 医生姓名 name = dd.xpath( './div[1]/div[2]/p[@class="doc_xinx"]/span[1]/text()').extract( ) item['name'] = ''.join(name) # 医生级别 level = dd.xpath( './div[1]/div[2]/p[@class="doc_xinx"]/span[2]/text()').extract( ) item['level'] = ''.join(level) # 工作单位 company = dd.xpath( './div[1]/div[2]/p[@class="doc_xinx"]/span[3]/text()').extract( ) # 擅长的领域 good = dd.xpath( './div[1]/div[2]/p[@class="doc_sc"]/span/text()').extract() if len(div.xpath(".//span[@class='doc_yshi']/text()")) > 1: # 医院 item['company'] = ''.join(company) else: item['company'] = '' if len(div.xpath(".//p[@class='doc_sc']/span/text()")) > 0: # 擅长 item['good'] = ''.join(good) else: item['good'] = '' # 回答答案 detail = dd.xpath('./p/text()').extract() item['detail'] = ''.join(detail) # 回答时间 time = dd.xpath( './div[@class="doc_t_strip"]/div[1]/p/text()').extract() item['time'] = ''.join(time[0]) # 帮助人数 pid = dd.xpath('.//div[@class="doctor_all"]/@mid').extract()[0] json_url = 'http://askdatafmy.39.net/home/askapi.ashx?callback=jQuery172033868943235912363_1539677691886&action=doctorTopicCount&pid=' + pid item['helpNum'] = json.loads( requests.get(json_url).text.split("()")[0].split("(")[1].split( ")")[0])["data"]["all"] item['link'] = response.url yield item
class uyghurcongress(RedisCrawlSpider): name = 'uyghurcongress' start_urls = ['http://www.uyghurcongress.org/'] # redis_key='middleway:urls' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', } rules = (Rule(LinkExtractor( allow='http\:\/\/www\.uyghurcongress\.org\/en\/\?p\=\d*', ), callback='parse_content', follow=True), Rule(LinkExtractor( allow='http\:\/\/www\.uyghurcongress\.org\/.*', ), follow=True)) def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_list=[]): if not publish_time_list: print('time is None') return None mouthstr = publish_time_list[0] mouth_eng_to_num = { 'January': '01', 'February': '02', 'March': '03', 'April ': '04', 'May': '05', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12' } mouth_num = mouth_eng_to_num[str(mouthstr)] day_str_raw = str(publish_time_list[1]) if len(day_str_raw) < 2: day_str_raw = '0' + day_str_raw publish_time = str( publish_time_list[2] ) + '-' + mouth_num + '-' + day_str_raw + ' 00:00:00' return publish_time def deal_reply_nodes(reply_nodes=None): reply_nodes_list = [] def deal_publishtime_inside(publishtime): publish_time = publishtime.replace('年', '-').replace( '月', '-').replace('日', '') time_split_2 = publish_time.split(' ') data_str = time_split_2[0] data_str_list = data_str.split('-') mounth = data_str_list[1] day = data_str_list[2] if len(mounth) < 2: mounth = '0' + mounth if len(day) < 2: day = '0' + day data_str = data_str_list[0] + '-' + mounth + '-' + day time_split_2_part2 = time_split_2[1] if '下午' in time_split_2_part2: time_part2_h_m = time_split_2_part2.replace('下午', '').split(':') time_split_2_h = int(time_part2_h_m[0]) time_split_2_m = time_part2_h_m[1] time_split_2_h_add = 12 + time_split_2_h time_pm_finally = str( time_split_2_h_add) + ':' + time_split_2_m + ':00' return data_str + ' ' + time_pm_finally elif '上午' in time_split_2_part2: time_part2_h_m = time_split_2_part2.replace('上午', '').split(':') time_split_2_h = int(time_part2_h_m[0]) time_split_2_m = time_part2_h_m[1] time_split_2_h_add = time_split_2_h if time_split_2_h_add < 10: time_split_2_h_add = '0' + str(time_split_2_h) time_am_finally = str( time_split_2_h_add) + ':' + time_split_2_m + ':00' return data_str + ' ' + time_am_finally if reply_nodes: reply_nodes_list_eval = eval(reply_nodes[0]) for one_reply_nodes in reply_nodes_list_eval: content = one_reply_nodes['body'] publish_time_raw = one_reply_nodes['displayTime'] publish_time = deal_publishtime_inside(publish_time_raw) id = one_reply_nodes['id'] publish_user_photo = one_reply_nodes['author']['avatarUrl'] publish_user = one_reply_nodes['author']['name'] child_reply_node = { 'content': content, 'publish_time': publish_time, 'id': id, 'publish_user_href': 'http:' + publish_user_photo if 'http' not in publish_user_photo else publish_user_photo, 'publish_user': publish_user } reply_nodes_list.append(child_reply_node) return reply_nodes_list else: return None loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath( 'title', '//div[@class="post-content"]//h3[@class="post-title"]/text()', TakeFirst(), lambda x: x.strip()) loader1.add_xpath( 'content', '//div[@class="post-content"]//p/text()|//div[@class="post-content"]//h1/text()|//div[@class="post-content"]//h2/text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.split('=')[1].split('.')[0].strip()) loader1.add_xpath('img_urls', '//div[@class="post-content"]//img/@src') loader1.add_value( 'publish_time', response.xpath( '//div[@class="post-content"]//p[@class="post-meta"]').re( '(\S*) (\d{1,2})\, (\d{4})'), deal_publish_time) # loader1.add_value('publish_user','degewa') # loader1.add_value('reply_count',response.selector.xpath('//*[@id="comments"]/h4/text()').re(ur'(\d{1,2}).*条评论'),lambda x:x[0] if x else 0) # loader1.add_value('reply_nodes',response.selector.re(ur'var items \= (\[.*?\])\;'),deal_reply_nodes) item = loader1.load_item() print(item) return item
def parse(self, response): xlink = LinkExtractor() itemre = re.compile(self.itemurl_re) for link in xlink.extract_links(response): if itemre.search(link.url): yield Request(url=link.url, callback=self.parse_item)
XPATH = { 'name': "//h1[@class='mrpH']", 'price': "//span[@class='mcrp']", 'category': "//div[@class='mrbn']/a", 'description': "//div[@class='des1 cl']", 'images': "//div[@id='p_inner']/div/a/@href", 'canonical': "", 'base_url': "", 'brand': "" } name = 'giadinhmart.vn' allowed_domains = ['giadinhmart.vn'] start_urls = ['http://giadinhmart.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule( LinkExtractor(allow=['/giadinhmart.vn/[a-z0-9-]+-\d+.aspx'], deny=[ '/huong-dan-mua-hang', '/phuong-thuc-thanh-toan', '/chinh-sach-hau-mai', '/gioi-thieu-giadinhmartvn', 'danh-muc' ]), 'parse_item'), Rule(LinkExtractor(allow=['giadinhmart.vn/ca/[a-z0-9-]+-\d+.aspx']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class CarsSpider(CrawlSpider): name = "cars" allowed_domains = ["www.olx.ro"] start_urls = open('links.txt').readlines() rules = (Rule(LinkExtractor(allow=(), restrict_css=('.pageNextPrev', )), callback="parse_item", follow=True), ) def parse_item(self, response): item_links = response.css( 'table .wrap .offer h3 a::attr(href)').extract() for a in item_links: yield scrapy.Request(a, callback=self.parse_detail_page) def parse_detail_page(self, response): id = response.css('.offer-bottombar__item > strong::text').get() title = response.css('h1::text').get().strip() price = response.css('.pricelabel > strong::text').get() user = response.css('.offer-user__actions h4 a::attr(href)').get() location = response.css('address p::text').get() # properties properties = response.css('.offer-details__name *::text').getall() values = response.css('.offer-details__value *::text').getall() for i, p in enumerate(properties): if p == 'Oferit de': owner_type = values[i] elif p == 'Marca': brand = values[i] elif p == 'Model': model = values[i] elif p == 'Culoare': color = values[i] elif p == 'Combustibil': fuel_type = values[i] elif p == 'Cutie de viteze': gearbox_type = values[i] elif p == 'An de fabricatie': year = values[i] elif p == 'Rulaj': mileage = values[i] elif p == 'Caroserie': body_type = values[i] elif p == 'Capacitate motor': engine_displacement = values[i] elif p == 'Stare': condition = values[i] description = response.xpath( './/div[@id="textContent"]/text()').getall() # description = [i.strip() + '\n' for i in list(response.xpath('.//div[@id="textContent"]/text()').getall())] post_date = response.css('em > strong::text').get()[3:] views = response.css('.offer-bottombar__counter > strong::text').get() item = OlxItem() item['id'] = id item['title'] = title item['price'] = price item['user'] = user item['url'] = response.url item['location'] = location item['description'] = description item['owner_type'] = owner_type item['brand'] = brand item['model'] = model item['color'] = color item['fuel_type'] = fuel_type item['gearbox_type'] = gearbox_type item['year'] = year item['mileage'] = mileage item['body_type'] = body_type item['engine_displacement'] = engine_displacement item['condition'] = condition item['post_date'] = post_date item['views'] = views yield item
class NySpider(CrawlSpider): name = 'NY' allowed_domains = ['www.in-en.com','www.china-nengyuan.com','www.chinaoilonline.com'] start_urls = [ # 'http://www.china-nengyuan.com/news/news_list_1.html', # 'https://www.in-en.com/article/news/intl/', # 'https://www.in-en.com/article/news/china/', # 'https://www.in-en.com/article/news/china/', 'http://www.chinaoilonline.com/article.do?method=toArticleListByType2ByTypeidList&p1=15&p1=16&p1=17&p1=18&pageNo=1&pageSize=22&pid=2&subLanmuId=2&rightshow=&typeid=23&titleLength=30' ]#https://www.in-en.com/article/news/intl/','https://www.in-en.com/article/news/china/', custom_settings = { # 并发请求 'CONCURRENT_REQUESTS': 10, # 'CONCURRENT_REQUESTS_PER_DOMAIN': 1000000, 'CONCURRENT_REQUESTS_PER_IP': 0, # 下载暂停 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': { # 设置异步入库方式 'HY_NEWS.pipelines.MysqlTwistedPipeline': 600, # 去重逻辑 # 'HY_NEWS.pipelines.DuplicatesPipeline': 200, }, 'DOWNLOADER_MIDDLEWARES': { # 调用 scrapy_splash 打开此设置 # 'scrapy_splash.SplashCookiesMiddleware': 723, # 'scrapy_splash.SplashMiddleware': 725, # 设置设置默认代理 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 700, # 设置请求代理服务器 # 'HY_NEWS.util_custom.middleware.middlewares.ProxyMiddleWare': 100, # 设置scrapy 自带请求头 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 自定义随机请求头 'HY_NEWS.util_custom.middleware.middlewares.MyUserAgentMiddleware': 120, # 重试中间件 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, # 重试中间件 'HY_NEWS.util_custom.middleware.middlewares.MyRetryMiddleware': 90, }, # 调用 scrapy_splash 打开此设置 # 'SPIDER_MIDDLEWARES': { # 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, # }, # 去重/api端口 # 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', # # 'SPLASH_URL': "http://10.8.32.122:8050/" # 'SPLASH_URL': "http://127.0.0.1:8050/" } rules = ( Rule(LinkExtractor(restrict_css='div.bd > ul:nth-child(1) > div > a:nth-child(14) '), follow=True), Rule(LinkExtractor(restrict_css='.imgBox a '), callback='parse_item', follow=True), Rule(LinkExtractor(restrict_css='.member_tr_row a '), callback='parse_item', follow=True), Rule(LinkExtractor(restrict_css='.zxwk_li a'), callback='parse_item', follow=True), ) def parse_item(self, response): item = HyNewsItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] p_time = result['publish_time'] lyurl = response.url lyname = '能源' content_css = [ '#content', '.news_link', '#zxwk_left_1' ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') classify, codes, region = get_category(txt) item['title'] = title item['txt'] = txt item['p_time'] = get_times(p_time) item['content'] = content item['spider_name'] = 'NY' item['module_name'] = '行业新闻' item['cate'] = classify item['region'] = region item['code'] = codes item['link'] = lyurl item['website'] = lyname if content: yield item
def __init__(self, place='amsterdam'): self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page_number) for page_number in range(1, 2)] self.base_url = "http://www.funda.nl/koop/%s/" % place self.le1 = LinkExtractor(allow=r'%s+(huis|appartement)-\d{8}' % self.base_url)
class ChengduSpider(CrawlSpider): name = 'chengdu' allowed_domains = ['chengdu.gov.cn'] start_urls = ['http://www.chengdu.gov.cn/chengdu/zfxx/zfxx.shtml'] rules = (Rule(LinkExtractor(allow=r'.*chengdu.gov.cn/chengdu/c131029.*'), callback='parse_page', follow=False), ) cont_dict = {} def parse_item(self, response): print("5. parse_item(): " + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + " -> " + response.url) title = response.xpath( "//*[@id='bar']/div[1]/div[1]/table/tbody/tr[2]/td/text()").get() cont = response.xpath("//*[@class='text_content'").get() index_id = str("_NULL") pub_org = response.xpath( "//*[@id='zc']/div[3]/table/tbody/tr[1]/td[2]/text()").get() pub_time = response.xpath( "//*[@id='zc']/div[3]/table/tbody/tr[2]/td[1]/text()").get() doc_id = response.xpath( "//*[@id='zc']/div[3]/table/tbody/tr[1]/td[1]/text()").get() region = str('成都') update_time = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00") if not title: return print("\t>>> " + title) for key in keys: if key in title: self.dict_add_one(re.sub('[\s+]', ' ', title), response.url, re.sub('[\s+]', ' ', cont), re.sub('[\s+]', ' ', pub_time), re.sub('[\s+]', ' ', pub_org), index_id, doc_id, region, update_time, key) item = YqcChengduSpiderItem(cont_dict=self.cont_dict) yield item def dict_add_one(self, title, url, cont, pub_time, pub_org, index_id, doc_id, region, update_time, doc_key): time.sleep(0.3) if title in self.cont_dict: self.cont_dict[title]['key_cnt'] += 1 self.cont_dict[title][ 'doc_key'] = self.cont_dict[title]['doc_key'] + ',' + doc_key else: cnt_dict = { 'key_cnt': 1, 'title': title, 'url': url, 'cont': cont, 'pub_time': pub_time, 'pub_org': pub_org, 'index_id': index_id, 'doc_id': doc_id, 'region': region, 'update_time': update_time, 'doc_key': doc_key } self.cont_dict[title] = cnt_dict def parse_page(self, response): url = response.url print("4. parse_page(): " + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + " -> " + url) url_prefix = 'http://www.nanning.gov.cn/xxgk/xxgkml/jcxxgk/zcwj/zfwj' if str('REPORT_NDOC_006051') in url or str( 'REPORT_NDOC_006010') in url: print("\t>>> debug: " + url) if str('currentPage') in url: print('currentPage exist') tr_list = response.xpath( "//*[@id='main']/div[1]/div/div[2]/table/tbody//tr") for tr in tr_list: # print(tr) url = tr.xpath("./td[1]/a/@href").get() full_url = url_prefix + url yield scrapy.Request(full_url, callback=self.parse_item) else: print('no currentPage exist') if str('REPORT_NDOC_006051') in url or str( 'REPORT_NDOC_006010') in url: print('\t>>> no currentPage') title = response.xpath("//*[@class='detai_title']/text()").get() cont = response.xpath("//*[@class='text_content'").get() index_id = str("_NULL") pub_org = response.xpath( "//*[@id='zc']/div[3]/table/tbody/tr[1]/td[2]/text()").get() pub_time = response.xpath( "//*[@id='zc']/div[3]/table/tbody/tr[2]/td[1]/text()").get() doc_id = response.xpath( "//*[@id='zc']/div[3]/table/tbody/tr[1]/td[1]/text()").get() region = str('成都') update_time = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00") if not title: return print("\t>>> " + title) for key in keys: if key in title: # print("\t>>> included") self.dict_add_one(re.sub('[\s+]', ' ', title), response.url, re.sub('[\s+]', ' ', cont), re.sub('[\s+]', ' ', pub_time), pub_org, index_id, doc_id, region, update_time, key) item = YqcChengduSpiderItem(cont_dict=self.cont_dict) print("6. parse_page(): " + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + " -> " + url) # print("\n") # print(item) yield item
class CompetitivecyclistSpider(CrawlSpider): name = u'competitivecyclist.com' allowed_domains = ['competitivecyclist.com'] start_urls = [ 'http://www.competitivecyclist.com/Store/sitemaps/categoriesIndex.jsp', ] rules = ( Rule(LinkExtractor(allow='page=')), Rule(LinkExtractor(), callback='parse_category') ) def parse_category(self, response): base_url = get_base_url(response) products = response.xpath('//div[@id="products"]//a/@href').extract() for url in products: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) next_page = response.xpath('//li[@class="pag-next"]/a/@href').extract() if products: # This is to prevent some strange issues with website where it shows next page but there are no products for url in next_page: yield Request(urljoin_rfc(base_url, url), callback=self.parse_category) def parse_product(self, response): base_url = get_base_url(response) product_links = response.xpath('//div[@id="products"]//a[contains(@class,"qa-product-link")]/@href').extract() if product_links: for link in product_links: yield Request(url_query_cleaner(response.urljoin(link)), callback=self.parse_product) return product_name = response.xpath('//h1[@itemprop="name"]/text()').extract() if not product_name: return product_name = product_name[-1].strip() category = re.findall("name:'Category', value:'([^']+)'", response.body.replace("\\'", ""e;")) if category: category = category.pop().replace(""e;", "'") else: category = "" brand = response.xpath('//h1[@itemprop="name"]/span/text()').extract() brand = brand[0].strip() if brand else '' rrp_by_sku = {} sku_data = re.search(r'BC.product.skusCollection = \$.parseJSON\((.*)\);', response.body) if sku_data: sku_data = json.loads(demjson.decode(sku_data.group(1), encoding='utf8' )) rrp_by_sku = {sku.upper():str(opt['price']['high']) for sku, opt in sku_data.iteritems() if opt['price']['high']>opt['price']['low']} options = response.xpath('//li[contains(@class,"qa-variant-item-")]') for option in options: product_loader = ProductLoader(item=Product(), selector=option) sku = option.xpath('./@sku-value').extract() sku = sku[0] product_loader.add_value('sku', sku) product_loader.add_value('identifier', sku) option_name = option.xpath('./@title').extract()[0].strip() option_name = option_name.replace('One Color, One Size', '').replace(', One Size', '').replace('One Color, ', '').strip() if option_name != '': product_loader.add_value('name', product_name + ', ' + option_name) else: product_loader.add_value('name', product_name) image_url = option.xpath('./@data-img-large').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = extract_price(option.xpath('./@data-price').extract()[0]) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product = product_loader.load_item() metadata = CRCMeta() metadata['rrp'] = rrp_by_sku.get(sku.upper(), '') product['metadata'] = metadata yield product
class YahoomovieSpider(CrawlSpider): name = "yahoomovie" allowed_domains = ["yahoo.com.tw"] start_urls = ["https://movies.yahoo.com.tw/movie_intheaters.html?page=1"] IMAGE_DIR = MEDIA_ROOT # IMAGE_DIR = "D:\\Users\\Administrator\\gb5566\\yahoo_ptt\\media\\movie\\images\\yahoo" custom_settings = { "IMAGES_STORE": IMAGE_DIR, "DOWNLOAD_DELAY": 3, "ITEM_PIPELINES": { "crawlmovie.pipelines.CustomImagePipeline": 1, "crawlmovie.pipelines.YahooPipeline": 100, "crawlmovie.pipelines.DeleteNullTitlePipeline": 200, "crawlmovie.pipelines.DuplicatesTitlePipeline": 200, "crawlmovie.pipelines.CsvExportPipeline": 300, }, "AUTOTHROTTLE_ENABLED": True, # The initial download delay "AUTOTHROTTLE_START_DELAY": 5, # The maximum download delay to be set in case of high latencies "AUTOTHROTTLE_MAX_DELAY": 60, # The average number of requests Scrapy should be sending in parallel to # each remote server "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, # "CLOSESPIDER_ITEMCOUNT": 150, } rules = ( Rule( LinkExtractor( restrict_xpaths="//div[@class='release_movie_name']/a"), callback="parse_item", follow=True, ), Rule(LinkExtractor(restrict_xpaths="//li[@class='nexttxt']/a")), ) def parse_item(self, response): item = YahooCloudItem() title = response.xpath( "normalize-space(//div[@class='movie_intro_info_r']/h1/text())" ).extract() item["title"] = "".join(title) critics_consensus = response.xpath( "normalize-space(//span[@id='story']/text())").extract() item["critics_consensus"] = "".join( [i.replace(u"\xa0", u"") for i in critics_consensus]) item["release_date"] = response.xpath( "(//div[@class='movie_intro_info_r']/span[1]/text())").extract()[0] duration = response.xpath( "//div[@class='movie_intro_info_r']/span[2]/text()").extract() item["duration"] = "".join( [i.replace(u"\\u3000\\", u"") for i in duration]) item["genre"] = response.xpath( "normalize-space((//div[@class='level_name'])[2]/a/text())" ).extract() # i['rating'] = response.css( # '.ratingValue ::text').extract()[1] item["rating"] = response.xpath( "//div[@class='score_num count']/text()").extract() item["amount_reviews"] = response.xpath( "//div[@class='circlenum']/div[@class='num']/span/text()").extract( ) url = response.xpath( "//div[@class='movie_intro_foto']/img/@src").extract() link = "".join(url) item["images"] = {item["title"]: link} yield item
class TouzishijianSpider(CrawlSpider): name = 'huang114_all' allowed_domains = ['114chn.com'] start_urls = ['http://www.114chn.com/'] custom_settings = { 'DEFAULT_REQUEST_HEADERS': { 'upgrade-insecure-requests': "1", # 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'referer': "http://search.114chn.com/searchresult.aspx?type=1&areaid=31&pattern=2&page=100", 'accept-encoding': "gzip, deflate", 'accept-language': "zh-CN,zh;q=0.8", # 'cookie': "bdshare_firstime=1509612560767; UM_distinctid=15f7bed65c32e0-01af79ead3a85f-31637e01-13c680-15f7bed65c4735; Hm_lvt_40b8d9bb56b7b9b3fee170b6b9b4bc8e=1509612561; Hm_lpvt_40b8d9bb56b7b9b3fee170b6b9b4bc8e=1509613182; CNZZDATA30067493=cnzz_eid%3D1102648662-1510104203-http%253A%252F%252Fsearch.114chn.com%252F%26ntime%3D1510104203", 'cache-control': "no-cache", 'postman-token': "b710e80f-5152-1b73-ed8c-b5342bd0c5a9" } } # def start_requests(self): # burl = 'http://search.114chn.com/searchresult.aspx?type=1&key={k}&pattern=2&page=1' # x = 0 # while True: # comp_name = rc.spop('zhuce_names') # if not comp_name: # x += 1 # if x > 5: # raise CloseSpider('no datas') # time.sleep(60) # continue # url = burl.format(k=comp_name) # yield scrapy.Request(url, meta={'dont_redirect': True}) # start_url = "http://search.114chn.com/searchresult.aspx?type=1&areaid={area}&pattern=2&page=100" # for i in range(100): # yield scrapy.Request(start_url.format(area=str(i)), dont_filter=True) rules = ( # Rule(LinkExtractor(allow=('searchresult',))), Rule(LinkExtractor(allow=('.*',), deny=('s\.114chn', 'Error\.htm'))), Rule( LinkExtractor( allow=( 'TradeDetail\.aspx', 'Free\.aspx' ) ), callback='parse_item' ), ) def parse_item(self, response): item = huang114AllItem() if '很抱歉!页面在您访问时发生了错误' in response.text or '对不起 !' in response.text: return select = Selector(text=response.text) if 'freeindex' in response.url: comp_name = select.xpath('//*[@id="lblCompanyName"]//text()').extract_first() link_man = select.xpath('//*[@id="lblLinkMan"]//text()').extract_first() tel = select.xpath('//*[@id="lblTel"]/text()').extract_first() email = select.xpath('//*[@id="lblEmail"]/text()').extract_first() addr = select.xpath('//*[@id="lblAddress"]/text()').extract_first() t = select.xpath('//*[@id="lblContent"]//text()').extract() intro = ''.join(t) if t else '' else: comp_name = select.xpath('//div[@class="zitifree"]/text()').extract_first() link_man = select.xpath('//*[@class="lblLinkMan"]//text()').extract_first() tel = select.xpath('//*[@class="lbltel"]/text()').extract_first() email = select.xpath('//*[@class="lblemail"]/text()').extract_first() addr = select.xpath('//*[@class="lblweb"]/text()').extract_first() t = select.xpath('//div[@class="xinxi"]//text()').extract() intro = ''.join(t) if t else '' item['comp_url'] = response.url item['comp_name'] = comp_name.strip() if comp_name else '' item['link_man'] = link_man item['tel'] = tel item['email'] = email item['addr'] = addr item['intro'] = intro yield item
class VacanciesSpider(scrapy.spiders.CrawlSpider): """ This class defines a crawler spider for the contents of the Airbus Job portal vacancies """ # Define module logger logger = logger_setup.setup_module_logger(__name__) # Spider properties name = 'vacancies' allowed_domains = ['airbus.com'] start_urls = [ 'https://www.airbus.com/careers/search-and-apply/' 'search-for-vacancies.html/?page=1' ] # Scraped data scraped_data = [] @staticmethod def get_next_listing_page(this_page: str) -> str: """ This static method generates the link for the next vacancies listing page. :param this_page: String with the url of the current page :return: String with the url of the next page to index """ this_page_number = int( re.search('(?<=(\\?page\\=))(\\d+)', this_page).group()) next_page = re.sub('(?<=(\\?page\\=))(\\d+)', str(this_page_number + 1), this_page) return next_page def parse_vacancies_links(self, response: scrapy.http.Response) -> \ scrapy.http.Request: """ This method gets the links of the vacancies listed in the response, requests its own response and calls the ´´parse_vacancy_contents´´ method for each of them to parse its data. :param response: Scraped response of the listing page :return: Request of parsing the contents of each listed vacancy """ # self.logger.info('Processing listing page: %s', response.url) for href in response.xpath( "//section[@class='c-jobsearchpage__content']" "//div[@class='c-jobcarousel__slider--title']" "//a/@href").getall(): yield scrapy.Request(response.urljoin(href), self.parse_vacancies_contents) def parse_vacancies_contents(self, response: scrapy.http.Response) -> None: """ This method parses the contents of the vacancy from the scraped web page and stores them in the fields of an Scrapy Item. :param response: :return: """ # Parse vacancy fields vacancy = vacancy_item.Vacancy() # Url vacancy['url'] = response.url # Title vacancy['title'] = response.xpath( "//div[@class='c-jobdetails']" "//div[@class='header-job']" "//h2[@class='c-banner__title col-xs-12 col-sm-12']/text()").get() # Posting Date vacancy['published_date'] = response.xpath( "//div[@class='c-jobdetails']" "//div[@class='header-job']" "//div[@class='c-banner__jobinfo-ligne'][1]" "/span[2]/text()").get() # Division vacancy['division'] = response.xpath( "//div[@class='c-jobdetails']" "//div[@class='header-job']" "//div[@class='c-banner__jobinfo-ligne'][2]" "/span[2]/text()").get() # Location vacancy['location'] = str( response.xpath("//div[@class='c-jobdetails']" "//div[@class='header-job']" "//div[@class='c-banner__jobinfo-ligne'][3]" "/span[2]/text()").get()).strip() # Reference Code vacancy['reference_code'] = str( response.xpath("//div[@class='c-jobdetails']" "//div[@class='c-banner__jobinfo-botton-ligne']" "//span[text()='External code']" "/parent::*/span[2]/text()").get()).strip() # Functional Area vacancy['job_family'] = str( response.xpath("//div[@class='c-jobdetails']" "//div[@class='c-banner__jobinfo-botton-ligne']" "//span[text()='Job family']" "/parent::*/span[2]/text()").get()).strip() # Contract Type vacancy['contract_type'] = str( response.xpath("//div[@class='c-jobdetails']" "//div[@class='c-banner__jobinfo-botton-ligne']" "//span[text()='Contract type']" "/parent::*/span[2]/text()").get()).strip() # Work Experience vacancy['work_experience'] = str( response.xpath("//div[@class='c-jobdetails']" "//div[@class='c-banner__jobinfo-botton-ligne']" "//span[text()='Experience level']" "/parent::*/span[2]/text()").get()).strip() # Working Time vacancy['working_time'] = str( response.xpath("//div[@class='c-jobdetails']" "//div[@class='c-banner__jobinfo-botton-ligne']" "//span[text()='Working Time']" "/parent::*/span[2]/text()").get()).strip() # Description vacancy['description'] = html2text.html2text( response.xpath("//div[@class='has-padding c-contentjob']" "//h2[text()='Job Description']" "/parent::*/div[2]").get()) self.logger.info('Processing vacancy: %s --> %s', vacancy.get('title'), vacancy.get('url')) self.scraped_data.append(vacancy) def parse_start_url(self, response: scrapy.http.Response): """ This dummy function is requested for not to skipping the indexing of the initial listing page. :param response: :return: """ return self.parse_vacancies_links(response) def closed(self, reason: str) -> None: """ This method is called on spider closing and prints the parsed vacancies for debugging purposes. It will be removed in the future. :param reason: The reason of the spider closing :return: """ print("Spider will close, reason: {}".format(reason)) print("The scraped data is as follows:") print(self.scraped_data) rules = (scrapy.spiders.Rule(LinkExtractor( allow=(), restrict_css=('a.c-pagination--item.link.' 'c-jobsearchpage_searchlink.current', ), tags=('a', ), attrs=('href', ), process_value=get_next_listing_page.__func__), callback="parse_vacancies_links", follow=True), )
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@class='product-description']/div[@class='pd-name']/h1", 'price' : "//div[@class='pctemp chudam']/div/span[@class='sub1 txt_20 chudam']", 'category' : "//div[@id='location_hunv']/div[1]/a", 'description' : "//div[@class='pcdetails']/div/p", 'images' : "//div[@class='framehb imgd11 pn_img']//a/@href | //div[@class='thumb-view']//a/@href", 'canonical' : "//link[@rel='canonical']/@href", 'base_url' : "", 'brand' : "" } name = 'shopbevame.com' allowed_domains = ['shopbevame.com'] start_urls = ['http://shopbevame.com/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(), 'parse_item'), Rule(LinkExtractor(), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
XPATH = { 'name': "//div[@id='left_']/div[3]/h2", 'price': "//div[@class='specifications']/div[5]/ul/li/form/label/p[2]/span", 'category': "//div[@id='left_']/div[1]/a", 'description': "//div[@class='tabcontentstyle']/div[@id='tab2']/div[@id='stcpDiv']", 'images': "//div[@class='clearfix chitietsp']/img/@data-large", 'canonical': "", 'base_url': "", 'brand': "", 'in_stock': "", 'guarantee': "", 'promotion': "" } name = 'puritanpride.vn' allowed_domains = ['puritanpride.vn'] start_urls = ['http://www.puritanpride.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [''] rules = [ Rule(LinkExtractor(allow=['/detail-\d+-[a-zA-Z0-9-]+\.html$']), 'parse_item'), Rule(LinkExtractor(allow=['/catergories3-\d+-[a-zA-Z0-9-]+\.html']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class FacultyPagesFilteredSpider(scrapy.Spider): name = 'faculty_pages_filtered' allowed_domains = [ 'cmu.edu', 'cornell.edu', 'washington.edu', 'gatech.edu', 'princeton.edu', 'utexas.edu', 'illinois.edu', 'berkeley.edu' 'mit.edu', 'stanford.edu' ] count = 0 record = {} start_urls = [ 'https://www.cmu.edu/', 'https://www.cornell.edu/', 'https://www.washington.edu/', 'https://www.gatech.edu/', 'https://www.princeton.edu/', 'https://www.utexas.edu/', 'https://illinois.edu/', 'https://www.berkeley.edu/', 'https://www.mit.edu/', 'https://www.stanford.edu/' ] exclude_words = [ 'news', 'events', 'publications', 'pub', 'gallery', 'category', 'courses', 'students', 'references', 'reference', 'software', 'softwares', 'tags', 'tutorials', 'workshop', 'festival', 'admissions', 'exhibitions', 'alumni', 'lectures', 'undergraduate', 'about', 'history', 'awards', 'ranking', 'enrollment', 'graduate', 'archive', 'stories', 'post', 'pages', 'magazine', 'curriculum', '404', 'faqs', 'engage', 'campaign', 'career', 'resources', 'services', 'network', 'security', 'donate', 'giving', 'finance', 'forms', 'policies', 'policy', 'alphabetical', 'summer', 'winter', 'spring', 'autumn', 'fall', 'health', 'facilities', 'facility', 'wp', 'information', 'general', 'catalog', 'guides', 'library', 'publish', 'blog', 'collection', 'share', 'search', 'periodicals', 'bookstore', 'store', 'product', 'organisation', 'webstore', 'funding', 'pdf' ] rules = [Rule(LinkExtractor(unique=True), callback='parse', follow=True)] #count_limits = {"page_count": 200, "item_count": 200} def __init__(self): self.tree = Tree() self.tree.create_node("root", "root") self.tree.create_node("unknown", "unknown", parent="root") self.bio_identifier = BioIdentifier(model="bio-model") for dom in self.allowed_domains: domain = dom.split('.')[0] if not os.path.exists('Crawled_Data'): os.makedirs('Crawled_Data') folder_name = 'Crawled_Data/' + domain.capitalize( ) + '_University_Files' self.record[domain] = 0 if not os.path.exists(folder_name): os.makedirs(folder_name) def parse(self, response): matched_domain = [x for x in self.allowed_domains if x in response.url] if len(matched_domain) > 0: domain = matched_domain[0].split('.')[0] folder_name = 'Crawled_Data/' + domain.capitalize( ) + '_University_Files' self.record[domain] = self.record.get(domain, 0) + 1 if self.record[domain] % 50 == 0: print('\n Crawled {} Bio-pages of {} University ...'.format( self.record[domain], domain.capitalize())) self.tree.save2file(folder_name + "/00__" + str(self.record[domain]) + "_tree.txt") isBio = self.bio_identifier.is_bio_html_content( response.xpath('//*').get()) if isBio: text = BeautifulSoup(response.xpath('//*').get(), features="html.parser").get_text() tokens = nltk.word_tokenize(text) normalized_text = ' '.join( [word for word in tokens if word.isalnum()]) normalized_text += '\n' + response.url hash_text = hashlib.md5(response.url.encode()) file_name = hash_text.hexdigest() with open(folder_name + "/" + file_name + ".txt", "w", encoding="utf-8") as file: file.write(normalized_text) AllLinks = LinkExtractor(allow_domains=domain + '.edu', unique=True).extract_links(response) for n, link in enumerate(AllLinks): if not any([x in link.url for x in self.exclude_words]): if self.tree.get_node(link.url) == None: referer = response.request.headers.get('Referer', None) if referer == None: self.tree.create_node(link.url, link.url, parent='root') else: referer = referer.decode("utf-8") if self.tree.contains(referer): self.tree.create_node(link.url, link.url, parent=referer) else: self.tree.create_node(link.url, link.url, parent='unknown') yield scrapy.Request(url=link.url, callback=self.parse)
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='infomation_detail infomation']/h1/strong/span", 'price': "//td[@class='voucher_td']/p[@class='price_room']/span[@class='price_show']", 'category': "//ul[@class='main_header_ul']/li/a/span", 'description': "//div[@class='cont_toggle']/div[@id='detail_description']", 'images': "//td[@id='td_show_img']/img/@src | //div[@class='sliderkit-nav-clip']/ul/li/a/img/@data-large", 'canonical': "//link[@rel='canonical']/@href", 'base_url': "", 'brand': "" } name = 'mytour.vn' allowed_domains = ['mytour.vn'] start_urls = ['http://mytour.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/\d+-[a-zA-Z0-9-]+\.html']), 'parse_item'), Rule(LinkExtractor(allow=['/c+\d+/.+\.html($|\?page=\d+$)']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@class='content-info clearfix']//h1", 'price' : "//div[@class='bigprice']", 'category' : "//div[@class='dieuhuong fl']/a", 'description' : "//div[@class='sp-tab clearfix']/div[@class='content']", 'images' : "//div[@class='main-img']/a[@class='fancy']/img/@src", 'canonical' : "//link[@rel='canonical']/@href", 'base_url' : "", 'brand' : "", 'in_stock' : "", 'guarantee' : "", 'promotion' : "" } name = 'dieuhoabonmua.vn' allowed_domains = ['dieuhoabonmua.vn'] start_urls = ['http://dieuhoabonmua.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [''] rules = [ Rule(LinkExtractor(allow=['/[\w-]+\.html$']), 'parse_item'), Rule(LinkExtractor(allow=['/[\w-]+/($|page/\d+/$)']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class GoogleSpider(CrawlSpider): name = "google" allowed_domains = ["play.google.com"] start_urls = [ 'https://play.google.com/store', #'https://play.google.com/store/apps/category/GAME/collection/topselling_free', 'https://play.google.com/store/apps/details?id=com.viber.voip' ] rules = [ Rule(LinkExtractor( allow=("https://play\.google\.com/store/apps/details", )), callback='parse_app', follow=True), ] # CrawlSpider 会根据 rules 规则爬取页面并调用函数进行处理 def parse_app(self, response): # 在这里只获取页面的 URL 以及下载数量 item = GoogleplayItem() item['url'] = response.url app_id = response.url.split('=')[-1] if app_id: item['app_id'] = app_id else: item['app_id'] = '' rate_count = response.xpath('//span[@class="rating-count"]/text()') if rate_count: rate_count = rate_count.extract()[0].strip().replace(',', '') item['rating_count'] = rate_count # app_name_div = response.xpath('//div[@class="id-app-title"]/text()') # if not app_name_div: # logging.error(msg='not find the app name') # return # item['app_name'] = app_name_div.extract()[0].strip() # mail_a = response.xpath( # '//div[@class="content contains-text-link"]/a[2]/@href') # if not mail_a: # return # # mail_text = mail_a.extract()[0] # if 'mailto:' in mail_text: # mail_text = mail_text.replace('mailto:', '') # item['mail'] = mail_text.strip() # # company_name_span = response.xpath('//span[@itemprop="name"]/text()') # if not company_name_span: # return # # company_name = company_name_span.extract()[0].strip() # item['company_name'] = company_name # # download_count = response.xpath( # '//div[@itemprop="numDownloads"]/text()') # if download_count: # item['download_count'] = download_count.extract()[0].strip() # else: # item['download_count'] = '0' yield item
class NewZimukuCrawler(scrapy.Spider): name = "new_zimuku_crawler" allowed_domains = [ "zimuku.la", "subku.net", ] # List to hold all search result pages url_list = [ "https://www.zimuku.la/search?q=&p=1", "https://www.zimuku.la/search?q=&p=2", ] # Generate search result urls automatically counter = 3 while counter <= 2369: url_list.append("https://www.zimuku.la/search?q=&p=" + str(counter)) counter += 1 print("[INFO]\tURL list generated.") start_urls = url_list rules = (Rule(LinkExtractor(allow=('\.htm'))), ) def parse(self, response): # Find containers for download page link and file name containers = response.selector.xpath( '//div[contains(@class, "item prel")]/div[contains(@class, "title")]/div/table/tbody/tr/td[contains(@class, "first")]' ) # Go through all containers for container in containers: # Get file name for that specific file file_name = container.xpath('a/@title')[0].extract() # Assign file name to new item item = NewZimukuCrawlerItem() item['file_name'] = file_name # Get link to download page href = container.xpath('a/@href')[0].extract() # Go to download page url = response.urljoin(href) request = scrapy.Request(url, callback=self.parse_detail) request.meta['item'] = item yield request # Download page for a specific subtitle def parse_detail(self, response): # Get link to provider selection page url = response.selector.xpath( '//li[contains(@class, "dlsub")]/div/a[contains(@id, "down1")]/@href' ).extract()[0] # Go to provider selection page request = scrapy.Request(url, callback=self.parse_download) request.meta['item'] = response.meta['item'] yield request # Webpage that opens to select provider def parse_download(self, response): # Get url to actual file download url = response.selector.xpath( '//div[contains(@class, "down")]/ul/li/a/@href').extract()[4] # Download file request = scrapy.Request(url, callback=self.parse_file) request.meta['item'] = response.meta['item'] yield request def parse_file(self, response): body = response.body item = response.meta['item'] item['body'] = body return item
class CaijingSpider(CrawlSpider): name = 'caijing' source = "财经网" allowed_domains = ["caijing.com.cn"] yesterday = datetime.date.today() - datetime.timedelta(days=1) yesterday = yesterday.strftime('%Y%m%d') reg = yesterday start_urls = ['http://industry.caijing.com.cn/industrianews/'] rules = ( Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True), Rule(LinkExtractor(allow='industrianews/[2-4].shtml')), # Rule(LinkExtractor(allow='industrianews/[0-9].shtml')), ) def printcn(uni): for i in uni: print uni.encode('utf-8') def parse_news(self, response): item = GenericItem() self.get_id(response, item) self.get_url(response, item) self.get_source(response, item) self.get_title(response, item) self.get_date(response, item) self.get_body(response, item) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse if item['body']: return item def get_id(self, response, item): id = uuid.uuid4() if id: item['id'] = id def get_url(self, response, item): news_url = response.url if news_url: item['url'] = news_url def get_source(self, response, item): source = self.source if source: item['source'] = source def get_title(self, response, item): title = response.xpath('//*[@id="cont_title"]/text()').extract() if title: item['title'] = ''.join(title).strip() def get_date(self, response, item): date = response.xpath('//span[@id="pubtime_baidu"]/text()').extract() if date: item['date'] = ''.join(date).replace(u'-', u'').replace( u':', u'').replace(u' ', u'').strip() else: date = response.xpath('//span[@id="cont_riqi"]/text()').extract() if date: item['date'] = ''.join(''.join(date).replace( u'年', u'').replace(u'月', u'').replace(u'日', u'').replace( u':', u'').replace(u' ', u'').strip()[-12:]) + '00' def get_body(self, response, item): paras = response.xpath('//div[@id="the_content"]/p') news_body = '' for p in paras: data = p.xpath('string(.)').extract() if data: body = '' for line in ''.join(data).splitlines(): # print entry.encode('utf-8') body += line.strip() news_body += body + '_|_' item['body'] = news_body.replace('_|__|_', '_|_')
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//h1", 'price': "//ul[@class='list-unstyled']/li/h2", 'category': "//ul[@class='breadcrumb']/li/a", 'description': "//div[@class='tab-content']/div[@class='tab-pane active']", 'images': "//ul[@class='thumbnails']/li/a/@href", 'canonical': "//link[@rel='canonical']/@href", 'base_url': "//base/@href", 'brand': "" } name = 'hethongtongdai.vn' allowed_domains = ['hethongtongdai.vn'] start_urls = ['http://hethongtongdai.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/shop/[a-zA-Z0-9-/]+\.html$']), 'parse_item'), Rule(LinkExtractor(allow=['/shop/[a-zA-Z0-9-]+($|\?page=\d+$)']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class P5wSpider(CrawlSpider): name='p5w' source = "全景网" allowed_domains = ["p5w.net"] yesterday = datetime.date.today() - datetime.timedelta(days=1) yesterday = yesterday.strftime('%Y%m%d') reg=yesterday start_urls = [ 'http://www.p5w.net/news/gncj', 'http://www.p5w.net/news/gjcj', 'http://www.p5w.net/news/cjxw', 'http://www.p5w.net/news/xwpl', 'http://www.p5w.net/news/biz', 'http://www.p5w.net/news/cjxw/fdcy', 'http://www.p5w.net/news/cjxw/zzyjxsbyb', 'http://www.p5w.net/news/tech', 'http://www.p5w.net/news/travel', 'http://www.p5w.net/news/pgt', 'http://www.p5w.net/news/sjqx', 'http://www.p5w.net/news/gncj/index_2.htm', 'http://www.p5w.net/news/gjcj/index_2.htm', 'http://www.p5w.net/news/cjxw/index_2.htm', 'http://www.p5w.net/news/xwpl/index_2.htm', 'http://www.p5w.net/news/biz/index_2.htm', 'http://www.p5w.net/news/cjxw/fdcy/index_2.htm', 'http://www.p5w.net/news/cjxw/zzyjxsbyb/index_2.htm', 'http://www.p5w.net/news/tech/index_2.htm', 'http://www.p5w.net/news/travel/index_2.htm', 'http://www.p5w.net/news/pgt/index_2.htm', 'http://www.p5w.net/news/sjqx/index_2.htm', 'http://www.p5w.net/news/gncj/index_3.htm', 'http://www.p5w.net/news/gjcj/index_3.htm', 'http://www.p5w.net/news/cjxw/index_3.htm', 'http://www.p5w.net/news/xwpl/index_3.htm', 'http://www.p5w.net/news/biz/index_3.htm', 'http://www.p5w.net/news/cjxw/fdcy/index_3.htm', 'http://www.p5w.net/news/cjxw/zzyjxsbyb/index_3.htm', 'http://www.p5w.net/news/tech/index_3.htm', 'http://www.p5w.net/news/travel/index_3.htm', 'http://www.p5w.net/news/pgt/index_3.htm', 'http://www.p5w.net/news/sjqx/index_3.htm', 'http://www.p5w.net/news/gncj/index_4.htm', 'http://www.p5w.net/news/gjcj/index_4.htm', 'http://www.p5w.net/news/cjxw/index_4.htm', 'http://www.p5w.net/news/xwpl/index_4.htm', 'http://www.p5w.net/news/biz/index_4.htm', 'http://www.p5w.net/news/cjxw/fdcy/index_4.htm', 'http://www.p5w.net/news/cjxw/zzyjxsbyb/index_4.htm', 'http://www.p5w.net/news/tech/index_4.htm', 'http://www.p5w.net/news/travel/index_4.htm', 'http://www.p5w.net/news/pgt/index_4.htm', 'http://www.p5w.net/news/sjqx/index_4.htm', 'http://www.p5w.net/news/gncj/index_5.htm', 'http://www.p5w.net/news/gjcj/index_5.htm', 'http://www.p5w.net/news/cjxw/index_5.htm', 'http://www.p5w.net/news/xwpl/index_5.htm', 'http://www.p5w.net/news/biz/index_5.htm', 'http://www.p5w.net/news/cjxw/fdcy/index_5.htm', 'http://www.p5w.net/news/cjxw/zzyjxsbyb/index_5.htm', 'http://www.p5w.net/news/tech/index_5.htm', 'http://www.p5w.net/news/travel/index_5.htm', 'http://www.p5w.net/news/pgt/index_5.htm', 'http://www.p5w.net/news/sjqx/index_5.htm' ] rules=( Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True), ) def printcn(uni): for i in uni: print uni.encode('utf-8') def parse_news(self,response): item = GenericItem() self.get_id(response,item) self.get_url(response,item) self.get_source(response,item) self.get_title(response,item) self.get_date(response,item) self.get_body(response,item) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse if item['body']: return item def get_id(self,response,item): id=uuid.uuid4() if id: item['id']=id def get_url(self,response,item): news_url=response.url if news_url: item['url']=news_url def get_source(self,response,item): source=self.source if source: item['source']=source def get_title(self,response,item): title=response.xpath('//div[@class="newscontent_right2"]/h1/text()').extract() if title: item['title']=''.join(title).strip() def get_date(self,response,item): date=response.xpath('//div[@class="content_info clearfix"]/span[1]/time/text()').extract() if not date: date = response.xpath('//span[@id="dTime"]/text()').extract() if date: item['date']=datetime.date.today().strftime('%Y')+''.join(date).replace(u'月',u'').replace(u'日',u'').replace(u':',u'').replace(u' ',u'').strip()+'00' def get_body(self,response,item): paras = response.xpath('//div[@class="article_content2"]/div/p') if not paras: paras = response.xpath('//div[@class="Custom_UnionStyle"]/p') news_body = '' for p in paras: data = p.xpath('string(.)').extract() if data: body = '' for line in ''.join(data).splitlines(): # print entry.encode('utf-8') body += line.strip() news_body += body + '_|_' item['body'] = news_body.replace('_|__|_','_|_')
class lagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = ['https://www.lagou.com/'] agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/" \ "537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" # if not settings, it will be redirect to login custom_settings = { "COOKIES_ENABLED": False, "DOWNLOAD_DELAY": 1, 'DEFAULT_REQUEST_HEADERS': { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID=ABAAABAAAFCAAEGBC99154D1A744BD8AD12BA0DEE80F320; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; _ga=GA1.2.1111395267.1516570248; _gid=GA1.2.1409769975.1516570248; user_trace_token=20180122053048-58e2991f-fef2-11e7-b2dc-525400f775ce; PRE_UTM=; LGUID=20180122053048-58e29cd9-fef2-11e7-b2dc-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=7e9c503b9a29e06e6d130f153c562827; _gat=1; LGSID=20180122055709-0762fae6-fef6-11e7-b2e0-525400f775ce; PRE_HOST=github.com; PRE_SITE=https%3A%2F%2Fgithub.com%2Fconghuaicai%2Fscrapy-spider-templetes; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2F4060662.html; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1516569758,1516570249,1516570359,1516571830; _putrc=88264D20130653A0; login=true; unick=%E7%94%B0%E5%B2%A9; gate_login_token=3426bce7c3aa91eec701c73101f84e2c7ca7b33483e39ba5; LGRID=20180122060053-8c9fb52e-fef6-11e7-a59f-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1516572053; TG-TRACK-CODE=index_navigation; SEARCH_ID=a39c9c98259643d085e917c740303cc7', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } } rules = ( # Rule(LinkExtractor(allow=('zhaopin/.*',)), follow=True), # Rule(LinkExtractor(allow=('gongsi/j\d+.html',)), follow=True), Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), ) def parse_job(self, response): """ 解析拉勾网的职位 :param response: :return: """ item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary_min", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years_min", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
class ScrutinSpider(BaseSpider): name = "scrutinspider" rules = [ Rule(LinkExtractor(allow=['/scrutins/liste/.*']), 'parse_an_scrutins', follow=True), Rule(LinkExtractor(allow=['/scrutin-public/scr\d+.html']), 'parse_senat_session', follow=True) ] start_urls = [ 'http://www2.assemblee-nationale.fr/scrutins/liste/', 'http://www.senat.fr/seancepub.html' ] def parse_an_scrutins(self, response): for scrutin in response.xpath('//table[@class="scrutins"]/tbody/tr'): item = ScrutinItem() item['chambre'] = 'AN' item['numero'] = self.get_text(scrutin, 'td[1]').rstrip('*') item['objet'] = self.get_text(scrutin, 'td[3]').strip( ' [').capitalize() item['url'] = self.make_url(response, scrutin.select( 'td/a[contains(text(), "analyse")]/@href')[0].extract()) matches = re.search('(\d{1,2})/(\d{1,2})/(\d{1,4})', self.get_text(scrutin, 'td[2]')) item['date'] = '-'.join((matches.group(3), matches.group(2), matches.group(1))) try: item['dossier_url'] = self.make_url(response, scrutin.select( 'td/a[contains(text(), "dossier")]/@href')[0].extract()) except IndexError: pass yield item def parse_senat_session(self, response): for bloc in response.xpath('//div[@class="blocscr"]'): href = bloc.xpath('span[@class="blocscrnr"]/a/@href')[0].extract() dlink = bloc.xpath( '//a[contains(@href, "/dossier-legislatif/")]/@href') req = Request(url=self.make_url(response, href), callback=self.parse_senat_scrutin) if len(dlink): req.meta['dlink'] = dlink[0].extract() yield req def parse_senat_scrutin(self, response): item = ScrutinItem() item['chambre'] = 'SEN' titlediv = response.xpath('//div[@class="title"]')[0] title = self.get_text(titlediv, 'h1') matches = re.search(ur'scrutin-public/(\d+)/scr.*\.html', response.url) session = matches.group(1) matches = re.search(ur'^Scrutin n° (\d+) - séance du (.*)$', title) item['numero'] = '%s-%s' % (session, matches.group(1)) objet = self.get_text(response, '//div[@id="wysiwyg"]/p/i') item['objet'] = objet item['url'] = response.url dmatches = re.search(r'^(\d+) (\D+) (\d+)$', matches.group(2)) item['date'] = '%04d-%02d-%02d' % (int(dmatches.group(3)), _months[dmatches.group(2)], int(dmatches.group(1))) if 'dlink' in response.meta: item['dossier_url'] = response.meta['dlink'] else: dlink = response.xpath( '//a[contains(@href, "/dossier-legislatif/")]/@href') if len(dlink): item['dossier_url'] = self.make_url(response, dlink[0].extract()) yield item