def parse(self, response): PrintLog.print_start_flag(self.parse.__name__) sel = Selector(response) # pdb.set_trace() # print response.url # print response.body # 取出friendsList ''' 下面的正则表达式要查找和取出字符串‘user_list : [...],’中间的...内容 (?<= # 断言要匹配的文本的前缀开始 user_list : \[ # 查找字符串'user_list : [' ) # 前缀结束 [\s\S]* # 匹配任意文本 (?= # 断言要匹配的文本的后缀开始 \], # 查找字符串'[,' ) # 后缀结束 ''' friends = sel.re(r'(?<=user_list : \[)[\s\S]*(?=\],)') yield self.parse_friends_list(friends_list=friends) # 尝试取下一页数据 #pdb.set_trace() PrintLog.print_log("get next page") page_count_str_list = sel.re(r'pageCount :\s*(.*)') if page_count_str_list: m = re.findall(r"\d", page_count_str_list[0]) self.total_page_count = int(m[0]) # print "page_count_num=", self.total_page_count self.page_num += 1 # 下一页码 if self.page_num < self.total_page_count: yield self.request_page(page_idx=self.page_num)
def parse(self, response): sel = Selector(response) item = MarketScrapyItem() #item['name'] = 'test' item['name'] = sel.re('id=\"mdname\"(.*?)</span') item['bus'] = sel.re('id=\"inforBody\".*') return item
def parse_article(self, response): item = WechartAccount() sel = Selector(response) user_name = sel.re('<span class="profile_meta_value">([^<]*)</span>') nickname = sel.re('var nickname = "([^"]*)";') image_url = sel.re('hd_head_img : "([^"]*)"') item['nickname'] = ''.join(nickname) item['user_name'] = user_name[0] item['image_url'] = ''.join(image_url) return item
def parse_item(self, response): item = TupianItem() x = Selector(response) imgs = x.re('src="(http.*?\.jpg)".*?alt') imgname = x.re('src.*?alt="(.*?)"') for i in range(len(imgs)): item['tupianming'] = imgname[i] item['images_urls'] = [imgs[i]] item['leibie'] = x.xpath('//span/h1/text()').extract() yield item
def parse_item(self, response): item = TianqiItem() x = Selector(response) rq_list = x.re('(\d+年\d+月\d+日)') tqzk_list = x.re('([\u4e00-\u9fa5]+\s+/[\u4e00-\u9fa5]+)')[::2] qw_list = x.re('\d+℃\s+/\s+-?\d+℃') for i in range(len(tqzk_list)): item['rq'] = rq_list[i] item['tqzk'] = re.sub('\s+', '', tqzk_list[i]) item['qw'] = re.sub('\s+', '', qw_list[i]) yield item
def parse_product(self, response): self.response = response selector = Selector(response=response) ids = selector.re('"skuId":"(\w+)"') savings = selector.re('"SAVE":"(\d+)%"') options = selector.re('"Option":"(.+?)"') prices = selector.re(r'"RP":".(\d+\.\d{2})"') name = selector.re('productDisplayName="(.+?)"') items = self.load(prices, savings, ids, name[0], options) return items
def parse_news(self, response): sel = Selector(response) pattern = re.match(self.url_pattern, str(response.url)) item = SinaItem() item['source'] = 'sina' # pattern.group(1) item['date'] = ListCombiner(str(pattern.group(2)).split('-')) item['newsId'] = sel.re(r'comment_id:(\d-\d-\d+)')[0] item['cmtId'] = item['newsId'] item['channelId'] = sel.re(r'comment_channel:(\w+);')[0] item['comments'] = {'link':str('http://comment5.news.sina.com.cn/comment/skin/default.html?channel='+item['channelId']+'&newsid='+item['cmtId'])} item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''} item['contents']['title'] = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0] item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract()) return item
def parse_news(self, response): sel = Selector(response) pattern = re.match(self.url_pattern, str(response.url)) datenow = GetDate() item = SinaItem() item['source'] = 'sina' # pattern.group(1) item['date'] = datenow item['newsId'] = sel.re(r'comment_id:(\d+-\d-\d+)')[0] item['cmtId'] = item['newsId'] item['channelId'] = sel.re(r'comment_channel:(\w+);')[0] item['comments'] = {'link':str('http://comment5.news.sina.com.cn/page/info?format=json&channel='+item['channelId']+'&newsid='+item['cmtId']+'&group=0&compress=1&ie=gbk&oe=gbk&page=1&page_size=100&jsvar=requestId_24')} item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''} item['contents']['title'] = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0] item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract()) yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?people.com.cn.*?/\d+/\d+/.*?', url) and 'BIG' not in url: content = response.xpath( '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()' ).extract() if content: item = NewsItem( domainname='http://people.com.cn', chinesename='人民网', url=sel.root.base, title=sel.css('div.text_title > h1::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', timeofpublish=sel.re(r'\d{4}年\d{2}月\d{2}日\d{2}:\d{2}')[0], content=''.join(content), source=sel.css( 'div.box01 > div.fl > a::text').extract_first(), author=sel.css('p.author::text').extract_first()) item = judge_time_news(item) if item: yield item
def parse(self, response): selector = Selector(response) PageUrl = None #抓到下一页URL try: PageUrl = selector.re( "href=\"(http:.*\d)\" class=\"ui_page_item ui_page_next\"")[0] except IndexError: self.log('One done!' + response.url) #抓取本页的15个行程概览 divitems = selector.xpath("//div[@class='items']").extract() for item in divitems: result = QYURLItem() journey_name_pattern = re.compile(u'<dd>(.*)</dd>') url_pattern = re.compile('<a href=\"(//.*)\" class.*>') day_pattern = re.compile('<strong>(\d*)</strong>') line_pattern = re.compile('<p>(.*)</p>') date_pattern = re.compile(r'<dt>(20.*) 出发</dt>') lable_pattern = re.compile('<strong>(\W*)</strong>') result['journey_name'] = journey_name_pattern.findall(item) result['url'] = url_pattern.findall(item) result['day'] = day_pattern.findall(item) result['line'] = line_pattern.findall(item) result['date'] = date_pattern.findall(item) result['lable'] = lable_pattern.findall(item) yield result # self.log(PageUrl, level=log.DEBUG)\ '''
def parse_item(self, response): """ Main parse function """ sel = Selector(response) item = ProductItem() item['source'] = 'tmall' item['name'] = self.get_product_name( sel ) item['img'] = sel.xpath("//ul[@id='J_UlThumb']/li")[0].xpath(".//a/img/@src").extract()[0] item['category'] = self.get_category(response) try: # 获取TShop字符串,并对TShop字符串进行JSON标准化处理 TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0] # 移除注释,目前只有天猫超市有注释,以逗号开头 regex = re.compile(',\s*\/\/[^\n]*') TShop_str = re.sub(regex, ',', TShop_str) TShop = eval( TShop_str, type('Dummy', (dict,), dict(__getitem__=lambda s,n:n))() ) except SyntaxError: return item['itemId'] = TShop.get('itemDO').get('itemId', '') item['url'] = 'http://detail.tmall.com/item.htm?id=' + item['itemId'] item['date'] = date.today().strftime('%Y-%m-%d') item['attr'], item['brand'] = self.get_attr_and_brand( sel ) skuMap = self.get_sku_chinese_map( sel, TShop ) initApi_url = TShop.get('initApi') yield Request( initApi_url, headers={'Referer': 'http://www.google.com.hk/'}, meta={'item': item, 'skuMap': skuMap}, callback=self.parse_initapi )
def search(q): '''根据书名检索馆藏信息 ''' fdata = { 'tag': 'search', 'subtag': 'simsearch', 'gcbook': 'yes', 'viewtype': '', 'flword': '', 'viewtype': '', 'q': q } resp = requests.post(search_url, data=fdata) #得到记录条数 s_res = Selector(text=resp.content.decode('utf-8')).xpath('//p[@id="page"]/span/text()') #如没有检索到记录,result_list为空 result_list = s_res.extract() if len(result_list) == 0: return "没有检索到记录" result_str = result_list[0] num = int(s_res.re('[\d]+')[0]) if num > 3: note = "" if num > 10: note = "\n注:只显示前10条结果,得到所有检索结果:" + search_url + "\n======" return result_str + "\n======" + note + getManyLinks(resp, num) else: return result_str + "\n======" + getdetail(resp, num)
def parse_product(self, response): sel = Selector(response) price = sel.re(re.compile('jsProductPrice = \'(.*)\';')) categories = sel.xpath('//div[@id="navBreadCrumb"]/a/text()')[1:].extract() brand = sel.xpath('//span[@class="product_manufacturer"]/text()').re('Manufactured by: (.*)') brand = brand[0].strip() if brand else '' sku = sel.xpath('//span[@class="product_model"]/text()').re('Ref: (.*)') sku = sku[0].strip() if sku else '' identifier = re.search('p-(.*)\.html', response.url).group(1) image_url = response.xpath('//div[@id="replace_image_zoom"]//img[@class="zoom_pic"]/@src').extract() if image_url: image_url = response.urljoin(image_url[0]) name = sel.xpath('//h1[@class="productGeneral"]/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('name', name) loader.add_value('price', price) price = loader.get_output_value('price') if price and Decimal(price) < Decimal('400.0'): loader.add_value('shipping_cost', Decimal('35.00')) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', image_url) for category in categories: loader.add_value('category', category) loader.add_value('brand', brand) yield loader.load_item()
def myre(self, response, key_list): date = Selector(response) for key in key_list: #llprint key if date.re(key): return 1 return 0
class ChainReactionReviews(ChainReaction): name = 'chain-reaction-reviews' response = None selector = None item = None loader = None def _register(self, response): self.response = response self.selector = Selector(response=response) self.item = response.meta['item'] if 'item' in response.meta.keys() else Review() self.loader = ChainReactionReviewLoader(self.item, response=self.response) def parse_product(self, response): self._register(response) self.loader.add_value('slug', self.selector.re('productDisplayName="(.+?)"')) self.loader.add_value('name', self.selector.re('productDisplayName="(.+?)"')) self.loader.add_value('retailer', RETAILER) self.loader.add_value('manufacturer', MANUFACTURER) request = Request(response.url + '/reviews.djs?format=embeddedhtml', callback=self.parse_reviews) request.meta['item'] = self.loader.load_item() return request def parse_reviews(self, response): self._register(response) self.loader.add_value('review', 'review') self.loader.add_value('author', 'author') self.loader.add_value('date', 'date') return self.loader.load_item()
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?sina.com.*?/\d{4}-\d{2}-\d{2}/.*?', url): content = response.xpath( '//*[@id="artibody"]//p//text()').extract() # 移除编辑 editor = response.xpath( '//*[@class="article-editor"]/text()').extract_first() if editor: content.remove(editor) publish_time = sel.re(r'\d{4}年\d{2}月\d{2}日.{0,1}\d{2}:\d{2}')[0] if ' ' in publish_time: publish_time = publish_time.replace(' ', '') if content: item = NewsItem( domainname='http://sina.com.cn', chinesename='新浪网', url=sel.root.base, title=sel.css('#artibodyTitle::text, #main_title::text' ).extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), source=sel.xpath( '//*[@data-sudaclick="media_name"]/text() | //*[@data-sudaclick="media_name"]/a/text()' ).extract_first(), author=None) item = judge_time_news(item) if item: yield item
def myre(self,response,key_list): date=Selector(response) for key in key_list: #llprint key if date.re(key): return 1 return 0
def parse_item(response): sel = Selector(response) print(sel) url = response.request.url if re.match(r'.*?tibet.people.com.cn/.*?', url): print('---------------------') print(url) content = response.xpath('//html/body/div[2]/div[4]/div[1]/div[2]/div[2]/text()').extract() # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text() print(content) if content: item = NewsItem( domainname='http://tibet.people.com.cn', chinesename='people', url=sel.root.base, title=sel.css('.gq_content > h1:nth-child(2)::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='tibet', encodingtype='utf-8', corpustype='网络', timeofpublish=sel.re(r'\d{4}.*?\d{2}.*?\d{2}.*?\d{2}:\d{2}')[0].replace('ལོའི་ཟླ་ ', '年').replace('ཚེས་', '月').replace('ཉིན། ', '日'), # timeofpublish = re.search(r'\d{4}.*?\d{2}.*?\d{2}',sel.css('.title_hd > p:nth-child(2)::text').extract_first()).group(0), content=''.join(content), # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(), # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first() ) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) item = judge_time_news_people(item) if item: yield item
def parse_item(self, response): items = DoubanItem() x = Selector(response) lb = x.xpath('//div[@class="hd"]/h1/text()').extract()[0] zz_yz = x.re('作者</span.*?</a></span></span></p>') x_k = x.xpath('//div[@class="info"]') sm = x_k.xpath('./div[@class="title"]/a/text()').extract() jj = x_k.xpath('./div[@class="article-desc-brief"]/text()').extract() for i in range(len(sm)): items['lb'] = lb items['sm'] = sm[i] zz_k = re.findall('作者</span.*?</a></span></span>', zz_yz[i]) items['zz'] = re.findall('([〕〔\u4e00-\u9fa5·\s]{2,})', zz_k[0])[1:] yz_k = re.findall('译者</span.*?</a></span></span>', zz_yz[i]) if not yz_k: items['yz'] = None else: items['yz'] = re.findall('([〕〔\u4e00-\u9fa5·]+)', yz_k[0])[1:] pf = x_k[i].xpath( './div/span[@class="rating-average"]/text()').extract() if not pf: items['pf'] = None else: items['pf'] = pf[0] items['jj'] = jj[i] yield items #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return i
def parse_post(self, response): sel = Selector(response) post_has_been_removed = sel.xpath(self.post_parse_rules['invalid']) if len(post_has_been_removed) > 0: return None title = sel.xpath(self.post_parse_rules['title']) # postinfo = sel.xpath("//div[@class='postinginfos']") date_posted = sel.xpath(self.post_parse_rules['date_posted']) date_updated = sel.xpath(self.post_parse_rules['date_updated']) userbody = sel.xpath(self.post_parse_rules['body']) text = sel.xpath(self.post_parse_rules['text']) pictures = sel.re(self.post_parse_rules['picture']) match = re.search("(\d+)\.\w+$", response.url) filename, pid = (match.group(0), match.group(1)) item = PostItem() item['pid'] = pid item['filename'] = filename item['region'] = self.region item['domain'] = self.domain item['url'] = response.url item['body'] = response.body item['pictures'] = list(set(pictures)) # Eliminate duplicates item['title'] = title.extract()[0] item['date_posted'] = date_posted.extract()[0] if len(date_updated) > 0: item['date_updated'] = date_updated.extract()[0] item['userbody'] = userbody.extract()[0] item['text'] = text.extract()[0] return item
def down_file(self,response): ''' 下载软件 该请求返回一段js,拿出中的跳转路径,用curl进行下载 :param response: :return: ''' try: item = response.meta['item'] sel = Selector(response); #匹配出软件下载的真正链接 url = sel.re(r"window.location.href\s+=\s+\'([^\']+)") if url: #软件下载url是编过码的,替换里面的乱码,分割出软件名称 otherurl = url[0].replace('%2','/') #软件文件的父路径 parent_file = os.path.sep+settings.PARENT_FILE_NAME if not os.path.exists(parent_file): os.mkdir(parent_file) #软件路径 item['file_name'] = otherurl.split('/')[-1] filename = parent_file + os.path.sep + otherurl.split('/')[-1] #生成curl命令,-i代表断点续传,-o代表存储文件 commond = 'curl -i -o ' + filename + ' ' + url[0] recode = subprocess.call(commond,shell=True) print 'successful!' self.col.update({'vendor': self.vendor}, {'$set': {'state': 'crawling'}}) yield item except Exception as e: print e.message self.col.update({'vendor': self.vendor}, {'$set': {'state': 'error'}})
def parseTmall(self, response): """ Tmall parser """ sel = Selector(response) item = ProductItem() item['surl'] = response.url item['source'] = 'tmall' item['name'] = self.get_product_name(sel) try: # 获取TShop字符串,并对TShop字符串进行JSON标准化处理 TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0] # 移除注释,目前只有天猫超市有注释,以逗号开头 regex = re.compile(',\s*\/\/[^\n]*') TShop_str = re.sub(regex, ',', TShop_str) TShop = eval( TShop_str, type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))()) except SyntaxError: return item['itemId'] = TShop.get('itemDO').get('itemId', '') item['url'] = 'http://detail.tmall.com/item.htm?id=' + item['itemId'] initApi_url = TShop.get('initApi') yield Request(initApi_url, headers={'Referer': 'http://www.google.com.hk/'}, meta={'item': item}, callback=self.parse_initapi)
def parse(self,response): """ 第一次解析时,由于是论坛主页,XPath解析规则不同,之后根据从主页上获取子 论坛URL后,对每个子论坛的URL调用Parse """ sel=Selector(response) items=[] if self.IndexFlag: self.IndexFlag=False ##由于使用BBS论坛页面原因,使用Xpath方式获取不是很好的选择 urls=sel.re(r"(http://bbs2.99nets.me/forum.*html)")[1:-2] for url in urls: msg("------URL: %s -----" % url,level="DEBUG") yield Request(url,callback=self.parse) msg("-----Crawl END----",level="INFO") else: sites=sel.xpath("//form/table/tbody/tr/th/a[@class='s xst']") for site in sites: item=BbssecondItem() item['title']=site.xpath("text()").extract() item['link'] =site.xpath("@href").extract() yield item url,text=response.url,sel.xpath("//title/text()").extract() msg("---CURRENT URL:%s \n ---TEXT:%s" % (url,str(text).encode("UTF8")), level='INFO') ##子论坛翻页 rule="//div[@class='pg']/a[@class='nxt']/" if sel.xpath(rule+"text()").extract(): yield Request(sel.xpath(rule+"@href").extract()[0] ,callback=self.parse)
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?sohu.com.*?/\d{4}\d{2}\d{2}/.*?', url): content = response.xpath( '//*[@itemprop="articleBody"]//p//text()').extract() # 有的段落并不是在p标签下,所以 if len(content) < 3: content = response.xpath( '//*[@itemprop="articleBody"]//p//text() | //*[@id="contentText"]//div/text()' ).extract() publish_time = sel.re( r'\d{4}-\d{2}-\d{2} {0,1}\d{2}:\d{2}:\d{2}')[0] if content: item = NewsItem( domainname='http://sohu.com', chinesename='搜狐网', url=sel.root.base, title=sel.xpath( '//*[@itemprop="headline"]/text()').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), source=sel.xpath( '//*[@id="media_span"]/span/text()').extract_first(), author=sel.xpath( '//*[@id="author_baidu"]/text()').extract_first()) item = judge_time_news(item) if item: yield item
def parse_page(self, response): print response.url sel = Selector(response) email = sel.re('(\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6})')[0] print email if email not in email_in_file and email not in added_email: file.write(email+'\n') added_email.append(email)
def parse_item(self,response): sel= Selector(response) item = CommitsspiderItem() item['cve'] = sel.re(self.patt) if item['cve'] != []: item['url'] = response.url yield item return
def parse_page(self, response): sel = Selector(response) emails = sel.re('(\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6})') emails = list(filter(lambda x: x != '*****@*****.**', emails)) if bool(emails): for email in emails: if email not in email_in_file and email not in current_session_emails: file.write(email+'\n') current_session_emails.append(email)
def parse(self, response): sel = Selector(response) link = sel.re('((http).*?(\.mp4))')[0] name = sel.xpath('//title/text()').extract()[0] items = [] item = MaizieduCourseItem() item['link'] = link item['name'] = name items.append(item) return items
def parse_item(self, response): response = Selector(response) itme = S80Item() itme['电影'] = response.xpath('//h1/text()').extract() itme['类型'] = response.xpath( '//span[@class="span_block"][1]/a/text()').extract() itme['演员'] = response.re('a href="/actor/.*?>([\u4e00-\u9fa5·]+)<') itme['地区'] = response.xpath( '//div[@class="clearfix"]/span[2]/a/text()')[0].extract() itme['语言'] = response.xpath( '//div[@class="clearfix"]/span[3]/a/text()').extract() itme['导演'] = response.xpath( '//div[@class="clearfix"]/span[4]/a/text()').extract() itme['片长'] = response.xpath( '//div[@class="clearfix"]/span[6]/text()').extract() itme['上映时间'] = response.xpath( '//div[@class="clearfix"]/span[5]/text()').extract() itme['下载链接'] = list(set(response.re('"(thunder://.*?)"'))) return itme
def parse_page(self, response): sel = Selector(response) email = sel.re('(\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6})') if bool(email): email = email[0] if email + "\n" not in email_in_file and email not in added_email: file.write(email+'\n') added_email.append(email) print "Spider: Mandy. Email {0} added to file".format(email) else: print "Spider: Mandy. Email {0} already in the file".format(email)
def get_asn(self, response): parent = response.meta['parent'] base = parent.get('base') if base: sel = Selector(response) asn = ''.join(sel.re('([\d]+)\s+\<')) + ''.join( sel.xpath("//span[1]/text()").extract()).strip() print '______________________________' print asn base['asn'] = asn yield parent
def parse_item(self, response): selector = Selector(response) # Use regex to find valid emails emails = selector.re(self.EMAIL_REGEX) if (emails): item = CompanyItem({'link': response.url}) item['emails'] = emails return item return
def parse_news(self, response): sel = Selector(response) pattern = re.match(self.url_pattern, str(response.url)) item = NewsItem() item['source'] = 'news.sina.com.cn' # pattern.group(1) item['date'] = ListCombiner(str(pattern.group(2)).split('-')) item['newsId'] = sel.re(r'comment_id:(\d-\d-\d+)')[0] item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''} item['contents']['title'] = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0] item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract()) return item
def parse_get_user(self,response): log.msg("parse_get_user: "******"id=(.*?)\\"') for user in users: url='http://www.weibo.com/'+user yield Request(url=url,cookies=self.login_cookie,callback=self.parse_user,meta={'url':url}) except Exception, e: log.msg("Error for parse_get_user: " + response.url, level=log.ERROR) log.msg(str(e), level=log.ERROR)
def parse_page(self, response): sel = Selector(response) emails = sel.re('(\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6})') emails = list(filter(lambda x: x != '*****@*****.**', emails)) if bool(emails): for email in emails: if email + "\n" not in email_in_file and email not in current_session_emails: file.write(email+'\n') current_session_emails.append(email) print 'Spider: ProductionHub. Email {0} added to file'.format(email) else: print 'Spider: ProductionHub. Email {0} already in the file'.format(email)
def parseTmall(self, response): """ Tmall parser """ def _referer(): referer = response.request.headers.get('Referer') if referer and referer.find('list.tmall.com') > -1: rto = 'http://list.tmall.com/search_product.htm?' resultC = re.compile('[\?&]cat=(\d+)').search(referer) if resultC: rto += 'cat=%s' % resultC.group(1) resultQ = re.compile('[\?&]q=([^&]+)').search(referer) if resultQ: if resultC: rto += '&q=%s' % resultQ.group(1) else: rto += 'q=%s' % resultQ.group(1) if not 'http://list.tmall.com/search_product.htm?' == rto: return rto elif not referer and response.url.find('detail.tmall.com') > -1: return response.url return '' sel = Selector(response) item = ProductItem() item['source'] = 'tmall' item['name'] = self.get_product_name(sel) item['start_url'] = _referer() store = ''.join( sel.xpath('//input[@name="seller_nickname"]/@value').extract()) item['tm_store'] = '[%s] %s' % (store[-3:], store) if len(store) > 3 else store try: # 获取TShop字符串,并对TShop字符串进行JSON标准化处理 TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0] # 移除注释,目前只有天猫超市有注释,以逗号开头 regex = re.compile(',\s*\/\/[^\n]*') TShop_str = re.sub(regex, ',', TShop_str) TShop = eval( TShop_str, type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))()) except SyntaxError: return item['itemId'] = TShop.get('itemDO').get('itemId', '') item['url'] = response.url initApi_url = TShop.get('initApi') yield Request(initApi_url, headers={'Referer': 'http://www.google.com.hk/'}, meta={'item': item}, dont_filter=True, callback=self.parse_initapi)
def parse(self, response): sel = Selector(response) products = json.loads("{"+sel.re("\"products\":\[.*\]")[0]+"}") items = [] for product in products["products"]: #print(product) item = SephoraItem() item['display_name'] = product["display_name"] item['list_price'] = product["derived_sku"]["list_price"] item['brand_name'] = product["brand_name"] item['rating'] = product["rating"] items.append(item) return items
def parse_news(self, response): sel = Selector(response) pattern = re.match(self.url_pattern, str(response.url)) item = TencentItem() item['source'] = 'tencent' # pattern.group(1) item['date'] = pattern.group(2) item['newsId'] = pattern.group(3) item['cmtId'] = (sel.re(r"cmt_id = (.*);"))[0] # unicode string item['comments'] = {'link':str('http://coral.qq.com/')+item['cmtId']} item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''} item['contents']['title'] = sel.xpath('//h1/text()').extract()[0] item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract()) return item
def parse(self, response): self.driver.get(response.url) self.driver.execute_script("window.scrollTo(0, 1600)") self.driver.find_element_by_xpath( "//div[@class = 'index-showMoreText']").click() mount_root = Selector(text=self.driver.page_source) print(mount_root.re(r'url\("([^\")]+)')) row_keys = mount_root.xpath( "//*[@class = 'index-rowKey']/text()").getall() row_values = mount_root.xpath( "//*[@class = 'index-rowValue']/text()").getall() specifications = dict(zip(row_keys, row_values))
def parse4(self, response): print("PARSE4 GO!!!") selector = Selector(response=response) li = selector.re(r'<ul class="listContent">(.*?)</ul>') '''遍历所有页数,判断li标签是否存在''' if not li: # 通过XPath获取详情页url信息 url_pattern = r'//div[@class="content"]/div[1]/ul[@class="listContent"]/li/a/@href' url_list = selector.xpath(url_pattern).extract() print("======Redis连接信息:Host:{} Port:{}======".format( self.settings.get('REDIS_HOST'), self.settings.get('REDIS_PORT'))) # 将URL信息写入Redis数据库 print('PARSE1 开始写入') for u in url_list: print("准备写入{}".format(u)) self.r_link.rpush("Lianjia:detail_url", u) print("{}写入成功!".format(u)) print('=' * 30, '\n', "共计写入url:{}个".format(len(url_list)), '\n', '=' * 30) elif li[0] != '': # 通过XPath获取详情页url信息 url_pattern = r'//div[@class="content"]/div[1]/ul[@class="listContent"]/li/a/@href' url_list = selector.xpath(url_pattern).extract() # 启用Redis服务 r_link = redis.Redis(port=self.settings.get('REDIS_PORT'), host=self.settings.get('REDIS_HOST'), decode_responses=True, db=1) print("======Redis连接信息:Host:{} Port:{}======".format( self.settings.get('REDIS_HOST'), self.settings.get('REDIS_PORT'))) # 将URL信息写入Redis数据库 print('PARSE4 开始写入') for u in url_list: print("准备写入{}".format(u)) r_link.rpush("Lianjia:detail_url", u) print("{}写入成功!".format(u)) print('=' * 30, '\n', "共计写入url:{}个".format(len(url_list)), '\n', '=' * 30) elif li[0] == '': pass
def parse_itme(self, response): print('1') x = Selector(response) names = x.xpath('//ul/li/b/text()').extract() leibie = x.re('小说分类:([\u4e00-\u9fa5]+)') links = LinkExtractor(allow=('/down/\d+.html')) link_list = links.extract_links(response) for link in link_list: yield Request(url=link.url, callback=self.parse_itme1, meta={ 'name': names, 'leibie': leibie })
def parseTmall(self, response): """ Tmall parser """ def _referer(): referer = response.request.headers.get('Referer') if referer and referer.find('list.tmall.com') > -1: rto = 'http://list.tmall.com/search_product.htm?' resultC = re.compile('[\?&]cat=(\d+)').search( referer ) if resultC: rto += 'cat=%s' % resultC.group(1) resultQ = re.compile('[\?&]q=([^&]+)').search( referer ) if resultQ: if resultC: rto += '&q=%s' % resultQ.group(1) else: rto += 'q=%s' % resultQ.group(1) if not 'http://list.tmall.com/search_product.htm?' == rto: return rto elif not referer and response.url.find('detail.tmall.com') > -1: return response.url return '' sel = Selector(response) item = ProductItem() item['source'] = 'tmall' item['name'] = self.get_product_name( sel ) item['start_url'] = _referer() store = ''.join( sel.xpath('//input[@name="seller_nickname"]/@value').extract() ) item['tm_store'] = '[%s] %s' % (store[-3:], store) if len(store) > 3 else store try: # 获取TShop字符串,并对TShop字符串进行JSON标准化处理 TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0] # 移除注释,目前只有天猫超市有注释,以逗号开头 regex = re.compile(',\s*\/\/[^\n]*') TShop_str = re.sub(regex, ',', TShop_str) TShop = eval( TShop_str, type('Dummy', (dict,), dict(__getitem__=lambda s,n:n))() ) except SyntaxError: return item['itemId'] = TShop.get('itemDO').get('itemId', '') item['url'] = response.url initApi_url = TShop.get('initApi') yield Request( initApi_url, headers={'Referer': 'http://www.google.com.hk/'}, meta={'item': item}, dont_filter=True, callback=self.parse_initapi )
def parse_url(self, response): hxs = Selector(response) download_url = hxs.re('jjvod_url = \'(.+)\';') meta = response.request.meta imgs = meta.get('imgs') title = meta.get('title') url = meta.get('url') item = CCUrlItem() item['url'] = url item['download_url'] = download_url item['title'] = title item['image_urls'] = imgs yield item
def parse_news(self, response): sel = Selector(response) pattern = re.match(self.url_pattern, str(response.url)) item = NeteaseItem() item['source'] = 'netease' # pattern.group(1) item['date'] = '20' + pattern.group(2) + pattern.group(3) item['newsId'] = pattern.group(4) item['cmtId'] = item['newsId'] item['boardId'] = sel.re(r"boardId = \"(.*)\"")[0] item['comments'] = {'link':str('http://comment.news.163.com/'+item['boardId']+'/'+item['cmtId']+'.html')} item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''} item['contents']['title'] = sel.xpath("//h1[@id='h1title']/text()").extract()[0] item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract()) return item
def parse(self, response): selector = Selector(response) p1 = re.compile('<ul class="clearfix" style="display:none;">(.*?)</ul>', re.S) # 这个正则表达式会找的7个匹配<ul></ul> rp1 = selector.re(p1) p2 = re.compile(u'<a data-id="\d*" data-key="\D*" href="(http://.*)">(.*)</a>') r = Redis() for index, i in enumerate(rp1): rp2 = p2.findall(i) for j in rp2: con_item = ContryItem() con_item['name'] = j[1] con_item['url'] = j[0] con_item['con_id'] = index+1 #continent ID fk Redis_utils.server.lpush('myspider:qyconpage_urls', j[0]) yield con_item
def parse(self, response): selector = Selector(response=response) results = selector.re(self.WIF_REGEX) #self.root_logger.debug("REGEX: " + str(results)) for wif in results: if self.balance_checker.is_valid(wif): info = self.balance_checker.get_key_info(wif) if info.balance > 0: self.root_logger.critical("CASH: " + str(info)) self.root_logger.critical("URL: " + str(response.url)) self.root_logger.critical("BODY: " + str(response.body)) links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) for link in links: yield scrapy.Request(link.url, callback=self.parse)
def parse_stop_data(self, response): """Parse stop data from a response where the "from" and "to" stops were selected in the request.""" sel = Selector(response) stop_name = sel.css('#confirm1_hlFrom').xpath('text()').extract()[0] stop_location = sel.css('#divFrom').xpath('p[1]/text()').extract()[0] latlong = sel.re(r'Location\(([\d\.\-,]+)\)')[0] (stop_lat, stop_long) = latlong.split(',') return StopItem( stop_name=stop_name, stop_location=self.sanitize_stop_location(stop_location), lat=stop_lat, long=stop_long )
def parse(self, response): sel = Selector(response) data = sel.re(r'\(([^)]+)\)')[0] try: data = json.loads(data) except Exception as e: print e self.log(e) print data self.log(data) print response.body url = urlparse.unquote(response.url) nums = re.search(r'"startPoint":{"latitude":(\d+.\d+),"longitude":(\d+.\d+)},"endPoint":{"latitude":(\d+.\d+),"longitude":(\d+.\d+)}',url).groups() slat, slong, elat, elong = [float(n) for n in nums] l = abs(slat - elat) w = abs(slong - elong) print 'EXPLORING:', (slat,slong), (elat,elong) self.log('EXPLORING: (%f, %f) (%f,%f)' %(slat,slong,elat,elong)) quarters = [{'start':(slat, slong), 'wid':w/2, 'len':l/2}, {'start':(slat, slong + w/2), 'wid':w/2, 'len':l/2}, {'start':(slat - l/2, slong), 'wid':w/2, 'len':l/2}, {'start':(slat - l/2, slong + w/2), 'wid':w/2, 'len':l/2}] if not data: self.log('Failed: No data') return for point in data['pointDataList']: coord = point['coordinate'] if point['count'] > 1: for i,a in enumerate(quarters): if self.intersect(a, coord) and a['len'] >= self.minLen: slat2, slong2 = a['start'] w2, l2 = a['wid'], a['len'] newUrl = self.getUrl((slat2,slong2), (slat2-l2, slong2+w2)) quarters.pop(i) yield Request(newUrl) break else: celldata = point['cellData'] yield Cell(ID=celldata['cellId'], networkID=celldata['mnc'], latitude=coord['latitude'], longitude=coord['longitude'])
def parse(self, response): #log.msg( response.body, level=log.INFO ) sel = Selector(response) #1. all items in current page urls = sel.re('<a href="(http://finance.sina.com.cn/stock/jsy/\\d+/\\d+.shtml)" target="_blank">') for url in urls: log.msg( url, level=log.INFO ) yield Request( url, callback=self.parse_item) #2. next page detect #pageBar = sel.css('#Main > div.listBlk > table:nth-child(1) > tbody > tr > td > div > span.pagebox_next') #pageBar = sel.xpath( '//div[@id="Main"]/div[3]/table[1]/tbody/tr/td/div') #pageBar = response.xpath( '//div[@id="Main"]/div[3]/table[1]/tbody/tr/td/div') pageBar = response.xpath('//span[@class="pagebox_next"]') if pageBar != None and len(pageBar) > 0 : pageTxt = pageBar.extract()[0] log.msg( 'matched txt:'+pageTxt, level=log.INFO ) tail_url = self.reg_next_page.search( pageTxt ) log.msg('NEXT PAGE: '+tail_url.group(1), level=log.INFO ) yield Request( 'http://roll.finance.sina.com.cn/finance/zq1/gsjsy/'+tail_url.group(1), callback=self.parse )
def parse_article(self, response): article = response.meta["article"] # We'll be using a regex for pub_time so we can't use selector # response shortcuts. sel = Selector(response=response) # TW adds a newline at the beginning of the title, so strip the string. article["title"] = sel.xpath( "//span[@class='titlea']/text()").extract_first().strip() # Publication date and time is grouped in a span together with the # number of times the article was accessed. Therefore, use a regex to # extract just the date and time. Matches eg.: # "04. July 2015. [17:16:00 UTC]" article["pub_time"] = sel.re(r"\d{2}\. \w+ \d{4}\. \[\d{2}:\d{2}:\d{2} UTC\]")[0] article["body"] = sel.xpath("//div[@class='articlebody']").extract_first() yield article
def parse_item(self, response): sel = Selector(response) items = sel.xpath(TABLE_TR_XPATH) for item in items: problem_id = item.xpath( PROBLEM_ID_XPATH).extract()[0].strip() submit_time = item.xpath( SUBMIT_TIME_XPATH).extract()[0].split(' ')[0] self.solved[problem_id] = submit_time if not sel.re(RULE_REGEX): yield AccountItem(**dict( origin_oj=self.origin_oj, username=self.username, solved=self.solved )) raise CloseSpider('Crawl finished') return