def parse(self, response): ipItem = IpItem() proxy = response.meta['p'] rootPath = proxy['root'] table = response.xpath(rootPath).extract() for sel in Selector(response=response).xpath(rootPath): ipPath = proxy['ip'] portPath = proxy['port'] ipList = sel.xpath(ipPath).extract() portList = sel.xpath(portPath).extract() ip = Utility.listToStr(ipList) port = Utility.listToStr(portList) # using regular expression regex = '\d.{1,3}\d{1,3}' if re.match(regex, ip): print ip v = ValidateIp() protocol, anonymous, speed = v.validate(ip, port) if protocol is not -1: ipItem['ip'] = ip ipItem['port'] = port print ipItem['ip'], ':', ipItem['port'] yield ipItem else: continue else: continue
def parseAccount(self, response): urls = [] articleListItem = ArticleListItem() title = response.xpath('/html/head/title//text()').extract() print Utility.listToStr(title) # account = response.xpath('//*[@class="profile_infoß"]//text()').extract()[0].strip() print '----------------------account-------------------------' for articlePath in Selector(response=response).xpath( '//*[@class="weui_media_box appmsg"]/div'): # title title = articlePath.xpath('./h4//text()').extract()[0].strip() articleListItem['title'] = title print articleListItem['title'] # url url = articlePath.xpath('./h4//@hrefs').extract()[0] url = "https://mp.weixin.qq.com" + url articleListItem['url'] = url print articleListItem['url'] # date date = articlePath.xpath( './/*[@class="weui_media_extra_info"]//text()').extract()[0] articleListItem['date'] = date print articleListItem['date'] # abstract abstract = articlePath.xpath( './/*[@class="weui_media_desc"]//text()').extract() articleListItem['abstract'] = Utility.listToStr(abstract) print articleListItem['abstract'] urls.append(url) return urls
def parseArticle(self, response): articleItem = ArticleItem() title = response.xpath( '//*[@id="activity-name"]//text()').extract()[0].strip() date = response.xpath('//*[@id="post-date"]//text()').extract()[0] author = response.xpath( '//*[@id="img-content"]/div[1]/em[2]').extract() account = response.xpath('//*[@id="post-user"]//text()').extract()[0] accountId = response.xpath( '//*[@id="js_profile_qrcode"]/div/p[1]/span//text()').extract()[0] content = response.xpath('//*[@id="js_content"]').extract() articleItem['title'] = title.encode('utf-8') articleItem['date'] = date.encode('utf-8') articleItem['account'] = account.encode('utf-8') articleItem['accountId'] = accountId.encode('utf-8') articleItem['content'] = Utility.listToStr(content).encode('utf-8') if len(author) > 0: author = author[0].encode('utf-8') articleItem['author'] = author else: articleItem['author'] = '' print articleItem['title'] # Utility.writeArticleToFile(articleItem) yield articleItem
def parse(self, response): for n in range(9): articleListItem = ArticleListItem() num = self.articleListCon rootPath = '//*[@id="sogou_vr_' + num titlePath = rootPath + '_title_' + str(n) + '"]' contentPath = rootPath + '_summary_' + str(n) + '"]' title = response.xpath(titlePath + '//text()').extract() url = response.xpath(titlePath + '/@href').extract()[0] content = response.xpath(contentPath + '//text()').extract() articleListItem['title'] = Utility.listToStr(title) articleListItem['url'] = url articleListItem['abstract'] = Utility.listToStr(content) print 'title==', articleListItem['title'] print 'href==', articleListItem['url'] print 'content==' print articleListItem['abstract'] yield scrapy.Request(url, callback=self.parseArticle)
def parse(self, response): for n in range(1): accountListItem = AccountListItem() num = self.accountListCon rootPath = '//*[@id="sogou_vr_' + num + '_box_' + str(n) + '"]' accountIdPath = './div/div[2]/p[2]/label//text()' txtBox = response.xpath(rootPath) accountId = txtBox.xpath(accountIdPath).extract() accountName = txtBox.xpath('./div/div[2]/p[1]/a//text()').extract() url = txtBox.xpath('./div/div[2]/p[1]/a/@href').extract()[0] accountListItem['account'] = Utility.listToStr(accountName) accountListItem['url'] = url accountListItem['accountId'] = accountId[0].encode('utf-8') print accountListItem['account'] print accountListItem['url'] print accountListItem['accountId'] url = url.replace('http', 'https') cmd = "phantomjs spider/getBody.js '%s'" % url print url print cmd stdout, stderr = subprocess.Popen( cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() r = HtmlResponse(url=url, body=stdout) articleUrls = self.parseAccount(r) for url in articleUrls: yield scrapy.Request(url, callback=ArticleSpider().parseArticle)