def process_response(self, request, response, spider): if 'dont_retry' in request.meta: return response if response.status in self.retry_http_codes: #recode exception time and suspend spider key = self.genKey() incAttr(self.status, key) if self.maxExceptionTime and self.status[key] >= self.maxExceptionTime: time.sleep(self.suspendTime) reason = response_status_message(response.status) return self._retry(request, reason, spider) or response return response
def process_response(self, request, response, spider): if 'dont_retry' in request.meta: return response if response.status in self.retry_http_codes: #recode exception time and suspend spider key = self.genKey() incAttr(self.status, key) if self.maxExceptionTime and self.status[ key] >= self.maxExceptionTime: time.sleep(self.suspendTime) reason = response_status_message(response.status) return self._retry(request, reason, spider) or response return response
def parse(self, response): #just crawl the first url for mainboard in response.xpath('//*[@id="bodyarea"]/div'): for board in mainboard.xpath('./table/tr'): url = board.xpath('(./td)[2]//a/@href').extract() if url: url = url[0] else: continue time = filter(lambda x : len(x.strip()), board.xpath('(./td)[4]//text()').extract()) if time == []: incAttr(self.stats, 'ignoreBoardNum') log.msg('The board %s do not have time.' %url, level=log.ERROR) continue time = self.timeFormat(time[-1].strip()) if self.isNewTime(time): yield Request(url=url, callback = self.filterPost)
def filterPost(self, response): if response.xpath("//*[@id='bodyarea']/div[2][@style='margin-bottom: 3ex; ']"): #if has child board for board in response.xpath('//*[@id="bodyarea"]/div[2]/table/tr')[1 :]: #the list[0] is empty url = board.xpath('(./td)[2]//a/@href').extract() if url: url = url[0] else: continue if url: #some board have some subboard time = filter(lambda x : len(x.strip()), board.xpath('(./td)[4]//text()').extract()) if not time: incAttr(self.stats, 'ignoreSubboardNum') log.msg('The board %s do not have time.' %url, level=log.ERROR) continue time = self.timeFormat(time[-1].strip()) if self.isNewTime(time): yield Request(url=url, callback = self.filterPost) timelist = response.xpath('//*[@id="bodyarea"]/div[3]/table/tr[2]/td[7]/span//text()').extract() else: timelist = response.xpath('//*[@id="bodyarea"]/div[2]/table/tr[2]/td[7]/span//text()').extract() if len(timelist) == 6: time = self.timeFormat(timelist[2].strip()) else: time = self.timeFormat(timelist[0].strip()) if self.isNewTime(time): urls = response.xpath('//a/@href').extract() for url in urls: pattren = re.compile("https://bitcointalk\.org/index\.php\?topic=\d+\.0$") if pattren.match(url): printurl = '?action=printpage;'.join(url.rsplit('?', 1)) yield Request(url=printurl, callback = self.extractPost) #gen next board url nexturl = response.url k, n = nexturl.rsplit('.', 1) n = int(n) if k not in self.maxboardurl: self.genmax(response) mn = self.maxboardurl[k] nexturl = ''.join([k, '.', str(n + 40)]) #generate next board url if n < mn: yield Request(url=nexturl, callback = self.filterPost)
def extractUser(self, response): user = User() userinfo = response.xpath( "//table[@border = '0' and @cellpadding = '2']/tr") # extract every info form list of userinfo for character in userinfo: text = filter(unicode.strip, character.xpath(".//text()").extract()) if text != []: lenText = len(text) textname = text[0] text.pop(0) if textname.find("Name") != -1: if len > 1: user["name"] = text else: user["name"] = None continue if textname.find("Posts") != -1: if len > 1: user["posts"] = text else: user["posts"] = None continue if textname.find("Activity") != -1: if len > 1: user["activity"] = text else: user["activity"] = None continue if textname.find("Position") != -1: if len > 1: user["position"] = text else: user["position"] = None continue if textname.find("Date Registered") != -1: if len > 1: user["registerDate"] = text else: user["registerDate"] = None continue if textname.find("Last Active") != -1: if len > 1: user["lastDate"] = text else: user["lastDate"] = None continue if textname.find("Email: ") != -1: if len > 1: user["Email"] = text else: user["Email"] = None continue if textname.find("Gender") != -1: if len > 1: user["gender"] = text else: user["gender"] = None continue if textname.find("Age") != -1: if len > 1: user["age"] = text else: user["age"] = None continue if textname.find("Signature") != -1: if len > 1: user["bitcoinAddress"] = text else: user["bitcoinAddress"] = None continue else: incAttr(self.stats, 'ignoreUserAttrNum') log.msg('%s do not extract info in %s!' % response.body, response.url, level=log.ERROR) else: incAttr(self.stats, 'ignoreUserNum') log.msg('%s do not extract info in %s!' % response.body, response.url, level=log.ERROR) if dict(user).values() == [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]: incAttr(self.stats, 'ignoreUserNum') log.msg('%s do not extract info in %s!' % response.body, response.url, level=log.ERROR) return return user
def extractUser(self, response): user = User() userinfo = response.xpath("//table[@border = '0' and @cellpadding = '2']/tr") # extract every info form list of userinfo for character in userinfo: text = filter(unicode.strip, character.xpath(".//text()").extract()) if text != []: lenText = len(text) textname = text[0] text.pop(0) if textname.find("Name") != -1: if len > 1: user["name"] = text else: user["name"] = None continue if textname.find("Posts") != -1: if len > 1: user["posts"] = text else: user["posts"] = None continue if textname.find("Activity") != -1: if len > 1: user["activity"] = text else: user["activity"] = None continue if textname.find("Position") != -1: if len > 1: user["position"] = text else: user["position"] = None continue if textname.find("Date Registered") != -1: if len > 1: user["registerDate"] = text else: user["registerDate"] = None continue if textname.find("Last Active") != -1: if len > 1: user["lastDate"] = text else: user["lastDate"] = None continue if textname.find("Email: ") != -1: if len > 1: user["Email"] = text else: user["Email"] = None continue if textname.find("Gender") != -1: if len > 1: user["gender"] = text else: user["gender"] = None continue if textname.find("Age") != -1: if len > 1: user["age"] = text else: user["age"] = None continue if textname.find("Signature") != -1: if len > 1: user["bitcoinAddress"] = text else: user["bitcoinAddress"] = None continue else: incAttr(self.stats, 'ignoreUserAttrNum') log.msg('%s do not extract info in %s!' % response.body, response.url, level = log.ERROR) else: incAttr(self.stats, 'ignoreUserNum') log.msg('%s do not extract info in %s!' % response.body, response.url, level = log.ERROR) if dict(user).values() == [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]: incAttr(self.stats, 'ignoreUserNum') log.msg('%s do not extract info in %s!' % response.body, response.url, level = log.ERROR) return return user