def process_response(self, request, response, spider):
     if 'dont_retry' in request.meta:
         return response
     if response.status in self.retry_http_codes:
         #recode exception time and suspend spider
         key = self.genKey()
         incAttr(self.status, key)
         if self.maxExceptionTime and self.status[key] >= self.maxExceptionTime:
             time.sleep(self.suspendTime)
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     return response
 def process_response(self, request, response, spider):
     if 'dont_retry' in request.meta:
         return response
     if response.status in self.retry_http_codes:
         #recode exception time and suspend spider
         key = self.genKey()
         incAttr(self.status, key)
         if self.maxExceptionTime and self.status[
                 key] >= self.maxExceptionTime:
             time.sleep(self.suspendTime)
         reason = response_status_message(response.status)
         return self._retry(request, reason, spider) or response
     return response
Пример #3
0
 def parse(self, response):
     #just crawl the first url
     for mainboard in response.xpath('//*[@id="bodyarea"]/div'):
         for board in mainboard.xpath('./table/tr'):
             url = board.xpath('(./td)[2]//a/@href').extract()
             if url:
                 url = url[0]
             else:
                 continue
             time = filter(lambda x : len(x.strip()), board.xpath('(./td)[4]//text()').extract())
             if time == []:
                 incAttr(self.stats, 'ignoreBoardNum')
                 log.msg('The board %s do not have time.' %url, level=log.ERROR)
                 continue
             time = self.timeFormat(time[-1].strip())
             if self.isNewTime(time):
                 yield Request(url=url, callback = self.filterPost)
Пример #4
0
    def filterPost(self, response):
        if response.xpath("//*[@id='bodyarea']/div[2][@style='margin-bottom: 3ex; ']"): #if has child board
            for board in response.xpath('//*[@id="bodyarea"]/div[2]/table/tr')[1 :]:      #the list[0] is empty
                url = board.xpath('(./td)[2]//a/@href').extract()
                if url:
                    url = url[0]
                else:
                    continue
                if url:     #some board have some subboard 
                    time = filter(lambda x : len(x.strip()), board.xpath('(./td)[4]//text()').extract())
                    if not time:
                        incAttr(self.stats, 'ignoreSubboardNum')
                        log.msg('The board %s do not have time.' %url, level=log.ERROR)
                        continue
                    time = self.timeFormat(time[-1].strip())
                    if self.isNewTime(time):
                        yield Request(url=url, callback = self.filterPost)
            timelist = response.xpath('//*[@id="bodyarea"]/div[3]/table/tr[2]/td[7]/span//text()').extract()
        else:
            timelist = response.xpath('//*[@id="bodyarea"]/div[2]/table/tr[2]/td[7]/span//text()').extract()

        if len(timelist) == 6:
            time = self.timeFormat(timelist[2].strip())
        else:
            time = self.timeFormat(timelist[0].strip())
        if self.isNewTime(time):
            urls = response.xpath('//a/@href').extract()
            for url in urls:
                pattren = re.compile("https://bitcointalk\.org/index\.php\?topic=\d+\.0$")
                if pattren.match(url):
                    printurl = '?action=printpage;'.join(url.rsplit('?', 1))
                    yield Request(url=printurl, callback = self.extractPost)
            #gen next board url 
            nexturl = response.url
            k, n = nexturl.rsplit('.', 1)
            n = int(n)
            if k not in self.maxboardurl:
                self.genmax(response)
            mn = self.maxboardurl[k]
            nexturl = ''.join([k, '.', str(n + 40)])
            #generate next board url   
            if n < mn:        
                yield Request(url=nexturl, callback = self.filterPost)  
Пример #5
0
 def extractUser(self, response):
     user = User()
     userinfo = response.xpath(
         "//table[@border = '0'  and @cellpadding = '2']/tr")
     # extract every info form list of  userinfo
     for character in userinfo:
         text = filter(unicode.strip,
                       character.xpath(".//text()").extract())
         if text != []:
             lenText = len(text)
             textname = text[0]
             text.pop(0)
             if textname.find("Name") != -1:
                 if len > 1:
                     user["name"] = text
                 else:
                     user["name"] = None
                 continue
             if textname.find("Posts") != -1:
                 if len > 1:
                     user["posts"] = text
                 else:
                     user["posts"] = None
                 continue
             if textname.find("Activity") != -1:
                 if len > 1:
                     user["activity"] = text
                 else:
                     user["activity"] = None
                 continue
             if textname.find("Position") != -1:
                 if len > 1:
                     user["position"] = text
                 else:
                     user["position"] = None
                 continue
             if textname.find("Date Registered") != -1:
                 if len > 1:
                     user["registerDate"] = text
                 else:
                     user["registerDate"] = None
                 continue
             if textname.find("Last Active") != -1:
                 if len > 1:
                     user["lastDate"] = text
                 else:
                     user["lastDate"] = None
                 continue
             if textname.find("Email: ") != -1:
                 if len > 1:
                     user["Email"] = text
                 else:
                     user["Email"] = None
                 continue
             if textname.find("Gender") != -1:
                 if len > 1:
                     user["gender"] = text
                 else:
                     user["gender"] = None
                 continue
             if textname.find("Age") != -1:
                 if len > 1:
                     user["age"] = text
                 else:
                     user["age"] = None
                 continue
             if textname.find("Signature") != -1:
                 if len > 1:
                     user["bitcoinAddress"] = text
                 else:
                     user["bitcoinAddress"] = None
                 continue
             else:
                 incAttr(self.stats, 'ignoreUserAttrNum')
                 log.msg('%s do not extract info in %s!' % response.body,
                         response.url,
                         level=log.ERROR)
         else:
             incAttr(self.stats, 'ignoreUserNum')
             log.msg('%s do not extract info in %s!' % response.body,
                     response.url,
                     level=log.ERROR)
     if dict(user).values() == [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {},
                                {}, {}]:
         incAttr(self.stats, 'ignoreUserNum')
         log.msg('%s do not extract info in %s!' % response.body,
                 response.url,
                 level=log.ERROR)
         return
     return user
 def extractUser(self, response):
     user = User()
     userinfo = response.xpath("//table[@border = '0'  and @cellpadding = '2']/tr")
     # extract every info form list of  userinfo
     for character in userinfo:
         text = filter(unicode.strip, character.xpath(".//text()").extract())
         if  text != []:
             lenText = len(text)
             textname = text[0]
             text.pop(0)
             if  textname.find("Name") != -1:
                 if len > 1:
                     user["name"] = text
                 else:
                     user["name"] = None
                 continue
             if  textname.find("Posts") != -1:
                 if len > 1:
                     user["posts"] = text
                 else:
                     user["posts"] = None
                 continue
             if  textname.find("Activity") != -1:
                 if len > 1:
                     user["activity"] = text
                 else:
                     user["activity"] = None
                 continue
             if  textname.find("Position") != -1:
                 if len > 1:
                     user["position"] = text
                 else:
                     user["position"] = None
                 continue
             if  textname.find("Date Registered") != -1:
                 if len > 1:
                     user["registerDate"] = text
                 else:
                     user["registerDate"] = None
                 continue
             if  textname.find("Last Active") != -1:
                 if len > 1:
                     user["lastDate"] = text
                 else:
                     user["lastDate"] = None
                 continue
             if  textname.find("Email: ") != -1:
               if len > 1:
                   user["Email"] = text
               else:
                   user["Email"] = None
               continue
             if  textname.find("Gender") != -1:
                 if len > 1:
                     user["gender"] = text
                 else:
                     user["gender"] = None
                 continue
             if  textname.find("Age") != -1:
                 if len > 1:
                     user["age"] = text
                 else:
                     user["age"] = None
                 continue
             if  textname.find("Signature") != -1:
                 if len > 1:
                     user["bitcoinAddress"] = text
                 else:
                     user["bitcoinAddress"] = None
                 continue
             else:
                     incAttr(self.stats, 'ignoreUserAttrNum')
                     log.msg('%s do not extract info in %s!' % response.body, response.url, level = log.ERROR)
         else:
             incAttr(self.stats, 'ignoreUserNum')
             log.msg('%s do not extract info in %s!' % response.body, response.url, level = log.ERROR)
     if dict(user).values() == [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]:
         incAttr(self.stats, 'ignoreUserNum')
         log.msg('%s do not extract info in %s!' % response.body, response.url, level = log.ERROR)
         return         
     return user