예제 #1
0
class rebot_obj:
    def __init__(self):
        self.url = "http://top.chinaz.com/list.aspx?p=%d"
        self.curUrl = None
        self.pageNum = 1
        self.curRequest = None
        self.urlList = []

    def run(self):
        while self.pageNum < 300:
            self.curUrl = self.url % self.pageNum
            print self.curUrl
            self.curRequest = Requset(self.curUrl, 10)
            self.curRequest.run()
            doc = self.curRequest.get_doc()
            infoTag = doc.xpath("//div[@class='info']/h3/a")
            for info in infoTag:
                try:
                    domainTitle = info.text
                    url = info.get("href")
                    url = url.replace("/site_", "http://").replace(".html", "/")
                    domain = url_object(url).getRootDomain
                    self.urlList.append((domain, domainTitle))
                except Exception, e:
                    print "parse_web.rebot_obj.run: %s" % e

            self.pageNum += 1
예제 #2
0
파일: parse_web.py 프로젝트: cash2one/dark
class rebot_obj():
    def __init__(self):
        self.url = 'http://top.chinaz.com/list.aspx?p=%d'
        self.curUrl = None
        self.pageNum = 1
        self.curRequest = None
        self.urlList = []

    def run(self):
        while self.pageNum < 300:
            self.curUrl = self.url % self.pageNum
            print self.curUrl
            self.curRequest = Requset(self.curUrl, 10)
            self.curRequest.run()
            doc = self.curRequest.get_doc()
            infoTag = doc.xpath("//div[@class='info']/h3/a")
            for info in infoTag:
                try:
                    domainTitle = info.text
                    url = info.get('href')
                    url = url.replace("/site_",
                                      "http://").replace(".html", "/")
                    domain = url_object(url).getRootDomain
                    self.urlList.append((domain, domainTitle))
                except Exception, e:
                    print 'parse_web.rebot_obj.run: %s' % e

            self.pageNum += 1
예제 #3
0
 def get_script_content_in_js(self):
     '''
     描述: 从js文件中获取script内容
     '''
     scriptContentList = []
     for js in self.get_script_tag_js_list:
         request = Requset(js, 1)
         request.run()
         req = request.get_text()
         if req is not None:
             scriptContentList.append(req)
     return scriptContentList
예제 #4
0
 def get_style_content_in_css(self):
     '''
     描述: 从css样式中获取style内容
     '''
     styleContentList = []
     for css in self.get_link_tag_css_list:
         request = Requset(css, 1)
         request.run()
         req = request.get_text()
         if req is not None:
             styleContentList.append(req)
     return styleContentList