Exemplo n.º 1
0
 def __init__(self, category=None, *args, **kwargs):
     self.sitekey = "sogou.com"
     self.temp_id = 0
     self.keysparser = PageKeysParser()
     self.itemsfounder = ItemsFounder()
     self.datamgr = DataMgr()
     self.start_urls = ["http://weixin.sogou.com"]
Exemplo n.º 2
0
 def __init__(self, category=None, *args, **kwargs):
     self.sitekey = "sogou.com"
     self.temp_id = 0
     self.keysparser = PageKeysParser();
     self.itemsfounder = ItemsFounder();
     self.datamgr = DataMgr();
     self.start_urls = ["http://weixin.sogou.com"]
Exemplo n.º 3
0
class SogouSpider(Spider):
    name = "sogou"
    allowed_domains = ["sogou.com"]

    def __init__(self, category=None, *args, **kwargs):
        self.sitekey = "sogou.com"
        self.temp_id = 0
        self.keysparser = PageKeysParser()
        self.itemsfounder = ItemsFounder()
        self.datamgr = DataMgr()
        self.start_urls = ["http://weixin.sogou.com"]

    def parse(self, response):
        print "连上:" + self.sitekey

        urlitems = []
        #1:爬内容页:
        urls = self.datamgr.geturls(sitekey=self.sitekey)
        for uitem in urls:
            url = uitem[2]
            urlitems.append(Request(url, callback=self.parse_siteurl))

        surlcount = len(urlitems)
        print "装载siteurl数量:", surlcount

        #2: 爬搜索页
        words = self.datamgr.getwords(8)
        for word in words:
            url = "http://weixin.sogou.com/weixin?query=" + word[1]
            urlitems.append(Request(url, callback=self.parse_rooturl))

        print "装载搜索页数量:", len(urlitems) - surlcount

        if (len(urlitems) == 0):
            print "没有任何下载url可爬"

        return urlitems

    def parse_rooturl(self, response):
        baseurl = response.url
        stemplate = self.datamgr.getsitetemplates(self.sitekey)
        nextsiteurls = []
        if (stemplate == None):
            found = self.keysparser.parse(response.body, baseurl)
            if (found):
                temp_id = self.datamgr.insertSiteTemplate(
                    self.sitekey, self.keysparser.itemskeys,
                    self.keysparser.pagingkeys)
                self.template = [
                    temp_id, self.keysparser.itemskeys,
                    self.keysparser.pagingkeys
                ]
                purls = self.datamgr.inserturls(self.keysparser.pagingurls,
                                                temp_id)
                nextsiteurls.extend(purls)
        else:
            self.template = [
                stemplate[0],
                eval(stemplate[2]),
                eval(stemplate[3])
            ]
            keys = [self.template[1], self.template[2]]
            found1, found2 = self.itemsfounder.find(response.body, baseurl,
                                                    keys, self.sitekey)
            if (found2):
                purls = self.datamgr.inserturls(self.itemsfounder.pagingurls,
                                                self.template[0])
                nextsiteurls.extend(purls)

        self.datamgr.updateurl(baseurl, temp_id=self.temp_id)
        itemurls = []
        for url in nextsiteurls:
            print url
            itemurls.append(Request(url, callback=self.parse_siteurl))
        return itemurls

    def parse_siteurl(self, response):
        baseurl = response.url
        self.datamgr.updateurl(baseurl, temp_id=self.template[0])
        keys = [self.template[1], self.template[2]]
        found1, found2 = self.itemsfounder.find(response.body, baseurl, keys,
                                                self.sitekey)
        items = []
        # if (found1):

        return items
Exemplo n.º 4
0
class SogouSpider(Spider):
    name = "sogou"
    allowed_domains = ["sogou.com"]

    def __init__(self, category=None, *args, **kwargs):
        self.sitekey = "sogou.com"
        self.temp_id = 0
        self.keysparser = PageKeysParser();
        self.itemsfounder = ItemsFounder();
        self.datamgr = DataMgr();
        self.start_urls = ["http://weixin.sogou.com"]

    def parse(self, response):
        print "连上:"+self.sitekey

        urlitems = []
        #1:爬内容页:
        urls = self.datamgr.geturls(sitekey=self.sitekey)
        for uitem in urls:
          url = uitem[2]
          urlitems.append(Request(url,callback=self.parse_siteurl))

        surlcount = len(urlitems)
        print "装载siteurl数量:",surlcount

        #2: 爬搜索页
        words = self.datamgr.getwords(8)
        for word in words:
            url = "http://weixin.sogou.com/weixin?query="+word[1]
            urlitems.append(Request(url,callback=self.parse_rooturl))

        print "装载搜索页数量:",len(urlitems)-surlcount

        if (len(urlitems)==0):
          print "没有任何下载url可爬"

        return urlitems

    def parse_rooturl(self, response):
      baseurl = response.url
      stemplate = self.datamgr.getsitetemplates(self.sitekey)
      nextsiteurls = []
      if (stemplate==None):
        found = self.keysparser.parse(response.body,baseurl)
        if (found):
          temp_id = self.datamgr.insertSiteTemplate(self.sitekey,self.keysparser.itemskeys,self.keysparser.pagingkeys)
          self.template = [temp_id,self.keysparser.itemskeys,self.keysparser.pagingkeys]
          purls = self.datamgr.inserturls(self.keysparser.pagingurls,temp_id)
          nextsiteurls.extend(purls)
      else:
        self.template = [stemplate[0],eval(stemplate[2]),eval(stemplate[3])]
        keys = [self.template[1],self.template[2]]
        found1,found2 = self.itemsfounder.find(response.body,baseurl,keys,self.sitekey)
        if (found2):
          purls = self.datamgr.inserturls(self.itemsfounder.pagingurls,self.template[0])
          nextsiteurls.extend(purls)

      self.datamgr.updateurl(baseurl,temp_id=self.temp_id)
      itemurls = []
      for url in nextsiteurls:
        print url
        itemurls.append(Request(url,callback=self.parse_siteurl))
      return itemurls

    def parse_siteurl(self, response):
      baseurl = response.url
      self.datamgr.updateurl(baseurl,temp_id=self.template[0])
      keys = [self.template[1],self.template[2]]
      found1,found2 = self.itemsfounder.find(response.body,baseurl,keys,self.sitekey)
      items = []
      # if (found1):

      return items