Пример #1
0
def catg():
    sh = shelve.open("cat.db",writeback=False)['dict']
    urls = ['http://www.amazon.com/s/ref=sr_pg_2?rh=n%3A11091801%2Ck%3A%22&page=1&keywords=%22&ie=UTF8&qid=1375737123']    
    fil = open(fi[len(fi)-1].split('=')[1],"w")
    for i in sh:
        print i
        urls.append(urls[0].replace('mi',str(i)))
           
           
    #print urls       
    for ii in urls:
        #a = requests.get(ii,headers={'User-Agent': 'Mozilla/5.0'}).text #If you're on a windows machine, works for both windows and linux, but can sometimes be unreliable and slow.
        a = subprocess.check_output("scrapy fetch '%s'"%ii,shell=True) #I know weird, but this seems to be quite reliable.
        hxs = HtmlXPathSelector(text=a)               
        for p in hxs.select('//span'):
            if 'class="pagnDisabled"' in p:
               n = (int(str(p).split('<')[2].split('>')[1])+1)
               if n > 1:
                  for ra in range(2, n):
                      pag = ii.replace('page=1','page=%s'%str(ra))
                      aa = subprocess.check_output("scrapy fetch '%s'"%pag,shell=True)               
                      li = []
                      for d in hxs.select('//a').extract(): # find the right class 
                          if 'ilo2 ilc2' in d: 
                             li.append(d)
                             
                      lin = []
                      for dd in li:
                          tt = HtmlXPathSelector(text=dd)
                          lin.append(tt.select('//a/@href').extract()[0])
                          
                     
                      for it in lin:
                           aaa = subprocess.check_output("scrapy fetch '%s'"%it,shell=True)
                           ttt = HtmlXPathSelector(text=aaa)
                           rev = ttt.select('//a')
                           revi = ""
                           for ite in rev:
                               if "customer reviews" in ite:
                                  revv = HtmlXPathSelector(text=ite)
                                  revi = revv.select('//a/@href').extract()
                                  break
                                  
                           revie = HtmlXPathSelector(text=subprocess.check_output("scrapy fetch '%s'"%revi,shell=True))
                           for renn in revie.selecet('//a/@href').extract():
                               if "profile" in renn:
                                  scrap_prof("http://www.amazon.com/"+renn)