Python Selector примеры, scrapy.selector.Selector Python примеры использования

Пример #1

0

Показать файл

Файл: XidianInfo_spider.py Проект: PascoCoder/XidianInfo

    def parse_item(self, response):
        self.log('Hi,this is an item page! %s' % response.url)
        #r = redis.Redis(host='localhost',port=6379,db=0)
        self.r.set(response.url,1)
        sel = Selector(response)
        item = XidianinfoItem()
        urlParts = response.url.strip().split('/')
        item['newsType'] = urlParts[-2]
        item['newsId'] = urlParts[-1][:-4]
        item['newsTitle'] = sel.xpath('//td[@class="titlestyle1040"]/text()').extract_first()
        item['newsTime'] = sel.xpath('//span[@class="timestyle1040"]/text()').extract_first().strip()
        item['newsFrom'] = sel.xpath('//span[@class="authorstyle1040"]/text()').extract_first().strip()
        item['newsContent'] = sel.xpath('//div[@class="c1040_content"]//p').extract_first()

        image_urls = response.xpath('//div[@class="c1040_content"]/div/p/img/@src').extract()
        item['image_urls'] = []
        for image_url in image_urls:
            item['image_urls'].append(image_url.replace('../..','http://info.xidian.edu.cn'))

        print('============================')

        print(item['newsType'].strip().encode('utf8') + '---------------------')
        print(item['newsId'].strip().encode('utf8') + '---------------------')
        print(item['newsTitle'].strip().encode('utf8') + '---------------------')
        print(item['newsTime'].strip().encode('utf8') + '---------------------')
        print(item['newsFrom'].strip().encode('utf8') + '---------------------')
        print(item['newsContent'].strip().encode('utf8') + '---------------------')

        return item

Пример #2

0

Показать файл

Файл: yeahpic.py Проект: veinyy/spider_package

 def parse_img(self, response):
     urlItem = MeituItem()
     sel = Selector(response)
     for divs in sel.xpath("//div[@class='pic-meinv']"):
         img_url=divs.xpath("a/img[@class='pic-large']/@src").extract()
         urlItem['image_urls'] = img_url
         yield urlItem

Пример #3

0

Показать файл

Файл: gesis_spider.py Проект: gbirke/scrapy-vivo

    def parse_person(self, response):
        person = response.meta["person"]
        person["source_url"] = response.url

        # Connect person to division
        division_role = DivisionRole()
        division_role["source_url"] = response.meta["division_url"]
        division_role["name"] = "Leiter" # ACHTUNG Hartcodierung
        division_role["person_url"] = response.url
        division_role["division_url"] = response.meta["division_url"]
        yield division_role

        # TODO Ask students for other fields to parse here

        yield person
        return # Don't do publications while the parsing is broken
        # Parse publication list
        sel = Selector(response)
        publications_list = sel.css(".gs_publication > .gs_publication_list .gs_publication_list")
        current_publication_type = None
        source_url_base = response.url.split("#")[0] + "#" # Remove fragment (regardless if it exists) and add fragment separator
        for item in publications_list:
            current_publication_type = join(item.xpath("h3/text()").extract(), "")
            for pub_item in item.xpath("p"):
                 publication = self.create_publication(pub_item, current_publication_type, source_url_base)
                 # TODO remove person from publication["author_names"] and set publication["author_ids"] instead.
                 if publication:
                    yield publication

Пример #4

0

Показать файл

Файл: 123yq.py Проект: yytang2012/comics-novels-crawler

 def parse(self, response):
     sel = Selector(response);
     title = sel.xpath('//h1/text()').extract()[0]
     title = polishTitle(title, self.name);
     print(title)
     tmpNovelDirPath = os.path.join(self.tmpDirPath, title);
     if(os.path.isdir(tmpNovelDirPath) != True):
         os.makedirs(tmpNovelDirPath);
     
     dd = sel.xpath('//dl/dd');
     id = 0;        
     for d in dd:
         id += 1;
         nid = ((id-1)/3+1)*3 - (id-1)%3;
         a = d.xpath('a');
         if(len(a) == 0):
             continue;
         url = a.xpath('@href').extract()[0];
         url = response.urljoin(url.strip());
         subtitle = a.xpath('text()').extract()[0];
         subtitle = polishSubtitle(subtitle);
         print(url);
         print(subtitle);
         request = scrapy.Request(url, callback = self.parse_page);
         item = NovelsItem();
         item['title'] = title;
         item['subtitle'] = subtitle;
         item['id'] = nid;
         item['type'] = 'novels';
         request.meta['item'] = item;
         yield request;

Пример #5

0

Показать файл

Файл: gesis_spider.py Проект: gbirke/scrapy-vivo

 def parse_overview(self, response):
     """ Parse start page, branching out to each research subject """
     sel = Selector(response)
     for link in sel.css("#c12546 li a"): 
         url = join(link.xpath("@href").extract(), "") 
         url = self.fix_url(url, response.url)
         yield Request(url, callback=self.parse_research)

Пример #6

0

Показать файл

Файл: wikimeds_spiders.py Проект: kewtree1408/PlaceboSearch

    def parse_disease(self, response):
        """
        target_url = response.meta['url']
        if(not target_url == response.url):
            request = Request(target_url, callback=self.parse_disease)
            request.meta['url'] = target_url
            yield request
        """
        #if not response.status == 200:
        #    1+1
        sel = Selector(response)
        disease_title = sel.xpath('//div[@id="printMe"]/h1/text()').extract()[0]
        # context = sel.xpath('//*[@id="div_nest"]').extract()[0]q

        contexte = ''
        temp = ''.join([stuff+'\n' for stuff in sel.xpath('//div[@id="printMe"]//span[@id="info576"]/*[self::p or self::ol or self::ul]//text()').extract()])
        if temp:
            context = html2text(temp)
            drugs_and_stuff = ''.join([stuff+'\n' for stuff in sel.xpath('//a[@class="rest_data_list"]/text()').extract()])
            #time.sleep(5)
            yield DiseaseDescription(   url = response.url,
                                        name = disease_title,
                                        description = context,
                                        drugs = drugs_and_stuff,
                                    )

Пример #7

0

Показать файл

Файл: wikimeds_spiders.py Проект: kewtree1408/PlaceboSearch

    def parse_drug(self, response):
        sel = Selector(response)
        drug_name = sel.xpath('//div[@id="printMe"]/h1/text()').extract()[0]
        context = sel.xpath('//div[@id="printMe"]').extract()[0]
        classification = ''

        all_subheads = sel.xpath('//div[@id="printMe"]/h2/text()').extract()
        description, usage, contra, side, overdose = '', '', '', '', ''
        for i, subhead in enumerate(all_subheads):
            n = i+1
            if u"Лекарственная форма, состав, упаковка" in subhead:
                description = ''.join(self.p_between_id(n, sel))
            elif u"Режим дозирования" in subhead or u"Показания к применению" in subhead:
                usage = ''.join(self.p_between_id(n, sel))
            elif u"Противопоказания" in subhead:
                contra = ''.join(self.p_between_id(n, sel))
            elif u"Побочные действия" in subhead:
                side = ''.join(self.p_between_id(n, sel))
            elif u"Передозировка при приёме" in subhead:
                overdose = ''.join(self.p_between_id(n, sel))
        #time.sleep(5)
        yield DrugDescription(
                        url=response.url,
                        name=drug_name,
                        classification=classification,
                        description=description,
                        usage=usage,
                        contra=contra,
                        side=side,
                        overdose=overdose,
                        info=html2text(context),
                    )

Пример #8

0

Показать файл

Файл: jandan.py Проект: yanqiw/patu

    def parse(self, response):
        items=[]
        filename = 'pic_url.txt'
        pic_url = open(filename, 'wb')
        pic_url.write("Start")
        sel = Selector(response)
        pics = sel.xpath('//*[@id="comments"]/ol/li')
        pic_url.write('Version: 0.2')
        for pic in pics:

            ooRate = pic.xpath('div[1]/div/div[2]/*[@class="vote"]/span[2]/text()').extract()
            if len(ooRate) > 0 and int(ooRate[0]) > 100:
                item = PatuItem()
                item['support_rate']=ooRate
                pic_url.write(str(pic.xpath('div[1]/div/div[2]/p/img/@src').extract()) + '\n')
                if pic.xpath('div[1]/div/div[2]/p/img/@org_src'):
                    item['image_urls'] = pic.xpath('div[1]/div/div[2]/p/img/@org_src').extract()
                else:
                    item['image_urls'] = pic.xpath('div[1]/div/div[2]/p/img/@src').extract()
                item['images'] = ''
                # print(pic.xpath('div[1]/div/div[2]/p/img/@src').extract())
                items.append(item)
                yield item
        
        pic_url.write("End")
        pic_url.close()

Пример #9

0

Показать файл

Файл: wikimeds_spiders.py Проект: kewtree1408/PlaceboSearch

 def parse(self, response):
     sel = Selector(response)
     url_letters = sel.xpath('//div/ul[@class="alphaLinks"]//a/@href').extract()
     for url in url_letters:
         print url
         #time.sleep(5)
         yield Request(url, callback=self.parse_letter)

Пример #10

0

Показать файл

Файл: NasdaqSplits.py Проект: jackstine/stockAnalysis

    def run(self):
        site = urllib2.urlopen("http://www.nasdaq.com/markets/upcoming-splits.aspx")
        html = site.read()
        response = Selector(text = html, type = "html")

        #headers
        headers = response.xpath('//table[@rules="all"]/tr/th/text()').extract()
        headers = self.f.mushList(headers)
        #data
        data = response.xpath('//table[@rules="all"]/tr/td/text()').extract()
        #companyNames
        companyNames = response.xpath('//table[@rules="all"]/tr/td/a/text()').extract()

        models = []
        d = 0
        for com in companyNames:
            m = InsertModel(self.table)
            splits = self.splitName(com)
            for index,h in enumerate(headers):
                if (index == 0):
                    m.insert("CompanyName", self.f.filterForSQL(splits[1]))
                    m.insert("Symbol", splits[0])
                    continue
                elif (index == 1):
                    m.insert(h, self.getRatio(data[d]))
                else:
                    m.insert(self.f.headerFilter(h), self.f.convertDate(data[d]))
                d += 1
            models.append(m)
        return models

Пример #11

0

Показать файл

Файл: ChronologiaSpider.py Проект: lpawluczuk/questionAnsweringFamousPeople

    def parse(self, response):
        sel = Selector(response)
        result = []
       
        ad = DatesItem()
        ad['name'] = ""
        for p in sel.xpath("//div[@class='poziomd']//text()").extract():

            if re.match("^.*,", p):
                if p.startswith(","):
                    ad['desc'] = p[2:]
                else:
                    ad['desc'] = p[6:]
                ad['name'] = ad['name'].lstrip('1234567890() ').strip()
                if re.match('^.\s', ad['name']):
                    ad['name'] = ad['name'][2:]

                ad['url'] = response.url
                if re.match(".*urodzeni.*", response.url):
                    ad['isBirth'] = True
                else:
                    ad['isBirth'] = False

                result.append(ad)
                ad = DatesItem()
                ad['name'] = ""
            elif re.match("^\s*[0-9]{1,4}", p) and not ad.has_key('date'):
                ad['date'] = re.match("^\s*[0-9]{1,4}", p).group()
            else:
                ad['name'] = ad['name'] + p
        return result

Пример #12

0

Показать файл

Файл: zhihu_spider.py Проект: skywalker-lili/zhihu_tagCrawler

 def parse_pathPage(self, response):
     #print("parse a path page!")
     sel = Selector(response)
     item = {}
     item["name"] = sel.xpath('//h1[@class= "zm-editable-content"]/text()').extract()
     
     # 找到Path
     paths = []
     for path_selector in sel.xpath('//div[@class= "zm-topic-tree"][1]/ul'):
         # 提取一条path并append到paths
         one_path = path_selector.xpath('.//a/text()').extract()
         paths.append(one_path)
     item["paths"] = paths
     
     #  将item append到local file
     with io.open("tag_paths_app.jsonl", "a", encoding = "utf8") as outfile: # "a" 表示是appending mode
         row = json.dumps(item, ensure_ascii=False, sort_keys=True)
         print(row, file = outfile)
     
     # 修改spider的tag structure dictionary
     outside = self.d # initially, the outside is the whole dictionary
     for path in paths:
         for i in path:
             try:
                 inside = outside[i]
             except KeyError:
                 inside = {}
                 outside[i] = inside
             outside = inside
         outside = self.d # reset the outside to whole dictionary
     
     # 将这个item的paths加入到spider的p dictionary中
     self.p.append(item)

Пример #13

0

Показать файл

Файл: CAUcrawl.py Проект: JourneyGo/TN-spider

 def parse_item(self,response):
     sel_detail = Selector(response)
     item = response.meta['item']
     desc = sel_detail.xpath('//*[@id="mainArea"]/*').extract()
     item['desc'] = [d.encode('UTF-8') for d in desc]
     print "Done!"
     yield item

Пример #14

0

Показать файл

Файл: bitautoallprice.py Проект: top/python

    def parsePrice(self, response):
        sel = Selector(response)

        item = BitautoAllPriceItem()
        item['city'] = filt(sel.xpath('//div[@class="adress"]/text()').extract()[0], u'地址：', u'市')
        item['dealer'] = sel.xpath('//div[@class="info"]/h1/text()').extract()[0]
        item['dealerid'] = filt(response.url, '.com/', '/')

        db = SimpleMysql(host = '127.0.0.1:5029', db = 'wholenetwork', user = '******', passwd = '')
        trs = sel.xpath('//div[@class="car_list"]')
        for tr in trs:
            tmp = tr.xpath('div/div[@class="car_top"]/h3/a')
            item['brand'] = tmp.xpath('text()').extract()[0]
            item['brandid'] = filt(tmp.xpath('@href').extract()[0], 'cars_', '.html')
            prices = tr.xpath('div/div[@class="car_price"]/table/tbody/tr')
            for price in prices:
                if not price.xpath('td'): continue    # filt th rows
                item['model'] = price.xpath('td[1]/a/@title').extract()[0]
                item['modelid'] = filt(price.xpath('td[1]/a/@href').extract()[0], 'price_detail/', '.html')
                item['oprice'] = price.xpath('td[2]/text()').extract()[0].replace(u' ','').replace('\r\n','').replace(u'万','')
                item['price'] = price.xpath('td[4]/a/text()').extract()[0].replace('\r\n','').replace(u' ','').replace(u'万','')
                item['off'] = price.xpath('td[3]/em/text()').extract()[0].replace('\r\n','').replace(u' ','').replace(u'万','').replace(u'↓','')

                if ISSAVE: doSave(db, item)
                if ISPOST: doPost(API_ADDRESS, item)

        np = sel.xpath('//div[@id="pager"]/a')
        while np and (np[-1].xpath('text()').extract()[0] == u'下一页'):
            url = np[-1].xpath('@href').extract()[0]
            url = response.urljoin(url)
            yield Request(url, self.parsePrice)

Пример #15

0

Показать файл

Файл: zhihu_spider.py Проект: skywalker-lili/zhihu_tagCrawler

 def parse_tagPage(self, response):
     sel = Selector(response)
     
     # tag的名字和链接
     name = sel.xpath('//h1[@class= "zm-editable-content"]/text()').extract()
     relative_link = sel.xpath('//div[@class= "zm-topic-topbar"]//a/@href').extract()
     
     # tag的parent
     parents = sel.xpath('//div[@id= "zh-topic-organize-parent-editor"]//a[@class= "zm-item-tag"]/text()').extract()
     parents = [s.replace("\n", "") for s in parents]
     
     # tag的children
     children = sel.xpath('//div[@id= "zh-topic-organize-child-editor"]//a[@class= "zm-item-tag"]/text()').extract()
     children = [s.replace("\n", "") for s in children]
     
     # 把tag item保存起来以备最后输出
     item = {}
     item["name"] = name
     item["relative_link"] = relative_link
     item["parents"] = parents
     item["children"] = children
     self.l.append(item)
     
     # 将item append到local file
     with io.open("tag_items.jsonl", "a", encoding = "utf8") as outfile: # "a" 表示appending mode
         row = json.dumps(item, ensure_ascii=False)
         print(row, file = outfile)
     
     # Mimic the return of CrawlSpider's default parse() so that the rules will be applied continueously
     #   in stead of just once on the start_urls
     return self.parse(response)

Пример #16

0

Показать файл

Файл: STJSpider.py Проект: jacksonjos/analise-juridica

 def parsePage( self, response):
     #inspect_response(response, self)
     try: 
         unicode(response.body.decode(response.encoding)).encode('utf-8')
     except exceptions.UnicodeDecodeError:
         print "exception error"
     sel = Selector(response)
     doclist = sel.xpath(
         '/html/body/div[@id="divprincipal"]'+
         '/div[@class="minwidth"]'+
         '/div[@id="idInternetBlocoEmpacotador"]'+
         '/div[@class="incenter_interno"]'+
         '/div[@id="idDivContainer"]'+
         '/div[@id="idAreaBlocoExterno"]'+
         '/div[@id="idArea"]'+
         '/div[@id="corpopaginajurisprudencia"]'+
         '/div[@id="listadocumentos"]'+
         '/div[@style="position: relative;"]')
     for doc in doclist:
         yield self.parseDoc( doc)
         self.fIndex = self.fIndex + 1
     nextPage = sel.xpath('//*[@id="navegacao"][1]/a[@class="iconeProximaPagina"]')
     if nextPage:
         yield Request( urlparse.urljoin('http://www.stj.jus.br/',
                                         nextPage.xpath('@href').extract()[0]),
                        callback=self.parsePage )
     else:
         self.saveSearchInfo()

Пример #17

0

Показать файл

Файл: councilors.py Проект: alecchen/councilor-voter-guide

    def parse_profile_frameset(self, response):
        sel = Selector(response)

        url = parse.get_extracted(sel.xpath('//frame[@name="mainFrame"]/@src'))
        url = urljoin(response.url, url)
        meta = response.request.meta
        return Request(url, callback=self.parse_profile, meta=meta)

Пример #18

0

Показать файл

Файл: spider.py Проект: BeaLin/ScrapyProgram

 def parse_brand(self,response):
     sel = Selector(response)
     items=[]
     product_sites=sel.xpath('//div[@class="product_result_box"]/ul/li')
     for product_site in product_sites:
         img_src=self.check_list(product_site.xpath('a[@class="pro_item"]/img/@src').extract())
         if img_src=='':
             product_id=''
         else:
             img_id=img_src.split('/')[-1]
             product_id=img_id.split('_')[0]
         
         product_name=self.check_list(product_site.xpath('div[@class="searchlist_tit"]/a/text()').extract())
         comment_href=self.check_list(product_site.xpath('a[@class="pro_item"]/@href').extract())
         comment_temp=comment_href.split('_')[-1]
         comment_id=comment_temp.split(".")[0]
         comment_url="http://koubei.jumei.com/comment_list-"+comment_id+"-1.html"
         r = Request(comment_url ,meta={'product_id':product_id,'product_name':product_name} ,callback=self.parse_comment)
         items.append(r)
     next_brandpage=response.xpath('//div[@class="pageSplit"]/a[@class="next"]/@href').extract()
     if len(next_brandpage):
         next_brandpage_url="http://koubei.jumei.com"+next_brandpage[0]
         r = Request(next_brandpage_url ,callback=self.parse_brand)
         items.append(r)
     return items

Пример #19

0

Показать файл

Файл: zhihu.py Проект: weberwang/scrapy-zhihu

    def people_page(self, response):
        yield self.parse_item(response)
        sel = Selector(response)
        # 关注和被关注
        following = sel.xpath('//div[@class="zm-profile-side-following zg-clear"]')

        # todo 递归找出所有有效用户关注的数据
        followings = following.xpath('.//a/@href').extract()
        for follow_link in followings:
            # yield self.cookiejar_addcookies(response, url=follow_link, callback=self.followees_page) #这样调用会重定向 还没有决解
            self.webdriver_addcookies(follow_link)
            browerHeight = self.driver.execute_script('return document.body.scrollHeight;')
            while True:
                # do the scrolling
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(1)  # 等待加载完成数据
                scrollHeight = self.driver.execute_script('return document.body.scrollHeight;')
                if browerHeight == scrollHeight:
                    break
                browerHeight = scrollHeight
            peoplelinks = self.driver.find_elements_by_xpath('//a[@class="zm-item-link-avatar"]')
            for link in peoplelinks:
                href = link.get_attribute('href') #某些用户的链接在这里找不到,待查找
                yield self.cookiejar_addcookies(response, url=href, callback=self.people_page)
            pass
        # followees = followings[0]  # 关注的链接
        # followers = followings[1]  # 被关注
        pass

Пример #20

0

Показать файл

Файл: 6vhao_spider.py Проект: zhs007/movieSpider

    def search_parse(self, response):
        sel = Selector(response)

        lst = sel.css('ul.list')[0].xpath('./li')
        for cur in lst:
            #print "cur is %s" % (cur.extract())
            cura = cur.xpath('./a/@href')[0].extract()
            #print "cur is %s" % (cura)

            at = cur.xpath('./a/text()')
            if len(at) > 0:
                curn = cur.xpath('./a/text()')[0].extract()
            else:
                curn = cur.xpath('./a/font/text()')[0].extract()

            bt = curn.find(u'《')
            et = curn.find(u'》')
            if bt == -1 or et == -1:
                name = curn
            else:
                name = curn[(bt + 1):et]

            #print 'curmovie name is ' + name

            self.moviedb.insMovie_6vhao(cura, name)

        return []

Пример #21

0

Показать файл

Файл: Splits.py Проект: jackstine/stockAnalysis

 def run(self):
     html = urllib2.urlopen("http://getsplithistory.com/AA").read()
     response = Selector(text = html, type = "html")
     datesAndOther = response.xpath("//table/tbody/tr/td/text()").extract()
     ratios = response.xpath("//table/tbody/tr/td/span/text()").extract()
     objects = []
     rs = []
     for i,data in enumerate(datesAndOther[:-1]):
         if (i % 4 == 0):
             objNum = len(objects)
             objects.append(dict())
             objects[objNum]["date"] = self.cleanDate(data)
         if ((i - 1) % 4 == 0):
             objNum = len(objects) - 1
             objects[objNum]["denom"] = self.cleanDenom(data)
     for i,data in enumerate(ratios[:-1]):
         if (i % 3 == 0):
             objects[i/3]["num"] = self.cleanNum(data)
     for o in objects:
         o["factorial"] = float(o["num"]) / float(o["denom"])
     # now we insert the date symbol name and the factorial into the DB
     for o in objects:
         IM = InsertModel("jdfkasdklfj")#tableName)
         IM.insert("e", o["date"])
         IM.insert("symbol", symbol)
         IM.insert("Ratio", o["factorial"])

Пример #22

0

Показать файл

Файл: country_spider.py Проект: object8421/MyTravo

 def parse_item(self, response):
     sel = Selector(response)
     i = CountryDataCrawlerItem()
     i['item_type'] = 'country'
     i['crawled_url'] = response.url
     print response.url
     print i['item_type']
     i['name'] = sel.xpath('//div[@class="b_title clrfix"]/div[@class="tit"]/text()').extract()[0].encode('UTF-8')
     print i['name']
     image_url_list = sel.xpath('//img/@src').extract()
     image_url_list = utils.image_url_filter(image_url_list,'720x400')
     i['image_url'] = utils.get_image_url_string(image_url_list)
     print i['image_url']
     #i['image_path'] = 'test'
     i['image_path'] = utils.save_image_to_oss(image_url_list)
     print 'image_path'
     brief_info_list = sel.xpath('//div[@class="countbox"]//text()').extract()
     i['brief_description'] = utils.scrape_str(brief_info_list)
     print i['brief_description']
     #抓取详细信息
     detail_info_source = urllib2.urlopen(response.url+'/zhinan').read()
     detail_info_page = etree.HTML(detail_info_source.lower())
     detail_info_list = detail_info_page.xpath("//div[@class='b_g_cont']//text()")
     i['detail_description'] = utils.scrape_str(detail_info_list)
     print i['detail_description']
     i['last_update_time'] = utils.get_current_time()
     print i['last_update_time']
     self.data_count += 1
     print "共抓取了：%d 条数据。"%self.data_count
     return i

Пример #23

0

Показать файл

Файл: sinanews_spider.py Проект: wsnedy/learngit

 def parse_news(self,response):
     item = response.meta['item']
     sel = Selector(response)
     item['news_title'] = sel.xpath('//title/text()').extract()
     news_media = sel.xpath('//meta[@name="mediaid"]/@content').extract()
     if news_media:
         item['news_media'] = news_media[0]
     else:
         item['news_media'] = "NoMedia"
     timelist1 = sel.xpath('//span[@class="time-source"]/text()').re('\d+')[0:3]
     timelist2 = sel.xpath('//span[@id="pub_date"]/text()').re('\d+')[0:3]
     timelist3 = sel.xpath('//span[@class="time"]/text()').re('\d+')[0:3]
     timelist = timelist1 + timelist2 + timelist3
     # print timelist
     item['news_pubtime'] = ['-'.join(map(str,timelist))]
     news_content1 = sel.xpath('//div[@id="artibody"]').extract()
     news_content2 = sel.xpath('//div[@class="mainContent"]').extract()
     item['news_content'] = news_content1 + news_content2
     channel1 = sel.xpath('//script').re('channel:.*\'(.*)\'')
     channel2 = sel.xpath('//script').re('channel:.*\"(.*)\"')
     channel = channel1 + channel2
     newsid1 = sel.xpath('//script').re('newsid:.*\'(.*)\'')
     newsid2 = sel.xpath('//script').re('newsid:.*\"(.*)\"')
     newsid = newsid1 + newsid2
     item['news_id'] = newsid[0]
     cmturl = "http://comment5.news.sina.com.cn/page/info?format=json&channel=%s&newsid=%s&page_size=200"%(channel[0],newsid[0])
     item['news_commenturl'] = cmturl
     yield Request(url=cmturl,callback=self.parse_commentnum,meta={'item': item})

Пример #24

0

Показать файл

Файл: TMSpider.py Проект: break123/monitor

    def parse_item(self, response):
        """ Main parse function
        """
        sel = Selector(response)
        item = ProductItem()  

        item['source']  = 'tmall'       
        item['name']    = self.get_product_name( sel )        
        item['img']     = sel.xpath("//ul[@id='J_UlThumb']/li")[0].xpath(".//a/img/@src").extract()[0]

        item['category'] = self.get_category(response)
        
        try:
            # 获取TShop字符串，并对TShop字符串进行JSON标准化处理
            TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0]
            # 移除注释，目前只有天猫超市有注释，以逗号开头
            regex = re.compile(',\s*\/\/[^\n]*')
            TShop_str = re.sub(regex, ',', TShop_str)
            TShop = eval( TShop_str, type('Dummy', (dict,), dict(__getitem__=lambda s,n:n))() )      
        except SyntaxError:
            return  
        
        item['itemId']  = TShop.get('itemDO').get('itemId', '')
        item['url']     = 'http://detail.tmall.com/item.htm?id=' + item['itemId']
        item['date']    = date.today().strftime('%Y-%m-%d')
        item['attr'], item['brand'] = self.get_attr_and_brand( sel )
        
        skuMap = self.get_sku_chinese_map( sel, TShop )
        initApi_url = TShop.get('initApi')

        yield Request(  initApi_url, 
                        headers={'Referer': 'http://www.google.com.hk/'}, 
                        meta={'item': item, 'skuMap': skuMap}, 
                        callback=self.parse_initapi )

Пример #25

0

Показать файл

Файл: simplyrecipes.py Проект: Goldcap/millie

    def parse(self, response):
        location = response.url.lower().split("?")
        if location[0] in self.seen:
            #pass
            self.log('already seen  %s' % response.url)
        else:
            self.log('parsing  %s' % response.url)
            self.seen.add(location[0])

        hxs = Selector(response)
        if re.match('http://www.simplyrecipes.com/recipes/+',location[0]) and not re.search('(ingredient|course|season|type|cuisine)', location[0]):
            item = BaseItem()
            self.last = self.last + 1
            item['pk'] = self.last
            item['title'] = hxs.xpath('//title/text()').extract()
            item['id'] = response.url
            item['source'] = "simplyrecipes"
            item['url'] = location[0]
            item['text'] = response.body
            item['content'] = response.body_as_unicode()
            self.log("saving item " + response.url)
            yield item
        
        for url in hxs.xpath('//a/@href').extract():
            url = url.replace('http://www.simplyrecipes.com','')
            if not url in self.seen and not re.search(r'.(pdf|zip|jar)$', url) and \
                url.lower()[0:9] == '/recipes/' and "," not in url:
                #self.log("yielding request " + url)
                yield Request('http://www.simplyrecipes.com'+url, callback=self.parse)

Пример #26

0

Показать файл

Файл: movie.py Проект: Flowerowl/movie_crawler

    def parse_celebrity(self, response):
        """
        爬取艺人
        """
        celebrity = CelebrityItem()
        sel = Selector(response)

        celebrity["id"] = self._parse_id(response.url)
        name = sel.css("div.per_header h2::text").extract()
        celebrity["name"] = name[0] if name else ""
        name_en = sel.css("div.per_header p.enname::text").extract()
        celebrity["name_en"] = name_en[0] if name_en else ""

        yield Request(
            url=urljoin(response.url, "details.html"),
            callback=self.parse_celebrity_detail,
            meta={"celebrity": celebrity.copy()}
        )
        yield Request(
            url=urljoin(response.url, "awards.html"),
            callback=self.parse_celebrity_awards,
            meta={"celebrity": celebrity.copy()}
        )

        yield celebrity

Пример #27

0

Показать файл

Файл: movie.py Проект: Flowerowl/movie_crawler

    def parse_celebrity_detail(self, response):
        """
        爬取艺人详情
        """
        celebrity = response.meta["celebrity"]
        sel = Selector(response)

        for dt in sel.css("div.per_info_l dt"):
            title = dt.css("::text").extract()[0]
            if title == "出生日期：":
                text = dt.css("::text").extract()[1].rstrip("）")
                if "（" in text:
                    birthday, birthplace = text.split("（", 1)
                else:
                    birthday, birthplace = text, ""
                celebrity["birthday"] = birthday
                celebrity["birthplace"] = birthplace
            elif title == "血型：":
                celebrity["blood"] = dt.css("::text").extract()[1]
            elif title == "星座：":
                celebrity["constellation"] = dt.css("::text").extract()[1]
            elif title == "身高：":
                celebrity["height"] = int(dt.css("::text").extract()[1].rstrip("cm"))
            elif title == "体重：":
                celebrity["height"] = int(dt.css("::text").extract()[1].rstrip("kg"))

        celebrity["intro"] = "\n".join(sel.css("div#lblAllGraphy p::text").extract())
        return celebrity

Пример #28

0

Показать файл

Файл: epilepsy_adderworld_spider.py Проект: ilyutoev/ehealth_scraper

    def parsePostsList(self,response):
        sel = Selector(response)
        posts = sel.xpath('//dl[@class="discussion clear i0 xg_lightborder"]')
        items = []
        topic = response.xpath('//h1/text()').extract_first()
        url = response.url
        
        item = PostItemsList()
        item['author'] = response.xpath('//div[@class="xg_module xg_module_with_dialog"]//ul[@class="navigation byline"]/li/a[contains(@href,"profile")]/text()').extract_first()
        item['author_link'] = response.xpath('//div[@class="xg_module xg_module_with_dialog"]//ul[@class="navigation byline"]/li/a[contains(@href,"profile")]/@href').extract_first()

        item['create_date'] = response.xpath('//div[@class="xg_module xg_module_with_dialog"]//ul[@class="navigation byline"]/li/a[@class="nolink"][2]/text()').extract_first().replace('on','').replace('in','').strip()
        
        item['post'] = re.sub('\s+',' '," ".join(response.xpath('//div[@class="xg_module xg_module_with_dialog"]//div[@class="xg_user_generated"]/p/text()').extract()).replace("\t","").replace("\n","").replace("\r",""))
        item['tag']='epilepsy'
        item['topic'] = topic
        item['url']=url
        logging.info(item.__str__)
        items.append(item)
        
        for post in posts:
            item = PostItemsList()
            item['author'] = post.xpath('./dt[@class="byline"]/a[contains(@href,"user")]/text()').extract_first()
            item['author_link'] = post.xpath('./dt[@class="byline"]/a[contains(@href,"user")]/@href').extract_first()
            item['create_date'] = post.xpath('./dt[@class="byline"]/span[@class="timestamp"]/text()').extract_first()
            item['post'] = re.sub('\s+',' '," ".join(post.xpath('.//div[@class="description"]/div[@class="xg_user_generated"]/p/text()').extract()).replace("\t","").replace("\n","").replace("\r",""))
            item['tag']='epilepsy'
            item['topic'] = topic
            item['url']=url
            logging.info(item.__str__)
            items.append(item)
        return items

Пример #29

0

Показать файл

Файл: feifantxt.py Проект: yytang2012/comics-novels-crawler

 def parse(self, response):
     sel = Selector(response);
     title = sel.xpath('//h2/text()').extract()[0]
     title = "%s-%s"%(title, self.name);
     title = self.polishString(title);
     print(title)
     tmpNovelDirPath = os.path.join(self.tmpDirPath, title);
     if(os.path.isdir(tmpNovelDirPath) != True):
         os.makedirs(tmpNovelDirPath);
     
     dd = sel.xpath('//li/a');
     id = 0;        
     for d in dd:
         id += 1;
         url = d.xpath('@href').extract()[0];
         url = response.urljoin(url);
         subtitle = d.xpath('text()').extract()[0];
         subtitle = self.polishString(subtitle);
         subtitle = '\n\n*********   [%d] - %s   *********\n\n'% (id, subtitle);
         print(url);
         print(subtitle);
         request = scrapy.Request(url, callback = self.parse_page);
         item = NovelsItem();
         item['title'] = title;
         item['subtitle'] = subtitle;
         item['id'] = id;
         item['type'] = 'novels';
         request.meta['item'] = item;
         yield request;

Пример #30

0

Показать файл

Файл: foodbk_spider.py Проект: shengxc/MyCodes

 def parse_category(self,response):
   self.visited_url.add(response.url)
   dom = Selector(response)
   subcategories = dom.xpath("//div[contains(@id,'mw-subcategories')]//a")
   for subcategory in subcategories:
     url = self.domain + subcategory.xpath("./@href").extract()[0]
     if url not in self.visited_url:
       self.visited_url.add(url)
       item = FoodbkItem()
       item["up"] = response.url.decode("utf-8")
       item["down"] = url
       yield item
       yield self.make_requests_from_url(url).replace(callback=self.parse_category)
   entities = dom.xpath("//div[contains(@id,'mw-pages')]//a")
   for e in entities:
     url = e.xpath("./@href").extract()[0]
     if self.template_url_pattern.match(url) != None:
       continue
     url = self.domain + url
     item = FoodbkItem()
     item["up"] = response.url.decode("utf-8")
     item["down"] = url
     yield item
   indexes = dom.xpath("//a[contains(text(),'200')]")
   for index in indexes:
     if index.xpath("./text()").extract()[0] == u'后200条':
       url = self.domain + index.xpath("./@href").extract()[0]
       if url not in self.visited_url:
         self.visited_url.add(url)
         yield self.make_requests_from_url(url).replace(callback=self.parse_category)

Пример #31

0

Показать файл

    def run(self):
        url = 'http://www.sto.cn/Home/Index'

        self.driver.get(url)
        css_seletor = 'li.order-search'
        self.driver.find_element_by_css_selector(
            'li.order-search textarea').send_keys('3367154640058')
        self.driver.find_element_by_css_selector(
            'li.order-search div.btn_order_search input').click()
        sleep(5)
        # div.layui-layer-content
        # driver.save_screenshot('申通.jpg')

        body = self.driver.page_source
        # print(body)
        bg_pic = Selector(
            text=body).css('img.yidun_bg-img::attr("src")').extract_first()
        slide_pic = Selector(
            text=body).css('img.yidun_jigsaw::attr("src")').extract_first()
        print('滑块bg:', bg_pic, ' 补足部分:', slide_pic)
        '''滑动验证码破解'''
        # 等待验证码弹出
        # bg_pic = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "yidun_bg-img")))
        bg_pic = self.wait.until(
            EC.presence_of_element_located(
                (By.CLASS_NAME, "layui-layer-page")))

        # html中坐标原点是左上角，右为x轴正方向，下为y轴正方向
        # 输出的x为正就是此元素距离屏幕左侧距离
        # 输出的y为正就是此元素距离屏幕上侧距离
        # 所以我们需要截图的四个距离如下：
        top, bottom, left, right = (bg_pic.location['y'],
                                    bg_pic.location['y'] +
                                    bg_pic.size['height'],
                                    bg_pic.location['x'],
                                    bg_pic.location['x'] +
                                    bg_pic.size['width'])
        print('top: {0}, bottom: {1}, left: {2}, right: {3}'.format(
            top, bottom, left, right))
        sleep(1)
        cp1 = self.crop(left, top, right, bottom, '12.png')

        # 获取滑块按钮并点击一下
        slide = self.wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, "yidun_slider")))
        slide.click()
        sleep(3)  # 等3秒报错信息消失 TODO 这里应该可以改进
        cp2 = self.crop(left, top, right, bottom, '2.png')
        move = self.calc_move(cp1, cp2)

        result = self.path1(move)
        # result = self.path2(move)

        # 拖动滑块
        ActionChains(self.driver).click_and_hold(slide).perform()
        for x in result:
            ActionChains(self.driver).move_by_offset(xoffset=x[0],
                                                     yoffset=x[1]).perform()
            # ActionChains(driver).move_to_element_with_offset(to_element=slide,xoffset=x[0],yoffset=x[1]).perform()
            sleep(x[-1])  # 如果使用方法1则需要sleep
        sleep(0.5)
        ActionChains(self.driver).release(slide).perform()  # 释放按钮

        sleep(0.8)

Пример #32

0

Показать файл

    def parse(self, response):
        driver = response.meta['driver']
        for item in self.params:
            search_box = driver.find_element_by_xpath(
                "//input[@placeholder='Search for your personal TLD']")
            search_box.send_keys(item)
            #time.sleep(2)
            search_box.send_keys(Keys.ENTER)
            time.sleep(3)

            html = driver.page_source
            response = Selector(text=html)

            if response.xpath("//div[@class='desktop-bid-card']"):
                status = "Live"
                text = None
                days = response.xpath(
                    "//div[text()='Time left to bid (est.)']/preceding-sibling::div/text()"
                ).get()
                blocks = response.xpath(
                    "//div[text()='Blocks left to bid']/preceding-sibling::div/text()"
                ).get()
            elif response.xpath("//div[text()='Buy now']"):
                status = "Buy Now"
                text = response.xpath("//div[text()='HNS']/text()").get()
                days = None
                blocks = None
            elif response.xpath("//div[text()='Auction over']"):
                status = "Auction Over"
                text = response.xpath(
                    "//div[text()='Auction over']/following-sibling::div/text()"
                ).get()
                days = None
                blocks = None
            elif response.xpath("//div[text()='Already taken']"):
                status = "Already Taken"
                text = response.xpath(
                    "//div[text()='Already taken']/following-sibling::div/text()"
                ).get()
                days = None
                blocks = None
            elif response.xpath("//div[text()='Coming soon']"):
                status = "Coming Soon"
                text = response.xpath(
                    "//div[text()='Coming soon']/following-sibling::div/text()"
                ).get()
                days = response.xpath(
                    "//div[text()='Available in (est.)']/preceding-sibling::div/text()"
                ).get()
                blocks = response.xpath(
                    "//div[text()='Blocks until release']/preceding-sibling::div/text()"
                ).get()

            yield {
                'Word': item,
                'Status': status,
                'Text': text,
                'Blocks': blocks,
                'Days': days
            }

            back_button = driver.find_element_by_xpath(
                "//a[text()='Top-level domain']")
            back_button.click()
            time.sleep(3)

Пример #33

0

Показать файл

Файл: goupu.py Проект: heybody/goupu

    def detail_index(self, index_url):
        response = requests.get(url=index_url, headers=self.headers)
        selector = Selector(text=response.text)
        img_url = selector.css('.type_production img::attr(src)').extract()[1]
        zh_name = selector.css('.pro_tit .a1::text').extract()
        us_name = selector.css('.pro_tit .a2::text').extract()
        jianjie = selector.css('.produc_table .a2::text').extract()
        cankaojiage = selector.css('.produc_table .a3::text').extract_first('')
        hengliangbiaozhui = selector.css(
            '.pingfen .sp2 em::attr(style)').extract()
        text = selector.css('.prod_slidebox ul li .text p::text').extract()
        for x in range(len(hengliangbiaozhui)):
            nianren = hengliangbiaozhui[0].replace('width:', '')
            xijiao = hengliangbiaozhui[1].replace('width:', '')
            diaomao = hengliangbiaozhui[2].replace('width:', '')
            tiwei = hengliangbiaozhui[3].replace('width:', '')
            meirong = hengliangbiaozhui[4].replace('width:', '')
            youxian = hengliangbiaozhui[5].replace('width:', '')
            shengren = hengliangbiaozhui[6].replace('width:', '')
            dongwu = hengliangbiaozhui[7].replace('width:', '')
            yundongliang = hengliangbiaozhui[8].replace('width:', '')
            kexunxing = hengliangbiaozhui[9].replace('width:', '')
            koushui = hengliangbiaozhui[10].replace('width:', '')
            naihan = hengliangbiaozhui[11].replace('width:', '')
            naire = hengliangbiaozhui[12].replace('width:', '')
            shiying = hengliangbiaozhui[13].replace('width:', '')

        for i in range(len(jianjie)):
            bieming = jianjie[0].replace('\xa0', '')
            fenbuquyu = jianjie[1].replace('\xa0', '')
            yuanchandi = jianjie[2].replace('\xa0', '')
            tixing = jianjie[3].replace('\xa0', '')
            gongneng = jianjie[4].replace('\xa0', '')
            fenzu = jianjie[5].replace('\xa0', '')
            shengao = jianjie[6].replace('\xa0', '')
            tizhong = jianjie[7].replace('\xa0', '')
            souming = jianjie[8].replace('\xa0', '')
            tidian = jianjie[10].replace('\xa0', '')
        return {
            '封      面：': img_url,
            '中文名字  ：': zh_name,
            '英文名字  ：': us_name,
            '别      名：': bieming,
            '分布区域  ：': fenbuquyu,
            '原  产  地：': yuanchandi,
            '体      型：': tixing,
            '功      能：': gongneng,
            '分      组：': fenzu,
            '身      高：': shengao,
            '体      重：': tizhong,
            '寿      命：': souming,
            '参考价格  ：': cankaojiage,
            '特      点：': tidian,
            '粘人程度  ：': nianren,
            '喜叫程度：': xijiao,
            '掉毛程度：': diaomao,
            '体味程度：': tiwei,
            '美容程度：': meirong,
            '对小孩友善程度：': youxian,
            '对生人程度：': shengren,
            '对动物程度：': dongwu,
            '运动量：': yundongliang,
            '可训练性：': kexunxing,
            '口水程度：：': koushui,
            '耐寒程度：': naihan,
            '耐热程度：': naire,
            '城市适应度：': shiying,
            '简介：     ': str(text)
        }

Пример #34

0

Показать файл

    def parse_songdata(self, response):
        responseSelector = Selector(response)
        item = LyricsscraperItem()

        ## Non Translated data
        songLyricswithExtra = remove_tags(
            responseSelector.xpath(
                '//*[@id="genesis-content"]/article/*[@class="entry-content"]//pre'
            )[0].extract())
        songLyrics = "".join([
            char for char in songLyricswithExtra
            if ((char not in string.digits) and (
                char not in string.ascii_letters) and (char not in removepunc))
        ]).strip()
        songLyrics = songLyrics.replace("∆", "")
        item["songLyrics"] = songLyrics

        songLyrics = songLyrics.replace("\n", "")
        songLyrics = songLyrics.replace("\t", "")
        songLyrics = "".join([
            char for char in songLyrics if ((char not in string.punctuation))
        ]).strip()
        item["songLyricsSearchable"] = songLyrics

        string_viewcount_data = remove_tags(
            responseSelector.xpath('//*[@class="tptn_counter"]')[0].extract())
        string_viewcount = re.sub('[^0-9,]', "",
                                  string_viewcount_data).replace(',', '')
        viewcount = int(string_viewcount.replace(",", ""))
        item["views"] = viewcount

        shareobj = responseSelector.xpath('//*[@class="swp_count"]')
        if (len(shareobj) == 0):
            shareobj = responseSelector.xpath('//*[@class="swp_count "]')

        if (len(shareobj) > 0):
            string_sharecount_data = remove_tags(shareobj[0].extract())
            string_sharecount = re.sub('[^0-9,]', "",
                                       string_sharecount_data).replace(
                                           ',', '')
            sharecount = int(string_sharecount)
            item["shares"] = sharecount

        titlestring = remove_tags(
            responseSelector.xpath(
                '//*[@id="genesis-content"]/article/*[@class="entry-content"]/h2'
            )[0].extract())
        if ("-" in titlestring):
            titles = titlestring.split("-")
            titles = [i.strip() for i in titles]
            item["title"] = titles[1]
        elif ("|" in titlestring):
            titles = titlestring.split("|")
            titles = [i.strip() for i in titles]
            item["title"] = titles[1]
        elif ("–" in titlestring):
            titles = titlestring.split("–")
            titles = [i.strip() for i in titles]
            item["title"] = titles[1]
        else:
            item["title"] = titlestring.strip()

        musicInfoString = remove_tags(
            responseSelector.xpath(
                '//*[@id="genesis-content"]/article/*[@class="entry-content"]/h3'
            )[0].extract())
        if ("-" in musicInfoString):
            musicInfo = musicInfoString.split("-")
            musicInfo = [i.strip() for i in musicInfo]
            item["key"] = musicInfo[0].replace("Key:", "").strip()
            item["beat"] = musicInfo[1].replace("Beat:", "").strip()
        elif ("|" in musicInfoString):
            musicInfo = musicInfoString.split("|")
            musicInfo = [i.strip() for i in musicInfo]
            item["key"] = musicInfo[0].replace("Key:", "").strip()
            item["beat"] = musicInfo[1].replace("Beat:", "").strip()

        item['url'] = response.url

        gotNamesfromElement = False

        artistInfoObject = responseSelector.xpath(
            '//*[@id="genesis-content"]/article//*[@class="artist-name"]')
        if (len(artistInfoObject) > 0):
            aristInfoString = remove_tags(artistInfoObject[0].extract())
            aristInfoString = aristInfoString.replace("|", "/")
            artistNames = aristInfoString.split("/")
            isascii = lambda s: len(s) == len(s.encode())
            sinhalaArtistNamesArray = []
            for i in artistNames:
                if not isascii(i):
                    sinhalaArtistNamesArray.append(i)
            item['artist'] = sinhalaArtistNamesArray
            if (len(sinhalaArtistNamesArray) > 0):
                gotNamesfromElement = True
        ## Translated Data

        songInfo = responseSelector.xpath(
            '//*[@id="genesis-content"]/article/*[@class="entry-content"]/*[@class="su-row"]//ul/li'
        )
        for i in range(0, len(songInfo)):
            headstring = remove_tags(songInfo[i].extract())
            if ("Artist:" in headstring and not gotNamesfromElement):
                artiststring = headstring.replace("Artist:", "").strip()
                artists = artiststring.split(",")
                artists = [i.strip() for i in artists]
                translated_artists = translate_array(artists)
                item['artist'] = translated_artists
            elif (("Genre:" in headstring)):
                genrestring = headstring.replace("Genre:", "").strip()
                genre = genrestring.split(",")
                genre = [i.strip() for i in genre]
                translated_genre = translate_array(genre)
                item['genre'] = translated_genre
            elif (("Lyrics:" in headstring)):
                writerstring = headstring.replace("Lyrics:", "").strip()
                writers = writerstring.split(",")
                writers = [i.strip() for i in writers]
                translated_writers = translate_array(writers)
                item['writer'] = translated_writers
            elif (("Music:" in headstring)):
                composerstring = headstring.replace("Music:", "").strip()
                composers = composerstring.split(",")
                composers = [i.strip() for i in composers]
                translated_composers = translate_array(composers)
                item['composer'] = translated_composers
            elif (("Movie:" in headstring)):
                item['movie'] = translate_word(
                    headstring.replace("Movie:", "").strip())
        return item

Пример #35

0

Показать файл

Файл: publisher.py Проект: mooon0129/-

data = None
urls = []
new_list = []
ming = []
xie = []
hao = []
lei = []
number = []

print(u'\n数据爬取中，请稍候……')
html = gethtml(
    r'http://search.jd.com/Search?keyword=%E4%BA%BA%E6%B0%91%E6%96%87%E5%AD%A6%E5%87%BA%E7%89%88%E7%A4%BE&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E4%BA%BA%E6%B0%91%E6%96%87%E5%AD%A6%E5%87%BA%E7%89%88%E7%A4%BE&sid=1000005720&ev=publishers_%E4%BA%BA%E6%B0%91%E6%96%87%E5%AD%A6%E5%87%BA%E7%89%88%E7%A4%BE%5E&psort=3&click=0'
)  # 起始url
# 注意headers中的请求
ss = Selector(text=html).xpath("//ul[@class='gl-warp clearfix']")  # 总标签

file = Selector(
    text=html).xpath("//a[@class='crumb-select-item']/em/text()").extract()
for s in ss:
    # print(s)

    label = s.xpath("//li/div[@class='gl-i-wrap']/div[1]/@class").extract()
    url = s.xpath(
        "./li/div[@class='gl-i-wrap']/div[@class='p-name']/a/@href").extract()
    for i in range(1):
        if label[i] == 'gl-i-tab':
            url[i] = s.xpath(
                ".//div[@class='tab-content-item tab-cnt-i-selected']/div[@class='p-name']/a/@href"
            ).extract()
            urls.append(url[i][1])

Пример #36

0

Показать файл

Файл: sergio_rossi_spider.py Проект: haizi-zh/ofashion

    def parse_details(self, response):
        metadata = response.meta['userdata']
        metadata['url'] = response.url
        sel = Selector(response)

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        # image_urls = sel.xpath('//div[@id="itemContent"]//img/@src').extract()

        # 获得图片
        hdr = None
        tail = None
        img0 = sel.xpath(
            '//meta[@property="og:image" and @content]/@content').extract()
        if img0:
            img0 = img0[0]
            mt = re.search(r'(.+)_\d+_\w(\..+)$', img0)
            if mt:
                hdr = mt.group(1)
                tail = mt.group(2)
        idx = response.body.find('jsinit_item')
        img_item = None
        if idx != -1:
            tmp = response.body[idx:]
            idx = tmp.find('ALTERNATE')
            if idx != -1:
                try:
                    img_item = json.loads(
                        cm.extract_closure(tmp[idx:], r'\[', r'\]')[0])
                except ValueError:
                    pass
        image_urls = []
        if hdr and tail and img_item:
            for item in img_item:
                mt = re.search(r'(\d+)_\w', item)
                if not mt:
                    continue
                start_idx = int(mt.group(1))
                for idx in xrange(start_idx, 15):
                    tmp = re.sub(r'\d+_(\w)', str.format(r'{0}_\1', idx), item)
                    image_urls.append(str.format('{0}_{1}{2}', hdr, tmp, tail))

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['image_urls'] = image_urls
        item['metadata'] = metadata
        yield item

Пример #37

0

Показать файл

Файл: text.py Проект: 306235911/IpPool

 def selector(self):
     from scrapy.selector import Selector
     if self._cached_selector is None:
         self._cached_selector = Selector(self)
     return self._cached_selector

Пример #38

0

Показать файл

Файл: sina_head_img_and_nick_name.py Проект: ttggaa/Python_example

    def parse(self):
        while True:
            if self.index > 48:
                print('-' * 100 + '一次大循环爬取完成')
                print()
                print('-' * 100 + '即将重新开始爬取....')
                ip_object = IpPools(type=self.ip_pool_type)
                self.proxies = ip_object.get_proxy_ip_from_ip_pool(
                )  # 获取新的代理pool
                self.index = 1

            else:
                sleep(5)
                tmp_number = randint(1, 8)  # 随机一个数，来获取随机爬取范围

                my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                tmp_index = 1
                for i in range(0, 49):  # 控制每个分类的循环
                    bozhu = {}

                    if self.index == 49:
                        break

                    tmp_type = self.species[self.index][1]
                    number = self.species[self.index][0]

                    domain = '102803_ctg1_{}_-_ctg1_{}'.format(
                        str(number), str(number))
                    id = domain

                    tmp_pagebar_index = 0
                    tmp_pre_page_index = 1
                    tmp_page_index = 1

                    for count in self.page_range[
                            tmp_number]:  # 又入坑(大多数热门页面30页后无法下拉)：弄清算法规律后，发现在不同的热门页面，下拉到一定的页数，就无法下拉获取数据，点背...
                        if tmp_index % 50 == 0:  # 每50次重连一次，避免单次长连无响应报错
                            print('正在重置，并与数据库建立新连接中...')
                            my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                            print('与数据库的新连接成功建立...')

                        if my_pipeline.is_connect_success:
                            print('============| 正在采集第%d页的内容 ...... |' %
                                  (count + 1, ))
                            # 分析pagebar
                            #                    5            11           17
                            # pagebar: 0 1 2 3 4 无 0 1 2 3 4 无 0 1 2 3 4 无....
                            if tmp_pagebar_index > 5:  # 控制其始终小于5
                                tmp_pagebar_index = 0
                            pagebar = str(self.pagebar[tmp_pagebar_index])

                            current_page = str(count + 1)
                            script_uri = r'/102803_ctg1_{}_-_ctg1_{}'.format(
                                str(number), str(number))
                            domain_op = domain
                            # 1506471533330
                            __rnd = str(15064) + str(randint(1, 9)) + str(
                                randint(1, 9)) + str(randint(1, 9)) + str(
                                    randint(1, 9)) + str(randint(1, 9)) + str(
                                        randint(1, 9)) + str(randint(
                                            1, 9)) + str(randint(1, 9))
                            # __rnd = str(1506471533330)
                            if (count) % 6 == 0:  # 分析出来count为6的倍数则pre_page加1
                                tmp_pre_page_index += 1
                            pre_page = str(tmp_pre_page_index)

                            if (count + 1) % 6 == 0:  # 分析出来count+1为6的倍数则page加1
                                tmp_page_index += 1
                            page = str(tmp_page_index)

                            url = 'https://d.weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&from=faxian_hot&mod=fenlei&tab=home&pl_name=Pl_Core_NewMixFeed__3&feed_type=1&domain={}&pagebar={}&current_page={}&id={}&script_uri={}&domain_op={}&__rnd={}&pre_page={}&page={}' \
                                .format(domain, pagebar, current_page, id, script_uri, domain_op, __rnd, pre_page, page)
                            print(url)
                            sleep(2)  # 设置等待时间避免微博进行网页重定向

                            # 发现规律，每爬取多少页面时，会将页面重定向，并且很久不响应，所以间隔性休眠
                            # if count == 50 or count == 100 or count == 150 or count == 200 or count == 250:
                            #     print('============| >>>>>> 爬虫正在休眠中 ...... <<<<<<')
                            #     time.sleep(100)

                            tmp_html = self.get_url_body(url=url)

                            if len(tmp_html) <= 100000:
                                print(
                                    '==========| 此时返回的content["data"]为空值, 爬虫进入短暂休眠 ....... |'
                                )
                                print('==========| 请稍后，即将开始继续爬取------>>>>>')
                                sleep(2)
                                tmp_html = self.get_url_body(url=url)
                                # print(tmp_html)

                            for item in Selector(
                                    text=tmp_html).css('div.face a').extract():
                                tmp_nick_name = Selector(text=item).css(
                                    'img::attr("title")').extract_first()
                                tmp_head_img_url = 'https:' + Selector(
                                    text=item).css(
                                        'img::attr("src")').extract_first()

                                bozhu['nick_name'] = self.wash_nick_name(
                                    nick_name=tmp_nick_name)
                                bozhu['sina_type'] = tmp_type
                                bozhu['head_img_url'] = re.compile(
                                    '\.50/').sub('.180/', tmp_head_img_url)

                                print('---->> ', [
                                    tmp_nick_name, tmp_type, tmp_head_img_url
                                ])

                                # yield bozhu
                                my_pipeline.insert_into_sina_weibo_table(
                                    item=bozhu)
                                gc.collect()

                            print('============| 采集第%d页的内容 完毕 |' %
                                  (count + 1, ))
                            tmp_pagebar_index += 1  # 累加1

                        else:
                            print('数据库连接失败!')
                            pass
                        tmp_index += 1
                    self.index += 1  # 更换索引地址

Пример #39

0

Показать файл

Файл: demo.py Проект: ttggaa/Python_example

@author = super_fazai
@File    : demo.py
@Time    : 2017/8/20 10:33
@connect : [email protected]
'''
"""
Scrapy选择器是Selector通过传递文本或TextResponse 对象构造的类的实例。
它根据输入类型自动选择最佳解析规则(XML vs HTML)
"""

from scrapy.selector import Selector
from scrapy.http import HtmlResponse

# 从文本构建
body = '<html><body><span>good</span></body></html>'
print(Selector(text=body).xpath('//span/text()').extract())

# 从response(响应)中构建
response = HtmlResponse(url='https://sebastianraschka.com/blog/index.html',
                        body=body,
                        encoding='utf-8')
print(
    Selector(response=response).xpath(
        '//*/h1[@class="post-title"]/text()').extract())
# 上面那句等价于下面这句
print(response.selector.xpath('//*/h1[@class="post-title"]/text()').extract())

response = r"""
<html>
 <head>
  <base href='http://example.com/' />

Пример #40

0

Показать файл

    "www.lagou.com",
    "Referer":
    "https://www.lagou.com/",
    "Upgrade-Insecure-Requests":
    "1",
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
}

url = 'https://www.lagou.com/zhaopin/Python/?labelWords=label'
# url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
r = requests.get(url, headers=headers)
# print(r.status_code)
# print(r.text)
from scrapy.selector import Selector

s = Selector(text=r.text)

all_links = s.css('.position_link::attr(href)').getall()
print(all_links)
for link in all_links:
    r = requests.get(link, headers=headers)
    print(r.status_code)
    print(r.url)
    s = Selector(text=r.text)
    data = {}
    title = s.css('.job-name::attr("title")').get()
    detail = s.css('.job-detail').get()
    data['title'] = title
    data['detail'] = detail
    print(data)

Пример #41

0

Показать файл

Файл: qa_collect.py Проект: newer027/amazon_crawler

def qa_collect(self,asin,country='us'):
    try:
        product=Product.objects.filter(asin=asin,country=country)[0]
        page=get_url('ask/questions/asin/'+asin+'/ref=ask_ql_psf_ql_hza?sort=SUBMIT_DATE', country)
        tree = fromstring(page.content)
        print(tree.findtext('.//title'))
        if tree.findtext('.//title')=='Robot Check' or tree.findtext('.//title')=='Amazon CAPTCHA':
            info = {'to':0}
            return info

        if Selector(text=page.content).xpath('.//*[@id="noResultsTitle"]'):
            info = {'to':0}
            return info

        qa_collection={}
        if Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href"):
            page_num=0
            while True:
                boxes=Selector(text=page.content).xpath(".//*[@class='a-section askTeaserQuestions']/div[@class='a-fixed-left-grid a-spacing-base']")
                for box in boxes:
                    answer_url,answer,answer_user,qa_time=None,None,None,None
                    vote=int(box.xpath(".//ul[@class='vote voteAjax']/li[2]/span[1]/text()").extract()[0])
                    question=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-small']//a[@class='a-link-normal']/text()").extract()[0]
                    try:
                        qa_time=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[-1:][0]
                    except:
                        pass
                    try:
                        if box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract() and country != 'jp':
                            answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0]
                        elif box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract() and country == 'jp':
                            answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0]
                            if answer == "":
                                try:
                                    answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip()
                                except:
                                    pass
                        else:
                            answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip()
                    except:
                        pass
                    try:
                        answer_user=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[0]
                    except:
                        pass
                    try:
                        answer_quan=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/text()").extract()[0]
                        answer_quan = re.search(r'\d+', answer_quan).group(0)
                    except:
                        pass
                    try:
                        answer_url=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/@href").extract()[0]
                        answer_url=country_url(country)[:-1]+answer_url
                        #print("answer_url:",answer_url)
                    except:
                        pass

                    #print(answer_user,qa_time)
                    if answer_user == None:
                        pass
                    elif answer_user==qa_time:
                        if country in['us','uk','ca','de']:
                            name_date=re.split(' on |By |Von | am ', answer_user)
                        elif country=='it':
                            name_date=re.split(' in |Da ', answer_user)
                        elif country=='fr':
                            name_date=re.split(' le |Par ', answer_user)
                        elif country=='es':
                            name_date=re.split(' el |Por ', answer_user)
                        elif country=='jp':
                            name_date=re.split('投稿者: |、投稿日: ', answer_user)
                        answer_user=name_date[1]
                        qa_time=name_date[2]
                    else:
                        answer_user=re.split(' on |By |Von | am ', answer_user)[-1:][0]
                        qa_time=re.split(' on |By |Von | am ', qa_time)[-1:][0]

                    if answer_url and answer_quan:
                        qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time.strip(),'answer':answer,'answer_user':answer_user.strip(),'answer_quan':answer_quan,'answer_url':answer_url}
                    elif answer:
                        qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time.strip(),'answer':answer,'answer_user':answer_user.strip()}
                print(len(qa_collection))

                if Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href") and page_num<200:
                    time.sleep(2+random.random()*5)
                    page=get_url((Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href")).extract()[0],country=country)
                    page_num += 1
                else:
                    break

        else:
            boxes=Selector(text=page.content).xpath(".//*[@class='a-section askTeaserQuestions']/div[@class='a-fixed-left-grid a-spacing-base']")
            for box in boxes:
                answer_url,answer,answer_user,qa_time=None,None,None,None
                vote=int(box.xpath(".//ul[@class='vote voteAjax']/li[2]/span[1]/text()").extract()[0])
                question=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-small']//a[@class='a-link-normal']/text()").extract()[0]
                try:
                    qa_time=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[-1:][0]
                except:
                    pass
                try:
                    if box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract():
                        answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0]
                    else:
                        answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip()
                except:
                    pass
                try:
                    answer_user=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[0]
                except:
                    pass
                try:
                    answer_quan=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/text()").extract()[0]
                    answer_quan = re.search(r'\d+', answer_quan).group(0)
                except:
                    pass
                try:
                    answer_url=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/@href").extract()[0]
                    answer_url=country_url(country)[:-1]+answer_url
                except:
                    pass

                if answer_user == None:
                    pass
                elif answer_user==qa_time:
                    if country in['us','uk','ca','de']:
                        name_date=re.split(' on |By |Von | am ', answer_user)
                    elif country=='it':
                        name_date=re.split(' in |Da ', answer_user)
                    elif country=='fr':
                        name_date=re.split(' le |Par ', answer_user)
                    elif country=='es':
                        name_date=re.split(' el |Por ', answer_user)
                    elif country=='jp':
                        name_date=re.split('投稿者: |、投稿日: ', answer_user)
                    answer_user=name_date[1]
                    qa_time=name_date[2]
                else:
                    answer_user=re.split(' on |By |Von | am ', answer_user)[-1:][0]
                    qa_time=re.split(' on |By |Von | am ', qa_time)[-1:][0]

                if answer_url and answer_quan:
                    qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time,'answer':answer,'answer_user':answer_user,'answer_quan':answer_quan,'answer_url':answer_url}
                elif answer:
                    qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time,'answer':answer,'answer_user':answer_user}

        for qa in qa_collection:
            try:
                num=qa_collection[qa]['answer_quan']
            except:
                num="1"
            try:
            #if qa_collection[qa]['answer_url']:
                QA_detail.objects.get_or_create(product=product,vote=qa_collection[qa]['vote'],question=qa_collection[qa]['question'],qa_time=qa_collection[qa]['qa_time'],
                    answer=qa_collection[qa]['answer'],answer_person=qa_collection[qa]['answer_user'],num=num,answer_url=qa_collection[qa]['answer_url'])
            except:
                QA_detail.objects.get_or_create(product=product,vote=qa_collection[qa]['vote'],question=qa_collection[qa]['question'],qa_time=qa_collection[qa]['qa_time'],
                    answer=qa_collection[qa]['answer'],answer_person=qa_collection[qa]['answer_user'],num=num)
            #except:
            #    pass

        #report = GlucoseCsvReport(product)
        #report.email(product.user, 'subject', 'message')

    except Exception as e:
        dt = datetime.now(pytz.utc) + timedelta(seconds=40)
        self.retry(eta=dt, exc=e, max_retries=2)

Пример #42

0

Показать файл

Файл: statistic.py Проект: Sssmeb/A-statistical-program

    def parse_detail(self, response):
        if response.meta:
            response = response.meta['res']
        totalpage = int(
            response.xpath('//label[@class="ui-label"]/text()').extract_first(
            ).split('/')[-1])
        for page in range(totalpage):
            # # 容量
            # capacities = response.xpath('//span[@class="first"]')
            #
            # # 颜色
            # colors = response.xpath('//span[contains(string(.),"Color")]')
            #
            # # 物流
            # Logistics = response.xpath('//span[contains(string(.),"Logistics")]')

            infos = response.xpath('//div[@class="user-order-info"]')

            # 日期时间
            datetimes = response.xpath(
                '//dl[@class="buyer-review"]/dd[@class="r-time"]/text()'
            ).extract()

            # 国家
            countries = response.xpath(
                '//div[@class="user-country"]/b/text()').extract()

            # 图片
            image_urls = response.xpath(
                '//ul[@class="util-clearfix"]/li/img/@src').extract()

            for i in range(len(countries)):
                item = Item()

                # # 容量
                # capacity = capacities[i].xpath('string(.)').extract_first()
                # capacity = capacity.replace('\\t', '').replace('\\n', '')
                # # capacity = re.search("\d+-\d+ml", capacity).group(0)
                #
                # # 颜色
                # color = colors[i].xpath('string(.)').extract_first()
                # color = color.split(":")[-1].strip()
                #
                # # 物流
                # logistics = Logistics[i].xpath('string(.)').extract_first()
                # logistics = logistics.split(":")[-1].strip()

                spans = infos[i].xpath('span')
                item[Item.INFOS] = {}

                for span in spans:
                    key = span.xpath('strong/text()').extract_first().replace(
                        ':', '')
                    value = span.xpath('string(.)').extract_first()
                    value = value.split(":")[-1].replace('\t', '').replace(
                        '\n', '').replace(' ', '')
                    item[Item.INFOS][key] = value

                    # 日期时间
                datetime = datetimes[i]

                # 国家
                country = countries[i]

                # 图片
                if i == 0:
                    item[Item.IMAGE_URLS] = image_urls
                else:
                    item[Item.IMAGE_URLS] = None

                # item[Item.CAPACITY] = capacity
                # item[Item.COLOR] = color
                # item[Item.LOGISTICS] = logistics
                item[Item.DATETIME] = datetime
                item[Item.COUNTRY] = country
                item[Item.PRODUCT_ID] = self.product_id

                yield item
            if response.xpath('//a[contains(text(),"Next")]'):
                time.sleep(3)
                self.driver.find_element_by_xpath(
                    '//div[@class="ui-pagination ui-pagination-front ui-pagination-pager util-right"]/a[contains(text(),"Next")]'
                ).click()

                response = self.driver.page_source
                response = Selector(text=response)
            else:
                break

        self.driver.close()

Пример #43

0

Показать файл

Файл: reports_spider.py Проект: vpineda7/h1ReportScrape

 def __init__(self):
   self.driver = webdriver.Firefox()
   self.report_selector = Selector(text = "")

Пример #44

0

Показать файл

    def parse(self, response):
        self.log('Hi, this is: %s' % response.url)
        hxs = Selector(response)
        dls = hxs.xpath('//div[@id = "a-bit-more-about"]/dl')

        item = FlickrProfileItem()
        # item['_id'] = self.get_username(response.url)
        _id = self.get_username(response.url)
        print _id

        item["_id"] = _id
        for dl in dls:
            if dl.xpath('dt/text()').extract()[0] == "Name:":
                given_name = ""
                family_name = ""

                try:
                    given_name = dl.xpath(
                        'dd/span[@class="given-name"]/text()').extract()[0]
                except:
                    pass
                else:
                    print 'given_name:', given_name
                    # item['given_name'] = given_name

                try:
                    family_name = dl.xpath(
                        'dd/span[@class = "family-name"]/text()').extract()[0]
                except:
                    pass
                else:
                    print 'family_name:', family_name
                    # item['family_name'] = family_name
                item["name"] = given_name + " " + family_name

            if dl.xpath('dt/text()').extract()[0] == "Joined:":
                joined = dl.xpath('dd/text()').extract()[0]
                print 'joined time:', joined
                item['joined'] = joined

            if dl.xpath('dt/text()').extract()[0] == "Hometown:":
                home = dl.xpath('dd/text()').extract()[0]
                print 'hometown:', home
                item['hometown'] = home

            if dl.xpath('dt/text()').extract()[0] == "Currently:":
                try:
                    locality = dl.xpath(
                        'dd/span[@class = "adr"]/span[@class = "locality"]/text()'
                    ).extract()[0]
                except:
                    pass
                else:
                    print 'locality:', locality
                    item['location'] = locality

                try:
                    country_name = dl.xpath(
                        'dd/span[@class = "adr"]/span[@class = "country-name"]/text()'
                    ).extract()[0]
                except:
                    pass
                else:
                    print 'country-name:', country_name
                    item['country'] = country_name

            if dl.xpath('dt/text()').extract()[0] == "I am:":
                gender = dl.xpath('dd/text()').extract()[0].strip()
                print 'gender:', gender
                item['gender'] = gender

            if dl.xpath('dt/text()').extract()[0] == "Occupation:":
                occupation = dl.xpath('dd/text()').extract()[0]
                print 'occupation:', occupation
                item['occupation'] = occupation

            if dl.xpath('dt/text()').extract()[0] == "Website:":
                websitename = dl.xpath('dd/a/text()').extract()[0]
                websiteurl = dl.xpath('dd/a/@href').extract()[0]
                print 'website:', websitename, websiteurl
                item['websitename'] = websitename
                item['websiteurl'] = websiteurl
        yield item

Пример #45

0

Показать файл

Файл: ZhilianSpider.py Проект: OneTraTown/ZhilianSpider

    def parse_item(self, response):
        items = []
        sel = Selector(response)

        name_list = sel.xpath('//td[@class="zwmc"]/div/a').xpath(
            'string(.)').extract()
        link_list = sel.xpath('//td[@class="zwmc"]/div/a/@href').extract()
        firm_list = sel.xpath('//td[@class="gsmc"]/a').xpath(
            'string(.)').extract()
        salary_list = sel.xpath('//td[@class="zwyx"]').xpath(
            'string(.)').extract()
        workplace_list = sel.xpath('//td[@class="gzdd"]').xpath(
            'string(.)').extract()
        pubdate_list = sel.xpath('//td[@class="gxsj"]/span/text()').extract()
        firmsize_list = sel.xpath(
            '//li[@class="newlist_deatil_two"]/span[3]/text()').extract()
        workreq_list = sel.xpath(
            '//li[@class="newlist_deatil_two"]/span[4]/text()').extract()
        details_list = sel.xpath('//li[@class="newlist_deatil_last"]').xpath(
            'string(.)').extract()
        while (name_list):
            item = ZhilianItem()
            try:
                item['job_name'] = name_list.pop()
                item['link'] = link_list.pop()
                item['firm_name'] = firm_list.pop()
                item['salary'] = salary_list.pop()
                item['working_place'] = workplace_list.pop()
                item['pub_date'] = pubdate_list.pop()
                item['firm_size'] = firmsize_list.pop()
                item['work_requirement'] = edureq_list.pop()
                item['job_describe'] = details_list.pop()
            except:
                pass
            items.append(item)
        return items

Пример #46

0

Показать файл

Файл: reports_spider.py Проект: vpineda7/h1ReportScrape

class ReportSpider(scrapy.Spider):
  name = "reports"
  start_urls = [
      'https://h1.sintheticlabs.com/'
  ]

  def __init__(self):
    self.driver = webdriver.Firefox()
    self.report_selector = Selector(text = "")

  def parse(self, response):
    report_urls = response.xpath('//tbody/tr/td[3]/a/@href').extract()
    item = HackeroneItem()
    for report_url in report_urls[1:]:
      self.driver.get(report_url)
      #Sleep for few moments so that webpage gets loaded properly otherwise content won't come up
      time.sleep(2)
      self.report_selector = Selector(text = self.driver.page_source)
      item = self.parseReport()
      if item != None:
        yield item
    #Everything is over and browser whould be quit
    self.driver.quit()

  def parseReport(self):

    print("Report called")
    #Check whether report is duplicate
    if self.report_selector.xpath('//i[contains(@class, "duplicate")]').extract_first() != None:
      self.log("Found a duplicate report")
      return None

    hid = self.get_hid()
    reward = self.get_reward()
    submission_date = self.get_submission_date()
    ending_date = self.get_end_date()
    vuln_type = self.get_vuln_type()
    severity = self.get_severity()

    item = HackeroneItem()
    item['hid'] = hid
    item['reward'] = reward
    item['submission_date'] = submission_date
    item['resolved_date'] = ending_date
    item['vuln_type'] = vuln_type
    item['severity'] = severity
    return item

  def get_hid(self):
    hid = self.report_selector.xpath("//div[@class='report-status']/a/text()[2]").extract_first()
    return hid

  def get_reward(self):
    reward = self.report_selector.xpath("//tr[contains(@class, 'bounty-amount')]/td/text()").extract()
    if (len(reward) == 0):
      reward = 0
    else:
      reward = float(reward[0][1:].replace(',', ''))
      #Typecasting reward to float first so that it can handle decimal
      reward = int(reward)
    return reward

  def get_submission_date(self):
    submission_date = self.report_selector.xpath("//span[contains(@class,'spec-timestamp')]/span/@title").extract_first()
    return submission_date

  def get_end_date(self):
    ending_date = self.report_selector.xpath("//div[contains(@data-activity, 'BugResolved')]/div[4]/div/span/@title").extract_first()
    if ending_date == None:
      ending_date = self.report_selector.xpath("//div[contains(@data-activity, 'BugInformative')]/div[4]/div/span/@title").extract_first()
      #TODO Construct better way to find reasons for non-existant end_date
    return ending_date

  def get_vuln_type(self):
    vuln_type = self.report_selector.xpath("//tr[contains(@class, 'vuln-types')]/td[2]/text()").extract()
    vuln_type = ','.join(vuln_type)
    return vuln_type

  def get_severity(self):
    severity = self.report_selector.xpath("//span[contains(@class, 'severity')]/text()").extract_first()
    return severity

Пример #47

0

Показать файл

                <li class="item-55"><a id='i55' href="link.html" class='ding'>first item</a></li>
                <li class="item-66"><a id='i66' href="llink.html" class='ding'>first item</a></li>
                <li class="item-77"><a href="llink2.html">second item<span>vv</span></a></li>
            </ul>
        </div>
    </body>
    <ul>
        <li class="item-5"><a id='i5' href="link.html" class='ding'>first item</a></li>
        <li class="item-6"><a id='i6' href="llink.html" class='ding'>first item</a></li>
        <li class="item-7"><a href="llink2.html">second item<span>vv</span></a></li>
    </ul>
</html>
"""
# 构造response对象
response = HtmlResponse(url='', body=html, encoding='utf-8')
selector = Selector(response=response)
# // 从根开始搜索
# 获取所有a标签
temp = selector.xpath('//a')
# 搜索所有符合 div/div 的
temp = selector.xpath('//div/div')

# 获取子标签
# 获取不到，因为a标签不是html的子标签
temp = selector.xpath('a')

# 相对位置绝对位置
# 获取第一个body标签, 下面从body标签开始找ul标签
x = selector.xpath('body')[0]

# ./ul 相对标签的子标签ul

Пример #48

0

Показать файл

Файл: test_epci_parsing.py Проект: kevinomics/nosfinanceslocales_scraper

 def test_parsing(self):
     parser = EPCIZoneParser('', 2013, '', '')
     data = parser.parse(Selector(self.response))
     for key, val in self.data.items():
         self.assertAlmostEqual(data[key], val)

Пример #49

0

Показать файл

Файл: alexaspider.py Проект: SoumyaParida/tutorial

    def _extract_img_requests(self, response, tag, counter):
        r = []
        siteList = []
        ObjectList = dict()
        externalSites = []
        #uniqueExternalSites=[]
        if isinstance(response, HtmlResponse):
            tag = 'I'
            #imgcount=0
            counterValueImg = counter
            sites = Selector(response).xpath("//img/@src").extract()
            # for site in sites:
            #     imgcount=imgcount+1
            #logging.info('imgcount',imgcount)

            #logwr = csv.writer(logFile, delimiter=',',quotechar=' ', quoting=csv.QUOTE_MINIMAL)
            for item in sites:
                if isinstance(item, unicode):
                    item = item.encode('utf-8')
                    siteList.append(item)
                else:
                    siteList.append(item)
            #wr.writerow(siteList)

            externalImageCount, InternalImageCount, uniqueExternalSites, externalSites, secondlevelurl = _extract_object_count(
                siteList)
            Imagecount = len(siteList)
            #lock.acquire()

            # ObjectList['url']=response.url
            # ObjectList['counter']=counterValueImg
            # ObjectList['Imagecount']=Imagecount
            # ObjectList['InternalImageCount']=InternalImageCount
            # ObjectList['ExternalImageCount']=externalImageCount

            # logwr.writeheader()
            logwr.writerow({
                'url': response.url,
                'counter': counterValueImg,
                'InternalImageCount': InternalImageCount,
                'ExternalImageCount': externalImageCount,
                'UniqueExternalSites': uniqueExternalSites,
                'ExternalSites': externalSites,
                'secondlevelurl': secondlevelurl
            })
            #logwr.writerow([ObjectList])
            #lock.release()
            #wr.writerow([Imagecount])
            #logwr.writerow([imgcount])
            #Imagecount=str(len(siteList))
            #logwr.writerow([siteList])
            r.extend(
                Request(site,
                        callback=self.parse,
                        method='HEAD',
                        meta={
                            'tagType': tag,
                            'counter': counterValueImg
                        }) for site in siteList
                if site.startswith("http://") or site.startswith("https://"))
        return r

Пример #50

0

Показать файл

Файл: as_spider.py Проект: binhoul/tangpoems

    def parse(self, response):
        sel = Selector(response)
        sites = sel.xpath('//table[@bgcolor="#808080"]/table/tr/td[@width="33%"]/a/@href').extract()

        for site in sites:
            yield scrapy.Request(''.join(["http://ts300.5156edu.com/sc300/",site]),callback=self.parse_dep2)

Пример #51

0

Показать файл

Файл: test.py Проект: AngerBegins/tianyancha

def b(url, xpath1, xpath2):
    selector = Selector(text=requests.get(url, headers, verify=False))
    selector.xpath(xpath1).extract()
    selector.xpath(xpath2)

Пример #52

0

Показать файл

Файл: alexaspider.py Проект: SoumyaParida/tutorial

 def _set_title(self, page, response):
     if isinstance(response, HtmlResponse):
         title = Selector(response).xpath("//title/text()").extract()
         if title:
             page['title'] = title[0]

Пример #53

0

Показать файл

# from selenium.webdriver.common.keys import Keys
from selenium.webdriver import Chrome
from scrapy.selector import Selector
# from bs4 import BeautifulSoup
import csv
# import re

browser = Chrome('/Users/Tim/PyCharmProjects/learning/chromedriver')
browser.get('https://maplelegends.com/ranking/monsterbook?page=1&search=')

with open('MonsterbookRanking.csv', 'w', newline='') as file:
    filewriter = csv.writer(file, quotechar='|', quoting=csv.QUOTE_MINIMAL)
    filewriter.writerow(['Rank', 'IGN', 'Fame', 'Level', 'Cards', 'Class'])
    for y in range(500):
        time.sleep(5)
        html = Selector(text=browser.execute_script(
            "return document.documentElement.outerHTML;"))
        for x in range(5):
            filewriter.writerow(
                list((
                    html.xpath('//tr/td/b/text()').extract()[5 * x],
                    html.xpath('//tr/td/b/text()').extract()[5 * x + 1],
                    html.xpath('//tr/td/b/text()').extract()[5 * x + 2],
                    html.xpath('//tr/td/b/text()').extract()[5 * x + 3],
                    html.xpath('//tr/td/b/text()').extract()[5 * x + 4],
                    html.xpath(
                        '//tr/comment()[contains(., "job")]/following-sibling::*[1]/text()'
                    ).extract()[x])))
        browser.find_element_by_xpath('//li/a[contains(.,"Next")]').click()
browser.quit()

browser = Chrome('/Users/Tim/PyCharmProjects/learning/chromedriver')

Пример #54

0

Показать файл

 def get_total_page_numbers(self, response):
     sel = Selector(response)
     response_page_list = sel.xpath(
         "//div[@class='pager']/table/tr/td/text()").extract()
     response_page = response_page_list[0].strip().split('/')[1][1:-1]
     return int(response_page)

Пример #55

0

Показать файл

    def parse_salary(self, response):
        hxs = Selector(response)
        items = hxs.xpath("//table[@id='salaryDescTable']/tr[@data-url]")
        for item in items:
            salary = Salary()
            #
            name = first_item(item.xpath('td/a/text()').extract())
            if name.endswith(')'):
                ix = name.rfind('(')
                if ix == -1:
                    salary['job_name'] = name
                    salary['job_count'] = 0
                else:
                    salary['job_name'] = name[0:ix]
                    salary['job_count'] = int(name[ix + 1:-2])
            else:
                salary['job_name'] = name
                salary['job_count'] = 0
            salary['average'] = first_item(
                item.xpath("td[@class='s-d-average']/text()").extract())
            salary['average'] = salary['average'].replace('￥',
                                                          '').replace(',', '')
            salary['company_logo'] = first_item(
                hxs.xpath("//a[@ka='com-logo']/img/@src").extract())
            salary['src_url'] = self.create_url(
                first_item(item.xpath('td/a/@href').extract()))
            #
            company_url = first_item(
                hxs.xpath("//a[@ka='com-logo']/@href").extract())
            if company_url is not None:
                salary['company_url'] = self.create_url(company_url)
                start = company_url.find('gso')
                end = company_url.find('.html')
                salary['company_code'] = company_url[start:end]
            else:
                salary['company_url'] = ''
                salary['company_code'] = ''
            co_info = hxs.xpath("//div[@class='co_info']")
            salary['company_name'] = first_item(
                co_info.xpath(
                    "p[@id='companyName']/@data-companyname").extract())
            salary['praise_rate'] = first_item(
                co_info.xpath("div[@class='msgs']/strong/text()").extract())
            other = co_info.xpath(
                "p[@class='params grey_99 mt5']//text()").extract()
            salary['industry'] = ''
            salary['city_name'] = ''
            salary['company_type'] = ''
            salary['company_scale'] = ''
            if other is not None:
                other_str = ''
                for ix in other:
                    other_str += ix
                other_array = other_str.split('|')
                if len(other_array) > 0:
                    salary['industry'] = other_array[0]
                if len(other_array) > 1:
                    salary['city_name'] = other_array[1]
                if len(other_array) > 2:
                    salary['company_scale'] = other_array[2]
                if len(other_array) > 3:
                    salary['company_type'] = other_array[3]
            #
            id = first_item(item.xpath('@id').extract())
            if id != '':
                id += '_C'
                ul = hxs.xpath(
                    "//table[@id='salaryDescTable']/tr[@id='%s']/td/div/ul" %
                    id)
                if ul:
                    salary['high'] = first_item(
                        ul.xpath("li[@class='s-d-low']/text()").extract())
                    salary['low'] = first_item(
                        ul.xpath("li[@class='s-d-high']/text()").extract())
                    salary['mark'] = first_item(
                        ul.xpath(
                            "li[@class='s-d-mark']/a/em/text()").extract())
                    salary['high'] = salary['high'].replace('￥', '').replace(
                        ',', '').lstrip(' ')
                    salary['low'] = salary['low'].replace('￥', '').replace(
                        ',', '').lstrip(' ')

            yield salary
        #下页处理
        link = first_item(
            hxs.xpath("//div[@class='page_wrap']/div/a[@class='p_next']/@href"
                      ).extract())
        if link is not None:
            yield Request(url=self.create_url(link),
                          meta={'use_proxy': True},
                          dont_filter=True,
                          callback=self.parse_salary)

Пример #56

0

Показать файл

Файл: test.py Проект: AngerBegins/tianyancha

    selector.xpath(xpath1).extract()
    selector.xpath(xpath2)


def fun(names, codes, urls):
    d = {}
    for name, code, url in zip(names, codes, urls):
        d[name] = {}
        d[name]['code'] = code
        d[name]['url'] = url
    return d


if __name__ == '__main__':
    req = requests.get(url, headers=headers, verify=False)
    s = Selector(text=req.text)
    # #机构类型
    # organization_name = s.xpath('//div[@class="folder-body"]/div[1]/div/a/text()').extract()
    # organization_url = s.xpath('//div[@class="folder-body"]/div[1]/div/a/@href').extract()
    # organization_code = [re.search('companyType=(.*)',i).group(1) for i in organization_url]
    # organization = fun(organization_name,organization_code,organization_url)
    # config = {'organization': organization}
    # json.dump(config, open("config.json", "w"), ensure_ascii=False)
    # #省份
    # Province={}
    # Provinces_name = s.xpath('//div[@class="folder-body"]/div[2]/div/a/text()').extract()
    # Province_url = s.xpath('//div[@class="folder-body"]/div[2]/div/a/@href').extract()
    # Province_code = [re.search('base=(.*)',i).group(1) for i in Province_url]
    # Province = fun(Provinces_name,Province_code,Province_url)
    # config = {'organization': organization,'Province':Province}
    # json.dump(config, open("config.json", "w"), ensure_ascii=False)

Пример #57

0

Показать файл

# browser.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click()

# 开源中国博客:selenium执行JavaScript
# browser.get("https://www.oschina.net/blog")
# import time
# time.sleep(5)
# for i in range(3):
#     browser.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;")
#     time.sleep(3)

# 设置chromedriver不加载图片
# chrome_opt = webdriver.ChromeOptions()
# prefs = {"profile.managed_default_content_settings.images":2}
# chrome_opt.add_experimental_option("prefs", prefs)
#
# browser = webdriver.Chrome(executable_path="./chromedriver.exe",chrome_options=chrome_opt)
# browser.get("https://www.oschina.net/blog")

# phantomjs, 无界面的浏览器， 多进程情况下phantomjs性能会下降很严重

browser = webdriver.PhantomJS(
    executable_path="C:/spiderDriver/phantomjs-2.1.1-windows/bin/phantomjs.exe"
)
browser.get(
    "https://detail.tmall.com/item.htm?spm=a230r.1.14.3.yYBVG6&id=538286972599&cm_id=140105335569ed55e27b&abbucket=15&sku_properties=10004:709990523;5919063:6536025"
)
t_selector = Selector(text=browser.page_source)
print(t_selector.css(".tm-price::text").extract())
# print (browser.page_source)
browser.quit()

Пример #58

0

Показать файл

Файл: pos_spider2.py Проект: dcoxnard/Overpaid

    def parse_player(self, response):
        """
		Scrape a player's page.
		"""

        player_name = ((response.xpath('//h1/text()').extract_first()))

        position = (response.xpath(
            '//div[@id="meta"]/div[2]/p[1]/text()').extract()[1].strip())

        #### BATTING STATS ####

        for row in response.xpath(
                '//table[@id="batting_standard"]/tbody/tr[@class="full"]'):

            year = row.xpath(
                './th[@data-stat="year_ID"]/text()').extract_first()

            age = row.xpath('./td[@data-stat="age"]/text()').extract_first()

            team = row.xpath(
                './td[@data-stat="team_ID"]/a/text()').extract_first()

            pa = row.xpath('./td[@data-stat="PA"]/text()').extract_first()

            hr = row.xpath('./td[@data-stat="HR"]/text()').extract_first()

            rbi = row.xpath('./td[@data-stat="RBI"]/text()').extract_first()

            avg = row.xpath(
                './td[@data-stat="batting_avg"]/text()').extract_first()

            obp = row.xpath(
                './td[@data-stat="onbase_perc"]/text()').extract_first()

            slg = row.xpath(
                './td[@data-stat="slugging_perc"]/text()').extract_first()

            ops = row.xpath('./td[@data-stat="onbase_plus_slugging"]/text()'
                            ).extract_first()

            # Deal with that godawful commented-out HTML.
            commented_text = response.xpath('//comment()').re(regex)[16]

            new_selector = Selector(text=commented_text, type='html')

            year_row_string = '//tr[@id="batting_value.' + year + '"]'

            year_row = new_selector.xpath(year_row_string)

            war = year_row.xpath(
                './td[@data-stat="WAR"]/text()').extract_first()

            salary = year_row.xpath(
                './td[@data-stat="Salary"]/text()').extract_first()

            stats = {
                'player_name': player_name,
                'position': position,
                'year': year,
                'age': age,
                'team': team,
                'pa': pa,
                'hr': hr,
                'rbi': rbi,
                'avg': avg,
                'obp': obp,
                'slg': slg,
                'ops': ops,
                'war': war,
                'salary': salary
            }

            yield stats

Пример #59

0

Показать файл

Файл: olimpica_scraping.py Проект: CamiloAguilar/StoresScrap

    def category_parse(self, response):
        n_cat = response.meta['n_cat']
        categories = response.meta['categories']
        pag = 1

        parte = url_part(response.url)

        while True:

            print('\n', response.url + parte + str(pag), '\n')

            check_connection()
            driver.get(response.url + parte + str(pag))
            sleep(3)

            driver.execute_script(
                'document.body.style.MozTransform = "scale(0.2)";')
            sleep(.5)
            driver.execute_script(
                'document.body.style.MozTransformOrigin = "0 0";')
            sleep(1)

            button_test_trys = 0
            button_bool = False
            while button_test_trys <= 20:
                cat_page_sel = Selector(text=driver.page_source)
                button_test = cat_page_sel.xpath(
                    '//*[@class="vtex-button bw1 ba fw5 v-mid relative pa0 lh-solid br2 min-h-small t-action--small bg-action-primary b--action-primary c-on-action-primary hover-bg-action-primary hover-b--action-primary hover-c-on-action-primary pointer "]'
                )
                if button_test != []:
                    button_bool = True
                    break
                else:
                    button_test_trys += 1
                    sleep(2)

            if button_bool:

                ver_mas_prods = 0
                while ver_mas_prods < 10:
                    cat_page_sel = Selector(text=driver.page_source)
                    n_prods = len(
                        cat_page_sel.xpath(
                            '//*[@class= "vtex-flex-layout-0-x-flexColChild vtex-flex-layout-0-x-flexColChild--search-result-content pb0"]//*[@class="vtex-search-result-3-x-galleryItem vtex-search-result-3-x-galleryItem--normal vtex-search-result-3-x-galleryItem--grid-3 pa4"]'
                        ))

                    if n_prods != 0 and n_prods >= 11:
                        break

                    else:
                        ver_mas_prods += 1
                        driver.execute_script(
                            "window.scrollTo(0, window.scrollY + 3)")
                        sleep(1)
                        driver.execute_script(
                            "window.scrollTo(0, window.scrollY - 3)")
                        sleep(1)

                driver.execute_script(
                    'document.body.style.MozTransform = "scale(0.005)";')
                driver.execute_script(
                    'document.body.style.MozTransformOrigin = "0 0";')

                sleep(2)

                #import pdb; pdb.set_trace()

                cat_page_sel = Selector(text=driver.page_source)
                n_prods = len(
                    cat_page_sel.xpath(
                        '//*[@class= "vtex-flex-layout-0-x-flexColChild vtex-flex-layout-0-x-flexColChild--search-result-content pb0"]//*[@class="vtex-search-result-3-x-galleryItem vtex-search-result-3-x-galleryItem--normal vtex-search-result-3-x-galleryItem--grid-3 pa4"]'
                    ))

                prods = cat_page_sel.xpath(
                    '//*[@class= "vtex-flex-layout-0-x-flexColChild vtex-flex-layout-0-x-flexColChild--search-result-content pb0"]//*[@class="vtex-search-result-3-x-galleryItem vtex-search-result-3-x-galleryItem--normal vtex-search-result-3-x-galleryItem--grid-3 pa4"]'
                )

                cat_name = cat_page_sel.xpath(
                    './/*[@class= "vtex-search-result-3-x-galleryTitle--layout t-heading-1"]//text()'
                ).extract()[-1]

                for prod in prods:
                    prod_name = prod.xpath(
                        './/*[@class= "vtex-product-summary-2-x-productBrand vtex-product-summary-2-x-brandName t-body"]//text()'
                    ).extract_first()

                    normal_price = prod.xpath(
                        './/*[@class= "vtex-product-price-1-x-listPriceValue vtex-product-price-1-x-listPriceValue--summary strike"]//text()'
                    ).extract()

                    if normal_price == []:
                        normal_price = prod.xpath(
                            './/*[@class= "vtex-product-price-1-x-sellingPriceValue vtex-product-price-1-x-sellingPriceValue--summary"]//text()'
                        ).extract()

                    if normal_price != []:
                        normal_price = ' '.join(normal_price)

                    disc_price = prod.xpath(
                        './/*[@class= "vtex-product-price-1-x-currencyContainer vtex-product-price-1-x-currencyContainer--summary"]//text()'
                    ).extract()
                    if disc_price != []:
                        disc_price = ' '.join(disc_price)

                    image_url = prod.xpath(
                        './/*[@class= "vtex-product-summary-2-x-imageNormal vtex-product-summary-2-x-image"]/@src'
                    ).extract_first()

                    print('\n', '#' * 15, 'Resultado Producto', '#' * 15, '\n')
                    print('Categoria: ', cat_name, '\n', '\n\tProducto:\t',
                          prod_name, '\n\tPrecio norm:\t', normal_price,
                          '\n\tPrecio desc:\t', disc_price, '\n')

                    yield {
                        'cat_name': cat_name,
                        'prod_name': prod_name,
                        'normal_price': normal_price,
                        'disc_price': disc_price,
                        'image_url': image_url
                    }

                pag += 1
                continue

            else:
                break

        print('\n', 'Se econtraron en total', n_prods)

        # import pdb; pdb.set_trace()

        if n_cat < len(categories) - 1:
            n_cat += 1

            check_connection()
            yield Request(url='http://olimpica.com/',
                          callback=self.parse,
                          meta={
                              'n_cat': n_cat,
                              'categories': categories
                          },
                          dont_filter=True)

        else:
            driver.quit()

Пример #60

0

Показать файл

Файл: NelcoSports.py Проект: PranavSonar/scrapers

    def parse_category(self, response):
        sel = Selector(response)
        item = DicksItem()
        pname = sel.xpath("//div[@class='product_name']/text()").extract()[0]
        pname = pname.encode('utf-8')
        item['Brand_Name'] = "Nelco Sports"
        item[
            "Product_Image_Description_1"] = "Buy " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
        item[
            "MetaDescription"] = "Get your hands on the " + pname + ". Buy it Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
        item[
            "TitleTag"] = "Buy the " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
        pcode = sel.xpath(
            "//input[@type='checkbox']/@value").extract()[0] + "NELPRD"
        item["Product_Description"] = sel.xpath(
            "//td[@class='form_text_normal']/text()").extract()
        item["Product_Description"] = ''.join(
            item["Product_Description"]).encode('utf-8')

        mrp = []
        sp = sel.xpath("//td[@class='guest']/text()").extract()
        sp = min(float(x) for x in sp)
        mrp = sel.xpath(
            "//tr/td[@class='form_text'][last()-1]/text()").extract()
        mrp = min(float(x) for x in mrp)
        sortorder = -150
        trackinventory = 'By Option'
        image = ("http://www.nelcosport.com/" +
                 sel.xpath('//div[@class="enlarge"]/a/@href').extract()[0])
        category = sel.xpath("//h1[@class='heading1']/a/text()").extract()[0]
        row = (
            "Product",
            "",
            pname,
            item["Brand_Name"],
            mrp,
            mrp,
            sp,  #price
            item["Product_Description"],
            pcode,
            "NELCOSPORTS",
            category,
            pname,
            "15-23 Working days",
            "100",
            "N",
            sortorder,
            item["MetaDescription"],
            item["TitleTag"],
            item["Product_Image_Description_1"],
            "Y",
            trackinventory,
            "1",
            image)

        mywriter.writerow(row)

        variants = {}
        variants['sku'] = sel.xpath(
            "//input[@type='checkbox']/@value").extract()
        x = sel.xpath("//tr/td[2]/text()").extract()
        variants['weight'] = ''
        for w in x:
            if 'Weight' in w:
                variants['weight'] = sel.xpath(
                    "//tr/td[@class='form_text'][2]/text()").extract()
                break

        variants['size'] = sel.xpath(
            '//tr/td[@bgcolor="#FFFFFF"][last()]/text()').extract()
        variants['price'] = sel.xpath('//td[@class="guest"]/text()').extract()
        count = 0
        for i in range(len(variants['size'])):
            variants['size'][i] = ''.join(variants['size'][i]).encode('utf-8')
            if variants['size'][0] == variants['size'][i]:
                count = count + 1

        if len(variants['price']) > 1:
            for i in range(len(variants['price'])):
                if variants['weight']:
                    if variants['size'][i].strip() == '' or count == len(
                            variants['price']):
                        row = ("Rule", '',
                               "[S]Weight =" + variants['weight'][i] + "KG",
                               '', '[FIXED]' + variants['price'][i],
                               '[FIXED]' + variants['price'][i], '[FIXED]' +
                               variants['price'][i], '', variants['sku'][i],
                               'NELCOSPORTS', '', '', '', '100')
                        rules = ("Rule", '',
                                 "[S]Weight =" + variants['weight'][i] + "KG",
                                 '', '[FIXED]' + variants['price'][i],
                                 '[FIXED]' + variants['price'][i], '[FIXED]' +
                                 variants['price'][i], '', variants['sku'][i],
                                 'NELCOSPORTS', '', '', '', '100')
                    else:
                        row = ("Rule", '',
                               "[S]Weight =" + variants['weight'][i] + "KG" +
                               ",[S]Size=" + variants['size'][i].strip('Size'),
                               '', '[FIXED]' + variants['price'][i],
                               '[FIXED]' + variants['price'][i], '[FIXED]' +
                               variants['price'][i], '', variants['sku'][i],
                               'NELCOSPORTS', '', '', '', '100')
                else:
                    row = ("Rule", '',
                           "[S]Size=" + variants['size'][i].strip('Size'), '',
                           '[FIXED]' + variants['price'][i],
                           '[FIXED]' + variants['price'][i], '[FIXED]' +
                           variants['price'][i], '', variants['sku'][i],
                           'NELCOSPORTS', '', '', '', '100')
                mywriter.writerow(row)

Python Selector примеры использования