예제 #1
0
    def parse_user(self, response):
        item = MFWItem()

        item['uid'] = response.meta['uid']
        item['name'] = response.xpath(
            '//div[@class="MAvaName"]/text()').extract_first()
        item['level'] = int(response.xpath(
            '//span[@class="MAvaLevel flt1"]/a/@title').extract_first().split('.')[-1])
        if item['level'] <= 3:
            return
        item['tags'] = response.xpath(
            '//div[@class="its_tags"]//i[contains(@class, "on")]/../@title').extract()
        item['attention'] = [int(i) for i in response.xpath(
            '//div[@class="MAvaMore clearfix"]//a/text()').extract()]
        item['groups'] = response.xpath(
            '//div[@class="MGroupDetail"]//a[@class="name"]/text()').extract()
        item['dynamic'] = response.xpath(
            '//span[@class="time"]/text()').extract()
        item['download'] = []
        infos = response.xpath('//div[@class="common_block relative_info"]/p')
        for info in infos:
            if u'刚刚下载了' in ''.join(info.xpath('text()').extract()):

                item['download'].append({'time': info.xpath(
                    'span[@class="time"]/text()').extract_first(), 'name': info.xpath('a/text()').extract()[-1]})

        item['note'] = {}
        item['path'] = []
        item['review'] = []
        item['together'] = []
        note = response.xpath(u'//a[@title="TA的游记"]/@href').extract_first()
        req = Request(urljoin(response.url, note), callback=self.parse_note)
        req.meta['item'] = item
        yield req
예제 #2
0
    def _parse_symptom_question(self, response):
        symptom_question_item = response.meta.get('symptom_questions')
        # print response.url
        if not symptom_question_item:
            symptom_question_item = SymptomQuestionItem()
            symptom_question_item['symptom_name'] = response.meta['symptom_item']['name']
            symptom_question_item['qids'] = []

        # parse
        urls = response.xpath('//div[@class="p_list_li"]/div[@class="p_list_cent"]/div[@class="p_list_centt"]/dl/dt/a/@href').extract()
        symptom_question_item['qids'] += [u.split('/')[-1].split('.')[0] for u in urls]

        # last_url = response.xpath('//div[@class="portldet-content"]/a/@href').extract()[-1]
        next_url = response.xpath('//div[@class="portlet-content"]/a[text()="下一页 >"]/@href').extract()
        if not next_url:
             # 所有页都处理完了
            print symptom_question_item
            yield symptom_question_item
        else:
            url = next_url[0]
            # print url
            # print symptom_question_item['qids']
            request = Request(url, dont_filter=True, callback=self._parse_symptom_question)
            request.meta['symptom_questions'] = symptom_question_item
            # print request
            yield request
예제 #3
0
    def parse(self, response):

        tabs = []
        tab_selector = response.xpath('//div[@id="siteDirectory"]')
        ### loop for all tabs
        for tab in tab_selector.xpath('.//div[@class="popover-grouping"]'):
            tabNameSel = tab.xpath("h2/text()").extract()

            if tabNameSel:
                tabName = tabNameSel[0]

                fobj = open(tabName + ".txt", "a+")

            cat_selector = tab.xpath(".//ul")

            ### loop for all categories
            for category in cat_selector.xpath("li"):  #'.//div[contains(@class, "ht180")]
                catNameSel = category.xpath(
                    "a/text()"
                ).extract()  # //div[contains(@class, "top-menu unit")]/ul/li/div/div/div/ul/li[@class="heading"]
                # print category.extract()
                if catNameSel:
                    catName = catNameSel[0]
                catLinkSel = category.xpath("a/@href").extract()
                if catLinkSel:
                    catLink = "http://www.amazon.in" + catLinkSel[0]

                request = Request(catLink, callback=self.parse_subcatpage)
                request.meta["fobj"] = fobj
                request.meta["tabName"] = tabName
                request.meta["catName"] = catName
                yield request

        fobj.close()
예제 #4
0
    def getItem(self, school):
        item = SchoolItem()
        logo = school.xpath('div/div[contains(@class,"school_m_img fl")]/a/img/@src').extract()
        item["logo"] = logo[0] if logo else ""

        # name province city area under school_m_main
        school_main = school.xpath('div/div[contains(@class,"school_m_main fl")]')
        name = school_main.xpath("li/h3/a/text()").extract()
        item["name"] = name[0] if name else ""
        item["province"] = ""
        item["city"] = ""
        item["area"] = ""
        tempLocation = school_main.xpath("li[2]/b/text()").extract()
        if tempLocation:
            location = tempLocation[0].split()
            item["province"] = location[0] if len(location) > 0 else ""
            item["city"] = location[1] if len(location) > 1 else ""
            item["area"] = location[2] if len(location) > 2 else ""

        catagery = school_main.xpath("li[3]/b/text()").extract()
        schoolType = school_main.xpath("li[4]/ol[1]/b/text()").extract()
        level = school_main.xpath("li[4]/ol[2]/b/text()").extract()
        item["level"] = level[0] if level else ""
        item["catagery"] = catagery[0] if catagery else ""
        item["schoolType"] = schoolType[0] if schoolType else ""

        # address and phone under school_m_lx
        addressAndPhone = school.xpath('ul[contains(@class,"school_m_lx")]')
        address = addressAndPhone.xpath("li[1]/b/text()").extract()
        item["address"] = address[0] if address else ""
        item["phone"] = addressAndPhone.xpath("li[2]/b/text()").extract()
        schoollUrl = school_main.xpath("li/h3/a/@href").extract()[0]
        request = Request(schoollUrl, callback=self.parse_schoolIntroUrl)
        request.meta["item"] = item
        return request
예제 #5
0
    def parseJsonProduct(self, response):
        item = response.meta["item"]
        # make a valid json file out of it and remove unneeded data
        prodResponse = response.body.split("$+$")[0].strip().replace("'", '"')
        prodDict = {}
        sizeWidthDict = {}
        jsonresponse = json.loads(prodResponse)
        for product, value in jsonresponse.iteritems():
            if item["sku"] not in prodDict:
                prodDict[item["sku"]] = {}
            if value["c"] not in prodDict[item["sku"]]:
                prodDict[item["sku"]][value["c"]] = {}
            if value["w"] not in prodDict[item["sku"]][value["c"]]:
                prodDict[item["sku"]][value["c"]][value["w"]] = {}
            if value["s"] not in sizeWidthDict:
                sizeWidthDict[value["s"]] = []
            if value["w"] not in sizeWidthDict[value["s"]]:
                sizeWidthDict[value["s"]].append(value["w"])
            prodDict[item["sku"]][value["c"]][value["w"]][value["s"]] = value["sku"]
        item["variant"] = prodDict
        item["size_width_list"] = sizeWidthDict
        # request first imageset
        if item["imageSetUrls"]:
            color, href = item["imageSetUrls"].popitem()
            if len(href) > 1:
                item["imageSetUrls"][color] = href[1:]
            request = Request(href[0], callback=self.parseJsonImageSet)
            request.meta["item"] = item
            return request

        self.to_csv(item)
        return item
예제 #6
0
	def parse_page(self, response):
		if response.meta.has_key('crawldepth'):
			depth = response.meta['crawldepth']
		else:
		#       Set search depth here
			depth = 1
		log.msg('Depth = %s' % str(depth), level=log.INFO)
		if not isinstance(response, HtmlResponse):
		    log.msg('Not an HTML file: %s' % response.url, level=log.WARNING)
		    return

		log.msg('Response from: %s' % response.url, level=log.INFO)
		url_bf.add(response.url)
	
		# TODO: Extract page title
	
		extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode())
		cleaned_text = extractor.getText()

		# Eliminate duplicates
		keywordset = set(keywordlist)

		found_list = []
		for keyword in keywordset: # TODO: Is there a more efficient way to do this?
			# Look at word boundaries to match entire words only
			if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)):
				found_list.append(keyword)

		# Parse this page		
		item = BiffleItem()
		if (len(found_list) > 0):
			item['url'] = response.url
			item['body'] = cleaned_text
			item['keywords'] = ', '.join(found_list)
			item['process_date'] = datetime.today()
			log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO)
			self.map_keyword_count(found_list)
			yield item

		if (depth > 0):	
			# Find the next requests and yield those
			hxs = HtmlXPathSelector(response)
			links = hxs.select('//a/@href').extract()
			log.msg('Links on page: %s' % len(links), level=log.INFO)
			depth -= 1
			log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO)
			for l in links:
				l = urlparse.urljoin(response.url, l)
				if (l in url_bf):
					pass
					#log.msg('Duplicate URL found: %s' % l, level=log.INFO)
				else:
					url_bf.add(l)
					#log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO)
					# Decrement depth for next layer of links
					#callback = lambda response, depth = depth: self.parse_page(response, depth)			
					callback = lambda response: self.parse_page(response)
					request = Request(l, callback=callback)
					request.meta['crawldepth'] = depth
					yield request
예제 #7
0
	def amazon_marketplace(self,response):
		
		sel = Selector(response)
		item = response.meta['item']
		try:
			sp = sel.xpath("//span[@style='text-decoration: inherit; white-space: nowrap;']/text()").extract()[0].replace(",","")
			shippingcost = sel.xpath("//span[@class='olpShippingPrice']/span/text()").extract()
			if shippingcost:
				sp = str(float(sp) + float(sel.xpath("//span[@class='olpShippingPrice']/span/text()").extract()[0].replace(",","")))	
			
			if sp>item['SP']:
				sp = item['SP']
		except:			
			try:
				flipkart_url = flipkart_urls[item['index']]
				request = Request(flipkart_url,callback = self.flipkart_scraper)
				request.meta['item'] = item
				# request.meta['proxy'] = "http://111.161.126.100:80"
				yield request
		
			except:				
				try:
					paytm_url = paytm_urls[item['index']]
					request = Request(paytm_url,callback = self.paytm_scraper)
					request.meta['item'] = item
					request.meta['proxy'] = "http://111.161.126.100:80"
					yield request
				except:
					self.to_csv(item)
	def parse_monuments_en(self,response):
		sel=Selector(response)
		monument=sel.xpath('//div[@class="col-50 content-desc"]')
		title=monument.xpath("h2[@class='big sec-color']/text()").extract()
		summary=''.join(monument.xpath("div[@id='idContentScroll']/span/p//text()").extract())
		informationLink=monument.xpath("div[@id='idContentScroll']/span/a/@href").extract()
		item = response.meta['item']
		if len(informationLink)>0:
			item['informationLink_en']=informationLink.pop()
		else:
			item['informationLink_en']=response.url
		if len(title)>0:
			item['name_en']=title.pop()
		else:
			item['name_en']=''
		if len(summary)>0:
			item['description_en']=summary
		else:
			item['description_en']=''
		if len(informationLink)>0:
			item['informationLink']=informationLink.pop()
		else:
			item['informationLink']=response.url
		
		euLink=sel.xpath('//*[@id="eu"]/@href').extract()
		request=Request(self.BASE+str(euLink.pop()),callback=self.parse_monuments_eu)
		request.meta['item']=item
		yield request
	def parse_restaurants_en(self,response):
		sel=Selector(response)
		item = response.meta['item']
		descriptionpath=sel.xpath("//*[@id='idContentScroll']")
		description=descriptionpath.xpath("span[@itemprop='description']/p//text()").extract()
		timetable=descriptionpath.xpath("span[@itemprop='description']/p[2]//text()").extract()
		timetable2=descriptionpath.xpath("span[@itemprop='description']/p[3]//text()").extract()
		categoryPath=sel.xpath("//*[@id='gastronomy-content']/section[2]/div/section[1]/section/div/ul/li[2]/p[2]")
		category=categoryPath.xpath("a/strong/text()").extract()
		if len(description)>0:
			item['description_en']=' '.join(description)
		else:
			item['description_en']=''
		if len(category)>0:
			item['category_en']=['Restaurant',category.pop()]	
		else:
			item['category_en']=['Restaurant','Others']
		if len(timetable)>0:
			if len(timetable2)>0:
				item['timetable_en']=' '.join([timetable.pop(),timetable2.pop()])
			else:
				item['timetable_en']=timetable.pop()
		else:
			item['timetable_en']=''
		link=response.url
		link=link.replace("/en/","/eu/")
		request=Request(link,callback=self.parse_restaurants_eu)
		request.meta['item']=item
		yield request		
예제 #10
0
    def parse_disease(self, response):
        """解析【疾病】页面"""
        disease_item = DiseaseItem()
        disease_item['url'] = response.url

        _name = response.xpath('//div[@class="p_lbox1"]/div[@class="p_lboxti"]/h3')
        disease_item['name'] = _name.xpath('text()').extract()[0]
        _other_name = _name.xpath('var/text()').extract()
        if _other_name:
            begin = _other_name[0].find(':') + 1
            end = _other_name[0].rfind(')')
            disease_item['aliases'] = re.split(',|,', _other_name[0][begin:end])

        _related = response.xpath('//div[@id="yw4"]/div/div/div')
        disease_item['related_diseases'] = _related.xpath('ul/li/a[contains(@href, "/jibing/")]/@title').extract()
        disease_item['related_symptoms'] = _related.xpath('ul/li/a[contains(@href, "/zhengzhuang/")]/@title').extract()
        # print disease_item['related_diseases'], disease_item['related_symptoms']
        # print disease_item
        yield disease_item

        # Go on parsing details
        detail_urls = response.xpath('//div[@class="p_lbox1_ab"]/a/@href').extract()
        detail_urls += response.xpath('//ul[@class="p_sibox2ul clears"]/li/a/@href').extract()
        # print detail_urls
        for url in detail_urls:
            request = Request(url=url, dont_filter=True, callback=self._parse_disease_detail)
            request.meta['disease_item'] = disease_item
            yield request

        # Go on parsing questions
        question_url = response.xpath('//div[@class="p_lbox5"]/div[@class="p_lboxti"]/a/@href').extract()[0]
        request = Request(url=question_url, dont_filter=True, callback=self._parse_disease_question)
        request.meta['disease_item'] = disease_item
        # print request
        yield request
예제 #11
0
    def parse_symptom(self, response):
        """解析【症状】页面"""
        symptom_item = SymptomItem()
        symptom_item['url'] = response.url
        symptom_item['name'] = response.xpath('//div[@id="m_1"]/div[@class="p_sibox1 p_siboxbor"]/div[@class="p_sititile"]/span/h1/text()').extract()[0]

        _related = response.xpath('//div[@id="yw3"]/div/div')
        symptom_item['related_diseases'] = _related.xpath('ul/li/a[contains(@href, "/jibing/")]/@title').extract()
        # symptom_item['related_symptoms'] = _related.xpath('ul/li/a[contains(@href, "/zhengzhuang/")]/@title').extract()
        # print symptom_item['related_diseases'], symptom_item['related_symptoms']
        # print symptom_item
        yield symptom_item

        # Go on parsing details
        detail_urls = response.xpath('//dl[@class="p_sibox1dl clears"]/dt/a/@href').extract()
        detail_urls += response.xpath('//ul[@class="p_sibox2ul clears"]/li/a[1]/@href').extract()
        # print detail_urls
        for url in detail_urls:
            request = Request(url=url, dont_filter=True, callback=self._parse_symptom_detail)
            request.meta['symptom_item'] = symptom_item
            yield request

        # Go on parsing questions
        question_url = response.xpath('//div[@class="p_sibox4 p_siboxbor"]/div[@class="p_sititile"]/a/@href').extract()[0]
        request = Request(url=question_url, dont_filter=True, callback=self._parse_symptom_question)
        request.meta['symptom_item'] = symptom_item
        # print request
        yield request
예제 #12
0
    def parse(self, response):
        ''' Parse response from start urls (/channels)
            
            Channels are groups by category. So, this spider extracts the 
            category of each channel, and constructs a request with the meta 
            information of the category (that information would not be 
            available from the channel page otherwise)
        '''
        self.logger.debug("Parse url {}".format(response.url))        

        cat_container = response.xpath('/html/body/div[1]/div/article/div')
        
        # Channels are grouped by category in containers with class '.channel-category'
        for cat in cat_container.css('.channel-category'):
            # extract the title of the category
            cat_title = cat.xpath('h2/text()').extract_first()            
            # extract the link to the channel pages
            for channel in cat.css('ul.channel-grid li'):
                link = channel.xpath('a//@href').extract_first()
                full_link = loaders.contextualize(link, base_url=response.url)
                # Construct request               
                request = Request(full_link, callback=self.parse_channel)
                request.meta['category'] = cat_title
                
                yield request
예제 #13
0
 def parse_history(self,response):
     #Parse Price History Table
     house = response.meta['item']
     tax_url = house['tax_url']
     price_history = []
     pattern = r' { "html": "(.*)" }'
     html = re.search(pattern, response.body).group(1)
     html = re.sub(r'\\"', r'"', html)  # Correct escaped quotes
     html = re.sub(r'\\/', r'/', html)  # Correct escaped forward
     if (html != ""):
         soup = BeautifulSoup(html)
         table = soup.find('table')
         table_body = table.find('tbody')
         rows = table_body.find_all('tr')
         for row in rows:
             cols = row.find_all('td')
             cols = [ele for ele in cols]
             cols = cols[:3]
             if (cols[2].find('span') != None):
                 date = cols[0].get_text()
                 event = cols[1].get_text()
                 price = cols[2].find('span').get_text()
                 price_history.append([date, event, price])
         #Store history as JSON string    
         house['price_history'] = json.dumps(price_history)
     tax_request = Request(tax_url, 
                       callback=self.parse_taxes)
     tax_request.meta['item'] = house
     
     return tax_request
예제 #14
0
    def parse(self, response):
        """First step of Mon/gr parsing."""
        try:
            # Connect to Beanstalkd server
            self.beanstalk = beanstalkc.Connection(host=self.host_beanstalkd, port=11301)

            # See all tubes:
            self.beanstalk.tubes()

            # Switch to the default (tube):
            self.beanstalk.use("default")

            # self.makedirResults()
            self.nodes = json.loads(response.body_as_unicode())

            for node in self.nodes:
                link_node = self.domain + self.nodes[node]
                request = Request(link_node, callback=self.parseDomain)
                # Pass metadata to the next wave of parsing
                request.meta["node"] = node
                yield request
        except:
            print "Please run the beanstalkc"

        return
예제 #15
0
파일: Dicks.py 프로젝트: Diwahars/scrapers
	def parse(self, response):
		sel = Selector(response)
		item = DicksItem()		
		if "&page=" in response.url: # Extracting the Page Number and then using that to assign sort.
			pagenumber = float(response.url.split("&page=")[-1]) 
		else:
			pagenumber = 1		
		t = 0 + ((pagenumber-1)*48)
		item["Sort_Order"] = {}
		
		producturls= sel.xpath("//div[@class='prod-details']/h2/a/@href").extract()
		productnames = sel.xpath("//div[@class='prod-details']/h2/a/@title").extract()		
		
		for url,name in zip(producturls,productnames):
			item["Sort_Order"]["http://www.dickssportinggoods.com"+url] = t
			t=t+1
			
		for i in range(len(urllist)): #comparing the Category URL and assigning LYS Categorization
			if urllist[i] == response.url:
				item['Category'] = lyscat[i]
				item['id1'] = priceid[i]
				break
		
		for url,name in zip(producturls,productnames):       
			if "Fitbit" not in name:         
				request=Request("http://www.dickssportinggoods.com"+url, self.product_page)
				request.meta["item"] = item
				yield request
예제 #16
0
    def parseJsonImageSet(self, response):
        item = response.meta["item"]
        imageSetResponse = response.body
        # make a valid json file out of it, if only one image available it was a list => make a dict
        imageSetResponse = imageSetResponse.replace("/*jsonp*/s7jsonResponse(", "")
        imageSetResponse = ",".join(imageSetResponse.split(",")[:-1])
        imageSetResponse = imageSetResponse.replace('"item":[', '"item":')
        imageSetResponse = imageSetResponse.replace('"item":', '"item":[')
        imageSetResponse = imageSetResponse.replace("}]}}", "}}}")
        imageSetResponse = imageSetResponse[::-1].replace("}}}", "}}]}")[::-1]

        color = response.url.split("-")[1].split("?")[0]
        isImageSet = False
        if len(response.url.split("-")) > 2:
            isImageSet = True
        item["Product_Image_File1"][color] = []

        jsonresponse = json.loads(imageSetResponse)
        for index, imageItem in enumerate(jsonresponse["set"]["item"]):
            # check if there is a image set or only one image
            if "isDefault" not in imageItem["i"]:
                imageUrl = (
                    "http://roadrunnersports.scene7.com/is/image/" + imageItem["i"]["n"] + "?iv=" + imageItem["iv"]
                )
                # response url is image set => image can be scaled
                if isImageSet:
                    imageUrl += "&scl=1"
                item["Product_Image_File1"][color].append(imageUrl)
            else:
                # there is no image set append request for default image
                if item["color"][color] not in item["imageSetUrls"]:
                    item["imageSetUrls"][item["color"][color]] = []
                if item["color"][color] not in item["imageSetUrls2"]:
                    item["imageSetUrls2"][item["color"][color]] = []
                item["imageSetUrls"][item["color"][color]].append(
                    "http://roadrunnersports.scene7.com/is/image/roadrunnersports/"
                    + item["sku"]
                    + "-"
                    + color
                    + "?req=set,json&scl=1"
                )
                item["imageSetUrls2"][item["color"][color]].append(
                    "http://roadrunnersports.scene7.com/is/image/roadrunnersports/"
                    + item["sku"]
                    + "-"
                    + color
                    + "?req=set,json&scl=1"
                )

        if item["imageSetUrls"]:
            color, href = item["imageSetUrls"].popitem()
            if len(href) > 1:
                item["imageSetUrls"][color] = href[1:]
            request = Request(href[0], callback=self.parseJsonImageSet)
            request.meta["item"] = item
            return request

        self.to_csv(item)
        return item
예제 #17
0
 def parse(self, response):
     
     dirname = os.getcwd()
     tabs= []
     tab_selector = response.xpath('//div[contains(@id, "SMWrapr")]')
     ### loop for all tabs
     for tab in tab_selector.xpath('.//div[contains(@id, "Tab")]'):
        # tabItem = TabItem()
        
         tabNameSel = tab.xpath('div/span[2]/text()').extract()
                    
         if tabNameSel:
             tabName = tabNameSel[0]
             
             
         os.chdir(dirname)
         if not os.path.exists(currDir+"/"+tabName):
         	os.makedirs(currDir+"/"+tabName)         
         #os.chdir(tabName)
         fobj = open(currDir+"/"+tabName+".txt", 'w')
         cat_selector = tab.xpath('div[2]/div[contains(@class, "SMSubCat")]')
         ### loop for all categories
         for category in cat_selector.xpath('div'): #'.//div[contains(@class, "ht180")]
          #   catItem = CatItem()
             catNameSel = category.xpath('div/a/@title').extract()
             if catNameSel:
                 catName = catNameSel[0]
                 
             subcat_selector = category.xpath('.//ul')
             ### loop for all subcategories
             for subcat in subcat_selector.xpath('.//li'):
                 subcatNameSel = subcat.xpath('.//a/@title').extract()
                 if subcatNameSel:
                     subcatName = subcatNameSel[0]
                 subcatLinkSel = subcat.xpath('.//a/@href').extract()
                 if subcatLinkSel:
                     subcatLink = subcatLinkSel[0]+"?sort=plrty"
                     
                 request = Request(subcatLink,callback=self.parse_subcatpage)
                 request.meta['fobj'] = fobj
                 request.meta['tabName'] = tabName
                 request.meta['catName'] = catName
                 request.meta['subcatName'] = subcatName
                 yield request
         
                 
                     #(response,tabName,catName,subcatName)
               
                 #print subcatLink
                 #print tabName, ":", catName, ":", subcatName
           #  categories.append(catItem)
         #return categories
         #categories = [dict(categories)]
         #tabs.append(tabItem)
     #return tabs
     	
     	os.chdir(dirname)
     	
     fobj.close()
예제 #18
0
 def parse(self, response):
     nums = Tag.remain_items()
     for i in nums:
         request_url = DOMAIN + '/view/' + str(i) + '.htm'
         request = Request(request_url, callback=self.parse_page)
         request.meta['view_num'] = str(i)
         yield request
         time.sleep(0.1)
예제 #19
0
 def parse_list_detail(self, response):
     hxs = HtmlXPathSelector(response)
     shops = hxs.select('//li[@class="shopname"]/a/@href').extract()
     for shop in shops:
         url = base_url + shop
         request = Request(url, callback=self.parse_detail)
         request.meta["num"] = response.request.meta["num"]
         request.meta["need_js"] = True
         yield request
예제 #20
0
	def parse_solutions(self,response):
		hxs = HtmlXPathSelector(response)
		x = hxs.select("//tr[@class='kol']//td[8]/ul/li/a/@href").extract()
		filename = response.meta['name']
		for i in range(10):
			request = Request('http://www.codechef.com/viewplaintext/'+x[i].split('/')[-1], callback=self.parse_ptsol)
			request.meta['name'] = filename
			request.meta['count'] = str(i)
			yield request
예제 #21
0
 def make_amazon_request(self, response, asin, amazonprice=None):
     request = Request('https://www.amazon.co.uk/gp/offer-listing/%s/' % asin, callback=self.parse_offers)
     request.meta['ASIN'] = asin
     request.meta['Original_URL'] = response.url
     if amazonprice:
         try:
             request.meta["Amazon_Price"] = float(getprice.sub(r'', amazonprice))
         except:
             print "ERROR %s - %s" % (amazonprice, str(request))
     return request
예제 #22
0
def test_request_response_converters():
    spider = TestSpider()
    rc = RequestConverter(spider)
    rsc = ResponseConverter(spider, rc)

    url = "http://test.com/test?param=123"
    request = ScrapyRequest(url=url, callback=spider.callback, errback=spider.errback,
                            body=REQUEST_BODY)
    request.meta[b'test_param'] = b'test_value'
    request.headers.appendlist(b"TestKey", b"test value")
    request.cookies[b'MyCookie'] = b'CookieContent'

    frontier_request = rc.to_frontier(request)
    assert frontier_request.meta[b'scrapy_callback'] == b'callback'
    assert frontier_request.meta[b'scrapy_errback'] == b'errback'
    assert frontier_request.body == to_bytes(REQUEST_BODY)
    assert frontier_request.url == url
    assert frontier_request.method == b'GET'
    assert frontier_request.headers[b'Testkey'] == b'test value'
    assert frontier_request.cookies[b'MyCookie'] == b'CookieContent'
    assert b'frontier_request' not in frontier_request.meta[b'scrapy_meta']

    request_converted = rc.from_frontier(frontier_request)
    assert request_converted.meta[b'test_param'] == b'test_value'
    assert request_converted.body == to_bytes(REQUEST_BODY)
    assert request_converted.url == url
    assert request_converted.method == 'GET'
    assert request_converted.cookies[b'MyCookie'] == b'CookieContent'
    assert request_converted.headers.get(b'Testkey') == b'test value'
    assert request_converted.callback == spider.callback
    assert request_converted.errback == spider.errback

    # Some middleware could change .meta contents
    request_converted.meta[b'middleware_stuff'] = b'appeared'

    response = ScrapyResponse(url=url, request=request_converted, body=RESPONSE_BODY,
                              headers={b'TestHeader': b'Test value'})

    frontier_response = rsc.to_frontier(response)
    assert frontier_response.body == RESPONSE_BODY
    assert frontier_response.meta[b'scrapy_meta'][b'test_param'] == b'test_value'
    assert frontier_response.meta[b'scrapy_meta'][b'middleware_stuff'] == b'appeared'
    assert frontier_response.status_code == 200
    assert b'frontier_request' not in frontier_response.meta[b'scrapy_meta']

    response_converted = rsc.from_frontier(frontier_response)
    assert response_converted.body == RESPONSE_BODY
    assert response_converted.meta[b'test_param'] == b'test_value'
    assert response_converted.url == url
    assert response_converted.status == 200
    assert response_converted.headers[b'TestHeader'] == b'Test value'

    frontier_request = FrontierRequest(url)
    request_converted = rc.from_frontier(frontier_request)
    assert frontier_request.url == url
 def start_requests(self):
     with open('imageURLs.csv') as csvFile:
         reader = csv.DictReader(csvFile)
         for row in reader:
             item = GetimagesprojectItem()
             image_url = row['URL']
             item['image_urls'] = [row['URL'],]
             item['pid'] = row['ID']
             request = Request(image_url,callback = self.parse)
             request.meta['item'] = item
             yield request
	def start_requests(self):  
		
		for cate in cate_array :
			item_list_interface_url = u'http://api.youzibuy.com/brand_area_catalog/item_list?v=1.2.2&size=0&catalog_id=%s&group_id=%s'%(cate['catalog_id'],cate['group_id'])
	
			for i in xrange(1,load_page_count_per_api+1):
				request = Request('%s&page=%d'%(item_list_interface_url,i)) 
				request.meta['cate'] = cate

				# print '----------------------'+request.url
				yield request
예제 #25
0
 def parseLineType(self, response):
    hxs = HtmlXPathSelector(response)
    lineType = hxs.select('//*[@id="pagebar"]/h1/text()').extract()[0].strip()
    self.log('Processing %s...' % (lineType), level=log.DEBUG)
    
    items = hxs.select('//*[@id="tbl_fahrplaene"]/tbody/tr/td[2]/a')
    for item in items:
       url = urljoin(response.url, item.select('./@href').extract()[0])
       req = Request(url, callback=self.parseFahrplan)
       req.meta['lineType'] = lineType
       req.meta['lineName'] = item.select('./text()').extract()[0].strip()
       self.log("Following URL %s" % (url), level=log.DEBUG)
       yield req
예제 #26
0
    def parse_surgery(self, response):
        print response.url
        surgery_item = SurgeryItem()
        surgery_item['url'] = response.url
        surgery_item['name'] = response.xpath('//div[@class="w_n"]/h3/text()').extract()[0]
        surgery_item['summary'] = response.xpath('//dd[@class="w_d3"]/text()').extract()[0]

        # Go on parsing details
        _next = response.xpath('//div[@class="w_n"]/div[@class="w_na clears"]/a[@class="hover"]/following-sibling::a[not(@class="w_la")][1]/@href').extract()
        next_detail_url = urljoin(response.url, _next[0])
        request = Request(url=next_detail_url, dont_filter=True, callback=self._parse_surgery_detail)
        request.meta['surgery_item'] = surgery_item
        yield request
예제 #27
0
    def parse_schoolIntroUrl(self, response):
        sel = Selector(response)
        item = response.meta["item"]

        schoolIntroUrl = sel.xpath('//div[@class="school_kz fr"]/a/@href').extract()
        link = self.start_urls[0]
        if schoolIntroUrl:
            link = schoolIntroUrl[0]
            request = Request(link, callback=self.parse_items)
            request.meta["item"] = item
            return request
        else:
            return item
예제 #28
0
    def parseDomain(self, response):
        """Second step of Mon/rg parsing (Domains)."""
        node = response.meta["node"]
        self.domains = json.loads(response.body_as_unicode())

        for dom in self.domains:
            link_dom = self.domain + self.domains[dom]
            request = Request(link_dom, callback=self.parseStatements)
            # Pass metadata to the next wave of parsing
            request.meta["node"] = node
            request.meta["domain"] = dom
            yield request

        return
예제 #29
0
	def snapdeal_scraper(self,response):
		item = response.meta['item']
		sel = Selector(response)
		item['Snapdeal_URL']= response.url
		try:
			if sel.xpath("//div[@class='notifyMe-soldout']"):
				ProductName = sel.xpath("//h1[@itemprop='name']/text()").extract()[0].replace(",","")
				item['Snapdeal__ProductName'] =ProductName
				item['Snapdeal_MRP']=item['Snapdeal_SP'] = ''
				item['Snapdeal_Stock'] = 'Out of Stock'
				
				
			else:
				mrp = sel.xpath("//span[@id='original-price-id']/text()").extract()
				if mrp:
					item['Snapdeal_SP'] = sel.xpath("//span[@id='selling-price-id']/text()").extract()[0]
					item['Snapdeal_MRP'] = sel.xpath("//span[@id='original-price-id']/text()").extract()[0]
				else:
					item['Snapdeal_MRP'] = sel.xpath("//span[@id='selling-price-id']/text()").extract()[0]
					item['Snapdeal_SP'] = ''
					
				item['Snapdeal__ProductName'] = sel.xpath("//h1[@itemprop='name']/text()").extract()[0].replace(",","")
				stock = sel.xpath("//div[@class='notifyMe-soldout']").extract()
				discntnd = sel.xpath("//div[@class='noLongerProduct']").extract()
				if stock or discntnd:
					item['Snapdeal_Stock'] = "Out Of Stock"
				else:
					item['Snapdeal_Stock'] = "In Stock"				
				
		except:			
			item['Snapdeal__ProductName'] =	item['Snapdeal_MRP'] = item['Snapdeal_SP'] = ''
			item['Snapdeal_Stock'] = 'Not Found'
		
		
		try:
			amazon_url = amazon_urls[item['index']]
			request = Request(amazon_url,
								headers={'Referer':'http://amazon.in'},
								callback = self.amazon_scraper)
			request.meta['item'] = item
			request.meta['proxy'] = "http://111.161.126.100:80"
			yield request
			
		except:				
			try:
				flipkart_url = flipkart_urls[item['index']]
				request = Request(flipkart_url,callback = self.flipkart_scraper)
				request.meta['item'] = item
				# request.meta['proxy'] = "http://111.161.126.100:80"
				yield request
		
			except:			
				try:
					paytm_url = paytm_urls[item['index']]
					request = Request(paytm_url,callback = self.paytm_scraper)
					request.meta['item'] = item
					request.meta['proxy'] = "http://111.161.126.100:80"
					yield request
				except:
					self.to_csv(item)
예제 #30
0
    def parse_jsp(self, response):
        # extracting data from the jsp file
        country_data = []

        log.msg("Looking for all the countries flags", level=log.INFO)

        for data in re.findall('countryCodeArray\[\d{1,3}\]="(.+)\"', response.body):
            country_data.append(data.split("|"))

        log.msg("Found {} countries".format(len(country_data)))

        request = Request(self.start_urls[0], callback=self.parse_countries_page, dont_filter=True)
        request.meta['country_data'] = country_data
        yield request
예제 #31
0
 def parse_api_data(self, response):
     # make a crawl loop
     # equal to continually scroll to the bottom
     # and load to new posts
     # the number of posts should be not more than max_posts_num to control the loop times
     # but the max loop need control
     loop_times = response.meta.get("loop_times", 1)
     max_posts_num = response.meta.get("max_posts_num", 100)
     range = response.meta.get("range", 60)
     fb_api_req_access_token = response.meta.get("fb_api_req_access_token", None)
     request_tail = response.meta.get("request_tail", None)
     data_header = response.meta.get("data_header", None)
     # the max number for scroll to the bottom
     max_loop_times = 10
     if loop_times <= max_loop_times and len(self.post_ids) <= max_posts_num:
         loop_times += 1
         api_code = response.body.replace('for (;;);', '')
         request_url, data_header = self.structure_api_request(api_code, request_tail = request_tail,
                                                               data_header = data_header)
         # no more request url
         if len(request_url) == 0:
             return self.structure_fbapi_request_url(fb_api_req_access_token)
         # request error
         elif request_url == "error":
             self.logger.info("request url: %s search post error." % response.url)
             return self.structure_fbapi_request_url(fb_api_req_access_token)
         # request urls are normal
         else:
             # post_ids contain (post_id, post_time, post_type)
             post_ids = self.parse_post(json.loads(api_code)["payload"], range)
             if post_ids:
                 # post_ids list, contains all the post_id need to crawl
                 # no suitable time range post, loop once
                 self.post_ids.extend(post_ids)
             return Request(
                     url = request_url,
                     callback = self.parse_api_data,
                     priority = 1,
                     dont_filter = False,
                     meta = {
                         "loop_times": loop_times,
                         "request_tail": request_tail,
                         "max_posts_num": max_posts_num,
                         "data_header": data_header,
                         "fb_api_req_access_token": fb_api_req_access_token,
                     }
             )
     else:
         return self.structure_fbapi_request_url(fb_api_req_access_token)
예제 #32
0
    def parse(self, response):

        # 获取响应体
        # print(response.body)
        # 获取 SIGN
        sign = re.search("var SIGN = \'(.+)\';", response.text).group(1)
        # print(sign)
        items = response.xpath('//tbody/tr')
        # print(items)
        # 如果当前页没有,则进行下一页
        if not len(items):
            # 看是否有下一页
            nextpage = response.xpath(
                '//div[@class="vd_page"]/a[@class="vd_bt_v2 vd_page_btn"]/span["下一页"]/text()'
            )
            print('next page', nextpage, len(nextpage))
            if len(nextpage):
                yield self.my_process_next(response)

        for item in items:
            info = item.xpath('.//th/span/a[1]/@data-info').extract_first()
            print(info)
            info = json.loads(info)
            # 如果是文件夹
            if info['is_dir']:
                # 默认只抓子第一页的目录
                if self.page['/'] <= self.dirpage:
                    yield self.process_dir(info, response)
                continue

            href = self.get_down_info.format(link=info['copy_ref'],
                                             sign=sign,
                                             time=int(round(time.time() *
                                                            1000)))
            # print(href)
            yield Request(href,
                          meta={
                              'cookiejar': response.meta['cookiejar'],
                              'filepath': response.meta['filepath']
                          },
                          callback=self.next)

        # 看是否有下一页
        nextpage = response.xpath(
            '//div[@class="vd_page"]/a[@class="vd_bt_v2 vd_page_btn"]/span["下一页"]/text()'
        )
        print('next page', nextpage, len(nextpage))
        if len(nextpage):
            yield self.my_process_next(response)
예제 #33
0
    def follow(self,
               url,
               callback=None,
               method='GET',
               headers=None,
               body=None,
               cookies=None,
               meta=None,
               encoding='utf-8',
               priority=0,
               dont_filter=False,
               errback=None,
               cb_kwargs=None,
               flags=None):
        # type: (...) -> Request
        """
        Return a :class:`~.Request` instance to follow a link ``url``.
        It accepts the same arguments as ``Request.__init__`` method,
        but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
        not only an absolute URL.

        :class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
        method which supports selectors in addition to absolute/relative URLs
        and Link objects.

        .. versionadded:: 2.0
           The *flags* parameter.
        """
        if isinstance(url, Link):
            url = url.url
        elif url is None:
            raise ValueError("url can't be None")
        url = self.urljoin(url)

        return Request(
            url=url,
            callback=callback,
            method=method,
            headers=headers,
            body=body,
            cookies=cookies,
            meta=meta,
            encoding=encoding,
            priority=priority,
            dont_filter=dont_filter,
            errback=errback,
            cb_kwargs=cb_kwargs,
            flags=flags,
        )
예제 #34
0
 def start_requests(self):
     base_url = 'http://yanbao.stock.hexun.com/xgq/%s.aspx'
     page_section_dict = {
         'gsyj': u"公司研究",
         'hyyj': u"行业研究",
         'yjyc': u"业绩预测",
         'qsch': u"券商晨会",
         'clbg': u"策略报告",
     }
     for section_short_name in page_section_dict:
         url = base_url % section_short_name
         yield Request(
             url=url,
             meta={'section': page_section_dict[section_short_name]},
             callback=self.parse_index_page_item)
예제 #35
0
    def parse(self, response):
        self.logger.info("------------ response 4 start")

        yield Request(url='https://www.baidu.com/s?wd=2',
                      callback=self.parse_e,
                      meta={
                          "expire":
                          datetime.datetime.now() +
                          datetime.timedelta(seconds=2)
                      })
        yield Request(url='https://www.baidu.com/s?wd=3',
                      callback=self.parse_e,
                      meta={
                          "expire":
                          datetime.datetime.now() +
                          datetime.timedelta(seconds=2)
                      })
        yield Request(url='https://www.baidu.com/s?wd=4',
                      callback=self.parse_e,
                      meta={
                          "expire":
                          datetime.datetime.now() +
                          datetime.timedelta(seconds=2)
                      })
 def deal_with_pager(self, item, pager, cur_pager, url):
     if pager is None or pager == cur_pager:
         return item
     else:
         url_pattern = None
         if cur_pager == 1:
             url_pattern = r'(.*)\.shtml'
         else:
             url_pattern = r'(.*)-[\d]+\.shtml'
         m = re.match(url_pattern, url)
         url = m.group(1) + '-%d.shtml' % cur_pager
         return Request(url,
                        meta={'item': item, 'is_page': True},
                        callback=self.parse_news_item)
     pass
예제 #37
0
    def parse(self, response):
        self.page+=1
        articles = response.xpath('//*[@class="list"]/ul/li/div[@class="box"]')    #所有文章

        # from scrapy.shell import inspect_response
        # inspect_response(response,self)

        for article in articles:
            url = article.xpath('.//div[@class="word"]/h3/a/@href').extract_first()
            title = ''.join(article.xpath('.//div[@class="word"]/h3/a/text()').re(r'\w'))
            summary = ''.join(article.xpath('.//div[@class="des"]/text()').re(r'\w'))
            author = ''.join(article.xpath('.//div[@class="msg clr"]/a/text()[2]').re(r'\w'))
            tag = article.xpath('.//div[@class="tags"]/a/text()').extract()
            title_img = article.xpath('.//div[@class="img"]/a[2]/img/@src').extract_first()
            
            leiphone_item= leiphoneItem(url = url, title = title, author = author, title_img = title_img, tag = tag, summary = summary)
            request = Request(url=url,callback=self.parse_body) #请求文章正文
            request.meta['item'] = leiphone_item #将item暂存  meta属性具有传播性,无论发生重定向或重试都可以通过这个属性获取最原始的meta值
            yield request
            
        ##翻页 
        next_page = response.xpath('.//a[@class="next"]/@href').extract_first()   #下一页 
        if next_page:
            yield Request(url = next_page, callback = self.parse)#回调方法用来指定由谁来解析此项Request请求的响应
예제 #38
0
    def parse(self, response):
        sel = Selector(response)
        sites = sel.css('td[class="txt-container"] strong')
        items = []

        for site in sites:
            '''item = Mamba()
            item["user"] = site.xpath("a/text()").extract()
            item["number"]=self.i
            items.append(item)
            self.i=self.i+1'''
            user_url = "http://m.mamba.ru" + str(
                site.xpath("a/@href").extract())
            Request(user_url, self.get_user_info)
        return items
예제 #39
0
 def start_requests(self):
     source = '国研网'
     source_url = 'http://www.drcnet.com.cn/'
     for url in self.countrysUrls:
         country = url.split("=")[1]
         for (k, v) in self.subjects.items():
             search_url = url + "&uid=" + v
             meta = {
                 'source': source,
                 'source_url': source_url,
                 'search_url': search_url,
                 'subject': k,
                 'subject country': country
             }
             yield Request(search_url, self.parseUsefulUrl, meta=meta)
예제 #40
0
 def start_requests(self):
     allowed_domains = ['www.huajiao.com/']
     urls = ['1', '2', '3', '5', '999', '1000', '1001']
     for url in urls:
         newUrl = 'http://www.huajiao.com/category/' + url
         request = Request(
             url=newUrl,
             callback=self.filterPages,
             headers={
                 'Referral':
                 'http://www.huajiao.com/',
                 'User-Agent':
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
             })
         yield request
예제 #41
0
    def parse(self, response):
        detail_page_list = re.findall('document.write.*?a href="(.*?\.html)">',
                                      response.text)
        if detail_page_list and isinstance(detail_page_list, list):
            detail_page_list = list(set(detail_page_list))
            self.logger1.log_more('Current url: {}, detail length:{}'.format(
                response.request.url, len(detail_page_list)))
            for detail_url in detail_page_list:
                final_detail_url = response.urljoin(detail_url)
                yield Request(url=final_detail_url, callback=self.parse_detail)

        page_info_list = re.findall('setPage\(.*?(\d+),(\d+),(\d+)',
                                    response.text)
        if page_info_list:
            page_info_list = page_info_list[0]
            if len(page_info_list) >= 3:
                total_page = int(page_info_list[0])
                cur_page = int(page_info_list[2])
                if cur_page < total_page:
                    next_page_url = self.next_page_tpl.format(cur_page + 1)
                    next_page_url = response.urljoin(next_page_url)
                    yield Request(url=next_page_url,
                                  callback=self.parse,
                                  dont_filter=True)
예제 #42
0
    def next_request(self):
        """Returns a request to be scheduled or none."""
        use_set = self.settings.getbool('REDIS_SET')

        if use_set:
            url = self.server.spop(self.redis_key)
        else:
            url = self.server.lpop(self.redis_key)

        if url:
            t =pickle.loads(url)
            #print t['cookies']
            print t['link_hash']
            print t['product_code']
            return Request(t['url'],cookies=eval(t['cookies']),meta={'product_code':t['product_code'], 'link_hash': t['link_hash']},dont_filter=True)
예제 #43
0
    def start_requests(self):
        url = "https://kyfw.12306.cn/otn/queryTrainInfo/getTrainName?"

        t = (datetime.datetime.now() +
             datetime.timedelta(days=3)).strftime("%Y-%m-%d")
        params = {"date": t}

        s_url = url + urllib.parse.urlencode(params)
        self.logger.debug("start url " + s_url)
        yield Request(s_url,
                      callback=self.parse,
                      meta={
                          "t": t,
                          "turn": self.turn
                      })
예제 #44
0
    def start_requests(self):
        # 调试
        #meta = {"journal_url": "url"}
        #url = "http://www.satnt.ac.za/index.php/satnt/article/view/686"
        #yield Request(url, self.crawl_issue_info, meta = meta, dont_filter=True)
        #return

        with open(self.url_file, "rb") as f:
            for line in f:
                meta = {"journal_url": line.strip()}
                journal_issue_url = "%s/issue/archive/" % line.strip()
                yield Request(journal_issue_url,
                              self.crawl_homepage,
                              meta=meta,
                              dont_filter=True)
예제 #45
0
    def parse(self, response):
        cookies = CookieJar()
        cookies.extract_cookies(response, response.request)
        self.cookie_dict = cookies._cookies

        yield Request(
            url='http://dig.chouti.com/login',
            method='POST',
            headers={
                'Content-Type':
                "application/x-www-form-urlencoded; charset=UTF-8"
            },
            body='phone=8618279816872&password=18279816872&oneMonth:1',
            cookies=self.cookie_dict,
            callback=self.check_login)
예제 #46
0
 def get_request_object(self, params):
     """构造request对象"""
     formdata = params.get('formdata', {})
     if formdata:
         if isinstance(formdata, dict):
             return FormRequest(**params)
         else:
             s = json.dumps(formdata, ensure_ascii=False)
             log.warning("formdata:{}格式不对, 无法制造FormRequest对象".format(s))
             return None
     else:
         temp_params = copy.deepcopy(params)
         if 'formdata' in temp_params:
             del temp_params['formdata']
         return Request(**temp_params)
예제 #47
0
    def parse(self, response):

        #--- get the total number of the company
        max_page = ComSpider.count

        #--- for loop that crawl multiple pages that based different stock code
        for page in range(0, max_page + 1):
            #--- get the stock code
            code = ComSpider.Comdata[page]

            #--- call the scrape function iterally
            yield Request(url=add_or_replace_parameter(self.single_url,
                                                       'securityCode', code),
                          callback=self.scrape,
                          meta={'code': code})
예제 #48
0
 def getUserId(self, response):
     bsObj = BeautifulSoup(response.text, 'html.parser')
     user = bsObj.find_all("a", href=re.compile("^(/user/[0-9]+)"))[0]
     if 'href' in user.attrs:
         newPage = 'http://www.huajiao.com' + user.attrs['href']
         request = Request(
             url=newPage,
             callback=self.parseUserStat,
             headers={
                 'Referral':
                 'http://www.huajiao.com/',
                 'User-Agent':
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
             })
         yield request
예제 #49
0
 def after_login(self, response):
     # check login succeed before going on
     if (((("ERROR: Invalid username") or
         ("The username/password combination you haventered is invalid"))
         in response.body) or
         (response.url is self.start_urls[0])):
         self.log("Login failed", level=log.ERROR)
         return
     # continue scraping with authenticated session...
     else:
         self.log("Login succeed!", level=log.DEBUG)
         print response.url
         print "response end!!"+ response.url
         return Request(url=response.url,
                        callback=self.parse1)
예제 #50
0
 def parse(self, response):
     print(response)
     req = Request(
         url='https://dig.chouti.com/login',
         method='POST',
         headers={
             'Content-Type':
             'application/x-www-form-urlencoded; charset=UTF-8',
             'referer': 'https://dig.chouti.com/'
         },
         body='phone=************&password=************&oneMonth=1',
         meta={"cookiejar": True},
         callback=self.check_login,
     )
     yield req
예제 #51
0
    def parse3(self, response):
        """点赞"""
        hxs = Selector(response)
        linkid_list = hxs.xpath('//div[@class="news-pic"]/img/@lang').extract()
        print(linkid_list)
        for link_id in linkid_list:
            # 获取每一个id去点赞
            base_url = "https://dig.chouti.com/link/vote?linksId={0}".format(
                link_id)
            yield Request(url=base_url,
                          method='POST',
                          cookies=self.cookie_jar,
                          callback=self.parse4)

        # hxs.xpath('//div[@id="dig_lcpage"]//a/@href')
        # 寻找所有分页页面
        page_list = hxs.xpath('//a[@class="ct_pagepa"] /@href').extract()
        """https://dig.chouti.com/all/hot/recent/2"""
        for page in page_list:
            page_url = "https://dig.chouti.com%s" % page
            yield Request(url=page_url,
                          method='GET',
                          cookies=self.cookie_jar,
                          callback=self.parse3)
예제 #52
0
 def parse_comments(self, response):
     for comments in response.css('.comment-item'):
         username = comments.css(
             'span.comment-info > a::text').extract_first()
         comment = comments.css('span.short::text').extract_first()
         yield {
             'movie': response.meta['movie'],
             'username': username,
             'comment': comment
         }
     next_url = response.css('a.next::attr(href)').extract_first()
     if next_url:
         yield Request(url=response.url[:response.url.find('?')] + next_url,
                       callback=self.parse_comments,
                       meta=response.meta)
예제 #53
0
 def parse_songs(self, response):
     songs_links = response.xpath(
         '//a[contains(@target,"blank")]/text()').extract()
     reggaetonLyricsScrapperItem = response.meta['item']
     if songs_links > 0:
         for song in songs_links:
             reggaetonLyricsScrapperItem = response.meta['item']
             reggaetonLyricsScrapperItem = reggaetonLyricsScrapperItem.copy(
             )
             reggaetonLyricsScrapperItem['name'] = song
             yield Request(url=self.get_lyric_url(
                 reggaetonLyricsScrapperItem['author'], song),
                           meta={'item': reggaetonLyricsScrapperItem},
                           callback=self.parse_lyric)
     return
예제 #54
0
    def parse(self, response):
        self.driver.get(response.url)
        time.sleep(2)

        inputs = self.driver.find_elements_by_xpath(
            "//div[@id='results_nav_by_year']/a")

        links = []
        for i in inputs:
            link = i.get_attribute('href')
            if (link != None):
                links.append(link)

        for link in links:
            yield Request(url=link, callback=self.parse_page)
예제 #55
0
    def parse_subcategory(self, response):
        hxs = HtmlXPathSelector(response)
        for product in hxs.select(
                '//li[contains(@class, "prd_listing_prod")]'):
            product = self._parse_product_el(product, get_base_url(response))
            yield Request(product['url'],
                          callback=self.parse_product,
                          meta={'product': product})

        # go to page 2
        search_term = urlopen(
            'https://www.buyagift.co.uk/navigation/GetBNNumber?url=%s' %
            response.url).read()
        if not search_term:
            msg = "[BuyAGift] Error extracting search term from: %s" % response.url
            self.log(msg)
            #self.errors.append(msg)
            return
        search_term = 'BN-' + search_term

        page2_url = "http://www.buyagift.co.uk/navigation/GetPartialRecordsList?searchTerm=%(search_term)s&page=%(page_num)s&pageSize=24&sortTerm=SalesRank&"
        meta = {'search_term': search_term, 'page_num': 2}
        page2_url = page2_url % meta
        yield Request(page2_url, callback=self.parse_pages, meta=meta)
예제 #56
0
 def turn_to_next_page(self, response):
     this_func_name = sys._getframe().f_code.co_name
     self.logger.debug("%s(): current page\t%s" %
                       (this_func_name, response.url))
     sel = Selector(response)
     next_page_list = sel.xpath(
         u'//div[@class="pagination"]/a[@class="next_page" and @rel="next" and text()="下一页 ›"]/@href'
     ).extract()
     if len(next_page_list) == 0:
         return
     link = self.base_url + next_page_list[0]
     self.logger.debug("%s(): next page\t\t%s" % (this_func_name, link))
     return Request(url=link,
                    meta=response.meta,
                    callback=self.parse_proxy_list)
예제 #57
0
 def parse(self, response):
     exhibits = response.css(
         'main.site-main > div.row > #isotope-container > div')
     for exhibit in exhibits:
         url = exhibit.css('.mb-image > a::attr(href)').get()
         title = exhibit.css('.mb-image > a::attr(title)').get()
         date = exhibit.css('.date.details::text').get()
         image_link = exhibit.css('.mb-image > a > img::attr(src)').get()
         yield Request(url=url,
                       callback=self.parse_exhibit,
                       meta={
                           'title': title,
                           'date': date,
                           'image_link': image_link
                       })
예제 #58
0
    def crawl_issue(self, response):
        issues = response.xpath(".//a[@class='green issueTitle']")
        for issue in issues:
            url = urlparse.urljoin(response.url,
                                   issue.xpath("./@href").extract_first())

            meta = {"journal_url": response.meta["journal_url"]}
            yield Request(url, self.crawl_issue_info, meta=meta)

        if response.meta["is_first"]:
            identifier = response.xpath(
                "//h1[@class='issue_title_identifier']/text()").extract_first(
                )
            total_issue_num = Utils.regex_extract(identifier,
                                                  ".*-(\d+) issues.*")
            total_issue_num = int(total_issue_num)
            total_page = total_issue_num / 12 + 1
            for i in range(1, total_page):
                next_page_url = "%s&p=%d" % (response.url, i)
                meta = {
                    "journal_url": response.meta["journal_url"],
                    "is_first": False
                }
                yield Request(next_page_url, self.crawl_issue, meta=meta)
예제 #59
0
    def parse_item_requests_callback(self, response, item_xpath_selector=''):
        requests = []
        for job_item in response.xpath(item_xpath_selector):

            job_crawler_item = JobItem()
            self.populate_job_crawler_item(job_item, job_crawler_item)

            if self.should_load_details(job_crawler_item):
                requests.append(
                    Request(url=job_crawler_item.job_details_link,
                            callback=self.retrieve_job_details,
                            meta={'item': job_crawler_item},
                            dont_filter=True))

        return requests
예제 #60
0
 def parse(self, response):
     try:
         content = json.loads(response.body.decode())
         page_count = content.get('total', 1)
         for page in range(1, page_count + 1):
             url = self.page_url.format(page)
             yield Request(url,
                           callback=self.parse_link,
                           errback=self.error_parse,
                           dont_filter=True)
     except:
         err_msg = traceback.format_exc()
         self.logger1.warning(
             "Exception occurred on get the page counts[{url}], error:{err_msg}"
             .format(url=response.url, err_msg=err_msg))