예제 #1
1
 def _construct_query(self, page_num, query):
     url = 'http://www.innojoy.com/client/interface.aspx'
     data = {"requestModule": "PatentSearch",
             "userId": "",
             "patentSearchConfig": {
                 "Query": query,
                 "TreeQuery": "",
                 "Database": "idpat,mypat,phpat,sgpat,itpat,inpat,inapp,chpat,frpat,gbpat,depat,jpapp,eppat,wopat,usapp,usdes,uspp,usre,uspat,fmsq,wgzl,syxx,fmzl",
                 "Action": "Search",
                 "Page": str(page_num),
                 "PageSize": self._page_size,
                 "GUID": "",
                 "Sortby": "",
                 "AddOnes": "",
                 "DelOnes": "",
                 "RemoveOnes": "",
                 "TrsField": "",
                 "SmartSearch": ""
             }
     }
     data_bin = json.dumps(data)
     headers = {
         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
         'Referer': 'http://www.innojoy.com/SearchResult/default.shtml',
     }
     request = Request(url=url, method='post', headers=headers, body=data_bin)
     # noinspection PyUnresolvedReferences
     request.callback = self.query_callback
     return request
예제 #2
0
    def parse(self, response):
        send_emails = True
        try:
            with open(self.links):
                pass
        except IOError:
            open(self.links, 'a')
            #If this is the first time the file is created do not send emails
            send_emails = False;
            
        hxs = HtmlXPathSelector(response)
        results = hxs.select('//*[@id="results-anchor"]/*/a')

        for result in results:
            title = result.select('text()').extract()[0].strip()
            link = 'http://www.supost.com' + result.select('@href').extract()[0].strip()

            exists = False

                
            #check to see if we have already looked at this page
            for line in open(self.links):
                if link in line:
                    exists = True
                    break
        
            #If we have not seen the page before add it to the links list
            if exists == False:            
                request = Request(link, callback=self.get_description)
                request.meta['title'] = title
                request.meta['link'] = link
                request.meta['send_emails'] = send_emails
                yield request
 def test_request_cacheability(self):
     res0 = Response(self.request.url, status=200,
                     headers={'Expires': self.tomorrow})
     req0 = Request('http://example.com')
     req1 = req0.replace(headers={'Cache-Control': 'no-store'})
     req2 = req0.replace(headers={'Cache-Control': 'no-cache'})
     with self._middleware() as mw:
         # response for a request with no-store must not be cached
         res1 = self._process_requestresponse(mw, req1, res0)
         self.assertEqualResponse(res1, res0)
         assert mw.storage.retrieve_response(self.spider, req1) is None
         # Re-do request without no-store and expect it to be cached
         res2 = self._process_requestresponse(mw, req0, res0)
         assert 'cached' not in res2.flags
         res3 = mw.process_request(req0, self.spider)
         assert 'cached' in res3.flags
         self.assertEqualResponse(res2, res3)
         # request with no-cache directive must not return cached response
         # but it allows new response to be stored
         res0b = res0.replace(body=b'foo')
         res4 = self._process_requestresponse(mw, req2, res0b)
         self.assertEqualResponse(res4, res0b)
         assert 'cached' not in res4.flags
         res5 = self._process_requestresponse(mw, req0, None)
         self.assertEqualResponse(res5, res0b)
         assert 'cached' in res5.flags
예제 #4
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for div in hxs.select('//div[@id="contem_boxes"]'):
            titulo = div.select('.//div[@id="contem_titulo"]/text()').extract()[0]

            if not titulo.endswith(u'mara dos Deputados/BR'):
                continue
            else:
                reg = re.compile('<a class="listapar" href="(?P<url>.*?)">(?P<name>[\w\s]*[\w]+)\s*\(<b>[\w\s]+</b>\)\s-\s(?P<party>.*?)\/(?P<state>.*?)</a><br>', flags=re.U)
                for r in reg.finditer(div.extract()):
                    dict_deputy = r.groupdict()
                    #if dict_deputy['state'] in settings['STATE_TO_FILTER']:
                    db_deputy = self.api.get_deputado_por_nome(dict_deputy['name'])
                    if not db_deputy:
                        dep = Deputado(dict_deputy['name'], dict_deputy['state'], dict_deputy['party'])
                        self.api.inserir_deputado(dep)
                    else:
                        dep = db_deputy[0]

                    id = urlparse.parse_qs(urlparse.urlparse(dict_deputy['url']).query).get('id', [0])[0]
                    if not id:
                        continue
                    request = Request(urljoin(self.base_url, '@presencas.php?id=%s' % id), callback=self.parse_deputy_assiduity)
                    request.meta['dep'] = dep
                    yield request
                    
                    request = Request(urljoin(self.base_url, '@uso_verbas_als.php?uf=16&id=%s' % id), callback=self.parse_deputy_costs)
                    request.meta['dep'] = dep
                    yield request
예제 #5
0
def process_wsj_sitemap(spider, body):
    print "Enter processing sitemap for wsj"
    data = bs(body)
    urls = data.find_all('url')
    for url in urls:
        link = url.loc.text
        news = url.find('news:news')
        item = None
        if news is not None:
            item = SitemapItem()
            title = news.find('news:title')
            item['title'] = title.text
            #format: 2014-04-27T05:49:00-05:00
            date = news.find('news:publication_date')
            dt = parse(date.text)
            dt_utc = dt.astimezone(dateutil.tz.tzutc()).replace(tzinfo=None)
            item['update'] = dt_utc
        #need to save/get last crawled timestamp to decide whether we need to recrawl the link
        #pattern http://online.wsj.com/google_sitemap_Q1_1996.xml
        req = Request(link, callback = spider.process_page)
        if item is not None:
            req.meta['item'] = item
        else:
            pass
        yield req
	def parse(self, response):
		i = 0


		for div in response.xpath('//li[@class="conference vevent"]'):
			item = AfeventItem()
			item['location'] = div.xpath('.//p[@class="location"]/a[3]/text()').extract_first()
			
			item['title'] = div.xpath('//h4/a/text()').extract()[i]
			item['date'] = div.xpath('//p[@class="date"]/abbr[1]/@title').extract()[i]
			item['host'] = ''
			item['time'] = ''
			item['description'] = ''

			
			follow_url_1 = div.xpath('//h4/a/@href').extract()[i]
			follow_url = 'http://lanyrd.com' + follow_url_1
			request = Request(follow_url, callback = self.parse_url)

			url = div.xpath('//h4/a/@href').extract()[i]
			url = 'http://lanyrd.com' + url
			request = Request(url, callback = self.parse_url)

			request.meta['item'] = item

			if i < len(response.xpath('//li[@class="conference vevent"]')):
				i = i + 1	
			yield request
예제 #7
0
파일: card_spider.py 프로젝트: 0--key/lib
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     magic_sets_full = hxs.select('//div[@class="left_block"]//ul[@class="left_menu"]//li/a/text()').extract()
     links_to_magic_sets_full = hxs.select(
         '//div[@class="left_block"]//ul[@class="left_menu"]//li/a/@href'
     ).extract()
     # lets cut first category for debuging purposes:
     magic_sets = magic_sets_full[0]
     links_to_magic_sets = links_to_magic_sets_full[0]
     # self.log("This is first category and link to they: %s, %s, %s" % (type(magic_sets), magic_sets, links_to_magic_sets))
     # Now all magic sets are all together with the links to them:
     # uncoment this after debug:
     # magic_sets_zip = dict(zip(magic_sets, links_to_magic_sets))
     magic_sets_zip = dict([[magic_sets, links_to_magic_sets]])
     date_prefix = time.strftime("%Y%m%d", time.localtime())
     try:
         os.mkdir("./archive/HTML/" + date_prefix)
     except OSError:
         self.log("The folder exists!")
     filename = "./archive/HTML/" + date_prefix + "/" + response.url.split("/")[-1] + ".htm"
     self.log("This is filename for index: %s" % (filename,))
     try:
         open(filename, "wb").write(response.body)
     except OSError:
         os.remove(filename)
         open(filename, "wb").write(response.body)
     # Continue to extract data:
     for magic_set, url in magic_sets_zip.iteritems():
         abs_url = urljoin("http://www.blackborder.com", url)
         self.log("This is magic set name and url to it: %s ---> %s" % (magic_set, abs_url))
         request = Request(abs_url, callback=self.parse_set_page)
         request.meta["magic_set"] = magic_set
         request.meta["date_prefix"] = date_prefix
         yield request
    def start_requests(self):
        page = 1
        search_url = self.get_search_url(page)
        request = Request(search_url)
        request.meta['page'] = page

        yield request
예제 #9
0
파일: test_crawl.py 프로젝트: dvska/scrapy
 def test_referer_header(self):
     """Referer header is set by RefererMiddleware unless it is already set"""
     req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
     req1 = req0.replace()
     req2 = req0.replace(headers={'Referer': None})
     req3 = req0.replace(headers={'Referer': 'http://example.com'})
     req0.meta['next'] = req1
     req1.meta['next'] = req2
     req2.meta['next'] = req3
     spider = SingleRequestSpider(seed=req0)
     yield docrawl(spider)
     # basic asserts in case of weird communication errors
     self.assertIn('responses', spider.meta)
     self.assertNotIn('failures', spider.meta)
     # start requests doesn't set Referer header
     echo0 = json.loads(spider.meta['responses'][2].body)
     self.assertNotIn('Referer', echo0['headers'])
     # following request sets Referer to start request url
     echo1 = json.loads(spider.meta['responses'][1].body)
     self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
     # next request avoids Referer header
     echo2 = json.loads(spider.meta['responses'][2].body)
     self.assertNotIn('Referer', echo2['headers'])
     # last request explicitly sets a Referer header
     echo3 = json.loads(spider.meta['responses'][3].body)
     self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
예제 #10
0
    def _test(method):
      url = 'http://www.example.com/301'
      url2 = 'http://www.example.com/redirected'
      req = Request(url, method=method)
      req.meta['origin_url'] = url
      resp = Response(url, headers={'Location': url2}, status=301)

      req2 = mw.process_response(req, resp, self.spider)
      assert isinstance(req2, Request)
      self.assertEqual(req2.url, url2)
      self.assertEqual(req2.method, method)

      del resp.headers['Location']
      assert mw.process_response(req, resp, self.spider) is resp

      bad_url1 = 'http://baidu.com/'
      bad_url2 = 'http://baidu.com/xx'
      resp2 = Response(url, headers={'Location': bad_url1}, status=301)
      resp3 = Response(url, headers={'Location': bad_url2}, status=301)
      req.meta['proxy'] = 'xx.xx.xx.xx:301'

      req2 = mw.process_response(req, resp2, self.spider)
      req3 = mw.process_response(req, resp3, self.spider)

      assert isinstance(req2, Request)
      assert isinstance(req3, Request)
      self.assertEqual(req2.url, url)
      self.assertEqual(req3.url, url)
예제 #11
0
 def start_requests(self):
     with open(getattr(self, "file", "todo.csv"), "rU") as f:
         reader = csv.DictReader(f)
         for line in reader:
             request = Request(line.pop('url'))
             request.meta['fields'] = line
             yield request
예제 #12
0
	def getComments(self, response):
		Item = response.meta['item']

		res_text = response.body_as_unicode().encode('ascii', 'ignore')
		res_text = smart_str(self.parser.unescape(self.parser.unescape(res_text))).replace('\xc2\xa0','')
		res_text = res_text.replace('\n', ' ').replace('\t', ' ').replace('\r', '')
		res_text = re.subn('<script.*?</script>', '', res_text)[0]
		res_text = re.subn('<style.*?</style>', '', res_text)[0]
		hxs = HtmlXPathSelector(text=res_text)
		
		tmp = hxs.select('//div[@id="ds_div"]//text()').extract()
		comments = ''
		for val in tmp:
			val = val.strip()
			if val != '':
				comments += val + ' '
		Item['Comments'] = comments

		try:
			offers_url = 'http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=' + Item['eBay_Item_Number']
			if Item['eBay_Item_Number'] != 'NA' and Item['eBay_Item_Number'] != '':
				req = Request(offers_url, dont_filter=True, callback=self.getPostingDate)
				req.meta['item'] = Item
				return req
		except:
			pass

		return Item
    def start_requests(self):
        page = 1
        search_url = SEARCH_URL.format(page=page)
        request = Request(search_url)
        request.meta['page'] = page

        yield request
예제 #14
0
    def start_requests(self):
        """
            default Scrapy method to send requests
        """

        # if spider already active
        if self.settings['active'] == 'T':
            log.msg('[OVERLAP] - at %s EST' % (datetime.now(timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO)
            # Close the spider
            raise exceptions.CloseSpider('Recon Spider already active')

        # Set spider is activating
        ReconSpiderSettings(self.site).write_active('T')

        log.msg('[START_ID] - %s at %s EST' % (str(self.settings['recon_startid']), datetime.now(timezone('US/Eastern'))
                .strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO)
        log.msg('[CYCLES] - %s at %s EST' % (
            str(self.settings['cycles']), datetime.now(timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO)

        # requires a new recon_startid, if not, close the spider
        if self.settings['recon_startid'] == -1:
            # Close the spider and notice to provive initial start_id
            raise exceptions.CloseSpider('Provide start_id value via start_id parameter for initilizing')

        # Generate ids list for reconnoitering
        url_ids = generate_ids(self.site)
        
        # Send URL requests
        for id in url_ids:
            req = Request("".join((self.base_url, str(id))), dont_filter=True, callback=self.parse)
            # save url_id for calling back
            req.meta['url_id'] = id
            yield req
예제 #15
0
    def parse(self, response):
        sel = Selector(response)

        if self.dont_crawl:
            request = Request(response.url, callback=self.parse_single_episode)
            item = PlayGrabberItem()
            item['show_url'] = response.url
            # Store the original show id (to be able to detect mixing of seasons)
            item['original_show_id'] = '00000'
            # Pass on the item for further populating
            request.meta['episode-item'] = item
            return request

        # If this is a show index page, we need to get the URL for a single episode
        # (anyone will do, let's take the latest)
        try:
            any_episode_base_url = sel.xpath('//a[@class="play_title-page-trailer__start-button"]/@href').extract()[0]
            any_episode_url = 'http://www.svtplay.se' + any_episode_base_url
        except:
            # Otherwise we assume this url is for a single episode and not an index
            # page, and use it directly
            any_episode_url = response.url

        # Call this page again and make sure we get all episodes
        all_season_tabs = sel.xpath("//a[@class='play_accordion__section-title']/@href").re('[^#]*')
        # Don't include the shorts
        check_season_tabs = [t for t in all_season_tabs if t != '?tab=klipp']
        requests = []
        for tab in check_season_tabs:
            all_episodes_url = any_episode_url.split('?')[0] + tab + '&sida=99'
            request = Request(all_episodes_url, callback=self.parse_all_episodes)
            requests.append(request)

        return requests
예제 #16
0
    def parse_all_episodes(self, response):
        # Now extract all episodes and grab each of them
        sel = Selector(response)
        all_episode_urls = sel.xpath("//li/article//a/@href").extract()

        if not all_episode_urls:
            if response.url.endswith('sida=99'):
                # If the number of episodes fit on just one page, the "sida=99" barfs
                # and returns zero hits. Retry without it.
                self.log("Retrying for all episodes assuming a single page for %s" % response.url)
                all_episodes_url = response.url.split('?')[0] + '?tab=senast'
                return Request(all_episodes_url, callback=self.parse_all_episodes)
            else:
                self.log("No episodes available for show %s" % response.url)
        else:
            # Original show_id is not used anymore
            original_show_id = '00000'

            # Get the show url (only valid for top-level pages), but not really important
            show_url = sel.xpath("//meta[@property='og:url']/@content").extract()[0]
            content_type = sel.xpath("//meta[@property='og:type']/@content").extract()[0]
            if content_type != 'video.tv_show':
              self.log("WARNING: This is not a top-level page.")

            requests = []
            for url in all_episode_urls:
                request = Request('http://www.svtplay.se' + url, callback=self.parse_single_episode)
                item = PlayGrabberItem()
                item['show_url'] = show_url
                # Store the original show id (to be able to detect mixing of seasons)
                item['original_show_id'] = original_show_id
                # Pass on the item for further populating
                request.meta['episode-item'] = item
                requests.append(request)
            return requests
예제 #17
0
    def parse_page (self, response):
        task = response.meta['task']
        county_id = response.meta['county_id']
        hxs = HtmlXPathSelector(response)

#        inspect_response (response);

        #get next page
        next = hxs.select("//a[contains(text(),'Next')]/@href")
#        if 0:
        if len(next) > 0:
            request = Request (urljoin(response.url, next[0].extract()), callback=self.parse_page, errback=self.error_callback, dont_filter=True)
            request.meta['task'] = task
            request.meta['county_id'] = county_id
            yield request
        else:
            yield self.form_request(task)

        rows = hxs.select ('/html/body/table[4]/tr')
        if (len(rows) == 0):
            self.send_alert ('No permit data found in search response')
            self.log('No permit data table present in response', log.ERROR)
        elif (len(rows) == 1):
            self.log('No incident reports found in response', log.WARNING)
        else:
            # Skip the first report record because this is the header row
            rows.pop (0)
            self.log('Retrieved {0} permits'.format(len(rows)), log.INFO)
            for row in rows:
                r = dict(zip(self.field_names, [f.strip() for f in row.select ('td/text()').extract_unquoted()]))
                r['county'] = self.counties[county_id]
                for item in self.process_row(r, task):
                    yield item
예제 #18
0
파일: foxy.py 프로젝트: claudioharu/MngX
    def parse_chapter_page(self, response):
        hxs = HtmlXPathSelector(response)
        chapter = response.meta['chapter']
        page_number = response.meta['page_number']

        image_url = hxs.select('id("image")/@src').extract()[0]

        chapter.pages[page_number] = image_url
        if len(chapter.pages) == chapter.pages_count:
            #Sinal de progresso incrementado
            # print 'All urls of chapter %s of volume %s retrieved. Starting download...' % (chapter.chapter_number, chapter.volume)
            # brave_10/brave_10_v01/brave_10_v01_c01/brave_10_v01_c001_p001.jpg
            chapter_dir_name = chapter.chapter_number
            #volume_dir_name = '%s_%s' % (self.title, chapter.volume)
            #chapter_dir_name = '%s_%s' % (volume_dir_name, chapter.chapter_number) #(volume_dir_name, chapter.chapter_number) 
            chapter_dir = os.path.join(self.title, chapter_dir_name) #((self.title, volume_dir_name, chapter_dir_name))

            chapter.storage_dir = chapter_dir
            chapter.filename_pattern = '%03d.jpg' #chapter_dir_name + '_p%03d.jpg'

            if os.path.exists(chapter_dir):
                shutil.rmtree(chapter_dir)
            os.makedirs(chapter_dir)

            reqs = []
            for page, image_url in chapter.pages.iteritems():
                page_image_request = Request(image_url, callback=self.process_page_image)
                page_image_request.meta['chapter'] = chapter
                page_image_request.meta['page_number'] = page
                reqs.append(page_image_request)

            self.count += 1
            # self.emit(QtCore.SIGNAL("progress(int)"), (self.count*100)/self.totalChapt)
            self.prog.setValue((self.count*100)/self.totalChapt)
            return reqs
예제 #19
0
 def parse_hospital(self, response):
     hxs = Selector(response)
     department_urls = hxs.xpath("//table[@id='hosbra']/tr/td/a[@class='blue']/@href").extract()
     for department_url in department_urls:
         request = Request(department_url, callback=self.parse_doctors)
         request.meta['city'] = response.meta['city']
         yield request
예제 #20
0
    def parse(self, response):
        
        sel = Selector(response)
        title = sel.xpath('//div/div[@id="gd2"]/h1[@id="gj"]/text()').extract()[0]
        p = re.compile('[!|?|\\|/]') 
        title = re.sub(p, '', title)        
        sites = sel.xpath('//div[@id="gdt"]/div[@class="gdtm"]')

        next_page = ''
        if sel.xpath('(//table/tr/td[@onclick="sp({0})"])/a/@href'.format(self.next_page_count)).extract() != []:
            next_page = sel.xpath('(//table/tr/td[@onclick="sp({0})"])/a/@href'.format(self.next_page_count)).extract()[0]
            self.next_page_count += 1
        
        for site in sites:
            item = EhendownItem()
            item['title'] = title
            item['page'] = '{0:0>3d}'.format(self.page_count)
            item['image_page'] = site.xpath('div/a/@href').extract()[0]
            request = Request(item['image_page'], callback = self.parse_image)
            request.meta['item'] = item
            self.page_count += 1
            yield request

        print('page: ' + next_page)
        print(self.page_count)
        if next_page:
            yield Request(next_page, callback=self.parse)
예제 #21
0
def process_bbc_sitemap(spider, body):
    print "Enter processing sitemap for bbc"
    data = bs(body)
    urls = data.find_all('url')
    for url in urls:
        link = url.loc.text
        news = url.find('news:news')
        item = None
        if news is not None:
            item = SitemapItem()
            title = news.find('news:title')
            item['title'] = title.text
            #format: 2014-04-25T09:43:49Z
            date = news.find('news:publication_date')
            item['update'] = datetime.datetime.strptime(date.text.strip(), '%Y-%m-%dT%H:%M:%SZ')

        else:
            lastmod = url.find('lastmod')
            if lastmod:
                item = SitemapItem()
                #format: 2014-04-25T10:27:05Z
                item['update'] = datetime.datetime.strptime(lastmod.text.strip(), '%Y-%m-%dT%H:%M:%SZ')

        
        req = Request(link, callback = spider.process_page)
        if item is not None:
            req.meta['item'] = item
        else:
            pass
        yield req
예제 #22
0
	def parse(self, response):
		""""""
		currentPage = response.xpath('//div[@class="page mb10"]/span/text()').extract()[0]
		print '---------------- Page: %s  ----------------' % str(currentPage)
		
		jbs = response.xpath('//i[@class="iDes"]')
		
		for jb in jbs:
			jburl = jb.xpath('em[@class="eName"]/span/a/@href').extract()[0] if jb.xpath('em[@class="eName"]/span/a/@href') else ''
			keshi = jb.xpath('em[@class="eSym"]/a/text()').extract() if jb.xpath('em[@class="eName"]/span/a/@href') else ''
			
			if jburl != '':
				request = Request(jburl, callback=self.parse_nav)
				request.meta['ks'] = keshi
				yield request
			
			#urlList = jb.xpath('a/@href').extract()
			#if len(urlList) > 0:
				#jb_url = urlList[0]
				#request = Request(jb_url, callback=self.parse_nav)
				#yield request               			
		
		# 循环爬取“下一页”
		nextpages = response.xpath('//div[@class="page mb10"]/a[@class="next"]/@href').extract()
		if len(nextpages) > 0:
			nextpage = nextpages[0]
			print '----- next page ------'
			print nextpage
			print '----- next page ------'
			
			req = Request(url=nextpage, callback=self.parse)
			yield req		
예제 #23
0
    def parse_education_structure_page(self, response):
        """ This method is specific for the VO-schools, as these can have
        multiple educational structures (vmbo, havo, vwo, ...) """
        hxs = HtmlXPathSelector(response)
        structures = hxs.select('//li[@class="match"]/noscript/a')

        # The VOSchool item to be populated
        organisation = VOSchool()
        organisation['education_structures_to_scrape'] = set()

        # If we end up at the schools page directly, immediately yield request
        if not structures:
            request = Request(response.url, self.parse_organisation_detail_page)
            request.meta['item'] = organisation
            yield request

        organisation['name'] = hxs.select('//h1[@class="stitle"]/text()').extract()[0].strip()

        crawl_structures = {}
        for structure in structures:
            url = 'http://toezichtkaart.owinsp.nl/schoolwijzer/%s'\
                % structure.select('@href').extract()[0]
            url = self.open_blocks(url)
            crawl_structures[url] = structure.select('text()').extract()[0]
            organisation['education_structures_to_scrape'].add(url)

        for url, structure in crawl_structures.iteritems():
            request = Request(url, self.parse_organisation_detail_page)
            request.meta['item'] = organisation
            request.meta['structure'] = structure
            yield request
예제 #24
0
 def parse_categories(self,response):
   #item = response.meta['item']
   sel = Selector(response)
   #item["Category"] = ""
   url = sel.xpath("//table[@class='PageNormalTextSmall']/tr/td[@align='center']/a/@href").extract()        
   brand = sel.xpath("//td[@colspan='3']/span/text()").extract()    
   category = []
   size = len(brand)
   for i in range(size):
     category.append(sel.xpath("//span[@class='PageHeaderText']/text()").extract()[0])
   
   for x,name,cat in zip(url,brand,category):
     item = BigCItem()
     item["Category"] = cat
     
     for i in range(len(starkenncat)):
       if cat == starkenncat[i]:
         item["Category"] = LYScat[i]
         break
       else:
         item["Category"] = "NA-"+cat
     item["Brand_Name"] = name
     request = Request("http://www.starkennbikes.com/"+x,callback=self.parse_items) #For Parsing Information if search keyword found
     request.meta["item"]  = item
     yield request    
예제 #25
0
파일: qbai.py 프로젝트: No7777/qiubai
 def parse(self, response):
     for href in response.xpath('//span[@class="stats-comments"]/a/@href').extract():
         detail_url = response.urljoin(href)
         req = Request(detail_url, self.parse_detail_page)
         item = QiubaiItem()
         req.meta['item'] = item
         yield req
예제 #26
0
 def parse_items(self, response):
     hxs = Selector(response)
     print "came here"
     
     data = imdbItem()
     data["seriesRating"] = hxs.xpath('//span[@itemprop="ratingValue"]/text()').extract()
     print data["seriesRating"]
     seasonLink = hxs.xpath('//*[@id="title-episode-widget"]/div/div[3]/a/@href').extract()
     print seasonLink
     #Directly go to ratings page
     '''
     if not seasonLink==[]:
         #print data["link"]
         url = data["link"][0]+'epdate'
         request = Request(url,callback=self.parse_episode_ratings)
         request.meta['item'] = data
         yield request
     '''    
 
     #follow season links - can get more data as opposed to above method
     if not seasonLink==[]:
         for season in seasonLink:
             link = 'http://www.imdb.com/'+season
             request = Request(link,callback=self.parse_season_links)
             request.meta['item'] = data
             yield request
예제 #27
0
 def get_next_page_request(self, response):
     sleep_time = self.crawler.settings.get('DOWNLOAD_DELAY',1)
     time.sleep(sleep_time)
     request_data = response.request.body
     data = json.loads(request_data)
     page_number = data.get('PageNumber',None)
     if not page_number:
         return
     page_number = int(page_number) + 1
     data['PageNumber'] = page_number
     referer = response.request.headers['Referer']
     url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult'
     # url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult'
     headers = {
         'Content-Type': 'application/json; charset=UTF-8',
         'Referer': referer,
         'Cookie': get_cookie(),
     }
     _response = requests.post(url, data=json.dumps(data),headers=headers)
     try:
         result = _response.json()
         if result['d'][0] is not None:
             immediate_response = response_requests2scrapy(_response)
             meta = {
                 'immediate_response': immediate_response
             }
             request = Request(url,headers=headers,method='POST',meta=meta)
             # noinspection PyUnresolvedReferences
             request.callback = self.query_callback
             return request
     except Exception as e:
         log.msg('spider turn page error:%s' % str(e), level=log.INFO)
         return None
    def parse_themes(self, response):
        item = response.meta['item']
        colors_groups = response.meta['colors_groups']

        themes = response.xpath("//div[@class='sf_items sf_colors'][2]/ul/li/@title").extract()

        item["themes"] = ",".join(themes)

        if colors_groups.get('varies') != None:
            colors_groups["varies_res"] = list(colors_groups["varies"])
            colors = []
            colors_x = response.xpath("//div[@class='sf_items sf_colors'][1]/ul/li")
            for color_x in colors_x:
                colors.append([color_x.xpath("@title").extract()[0], color_x.xpath("substring-after(a/@href, '=')").extract()[0]])


            color = colors.pop(0)
            request = Request(url="http://www.madeleine.de%s?cf=%s" % (item["path"], color[1]) , callback=self.find_colors)
            request.meta['item'] = item
            request.meta['colors_groups'] = colors_groups
            request.meta['cur_color'] = color[0]
            request.meta['colors'] = colors

            return request
        else:
            item["colors"] = self.make_colors(colors_groups)
            self.check_item(item)
            return item
예제 #29
0
    def test_download_gzip_response(self):

        if twisted_version > (12, 3, 0):

            crawler = get_crawler(SingleRequestSpider)
            body = b"1" * 100  # PayloadResource requires body length to be 100
            request = Request("http://localhost:8998/payload", method="POST", body=body, meta={"download_maxsize": 50})
            yield crawler.crawl(seed=request)
            failure = crawler.spider.meta["failure"]
            # download_maxsize < 100, hence the CancelledError
            self.assertIsInstance(failure.value, defer.CancelledError)

            if six.PY2:
                request.headers.setdefault(b"Accept-Encoding", b"gzip,deflate")
                request = request.replace(url="http://localhost:8998/xpayload")
                yield crawler.crawl(seed=request)
                # download_maxsize = 50 is enough for the gzipped response
                failure = crawler.spider.meta.get("failure")
                self.assertTrue(failure == None)
                reason = crawler.spider.meta["close_reason"]
                self.assertTrue(reason, "finished")
            else:
                # See issue https://twistedmatrix.com/trac/ticket/8175
                raise unittest.SkipTest("xpayload only enabled for PY2")
        else:
            raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0")
예제 #30
0
파일: spider.py 프로젝트: dowson521/scrapy
    def parse_content(self, response):
        item = UyuItem()
        selector_content = Selector(response)
        req = []
        Article_Content = selector_content.xpath('//div[@class="content"]/article[@class="excerpt"]')
        for article in Article_Content:
            article_names = article.xpath(
                '//div[@class="content"]/article[@class="excerpt"]/header/h2/a/text()'
            ).extract()
            article_urls = article.xpath(
                '//div[@class="content"]/article[@class="excerpt"]/header/h2/a/@href'
            ).extract()

            # 文章简介标题
            for article_name in article_names:
                item["article_name"] = article_name

            for url in article_urls:
                r = Request(url, callback=self.parse_article)
                r.meta["item"] = item
                req.append(r)

            nextLink = article.xpath('//li[@class="next-page"]/a/@href').extract()
            if nextLink:
                nr = Request(nextLink[0], callback=self.parse_content)
                req.append(nr)

        return req
예제 #31
0
파일: msworld.py 프로젝트: asheteh/data
 def parse(self, response):
     links = response.xpath(
         "//div[@class='entry-content']//a/@href").extract()
     links = links[:-1]
     for i in links:
         yield Request(i, callback=self.parse_book, dont_filter=True)
예제 #32
0
    def parse(self, response):
        base_url = get_base_url(response)

        categories = response.xpath('//ul[@class="main-nav"]/li/a/@href').extract()[1:]
        for url in categories:
            yield Request(urljoin_rfc(base_url, url),
                          cookies=self.additional_cookies)

        sub_categories = response.xpath('//div[@class="sidenav-title" and span/text()="Browse Categories"]'
                                        '/following-sibling::div[@class="inner"]//a/@href').extract()
        for url in sub_categories:
            yield Request(urljoin_rfc(base_url, url),
                          cookies=self.additional_cookies)

        per_page = set(response.xpath('//div[contains(@class, "showing-per-page")]//option/@value').extract())
        if per_page:
            per_page_param = url_query_parameter(response.url, 'productsPerPage')
            if per_page_param != '48':
                url = add_or_replace_parameter(response.url, 'productsPerPage', '48')
                url = add_or_replace_parameter(url, 'page', '0')
                yield Request(url, cookies=self.additional_cookies)
                return

            # Check for valid location
            is_valid, country_detected = self._is_valid_location(response)
            if not is_valid:
                reason = 'Wrong country detected: %s' % country_detected
                new_request = self._retry_request(response, self.parse, reason)
                if new_request:
                    yield new_request
                return

            # Parse products
            mde = MicrodataExtractor()
            data = mde.extract(response.body)
            if data:
                product_ids = response.xpath('//div[@itemtype="http://schema.org/Product"]/@data-id').extract()
                product_urls = map(lambda u: urljoin_rfc(base_url, u),
                    response.xpath('//div[@itemtype="http://schema.org/Product"]'
                                   '/div[@class="product-info"]/div[@class="title"]/a/@href').extract())
                product_imgs = map(lambda u: urljoin_rfc(base_url, u),
                    response.xpath('//div[@itemtype="http://schema.org/Product"]//a[@class="product-image"]'
                                   '//img[@class="product-image-file"]/@src').extract())
                rrp_prices = {}
                for product_id in product_ids:
                    rrp_price = response.xpath('//div[@data-id="%s"]//div/@data-tc-original-price' % product_id).extract()
                    if rrp_price:
                        rrp_prices[product_id] = rrp_price[0]

                products_extra_data = {}
                for product_id, product_url, product_img in zip(product_ids, product_urls, product_imgs):
                    products_extra_data[product_id] = {
                        'url': product_url,
                        'image_url': product_img,
                    }

                category = ''
                categories = filter(lambda item: item['type'] == 'http://data-vocabulary.org/Breadcrumb', data['items'])
                if categories:
                    category = categories[0]['properties']['title'][1]
                brands = set(response.xpath('//div[@class="filter-brand-wrapper"]'
                                            '//label[contains(@for, "product-listings__filter-top-brands-")]/a[@disabled]/text()')\
                             .re(r'(.*) \('))
                products = filter(lambda item: item.get('type', '') == 'http://schema.org/Product', data['items'])
                for product in products:
                    product_id = product['properties']['productId']
                    ajax_url = self.AJAX_URL % product_id
                    headers = {'X-Requested-With': 'XMLHttpRequest'}
                    req = Request(ajax_url,
                                  headers=headers,
                                  callback=self.parse_options,
                                  meta={'main_product': product['properties'],
                                        'category': category,
                                        'products_extra': products_extra_data,
                                        'brands': brands,
                                        'rrp_prices': rrp_prices,
                                        'proxy': response.meta.get('proxy'),
                                        'proxy_service_disabled': True},
                                  cookies=self.additional_cookies)
                    yield req

                # Check for next page and follow this if exists
                next_page = response.xpath('//li[@class="next"]/a/@href').extract()
                if next_page:
                    yield Request(urljoin_rfc(get_base_url(response), next_page[0]),
                                  cookies=self.additional_cookies)
 def test_request_response(self):
     req = Request('http://example.com/index.html')
     resp = Response(req.url, status=200)
     ret = self._download(req, resp)
     self.assertTrue(isinstance(ret, Response), "Non-response returned")
예제 #34
0
파일: cglobbbank.py 프로젝트: Kuzyak/Bank
    def parse(self, response):
        try:
            chf = " — "
            cny = " — "
            eur = " — "
            gbp = " — "
            ron = " — "
            rub = " — "
            usd = " — "
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!    NachBank!")
            sel = Selector(response)
            result1 = sel.xpath('//table[@class="datatable"]/tbody/tr')
            for some1 in result1:
                name = some1.xpath('.//td//text()').extract()[0]
                #print (name)
                value = some1.xpath('.//td//text()').extract()[-1]
                if (name == "EUR"):
                    eur = value
                elif (name == "USD"):
                    usd = value
                elif (name == "CNY"):
                    cny = value
                elif (name == "RUB"):
                    rub = value
                elif (name == "RON"):
                    ron = value
                elif (name == "GBP"):
                    gbp = value
                elif (name == "CHF"):
                    chf = value
            yield ArticleItem(
                EUR=eur,
                USD=usd,
                CNY=cny,
                RUB=rub,
                RON=ron,
                GBP=gbp,
                CHF=chf,
                #description = des,
                url='https://www.mnb.hu/arfolyamok',
                title="NachBank")
        except:
            contact_message = """
            Bank NachBank:\n
            ERROR
            """
            send_mail("Bank fail", contact_message, self.from_email,
                      [self.to_email])
            print(
                "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!    NachBank!    ERROR--ERROR--ERROR--ERROR"
            )

        #KREDIT
        yield Request(
            "https://www.bankracio.hu/hitelkalkulator/lakashitel/2-lakasvasarlasi-hitel-uj-lakasra",
            callback=self.kredit)
        #BUDAPEST
        yield Request(
            "https://www.budapestbank.hu/info/arfolyamok/db_arfolyamok.php?sent=1&frm_arfolyam=CCR",
            callback=self.budapest)

        #CIB
        yield Request("http://www.cib.hu/maganszemelyek/arfolyamok/arfolyamok",
                      callback=self.cib)
        # ERSTE!
        try:
            resp4 = yield Request(
                "http://www.erstebank.hu/ekwa-web-web/includes/content/currency/exchangeRates.xhtml"
            )
            yield FormRequest.from_response(
                resp4,
                formxpath='//input[@id="exchangeRateForm:j_idt31"]',
                callback=self.erste)
        except:
            contact_message = """
            Bank ERSTE:\n
            ERROR
            """
            send_mail("Bank fail", contact_message, self.from_email,
                      [self.to_email])
            print(
                "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!    ERSTE!    ERROR--ERROR--ERROR--ERROR"
            )

        # Valuta History
        resp12 = yield Request("https://www.mnb.hu/en/arfolyam-lekerdezes")
        yield FormRequest.from_response(resp12,
                                        formxpath='//input[@id="geterates"]',
                                        callback=self.nach_history)
        #GRANIT
        yield Request("https://granitbank.hu/arfolyamok", callback=self.granit)

        #OTP
        yield Request(
            "https://www.otpbank.hu/apps/exchangerate/api/exchangerate/otp/{}".
            format(str(datetime.now()).split(" ")[0]),
            callback=self.otp)

        #RAIFFEISEN
        yield Request(
            "https://www.raiffeisen.hu/hasznos/arfolyamok/lakossagi/valutaarfolyamok",
            callback=self.raiffeisen)

        #K&H   k_and_h
        yield Request("https://www.kh.hu/valuta-deviza-arfolyam",
                      callback=self.k_and_h)

        #MKB
        yield Request("https://www.mkb.hu/apps/rates/rates?type=CAD",
                      callback=self.mkb,
                      method="GET")

        #UNICREDIT

        try:
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   UNICREDIT!")
            d = datetime.today()
            date_now = str(d.year)
            if len(str(d.month)) == 1:
                date_now = date_now + "0" + str(d.month)
            else:
                date_now = date_now + str(d.month)
            if len(str(d.day)) == 1:
                date_now = date_now + "0" + str(d.day)
            else:
                date_now = date_now + str(d.day)
            date_now = date_now + "T"
            if len(str(d.hour)) == 1:
                date_now = date_now + "0" + str(d.hour)
            else:
                date_now = date_now + str(d.hour)
            date_now = date_now + "23:00:00.000+0300"
            payload = {
                'Currency': '*ALL',
                'DateFrom': date_now,
                'DateTo': date_now
            }
            headers = {
                'User-Agent':
                'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
                'Accept': '*/*',
                'Content-Type': 'application/json',
                'EntityCode': 'HU',
                'Language': 'HU',
                'SourceSystem': 'PWS',
                'Product': 'PWS'
            }
            respons0 = requests.post(
                'https://www.unicreditbank.hu/cwa/GetExchangeRates',
                headers=headers,
                data=json.dumps(payload))
            sel_11 = Selector(respons0)
            result = json.loads(sel_11.xpath('//p').extract()[0][3:-4])
            chf = " — "
            cny = " — "
            eur = " — "
            gbp = " — "
            ron = " — "
            rub = " — "
            usd = " — "
            for some in range(0, len(result)):
                if result[some]["CurrencyCode"] in [
                        'CHF', 'CNY', 'EUR', 'GBP', 'RON', 'RUB', 'USD'
                ]:
                    if result[some]["CurrencyCode"] == 'CHF':
                        chf = str(
                            "%.2f" % result[some]["PurchaseRate"]) + "/" + str(
                                "%.2f" % result[some]["SaleRate"])
                    elif result[some]["CurrencyCode"] == 'CNY':
                        cny = str(
                            "%.2f" %
                            result[some]["CashPurchaseRate"]) + "/" + str(
                                "%.2f" % result[some]["CashSaleRate"])
                    elif result[some]["CurrencyCode"] == 'EUR':
                        eur = str(
                            "%.2f" % result[some]["PurchaseRate"]) + "/" + str(
                                "%.2f" % result[some]["SaleRate"])
                    elif result[some]["CurrencyCode"] == 'GBP':
                        gbp = str(
                            "%.2f" % result[some]["PurchaseRate"]) + "/" + str(
                                "%.2f" % result[some]["SaleRate"])
                    elif result[some]["CurrencyCode"] == 'RON':
                        ron = str(
                            "%.2f" %
                            result[some]["CashPurchaseRate"]) + "/" + str(
                                "%.2f" % result[some]["CashSaleRate"])
                    elif result[some]["CurrencyCode"] == 'RUB':
                        rub = str(
                            "%.2f" %
                            result[some]["CashPurchaseRate"]) + "/" + str(
                                "%.2f" % result[some]["CashSaleRate"])
                    elif result[some]["CurrencyCode"] == 'USD':
                        usd = str(
                            "%.2f" % result[some]["PurchaseRate"]) + "/" + str(
                                "%.2f" % result[some]["SaleRate"])
            yield ArticleItem(
                EUR=eur,
                USD=usd,
                CNY=cny,
                RUB=rub,
                RON=ron,
                GBP=gbp,
                CHF=chf,
                #description = des,
                url=
                "https://www.unicreditbank.hu/hu/maganszemelyek/exchange_rate.html",
                title="UNICREDIT")
        except:
            contact_message = """
            Bank UNICREDIT:\n
            ERROR or Weekend
            """
            send_mail("Bank fail", contact_message, self.from_email,
                      [self.to_email])
            print(
                "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   UNICREDIT!   ________   ERROR or Weekend"
            )

        try:
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!    SBERBANK!")
            d = datetime.today()
            date_now = str(d.year)
            if len(str(d.month)) == 1:
                date_now = date_now + ".0" + str(d.month)
            else:
                date_now = date_now + "." + str(d.month)
            if len(str(d.day)) == 1:
                date_now = date_now + ".0" + str(d.day)
            else:
                date_now = date_now + "." + str(d.day)
            payload = {
                'maxDays': "60",
                'language': "hu",
                'rateType': "valuta",
                'dateFrom': date_now,
                'allCurrency': "true"
            }
            headers = {
                'Content-Type':
                'application/x-www-form-urlencoded; charset=UTF-8'
            }
            respons5 = requests.post(
                'https://www.sberbank.hu/servlet/currencyRateServlet',
                headers=headers,
                data=payload)
            sel_10 = Selector(respons5)
            result = json.loads(sel_10.xpath('//p').extract()[0][3:-4])
            print(result["notFound"])
            if result["notFound"] == False:
                cny = " — "
                ron = " — "
                for some in range(
                        0,
                        len(result["currencyRatesByDay"][0]["currencyRates"])):
                    if result["currencyRatesByDay"][0]["currencyRates"][some][
                            "currency"] == 'CHF':
                        chf = str(
                            result["currencyRatesByDay"][0]["currencyRates"]
                            [some]["buyRate"]) + "/" + str(
                                result["currencyRatesByDay"][0]
                                ["currencyRates"][some]["sellRate"])
                    elif result["currencyRatesByDay"][0]["currencyRates"][
                            some]["currency"] == 'EUR':
                        eur = str(
                            result["currencyRatesByDay"][0]["currencyRates"]
                            [some]["buyRate"]) + "/" + str(
                                result["currencyRatesByDay"][0]
                                ["currencyRates"][some]["sellRate"])
                    elif result["currencyRatesByDay"][0]["currencyRates"][
                            some]["currency"] == 'GBP':
                        gbp = str(
                            result["currencyRatesByDay"][0]["currencyRates"]
                            [some]["buyRate"]) + "/" + str(
                                result["currencyRatesByDay"][0]
                                ["currencyRates"][some]["sellRate"])
                    elif result["currencyRatesByDay"][0]["currencyRates"][
                            some]["currency"] == 'RUB':
                        rub = str(
                            result["currencyRatesByDay"][0]["currencyRates"]
                            [some]["buyRate"]) + "/" + str(
                                result["currencyRatesByDay"][0]
                                ["currencyRates"][some]["sellRate"])
                    elif result["currencyRatesByDay"][0]["currencyRates"][
                            some]["currency"] == 'USD':
                        usd = str(
                            result["currencyRatesByDay"][0]["currencyRates"]
                            [some]["buyRate"]) + "/" + str(
                                result["currencyRatesByDay"][0]
                                ["currencyRates"][some]["sellRate"])
                yield ArticleItem(
                    EUR=eur,
                    USD=usd,
                    CNY=cny,
                    RUB=rub,
                    RON=ron,
                    GBP=gbp,
                    CHF=chf,
                    description=date_now,
                    url=
                    "http://www.sberbank.hu/hu/alkalmazasok/arfolyamok.html",
                    title="SBERBANK")
            else:
                contact_message = """
                Bank SBERBANK:\n
                ERROR or Weekend
                """
                send_mail("Bank fail", contact_message, self.from_email,
                          [self.to_email])
                print(
                    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   SBERBANK!   ________   ERROR or Weekend"
                )
        except:
            contact_message = """
            Bank SBERBANK:\n
            ERROR or Weekend
            """
            send_mail("Bank fail", contact_message, self.from_email,
                      [self.to_email])
            print(
                "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   UNICREDIT!   ________   ERROR or Weekend"
            )
예제 #35
0
 def start_requests(self):
     for url in self.start_urls:
         yield Request(url=url, callback=self.parse)
     for url in self.ipod_urls:
         yield Request(url=url, callback=self.parse_ipods)
예제 #36
0
 def get_media_requests(self, item, info):
     for image_url in item['image_urls']:
         yield Request(image_url)
예제 #37
0
 def start_requests(self):
     for url in self.products:
         yield Request(url)
예제 #38
0
 def load_listpage(self, response):
     yield Request(url=self.list_url,
                   callback=self.process_list,
                   dont_filter=True)
예제 #39
0
 def parse(self, response):
     self.logger.info("===========parse=============")
     responseUrl = response._url
     category = response.meta['category']
     newsTitle = response.meta['newsTitle']
     newsCover = response.meta['newsCover']
     # id =re.compile('\d+')
     # idResult = id.match(response._url)
     # commentId =re.findall("\d+",responseUrl)[0]
     # commentUrl = self.commentBaseUrl+commentId
     # self.allCommentUrlList.append({newsTitle:commentUrl})
     # commentDict = self.allCommentUrlList.pop(0)
     # print(commentDict)
     # for title,url in commentDict.items():
     #     yield Request(url, callback=self.comment_url_callback, meta={'title': title}, dont_filter=True)
     try:
         # commentId = re.findall("\d+", responseUrl)[0]
         # commentUrl = self.commentBaseUrl + commentId
         # self.allCommentUrlList.append({newsTitle: commentUrl})
         keywords = response.xpath(
             "//head/meta[@name='Keywords']/@content").extract_first()
         description = response.xpath(
             "//head/meta[@name='Description']/@content").extract_first()
         print('Keywords:' + keywords + ',' + 'Description:' + description)
         newsContents = response.xpath("//div[@class='newscontent']")
         # newsTitle = newsContents.xpath("./h1[@class='news_title']/text()").extract_first()
         newsAuthor = newsContents.xpath(
             "./div[@class='news_about']/p[1]/text()").extract_first()
         newsDate = newsContents.xpath(
             "./div[@class='news_about']/p[2]/text()").extract_first()
         # newsCover = newsContents.xpath(".//img/@src").extract_first()
         newsContentList = newsContents.xpath(
             "./div[@class='news_txt']/text()").extract()
         newsContent = "".join(newsContentList)
         print(newsContent)
         news_love = newsContents.xpath(
             "./div[@class='news_love']//a[@class='zan']/text()"
         ).extract_first()
         item = ThepaperspiderItem()
         item["title"] = newsTitle
         item["author"] = newsAuthor
         item["datetime"] = newsDate
         item["newsCover"] = newsCover
         item["newsContent"] = newsContent
         item["keywords"] = keywords
         item["description"] = description
         item["collectedCount"] = news_love
         item["category"] = category
         item["story_id"] = '1'
         item["comefrom"] = 'news'
         print(item)
         yield item
         imgItem = ImgItem()
         imgItem['image_urls'] = {newsTitle: newsCover}
         imgItem['comefrom'] = 'imgs'
         print(imgItem)
         yield (imgItem)
         commentId = re.findall("\d+", responseUrl)[0]
         commentUrl = self.commentBaseUrl + commentId
         self.allCommentUrlList.append({newsTitle: commentUrl})
         if len(self.allCommentUrlList) != 0:
             commentDict = self.allCommentUrlList.pop(0)
             print(commentDict)
             for title, url in commentDict.items():
                 yield Request(url,
                               callback=self.comment_url_callback,
                               meta={'title': title},
                               dont_filter=True)
     except:
         print("糟糕,出现exception")
         pass
     if len(self.allNewsUrlList) != 0:
         urlDict = self.allNewsUrlList.pop(0)
         newsInfo = []
         for key in urlDict:
             newsInfo.append(key)
         category = newsInfo[0]
         newsTitle = newsInfo[1]
         newsUrl = urlDict[category]
         newsCover = urlDict[newsTitle]
         yield Request(newsUrl,
                       callback=self.parse,
                       meta={
                           'category': category,
                           'newsTitle': newsTitle,
                           'newsCover': newsCover
                       },
                       dont_filter=True)
    def parse(self, response):
        # Check url at start of parse to catch links that were potentially redirected.
        orig_domain = response.url
        if "orig_domain" in response.meta:
            orig_domain = response.meta["orig_domain"]
        else:
            response.meta["orig_domain"] = orig_domain
        if not self.validate_link(response.url, orig_domain):
            return

        self._logger.debug("starting parse on url {}".format(
            response.request.url))
        cur_depth = 0
        if 'curdepth' in response.meta:
            cur_depth = response.meta['curdepth']
        else:
            response.meta['curdepth'] = cur_depth
        self._logger.debug("Forming response object")
        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers
        item["links"] = []
        item["curdepth"] = str(cur_depth)

        is_pdf = False
        url = response.url.lower()
        if (url[len(url) - 4:] == '.pdf') or ('.pdf?' in url):
            is_pdf = True

        item["is_pdf"] = str(is_pdf)
        if is_pdf:
            self._logger.debug("Handling pdf file")
            self.download_file(response.url)
            item["body"] = self.pdfparser("temp_document.pdf")
        else:
            item["body"] = self.gather_text(response.body)
            self._logger.debug("Current depth: " + str(cur_depth))
            # determine whether to continue spidering
            if cur_depth >= response.meta['maxdepth']:
                self._logger.debug("Not spidering links in '{}' because" \
                    " cur_depth={} >= maxdepth={}".format(
                                                          response.url,
                                                          cur_depth,
                                                          response.meta['maxdepth']))
            else:
                # we are spidering -- yield Request for each discovered link
                link_extractor = LinkExtractor(
                    allow_domains=response.meta['allowed_domains'],
                    allow=response.meta['allow_regex'],
                    deny=response.meta['deny_regex'],
                    deny_extensions=response.meta['deny_extensions'])

                for link in link_extractor.extract_links(response):
                    # link that was discovered
                    the_url = link.url
                    the_url = the_url.replace('\n', '')
                    if not self.validate_link(the_url, orig_domain):
                        continue
                    item["links"].append(
                        str({
                            "url": the_url,
                            "text": link.text,
                        }))
                    req = Request(the_url, callback=self.parse)

                    req.meta['priority'] = response.meta['priority'] - 10
                    req.meta['curdepth'] = response.meta['curdepth'] + 1

                    if 'useragent' in response.meta and \
                            response.meta['useragent'] is not None:
                        req.headers['User-Agent'] = response.meta['useragent']

                    self._logger.debug("Trying to follow link '{}'".format(
                        req.url))
                    yield req

        # raw response has been processed, yield to item pipeline
        yield item
예제 #41
0
    def parse_cat_0(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        # MINI-BAG
        temp = sel.xpath(
            '//article[contains(@class,"sliding-backgrounds")]//a[@href and contains(@class,"background")]'
        )
        if temp:
            return Request(url=self.process_href(temp[0]._root.attrib['href'],
                                                 response.url),
                           callback=self.parse_list,
                           meta={'userdata': metadata},
                           errback=self.onerr)

        node = None
        temp = sel.xpath(
            '//div[@class="menu"]/ul[@class="collections"]/li[contains(@class,"collection")]/'
            'div[contains(@class,"name")]/a[@href]')
        if temp:
            for temp1 in temp:
                if self.process_href(temp1._root.attrib['href'],
                                     response.url) == response.url:
                    node = temp1
                    break
        if not node:
            return None

        ret = []
        for node1 in node.xpath(
                '../../ul[contains(@class,"departments")]/li[contains(@class,"department")]/div/a[@href]'
        ):
            m1 = copy.deepcopy(metadata)
            href = node1._root.attrib['href']
            mt = re.search('/([^/]+)$', href)
            if mt:
                tag_name = unicodify(mt.group(1)).lower()
                tag_text = unicodify(
                    node1._root.text).lower() if node1._root.text else tag_name
                m1['tags_mapping']['category-1'] = [{
                    'name': tag_name,
                    'title': tag_text
                }]

            # 是否有子分类级别
            for node2 in node1.xpath(
                    '../../ul[contains(@class,"categories")]/li[contains(@class,"category")]//a[@href]'
            ):
                m2 = copy.deepcopy(m1)
                href = node2._root.attrib['href']
                mt = re.search('/([^/]+)$', href)
                if mt:
                    tag_name = unicodify(mt.group(1))
                    tag_text = unicodify(
                        node2._root.text) if node2._root.text else tag_name
                    m2['tags_mapping']['category-2'] = [{
                        'name': tag_name,
                        'title': tag_text
                    }]
                ret.append(
                    Request(url=self.process_href(href, response.url),
                            meta={'userdata': m2},
                            callback=self.parse_list,
                            errback=self.onerr))

        return ret
예제 #42
0
    def parse(self, response):

        zk.start()
        zode_path = zk.create("/pid/taobao/node-",
                              ephemeral=True,
                              sequence=True)
        myid = zode_path[-10:]
        mytask_dir = task_dir + "node-" + myid
        try:
            zk.create('/task/taobao')
            Master = True
        except:
            Master = False

        if Master == True:
            zk.create(mytask_dir)
            sleep(3)
            themes = response.xpath(
                '//ul[@class="service-bd"]/li/span/a/@href').extract()
            nodes = len(zk.get_children("/pid/taobao"))
            real_nodes = zk.get_children("/task/taobao")
            print "realnodes" + str(real_nodes)
            while nodes != len(real_nodes):
                real_nodes = zk.get_children("/task/taobao")
                nodes = len(zk.get_children("/pid/taobao"))
                sleep(0.01)

            peer_tasks = len(themes) / nodes
            print "master is " + str(os.getpid())
            i = 0
            while i < nodes:
                j = 0
                while j < peer_tasks:
                    msg = '[{ "url":"' + str(
                        themes[i * peer_tasks +
                               j]) + '", "level":"2", "content":"0"}]'
                    zk.create(task_dir + real_nodes[i] + "/task-",
                              value=msg,
                              sequence=True)
                    j += 1
                i += 1
        else:
            zk.create(mytask_dir)

        print "sleep"

        while True:
            global work_co

            try:
                tasks = zk.get_children(mytask_dir)
            except Exception, e:
                print "get_children %s" % e

            while len(tasks) == 0:
                sleep(1)
                tasks = zk.get_children(mytask_dir)

            obj_tasks = mytask_dir + '/' + tasks[random.randint(
                0,
                len(tasks) - 1)]

            working_set.add(obj_tasks)
            mytask_data, mytask_stat = zk.get(obj_tasks)

            task = json.loads(mytask_data)

            if task[0]['level'] == '2':
                temp = task[0]['url'].split(':')
                work_co += 1
                yield Request(url='http:' + temp[len(temp) - 1],
                              meta={
                                  "task": obj_tasks,
                                  "task_dir": mytask_dir
                              },
                              callback=self.classification)

            if task[0]['level'] == '3':
                temp = task[0]['url']
                work_co += 2
                yield Request(url=temp,
                              meta={
                                  "task": obj_tasks,
                                  "task_dir": mytask_dir
                              },
                              callback=self.pageturning)

            if task[0]['level'] == '4':
                temp = task[0]['url']
                work_co += 4
                yield Request(url=temp,
                              meta={
                                  "task": obj_tasks,
                                  "task_dir": mytask_dir
                              },
                              callback=self.goods)
예제 #43
0
 def parse(self, response):
     models = response.xpath('.//a[@class="view-all"]/@href').extract()
     print(len(models))
     for model in models:
         yield Request(model, callback=self.parse_model, dont_filter=True)
     pass
예제 #44
0
 def start_requests(self):
     yield Request(urls.start, method='POST')
예제 #45
0
 def _extract_requests(self, response):
     r = []
     if isinstance(response, HtmlResponse):
         links = self.link_extractor.extract_links(response)
         r.extend(Request(x.url, callback=self.parse) for x in links)
     return r
예제 #46
0
 def parse_cars(self, response):
     cars = response.xpath('//a[@class="readmore"]/@href').extract()
     for car in cars:
         yield Request(car, callback=self.parse_data, dont_filter=True)
예제 #47
0
 def check_login(self, response):
     print(response.text)
     yield Request(url="http://dig.chouti.com/", callback=self.good)
예제 #48
0
 def start_requests(self):
     return [Request(self.url, callback=self.parse, dont_filter=True)]
예제 #49
0
    def parse_detail_item(self, response, **kwargs):
        info_type = kwargs.get('type')
        if info_type == 'ershoufang':
            item = ErShouFangSourceItem()
            item['city'] = kwargs.get('city')
            item['area'] = kwargs.get('area')
            item['business_circle'] = kwargs.get('bankuai_name')
            item['village_name'] = response.css(
                '.communityName a.info::text').get(default='')

            lis = response.css('.base .content ul li')
            for li in lis:
                span_text = li.css('span::text').get(default='')
                if span_text == '房屋户型':
                    item['residence_room'] = li.css('::text').getall()[1]
                elif span_text == '所在楼层':
                    item['floor'] = li.css('::text').getall()[1]
                elif span_text == '建筑面积':
                    item['area1'] = li.css('::text').getall()[1].replace(
                        '㎡', '')
                elif span_text == '套内面积':
                    item['area2'] = li.css('::text').getall()[1].replace(
                        '㎡', '')
                elif span_text == '房屋朝向':
                    item['orientation'] = li.css('::text').getall()[1]
                elif span_text == '挂牌时间':
                    item['listing_time'] = li.css(
                        '::text').getall()[1].replace('\n', '')
            check_field_list = [
                'residence_room', 'area1', 'area2', 'floor', 'orientation',
                'listing_time'
            ]
            for check_field in check_field_list:
                if check_field not in item.keys():
                    item[check_field] = ''
            all_price = float(
                response.css('.price .total::text').get(default=0))
            single_price = float(
                response.css('.price .unitPriceValue::text').get(default=0))
            item['all_price'] = all_price
            item['single_price'] = single_price
            item['build_time'] = response.css('.area .subInfo::text').get(
                default='')
            item['link'] = response.url
            yield Request(url=kwargs.get('village_url'),
                          callback=self.parse_village_info,
                          cb_kwargs={
                              'item': item,
                              'village_url': kwargs.get('village_url')
                          },
                          dont_filter=True)
        elif info_type == 'zufang':
            item = RentingHouseSourceItem()
            item['city'] = kwargs.get('city')
            item['area'] = kwargs.get('area')
            item['business_circle'] = kwargs.get('bankuai_name')
            item['village_name'], item['residence_room'], item[
                'orientation'] = response.css('.content__title::text').get(
                    default='').split(' ')
            # item['orientation'] = response.css('div.content__article__info:nth-child(2) > ul:nth-child(2) > li:nth-child(3)::text').get(default='').replace('朝向:', '')
            item['floor'] = response.css(
                'div.content__article__info:nth-child(2) > ul:nth-child(2) > li:nth-child(8)::text'
            ).get(default='').replace('楼层:', '')
            item['house_area'] = response.css(
                'div.content__article__info:nth-child(2) > ul:nth-child(2) > li:nth-child(2)::text'
            ).get(default='').replace('面积:', '').replace('㎡', '')

            item['price'] = response.css(
                'div.content__aside--title span::text').get(default='')
            item['renting_time'] = response.css(
                'div.content__article__info:nth-child(2) > ul:nth-child(3) > li:nth-child(2)::text'
            ).get(default='').replace('租期:', '')
            item['link'] = response.url
            yield item
예제 #50
0
파일: files.py 프로젝트: wwjiang007/scrapy
 def get_media_requests(self, item, info):
     urls = ItemAdapter(item).get(self.files_urls_field, [])
     return [Request(u) for u in urls]
예제 #51
0
def _prepare_request_object(item_url):
    return Request(
        item_url,
        meta={'response': Response(item_url, status=200, body=b'data')})
예제 #52
0
    def parse(self, response):

        '''
        zk.start()
        zode_path =  zk.create("/pid/huanqiunews/node-" , ephemeral = True, sequence = True)
        myid = zode_path[-10 : ]
        mytask_dir = task_dir + "node-" + myid
        print "hello"

        if zk.exists("/task/huanqiunews") == None:
            zk.create('/task/huanqiunews')
            zk.create(mytask_dir)
            sleep(3)
            nodes = len(zk.get_children("/pid/huanqiunews"))

            themes = response.xpath('//a[@class="cate_menu_lk"]/@href').extract()
            real_nodes = zk.get_children("/task/huanqiunews")
            while nodes != len(real_nodes):
                real_nodes = zk.get_children("/task/huanqiunews")
                sleep(0.01)

      
            peer_tasks = len(themes) / nodes #tot do: chu bu jun yun ru he cao zuo ??

            i = 0
            while i < nodes:
                j = 0
                while j < peer_tasks:
                    try:
                        url = "http:" + theme[i*peer_tasks + j]
                        msg = '[{"motian":"0", "url":"' + url+ '", "level":"2", "content":"0"}]'
                        zk.create("/task/huanqiunews/" + real_nodes[i] + "/task-", value = msg, sequence = True)
                    except Exception,e:
                        print "%s" % e
                    j += 1
                i += 1
        else:
            zk.create(mytask_dir)

        work_co = 0
        while True:
            if work_co > 10:
                sleep(10)
            try:
                tasks = zk.get_children(mytask_dir)
            except Exception,e:
                print "get_children %s" % e 
            while len(tasks) == 0:
                sleep(1)
                tasks = zk.get_children(mytask_dir)
            obj_tasks = mytask_dir + '/' + tasks[random.randint(0, len(tasks) - 1)]
  
            mytask_data, mytask_stat = zk.get(obj_tasks)
            
      
            task = json.loads(mytask_data)

            if task[0]['level'] == '2':
                url = task[0]['url']
                print "url-->" + url
                yield Request(url=url,callback=self.classification)
                work_co += 1
            '''

        themes = response.xpath('//a[@class="cate_menu_lk"]/@href').extract()
        #for theme in themes:
            #url = "http:" + theme
            #yield Request(url=url, callback=self.classification)
        yield Request(url='http:'+themes[0], callback=self.classification)
 def parse(self, response):
     #构造不同版块下的页面url
     url = self.start_urls[0]
     yield Request(url, self.parse_news,meta={'url':self.url})
예제 #54
0
 def parse_village_trend(self, response: Response, **kwargs):
     url = re.search(r"analysis.init\('(.*?)'\)", response.text).group(1)
     yield Request(url='https://bj.ke.com' + url,
                   callback=self.yield_city_item,
                   cb_kwargs=kwargs)
예제 #55
0
 def parse_application(self, response):
     app_url = response.xpath(
         '//*[@class="glyphicon glyphicon-inbox btn-lg"]/following-sibling::a/@href').extract_first()
     yield Request(response.urljoin(app_url), callback=self.parse_form)
 def parse_news(self,response):
 #获取所有页面的所有新闻url
     news_urls = response.xpath('//*[@id="leftList"]/div[2]/dl/dd/ul/li/a/@href').extract()
     for i in range(len(news_urls)):
         news_url = response.meta['url'] + news_urls[i]
         yield Request(news_url,self.parse_content,meta={"url":news_url})
예제 #57
0
    def parse(self, response):
        index = response.meta['index']
        url_ = response.meta['url_']
        self.Maxpage_List[index] += 1  #爬取页数加1

        soup = BeautifulSoup(response.body_as_unicode(), "lxml")  #以浏览器的方式解析文档
        founds = soup.find('div', class_='searchResultArea').find_all('li')
        item_list = []
        print '------------url---------------'
        for found in founds:
            item = items.PostItem()
            title = found.find('h3').get_text().strip()
            url = found.find('h3').find('a').get('href')
            # print title
            print url
            print index
            m = md5.new()
            m.update(url)
            md_str = m.hexdigest()  #MD5算法编码获得ID号

            post_time = found.find_all('p')[1].find_all(
                'span')[3].get_text().strip()
            post_time = re.findall(self.tt_pa, post_time)[0]  #正则匹配得到时间
            post_time = post_time[6:] + '-' + post_time[3:5] + '-' + post_time[
                0:2] + ' ' + '00:00:00'
            print post_time

            item['url'] = url
            item['id'] = md_str
            item['post_time'] = post_time
            item['data_type'] = settings.DATA_TYPE  #政府类都是1
            item['site_id'] = settings.SITE_ID[self.name]
            item['topic_id'] = index
            item['scratch_time'] = time.strftime(
                '%Y-%m-%d %H:%M:%S',
                time.localtime())  #time.strftime()可以用来获得当前时间,可以将时间格式化为字符串等等
            item['title'] = title
            item['poster_name'] = ''
            item['poster_id'] = ''
            item['poster_url'] = ''

            item_list.append(item)

        print '-----------------------------------'
        res_items = self.sqldb.get_newest_time(
            item_list)  #判断这个链接是不是比上次爬取的要新,如果是就爬取

        #调用parse_content解析每篇文章内容
        for item in res_items:
            if '.htm' in item['url']:
                self.sum += 1
                print self.sum
                print '-----------------add new urls to Request-------------------'
                yield Request(item['url'],
                              callback=self.parse_content,
                              meta={'item': item})
            # else:
            # url = item['url']
            # if '.xls' in url or '.xlsx' in url or '.doc' in url or '.docx' in url or '.pdf' in url or '.txt' in url:
            # name = url.split('/')
            # filename = name[len(name)-1]
            # dir = 'D:\\Workspace\\Python\\Scrapy\\file\\' + self.name
            # if os.path.exists(dir):
            # print 'dir is existing...'
            # else:
            # os.makedirs(dir)
            # dir = dir + '\\' + item['id'] + '_' + str(item['topic_id'])
            # if os.path.exists(dir):
            # print 'filepath is existing...'
            # else:
            # os.makedirs(dir)
            # filepath = os.path.join(dir, filename)
            # if os.path.exists(filepath):
            # print 'already down...'
            # else:
            # print "-----------------downloading with requests-------------------"
            # r = requests.get(url)
            # with open(filepath, "wb") as code:
            # code.write(r.content)
            # item['content'] = filepath
            # yield Request(item['url'], callback = self.parse_fileurl, meta = {'item':item})

        next_pages = soup.find('div', class_='advancedIndex').find_all('span')
        try:
            next_page = next_pages[len(next_pages) - 1].a['href']
            next_page = 'http://lp.search.gov.hk/search.html' + next_page
            yield scrapy.Request(next_page,
                                 meta={
                                     'index': index,
                                     'url_': next_page
                                 })
        except:
            print 'last page-----'
예제 #58
0
 def parse(self, response):
     for i in range(2, 57):
         url = 'https://veromoda.tmall.com/category.htm?&pageNo={0}'.format(
             i)
         '''iteration  / generation'''
         yield Request(url=url, callback=self.parse_detail)  # 注意这里调用不需要有()
예제 #59
0
 def start_requests(self):
     price_urls = mongoservice.get_dealerprice_url()
     for url in price_urls:
         yield Request(self.api_url % url, callback=self.get_url)
예제 #60
0
 def test_proxy_already_seted(self):
     os.environ['http_proxy'] = http_proxy = 'https://proxy.for.http:3128'
     mw = HttpProxyMiddleware()
     req = Request('http://noproxy.com', meta={'proxy': None})
     assert mw.process_request(req, spider) is None
     assert 'proxy' in req.meta and req.meta['proxy'] is None