Python ItemLoader.default_output_processor示例，scrapy.contrib.loader.ItemLoader.default_output_processor Python示例

示例#1

0

显示文件

文件： goalie_spider.py 项目： kielejocain/NHL_sql

 def parse_item(self, response):
     sel = Selector(response)
     
     # collect xpaths of each player (row in table)
     rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')
     
     for row in rows:
         loader = ItemLoader(GoalSTItem(), selector=row)
         loader.default_input_processor = MapCompose()
         loader.default_output_processor = Join()
         
         # get unique NHL ID number from player's page URL
         num = row.xpath('td[2]/a/@href').extract()
         sNum = num[0][-7:]
         loader.add_value('nhl_num', sNum)
         
         # add season data
         loader.add_value('season', str(self.year))
         
         # collect additional stats
         loader.add_xpath('es_shots_against', './/td[6]/text()')
         loader.add_xpath('es_goals_against', './/td[7]/text()')
         loader.add_xpath('es_saves', './/td[8]/text()')
         loader.add_xpath('es_save_pct', './/td[9]/text()')
         loader.add_xpath('pp_shots_against', './/td[10]/text()')
         loader.add_xpath('pp_goals_against', './/td[11]/text()')
         loader.add_xpath('pp_saves', './/td[12]/text()')
         loader.add_xpath('pp_save_pct', './/td[13]/text()')
         loader.add_xpath('sh_shots_against', './/td[14]/text()')
         loader.add_xpath('sh_goals_against', './/td[15]/text()')
         loader.add_xpath('sh_saves', './/td[16]/text()')
         loader.add_xpath('sh_save_pct', './/td[17]/text()')
         
         # feed item to pipeline
         yield loader.load_item()

示例#2

0

显示文件

文件： skater_spider.py 项目： kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatSOItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            loader.add_xpath("so_shots", ".//td[13]/text()")
            loader.add_xpath("so_goals", ".//td[14]/text()")
            loader.add_xpath("so_pct", ".//td[15]/text()")
            loader.add_xpath("game_deciding_goals", ".//td[16]/text()")

            # feed item to pipeline
            yield loader.load_item()

示例#3

0

显示文件

文件： handbook.py 项目： ltiao/unsw-catalog-old

 def parse_course_item(self, response):
     url_obj = urlparse(response.url)
     l = ItemLoader(item=CourseItem(), response=response)
     l.default_input_processor = MapCompose(unicode.strip)
     l.default_output_processor = TakeFirst()
     l.add_xpath('code', "/html/head/meta[@name='DC.Subject.ProgramCode']/@content")
     l.add_xpath('name', "/html/head/meta[@name='DC.Subject.Description.Short']/@content")
     l.add_xpath('career', "/html/head/meta[@name='DC.Subject.Level']/@content")
     l.year_in = Identity()
     l.add_value('year', ppath.basename(ppath.dirname(url_obj.path)))
     l.add_value('src_url', unicode(response.url))
     l.add_xpath('uoc', "/html/head/meta[@name='DC.Subject.UOC']/@content")
     l.gened_in = MapCompose(unicode.strip, lambda s: s == 'Y')
     l.add_xpath('gened', "/html/head/meta[@name='DC.Subject.GenED']/@content")
     l.add_xpath('faculty', "/html/head/meta[@name='DC.Subject.Faculty']/@content")
     l.add_xpath('school', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                             "/div[@class='summary']/p[strong[text()[contains(.,'School')]]]/a/text()"))
     l.add_xpath('campus', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                             "/div[@class='summary']/p[strong[text()[contains(.,'Campus')]]]/text()"))
     l.add_xpath('prereqs_str', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                     "/div[@class='summary']/p[text()[contains(.,'Prerequisite:')]]/text()"), 
                     re=r'Prerequisite:\s(.+)')
     l.add_xpath('eftsl', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                             "/div[@class='summary']/p[strong[text()[contains(.,'EFTSL')]]]/text()"))
     l.add_xpath('description_markup', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                                         "/h2[text()='Description']/following-sibling::div"))
     course_item = l.load_item()
     yield course_item
     yield Request(url=response.xpath(("//div[@class='column content-col']/div[@class='internalContentWrapper']"
                                     "/div[@class='summary']//a[text()[contains(.,'Timetable')]]/@href")).extract()[0], 
                     callback=self.parse_class_item, 
                     meta=dict(course_identifier={k: course_item.get(k, None) for k in ('code', 'career', 'year', )}))

示例#4

0

显示文件

文件： skater_spider.py 项目： kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatTOIItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect TOI stats after converting from m,mmm:ss to seconds
            i = 5
            CATEG = ["es_toi", "sh_toi", "pp_toi", "toi"]
            while i < 12:
                i += 1
                if i % 2 == 0:
                    temp = row.xpath("td[" + str(i) + "]/text()").extract()[0]
                    sTemp = temp.split(":")
                    sTemp[0] = sTemp[0].replace(",", "")
                    loader.add_value(CATEG[(i - 6) / 2], str(60 * int(sTemp[0]) + int(sTemp[1])))
                else:
                    pass

            # feed item to pipeline
            yield loader.load_item()

示例#5

0

显示文件

文件： reden.py 项目： frederik-elwert/UniLu_HS2014_Textanalyse

 def parse_content(self, response):
     '''Parse content pages.'''
     loader = ItemLoader(item=Rede(), response=response)
     # Usually, we are only interested in the first item, e.g. for title, place, etc.
     loader.default_output_processor = TakeFirst()
     # Add fields
     loader.add_value('link', response.url)
     loader.add_css('title', '.text h1', extract_text)
     # Test if text has an abstract
     abstract = response.css('.abstract')
     if abstract:
         loader.add_css('abstract', '.abstract', extract_text)
         loader.add_css('text', '.abstract ~ p:not(.picture)',
                        extract_text, Join('\n'))
     else:
         loader.add_css('text', '.text p:not(.picture)',
                        extract_text, Join('\n'))
     # Metadata are in dt/dd pairs.
     keys = response.css('dl dt::text').extract()
     values = response.css('dl dd::text').extract()
     for key, value in zip(keys, values):
         if key == 'Datum:':
             match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4})', value)
             if match:
                 # '22.03.2011' format
                 value = match.group(1)
                 dt = datetime.strptime(value.encode(ENC), '%d.%m.%Y')
             else:
                 # '22. März 2011' format
                 dt = datetime.strptime(value.encode(ENC), '%d. %B %Y')
             loader.add_value('date', dt.date())
         elif key == 'Ort:':
             loader.add_value('place', value)
     return loader.load_item()

示例#6

0

显示文件

文件： standings_spider.py 项目： kielejocain/NHL_sql

    def parse(self, response):
        sel = Selector(response)

        # collect xpaths of each team (row in table)
        rows = sel.xpath('/html//div[@class="contentBlock"]/table/tbody/tr')

        # loop through teams
        for row in rows:
            loader = ItemLoader(StandingsItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get team identifier
            team = row.xpath('td[2]/a[1]/@rel').extract()
            loader.add_value('team', team)

            # collect several other data points
            loader.add_xpath('division', './/td[3]/text()')
            loader.add_xpath('games_played', './/td[4]/text()')
            loader.add_xpath('wins', './/td[5]/text()')
            loader.add_xpath('losses', './/td[6]/text()')
            loader.add_xpath('ot_losses', './/td[7]/text()')
            loader.add_xpath('points', './/td[8]/text()')
            loader.add_xpath('row', './/td[9]/text()')

            # feed item to pipeline
            yield loader.load_item()

示例#7

0

显示文件

文件： skater_spider.py 项目： kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # prepare to adjust for shootout stats if necessary
        shootout = 0
        if self.year > 2005:
            shootout = 1

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatEngItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            if shootout:
                loader.add_xpath("en_goals", ".//td[20]/text()")
                loader.add_xpath("ps_goals", ".//td[21]/text()")
            else:
                loader.add_xpath("en_goals", ".//td[21]/text()")
                loader.add_xpath("ps_goals", ".//td[22]/text()")

            # feed item to pipeline
            yield loader.load_item()

示例#8

0

显示文件

文件： skater_spider.py 项目： kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatRTSItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            loader.add_xpath("hits", ".//td[6]/text()")
            loader.add_xpath("blocked_shots", ".//td[7]/text()")
            loader.add_xpath("missed_shots", ".//td[8]/text()")
            loader.add_xpath("giveaways", ".//td[9]/text()")
            loader.add_xpath("takeaways", ".//td[10]/text()")
            loader.add_xpath("faceoff_wins", ".//td[11]/text()")
            loader.add_xpath("faceoff_losses", ".//td[12]/text()")

            # feed item to pipeline
            yield loader.load_item()

示例#9

0

显示文件

文件： wrapbootstrap_spider.py 项目： lotohov/wrapbootstrap-scraper

    def parse_template(self, response):
        """
        Callback used by Scrapy to process downloaded responses
        //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2]
        """
        response_body = response.body_as_unicode()

        # Checking if coffee beans are present in the source, since it shifts down the divs
        coffee = True if 'cups of coffee' in response_body else False

        prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()'
        substr_xpath = 'substring-after(normalize-space({}), "{}")'

        item_fields = {
            'item_hash':
            '//*[@id="offer_sku"]/text()',
            'title':
            '//*[@id="thing_name"]/text()',
            'thumbnail':
            '//*[@id="thing_image"]/@src',
            'description':
            '//*[@id="description"]',
            'creator':
            '//*[@id="product_manufacturer"]/text()',
            'when':
            prop_xpath.format('Released'),
            'bootstrap_version':
            substr_xpath.format(prop_xpath.format('Bootstrap'),
                                'Compatible with '),
            'cost_single':
            substr_xpath.format(
                '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()'
                .format(3 if coffee else 2), '$'),
            'cost_multiple':
            substr_xpath.format(
                '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()'
                .format(3 if coffee else 2), '$'),
            'cost_extended':
            substr_xpath.format(
                '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()'
                .format(3 if coffee else 2), '$'),
            'purchases':
            '//div[@class="purchases"]/span[@class="count"]/text()',
        }

        selector = Selector(response)

        loader = ItemLoader(WrapBootstrapTemplate(), selector=selector)

        # define processors
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        # iterate over fields and add xpaths to the loader
        for field, xpath in item_fields.iteritems():
            loader.add_xpath(field, xpath)
        yield loader.load_item()

示例#10

0

显示文件

文件： openStackSpider.py 项目： Luckystar143/conference-crawler

 def parse_item(self, response):
     l = ItemLoader(item=ItemloadItem(), response=response)
     l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)
     l.add_xpath('name', '//*[@id="sched-page-me-name"]/text()')
     l.add_xpath('image_url', '//*[@id="myavatar"]/@src')
     l.add_xpath('friends', '//*[@id="sched-page-me-connections"]/ul/li/a/@title')
     l.add_xpath('title_company_location', '//*[@id="sched-page-me-profile-data"]/text()')
     l.add_xpath('links', '//*[@class="sched-network-link"]/a/@href')
     l.add_xpath('about', '//*[@id="sched-page-me-profile-about"]/text()')
     return l.load_item()

示例#11

0

显示文件

文件： skater_spider.py 项目： kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # instantiate parsing variables
        MONTHS = {
            "Jan": "01",
            "Feb": "02",
            "Mar": "03",
            "Apr": "04",
            "May": "05",
            "Jun": "06",
            "Jul": "07",
            "Aug": "08",
            "Sep": "09",
            "Oct": "10",
            "Nov": "11",
            "Dec": "12",
        }

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatBioItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # parse the name
            name = row.xpath("td[2]/a/text()").extract()
            sName = name[0].split(" ", 1)
            loader.add_value("first_name", sName[0])
            loader.add_value("last_name", sName[1])

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # collect birth year
            bDate = row.xpath("td[5]/text()").extract()[0]
            bYear = "19" + bDate[-2:]
            bMonth = MONTHS[bDate[:3]]
            bDay = bDate[4:6]
            loader.add_value("birthday", "%s-%s-%s" % (bYear, bMonth, bDay))

            # collect other data points
            loader.add_xpath("position", ".//td[4]/text()")
            loader.add_xpath("draft_year", ".//td[12]/text()")
            loader.add_xpath("draft_position", ".//td[14]/text()")

            # feed item to pipeline
            yield loader.load_item()

示例#12

0

显示文件

文件： rtd_spider.py 项目： emptyset/10th-osage

    def parse(self, response):
        direction = response.xpath('//li[@class="btn-schedules-active"][1]/text()').extract()
        day = response.xpath('//li[@class="btn-schedules-active"][2]/text()').extract()
        for sel in response.xpath('//tr'):
            loader = ItemLoader(item = RtdItem(), selector = sel)
            loader.default_output_processor = TakeFirst()

            loader.add_value('day', day)
            loader.add_value('direction', direction)
            loader.add_xpath('route', 'th/a/text()')
            loader.add_xpath('depart_time', 'td[1]/text()')
            loader.add_xpath('arrive_time', 'td[2]/text()')
            yield loader.load_item()

示例#13

0

显示文件

文件： goalie_spider.py 项目： kielejocain/NHL_sql

 def parse_item(self, response):
     sel = Selector(response)
     
     # collect xpaths of each player (row in table)
     rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')
     
     # instantiate parsing variables
     MONTHS = {'Jan': '01',
               'Feb': '02',
               'Mar': '03',
               'Apr': '04',
               'May': '05',
               'Jun': '06',
               'Jul': '07',
               'Aug': '08',
               'Sep': '09',
               'Oct': '10',
               'Nov': '11',
               'Dec': '12'}
     
     # loop through players
     for row in rows:
         loader = ItemLoader(GoalBioItem(), selector=row)
         loader.default_input_processor = MapCompose()
         loader.default_output_processor = Join()
         
         # get unique NHL ID number from player's page URL
         num = row.xpath('td[2]/a/@href').extract()
         sNum = num[0][-7:]
         loader.add_value('nhl_num', sNum)
         
         # parse the name
         name = row.xpath('td[2]/a/text()').extract()
         sName = name[0].split(' ', 1)
         loader.add_value('first_name', sName[0])
         loader.add_value('last_name', sName[1])
         
         # collect birth year
         bDate = row.xpath('td[4]/text()').extract()[0]
         bYear = "19" + bDate[-2:]
         bMonth = MONTHS[bDate[:3]]
         bDay = bDate[4:6]
         loader.add_value('birthday', "%s-%s-%s" % (bYear, bMonth, bDay))
         
         # add other data points
         loader.add_value('position', 'G')
         loader.add_xpath('draft_year', './/td[12]/text()')
         loader.add_xpath('draft_position', './/td[14]/text()')
         
         # feed item to pipeline
         yield loader.load_item()

示例#14

0

显示文件

文件： basic_spider.py 项目： lovoror/core-scrapy

 def parse(self, response):
     items = []
     for everyday in response.xpath('//ul/li/strong/a'):
         loader = ItemLoader(ProductItem(), everyday)
         loader.default_input_processor = MapCompose(unicode.strip)
         loader.default_output_processor = Join()
         loader.add_xpath('name', 'text()')
         loader.add_xpath('price', '@href')
         loader.add_xpath('stock', '@mon')
         loader.add_value('last_updated', 'today')  # you can also use literal values
         item = self.to_utf8(loader.load_item(), *['name', 'price', 'stock', 'last_updated'])
         self.log(item['name'], log.INFO)
         items.append(item)
     return items

示例#15

0

显示文件

文件： amazon_spider.py 项目： andiskim/scraper

	def parse(self, response):
		selector = response.selector.xpath(view)

		#iterate over titles
		for page in selector.select(self.view):
			loader = ItemLoader(AmazonItem(), page)

			#define processors
			loader.default_input_processor = MapCompose(unicode.strip)
			loader.default_output_processor = Join()

			#iterate over fields and add xpaths to the loader
			for field, xpath in self.iem_fields.iteritems():
				loader.add_xpath(field, xpath)
			return loader.load_item()

示例#16

0

显示文件

文件： arabia.py 项目： hemache/snapshoter

    def parse_question(self, response):
        question = ItemLoader(item=ArabiaQuestionItem(), response=response)
        question.default_output_processor = TakeFirst()
        question.add_xpath('id', '//*[@id="question_id"]/@value',
                           MapCompose(int))
        question.add_xpath('asker_username',
                           '//*[@class="question_meta"]/a/text()')
        question.add_xpath('answerer_username',
                           '//*[@class="inblock username"]/text()')
        question.add_xpath('title', '//*[@class="question_title"]/h2/text()')
        question.add_xpath('date', '//*[@class="question_date"]/text()')
        question.add_xpath('content', '//*[@id="question_answer"]/*',
                           Join('\n'))
        question.add_value('url', response.url)
        question.add_value('item', 'question')

        yield question.load_item()

示例#17

0

显示文件

文件： A500_spider.py 项目： atul161/playground

    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        Testing contracts:
        @url http://odds.500.com/index_jczq_2014-08-29.shtml

        """
        selector = Selector(response)

        # iterate over matchs 
        for match in selector.select(self.match_list_xpath):
            loader = ItemLoader(Match(), selector=match)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.match_fields.iteritems():
                loader.add_xpath(field, xpath)

            match_item = loader.load_item()
            match_item["game_date"] = self.game_date
            match_item["season_id"] = match_item["season_id"].split('-')[-1]
            match_item["teama_id"] = match_item["teama_id"].split('-')[-1]
            match_item["teamb_id"] = match_item["teamb_id"].split('-')[-1]
            if "score" in match_item:
                sa, sb = match_item["score"].split(':')
                match_item["score_a"] = sa
                match_item["score_b"] = sb
                match_item["result"] = "win" if sa > sb else "draw" if sa == sb else "lost"
            else:
                match_item["score_a"] = match_item["score_b"] = -1
                match_item["result"] = "none"

            yield match_item

            #scrap asia odds
            #id=454359&ctype=1&start=60&r=1&style=0&guojia=0
            for i in xrange(3):
                url = self.asia_odds_url % (match_item["match_id"], i * 30)
                request = scrapy.Request(url, callback=self.parse_asia_odds)
                request.meta['match_item'] = match_item
                yield request

示例#18

0

显示文件

文件： handbook.py 项目： ltiao/unsw-catalog-old

    def parse_class_item(self, response):
        course_identifier = response.meta.get('course_identifier')
        for sem in response.xpath(( "//table[tr[td[@class='classSearchSectionHeading']"
                                    "[text()[contains(.,'Detail')]]]]/following-sibling::table[1]")):
            for class_detail in sem.xpath("tr/td[@class='formBody']/table"):
                l = ItemLoader(item=ClassItem(), selector=class_detail)
                l.default_input_processor = MapCompose(unicode.strip)
                l.default_output_processor = TakeFirst()
                l.add_xpath('class_nbr', "tr/td[@class='label'][text()='Class Nbr']/following-sibling::td[@class='data'][1]/text()")
                l.add_xpath('activity', "tr/td[@class='label'][text()='Activity']/following-sibling::td[@class='data'][1]/text()")
                l.add_xpath('section', "tr/td[@class='label'][text()='Section']/following-sibling::td[@class='data'][1]/text()")
                l.add_xpath('teaching', "tr/td[@class='label'][a[text()='Teaching Period']]/following-sibling::td[@class='data'][1]/text()")
                l.add_xpath('status', "tr/td[@class='label'][text()='Status']/following-sibling::td[@class='data'][1]/font/text()")
                l.add_xpath('enrolments', "tr/td[@class='label'][text()='Enrols/Capacity']/following-sibling::td[@class='data'][1]/text()", re=r'(\d+)/\d+')
                l.add_xpath('capacity', "tr/td[@class='label'][text()='Enrols/Capacity']/following-sibling::td[@class='data'][1]/text()", re=r'\d+/(\d+)')
                l.offering_start_in = l.offering_end_in = l.updated_in = l.census_date_in = MapCompose(date_parser.parse)
                l.add_xpath('offering_start', "tr/td[@class='label'][text()='Offering Period']/following-sibling::td[@class='data'][1]/text()", re=r'([\d/]*)\s-\s[\d/]*')
                l.add_xpath('offering_end', "tr/td[@class='label'][text()='Offering Period']/following-sibling::td[@class='data'][1]/text()", re=r'[\d/]*\s-\s([\d/])*')
                l.add_xpath('census_date', "tr/td[@class='label'][a[text()='Census Date']]/following-sibling::td[@class='data'][1]/text()")
                l.add_xpath('consent', "tr/td[@class='label'][text()='Consent']/following-sibling::td[@class='data'][1]/text()")
                l.add_xpath('mode', "tr/td[@class='label'][text()='Instruction Mode']/following-sibling::td[@class='data'][1]/text()")
                l.add_value('src_url', unicode(response.url))
                l.add_xpath('updated', "//td[@class='note'][text()[contains(., 'Data is correct as at')]]/text()", 
                    re=r'Data is correct as at ([\w\s\-:,]*)')
                l.course_identifier_in = Identity()
                l.add_value('course_identifier', course_identifier)
                class_item = l.load_item()
                yield class_item
                for meeting in class_detail.xpath("tr/td[@class='formBody']/table/tr[@class='rowHighlight' or @class='rowLowlight']"):
                    m = MeetingItem()
                    m['class_identifier'] = {k: class_item.get(k, None) for k in ('class_nbr', )}
                    
                    d = dict(
                        zip(
                            ('day', 'time', 'location', 'weeks', 'instructor'), 
                            meeting.xpath("td[@class='data']/text()").extract()
                        )
                    )

                    time = d.pop('time')
                    d['time_start'], d['time_end'] = time.split(' - ')

                    m.update(d)

                    yield m

示例#19

0

显示文件

文件： linkedin.py 项目： Aracktus/Linkedin-job-directory-crawler

    def parse_items(self, response):
        """ This function parses a sample job page.

            @url https://www.linkedin.com/jobs2/view/66769906?trk=jserp_job_details_text
            @returns items 1
            @scrapes company_logo company_name job_title job_date
            @scrapes job_location job_experience job_function employment_type
            @scrapes industry job_description apply_link company_description
            @scrapes company_youtube_video
            """
        l = ItemLoader(item=LinkedinCrawlerItem(), response=response)
        l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)

        #l.add_value('page_url', response.url)
        l.add_xpath('company_logo', '//*[@class="logo-container"]/a/img/@src')
        l.add_xpath('company_name', ".//*[@id='top-card']/div[1]/div[2]/h2/a/span/text()")
        l.add_xpath('job_title', '//h1/text()')
        l.add_xpath('job_date', ".//*[@id='top-card']/div[1]/div[2]/div[1]/text()")
        l.add_xpath('job_location', ".//*[@id='top-card']/div[1]/div[2]/h2/span/span[1]/text()")
        l.add_xpath('job_experience', ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[1]/div[2]/text()")
        l.add_xpath('job_function', ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[2]/div[2]/text()")
        l.add_xpath('employment_type', ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[3]/div[2]/text()")
        l.add_xpath('industry', ".//*[@id='top-card']/div[3]/div[1]/ul[2]/li[1]/div[2]/text()")
        l.add_xpath('job_description', '//*[@class="description-module container"]/div/div/div/text()|\
            //*[@class="description-module container"]//strong/text()|\
            //*[@class="description-module container"]//li/text()|\
            //*[@class="description-module container"]/div/div/div//ul/li/text()|\
            //*[@class="description-module container"]/div/div/div/strong/span/text()')

        apply_link_selector = response.xpath(".//*[@id='offsite-apply-button']/@href").extract()[0]

        parsed = urlparse(apply_link_selector)
        url_of_job = parsed.query[39:]

        url_of_job = urllib.unquote(url_of_job)

        l.add_value('apply_link', url_of_job)

        l.add_xpath('company_description', './/*[@id="company-module"]/div/div[1]/text()|\
            .//*[@id="company-module"]/div/div[1]//strong/text()')

        l.add_xpath('company_youtube_video', ".//*[@id='company-module']/div/div[2]/object/param[2]/@value")

        return l.load_item()

示例#20

0

显示文件

文件： arabia.py 项目： hemache/snapshoter

 def parse_community(self, response):
     community = ItemLoader(item=ArabiaCommunityItem(), response=response)
     community.default_output_processor = TakeFirst()
     community.add_xpath('id',
                         '//*[@id="nav_title"]/a/@href',
                         re=r'/([a-zA-Z0-9-_]+)$')
     community.add_xpath(
         'logo', '//*[@class="category_logo"]/@src',
         MapCompose(
             lambda relative_url: urljoin(response.url, relative_url)))
     community.add_xpath('title', '//*[@id="nav_title"]/a/text()')
     community.add_xpath('description',
                         '//*[@class="category_description"]/text()')
     community.add_xpath('followers',
                         '//*[@id="category_follow"]/h3/text()',
                         MapCompose(int),
                         re=r'(\d+)')
     community.add_value('url', response.url)
     community.add_value('item', 'community')
     yield community.load_item()

示例#21

0

显示文件

文件： wrapbootstrap_spider.py 项目： lotohov/wrapbootstrap-scraper

    def parse_template(self, response):
        """
        Callback used by Scrapy to process downloaded responses
        //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2]
        """
        response_body = response.body_as_unicode()

        # Checking if coffee beans are present in the source, since it shifts down the divs
        coffee = True if 'cups of coffee' in response_body else False

        prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()'
        substr_xpath = 'substring-after(normalize-space({}), "{}")'

        item_fields = {
            'item_hash': '//*[@id="offer_sku"]/text()',
            'title': '//*[@id="thing_name"]/text()',
            'thumbnail': '//*[@id="thing_image"]/@src',
            'description': '//*[@id="description"]',
            'creator': '//*[@id="product_manufacturer"]/text()',
            'when': prop_xpath.format('Released'),
            'bootstrap_version': substr_xpath.format(prop_xpath.format('Bootstrap'), 'Compatible with '),
            'cost_single': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()'.format(3 if coffee else 2), '$'),
            'cost_multiple': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()'.format(3 if coffee else 2), '$'),
            'cost_extended': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()'.format(3 if coffee else 2), '$'),
            'purchases': '//div[@class="purchases"]/span[@class="count"]/text()',
        }

        selector = Selector(response)

        loader = ItemLoader(WrapBootstrapTemplate(), selector=selector)

        # define processors
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        # iterate over fields and add xpaths to the loader
        for field, xpath in item_fields.iteritems():
            loader.add_xpath(field, xpath)
        yield loader.load_item()

示例#22

0

显示文件

文件： A500_spider.py 项目： atul161/playground

    def parse_asia_odds(self, response):
        match_item = response.meta['match_item']
        selector = Selector(response)
        # iterate over odds 
        for odds in selector.select(self.asia_odds__xpath):
            loader = ItemLoader(AsiaOdds(), selector=odds)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.asia_odds_fields.iteritems():
                loader.add_xpath(field, xpath)

            odds_item = loader.load_item()
            #http://odds.500.com/yazhi.php?cid=515
            odds_item["match_id"] = match_item["match_id"]
            odds_item["company_id"] = odds_item["company_id"].split('=')[-1]
            odds_item["water_a"] = odds_item["water_a"].replace(self.UP_CHAR, '').replace(self.DOWN_CHAR, '')
            odds_item["water_b"] = odds_item["water_b"].replace(self.UP_CHAR, '').replace(self.DOWN_CHAR, '')

            yield odds_item

示例#23

0

显示文件

文件： faculty_spider.py 项目： omarayad1/AUC-Faculty-Scraper

 def parse2(self, response):
     hxs = Selector(response)
     items = hxs.xpath(self.deals_list_xpath)
     for item in items:
         loader = ItemLoader(faculty_contact(), selector=item)
         loader.default_input_processor = MapCompose(unicode.strip)
         loader.default_output_processor = Join()
         for field, xpath in self.item_fields.iteritems():
             if field == 'email':
                 link = item.xpath(self.item_fields['email'])
                 r = httplib.HTTPConnection('dir.aucegypt.edu')
                 try:
                     r.request('GET', '/'+link.extract()[0])
                     res = r.getresponse()
                     data = res.read()
                     email_selection = Selector(text=data)
                     email = email_selection.xpath('//@href')
                     loader.add_value('email', unicode(urllib.unquote(email.extract()[0]).replace('mailto:', '')))
                 except IndexError:
                     loader.add_value('email', u'')
             else:
                 loader.add_xpath(field, xpath)
         yield loader.load_item()

示例#24

0

显示文件

    def parse_items(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link
        """
        selector = Selector(response)

        # iterate over articles
        for article in selector.xpath(self.main_article_xpath):
            loader = ItemLoader(WwfArticle(), selector=article)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

示例#25

0

显示文件

文件： game_spider.py 项目： kielejocain/NHL_ELO

    def parse(self, response):
        sel = Selector(response)

        # collect xpaths of each team (row in table)
        rows = sel.xpath('/html//div[@class="contentBlock"]/table/tbody/tr')

        # define collection of teams that play in games we care about
        # (some teams have moved, but we still want the old games)
        # We'll grab the first four letters of the team to distinguish,
        # then assign the NHL's standard 3-letter code.

        TEAMS = {'Anah': 'ANA',
                 'Ariz': 'ARI',
                 'Calg': 'CGY',
                 'Edmo': 'EDM',
                 'Los ': 'LAK',
                 'San ': 'SJS',
                 'Vanc': 'VAN',
                 'Chic': 'CHI',
                 'Colo': 'COL',
                 'Dall': 'DAL',
                 'Minn': 'MIN',
                 'Nash': 'NAS',
                 'St. ': 'STL',
                 'Winn': 'WPG',
                 'Bost': 'BOS',
                 'Buff': 'BUF',
                 'Detr': 'DET',
                 'Flor': 'FLA',
                 'Mont': 'MTL',
                 'Otta': 'OTT',
                 'Tamp': 'TBL',
                 'Toro': 'TOR',
                 'Caro': 'CAR',
                 'Colu': 'CBJ',
                 'New ': 'NJD',
                 'NY I': 'NYI',
                 'NY R': 'NYR',
                 'Phil': 'PHI',
                 'Pitt': 'PIT',
                 'Wash': 'WAS',
                 'Phoe': 'ARI',
                 'Atla': 'WPG'}

        # loop through teams
        for row in rows:
            if row.xpath('td[1]/@colspan').extract()[0] == '1':
                loader = ItemLoader(PlayoffsItem(), selector=row)
                loader.default_input_processor = MapCompose()
                loader.default_output_processor = Join()

                # add season and date
                loader.add_value('season', str(self.year))
                date = datetime.strptime(row.xpath('td[1]/div[1]/text()').extract()[0][4:], '%b %d, %Y').date()
                loader.add_value('date', str(date))

                # get team identifiers
                away = ''
                if row.xpath('td[2]/a[1]/@rel').extract():
                    away = row.xpath('td[2]/a[1]/@rel').extract()[0]
                elif row.xpath('td[2]/div/text()').extract()[0][:4] in TEAMS:
                    away = TEAMS[row.xpath('td[2]/div/text()').extract()[0][:4]]
                if away:
                    if row.xpath('td[3]/a[1]/@rel').extract():
                        home = row.xpath('td[3]/a[1]/@rel').extract()[0]
                    else:
                        home = TEAMS[row.xpath('td[3]/div/text()').extract()[0][:4]]
                    loader.add_value('away', away)
                    loader.add_value('home', home)

                    # collect and parse results
                    away_score = row.xpath('td[5]/span[1]/text()').extract()[0].replace('\n', '').strip()
                    match = re.search('\(.*?\)', away_score)
                    loader.add_value('away_score', match.group(0)[1:-1])
                    home_score = row.xpath('td[5]/span[2]/text()').extract()[0].replace('\n', '').strip()
                    match = re.search('\(.*?\)', home_score)
                    loader.add_value('home_score', match.group(0)[1:-1])
                    match = re.search('\).*', home_score)
                    result = match.group(0)[1:]
                    if result == 'S/O':
                        output = 'SO'
                    elif result == 'OT':
                        output = 'OT'
                    else:
                        output = 'REG'
                    loader.add_value('result', output)

                    # feed item to pipeline
                    yield loader.load_item()
            else:
                pass

示例#26

0

显示文件

文件： goalie_spider.py 项目： kielejocain/NHL_sql

 def parse_item(self, response):
     sel = Selector(response)
     
     # collect xpaths of each player (row in table)
     rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')
     
     # prepare to adjust for shootout stats if necessary
     shootout = 0
     if self.year > 2005:
         shootout = 1
     
     # loop through players
     for row in rows:
         loader = ItemLoader(GoalSumItem(), selector=row)
         loader.default_input_processor = MapCompose()
         loader.default_output_processor = Join()
         
         # get unique NHL ID number from player's page URL
         num = row.xpath('td[2]/a/@href').extract()
         sNum = num[0][-7:]
         loader.add_value('nhl_num', sNum)
         
         # add season data
         loader.add_value('season', str(self.year))
         
         # players on one (extant) team all season have link to team page
         if row.xpath('td[3]/a/text()').extract():
             loader.add_xpath('team', './/td[3]/a/text()')
             loader.add_value('team2', None)
             loader.add_value('team3', None)
         else:
             temp = row.xpath('td[3]/text()').extract()[0]
             teams = temp.split(', ')
             loader.add_value('team', teams[0])
             if len(teams) > 2:
                 loader.add_value('team2', teams[1])
                 loader.add_value('team3', teams[2])
             elif len(teams) == 2:
                 loader.add_value('team2', teams[1])
                 loader.add_value('team3', None)
             else:
                 loader.add_value('team2', None)
                 loader.add_value('team3', None)
         
         # collect several other stats
         loader.add_xpath('games_played', './/td[4]/text()')
         loader.add_xpath('games_started', './/td[5]/text()')
         loader.add_xpath('wins', './/td[6]/text()')
         loader.add_xpath('losses', './/td[7]/text()')
         if shootout:
             loader.add_value('ties', '0')
         else:
             loader.add_xpath('ties', './/td[8]/text()')
         loader.add_xpath('overtime_losses', './/td[%d]/text()' % (9-shootout,))
         loader.add_xpath('shots_against', './/td[%d]/text()' % (10-shootout,))
         loader.add_xpath('goals_against', './/td[%d]/text()' % (11-shootout,))
         loader.add_xpath('gaa', './/td[%d]/text()' % (12-shootout,))
         loader.add_xpath('saves_', './/td[%d]/text()' % (13-shootout,))
         loader.add_xpath('save_pct', './/td[%d]/text()' % (14-shootout,))
         loader.add_xpath('shutouts', './/td[%d]/text()' % (15-shootout,))
         loader.add_xpath('goals', './/td[%d]/text()' % (16-shootout,))
         loader.add_xpath('assists', './/td[%d]/text()' % (17-shootout,))
         loader.add_xpath('penalty_minutes', './/td[%d]/text()' % (18-shootout,))
         
         # convert time in ice to seconds and add
         location = 'td[%d]/text()' % (19-shootout,)
         temp = row.xpath(location).extract()[0]
         sTemp = temp.split(':')
         sTemp[0] = sTemp[0].replace(',', '')
         loader.add_value('toi', str(60*int(sTemp[0])+int(sTemp[1])))
         
         # feed item to pipeline
         yield loader.load_item()

示例#27

0

显示文件

文件： skater_spider.py 项目： kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # prepare to adjust for shootout stats if necessary
        shootout = 0
        if self.year > 2005:
            shootout = 1

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatSumItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # players on one (extant) team all season have link to team page
            if row.xpath("td[3]/a/text()").extract():
                loader.add_xpath("team", ".//td[3]/a/text()")
                loader.add_value("team2", None)
                loader.add_value("team3", None)
            else:
                temp = row.xpath("td[3]/text()").extract()[0]
                teams = temp.split(", ")
                loader.add_value("team", teams[0])
                if len(teams) > 2:
                    loader.add_value("team2", teams[1])
                    loader.add_value("team3", teams[2])
                elif len(teams) == 2:
                    loader.add_value("team2", teams[1])
                    loader.add_value("team3", None)
                else:
                    loader.add_value("team2", None)
                    loader.add_value("team3", None)

            # collect several other data points
            loader.add_xpath("games_played", ".//td[5]/text()")
            loader.add_xpath("goals", ".//td[6]/text()")
            loader.add_xpath("assists", ".//td[7]/text()")
            loader.add_xpath("points", ".//td[8]/text()")
            loader.add_xpath("plus_minus", ".//td[9]/text()")
            loader.add_xpath("penalty_minutes", ".//td[10]/text()")
            loader.add_xpath("pp_goals", ".//td[11]/text()")
            loader.add_xpath("pp_points", ".//td[12]/text()")
            loader.add_xpath("sh_goals", ".//td[13]/text()")
            loader.add_xpath("sh_points", ".//td[14]/text()")
            loader.add_xpath("gw_goals", ".//td[15]/text()")

            # NHL stopped tracking tying goals in 2005, forcing an adjustment
            if shootout:
                loader.add_xpath("ot_goals", ".//td[16]/text()")
                loader.add_xpath("shots", ".//td[17]/text()")
                loader.add_xpath("shot_pct", ".//td[18]/text()")
            else:
                loader.add_xpath("ot_goals", ".//td[17]/text()")
                loader.add_xpath("shots", ".//td[18]/text()")
                loader.add_xpath("shot_pct", ".//td[19]/text()")

            # feed item to pipeline
            yield loader.load_item()

示例#28

0

显示文件

文件： arabia.py 项目： hemache/snapshoter

    def parse_post(self, response):
        post = ItemLoader(item=ArabiaPostItem(), response=response)
        post.default_output_processor = TakeFirst()
        #post.add_xpath('id', '//*[@class="post_content replace_urls"]/@id', MapCompose(int), re=r'(\d+)')
        post.add_xpath('id',
                       '//*[@class="short_url inputtext"]/@value',
                       MapCompose(int),
                       re=r'(\d+)')
        post.add_xpath('title', '//*[@id="nav_title"]/a/text()')
        post.add_xpath('up_votes',
                       '//*[@class="s_upvotes"]/text()',
                       MapCompose(int),
                       re=r'(\d+)')
        post.add_xpath('down_votes',
                       '//*[@class="s_downvotes"]/text()',
                       MapCompose(int),
                       re=r'(\d+)')
        post.add_xpath('points', '//*[@class="post_points ltr"]/text()',
                       MapCompose(int))
        post.add_xpath('author_username',
                       '//*[@class="block username"]/text()')
        post.add_xpath('author_fullname',
                       '//*[@class="block full_name"]/text()',
                       MapCompose(lambda value: value.replace(u'\xa0', u'')))
        post.add_xpath('date', '//*[@class="icon-time"]/../text()')
        post.add_xpath('community',
                       '//*[@class="icon-reorder"]/../a[1]/text()')
        post.add_xpath('topics', '//*[@class="topic"]/text()',
                       MapCompose(string.strip))
        post.add_xpath('url', '//*[@class="short_url inputtext"]/@value')
        post.add_value(
            'type',
            'link' if post.get_xpath('//*[@id="nav_title"]/a/@rel',
                                     TakeFirst()) == 'nofollow' else 'text')
        if post.get_output_value('type') == 'link':
            post.add_xpath('link', '//*[@id="nav_title"]/a/@href')
            post.add_xpath('domain',
                           '//*[@class="post_domain"]/text()',
                           re=r'\((.+?)\)')
        post.add_xpath('content', '//*[@class="post_content replace_urls"]/*',
                       Join('\n'))
        post.add_value('item', 'post')
        yield post.load_item()

        comments = []
        for row in response.selector.xpath(
                '//*[contains(@class, "post_comment")]'):
            comment = ItemLoader(item=ArabiaCommentItem(),
                                 selector=row,
                                 response=response)
            comment.default_output_processor = TakeFirst()
            comment.add_xpath('id', './@id', re=r'(\d+)')
            comment.add_xpath('index',
                              './@class',
                              MapCompose(int),
                              re=r'index(\d+)')
            comment.add_value('post_id', post.get_output_value('id'))
            #comment.add_value('parent_id', '')
            comment.add_xpath('author_username',
                              './/*[@class="comment_user"]/a/text()')
            comment.add_xpath('date', './/*[@class="comment_date"]/text()')
            comment.add_xpath('points',
                              './/*[@class="comment_points ltr"]/text()')
            comment.add_xpath(
                'content',
                './/*[@class="post_content comment_content replace_urls"]/*',
                Join('\n'))
            #comment.add_xpath('url', './/*[@class="comment_short_url"]/a/@href')
            comment.add_value(
                'url', 'https://arabia.io/go/{0}/{1}'.format(
                    post.get_output_value('id'),
                    comment.get_output_value('id')))
            comment.add_value('item', 'comment')
            comments.append(comment)

        for (index, comment) in enumerate(comments):
            if comment.get_output_value('index') == 0:
                comment.add_value('parent_id', 0)
                continue
            for comment_cursor in comments[:index][::-1]:
                if comment_cursor.get_output_value(
                        'index') == comment.get_output_value('index') - 1:
                    comment.add_value('parent_id',
                                      comment_cursor.get_output_value('id'))
                    break

        for comment in comments:
            yield comment.load_item()

示例#29

0

显示文件

文件： hydrospider.py 项目： ncouture/infodebit-scrapy

    def parse(self, response):
        # fetch all regions URLs
        hxs = HtmlXPathSelector(response)
        if response.url == 'http://www.cehq.gouv.qc.ca/suivihydro/default.asp':
            regions_urls = self.get_regions_urls(hxs)
            for url in regions_urls:
                yield Request(url, callback=self.parse)

        # to fetch all stations URLs
        if 'ListeStation.asp' in response.url:
            stations_urls = self.get_stations_urls(hxs)
            for url in stations_urls:
                yield Request(url, callback=self.parse)

        # to fetch their file information,
        if 'graphique.asp' in response.url:
            (station_id,
             name,
             description,
             municipality,
             region,
             lake_or_river_name,
             hydrographic_region,
             drainage_basin,
             flow_regime,
             federal_station_number) = self.get_station_items(hxs)[:10]

            # update our items,
            l = ItemLoader(item=StationHydrique())
            l.default_output_processor = processor.TakeFirst()
            l.add_value('entry_type', 'station')
            l.add_value('station_id', station_id)
            l.add_value('hack', 'station' + station_id)
            l.add_value('name', name)
            l.add_value('description', description)
            l.add_value('municipality', municipality)
            l.add_value('region', region)
            l.add_value('lake_or_river_name', lake_or_river_name)
            l.add_value('hydrographic_region', hydrographic_region)
            l.add_value('drainage_basin', drainage_basin)
            l.add_value('flow_regime', flow_regime)
            l.add_value('federal_station_number', federal_station_number)
            yield l.load_item()

            # and fetch any data table URL available
            data_table_url = self.get_data_table_url(hxs)
            if data_table_url:
                yield Request(data_table_url, callback=self.parse)

        # to store all of it...
        if 'tableau.asp' in response.url:
            station_id = response.url.split('NoStation=')[1].split('&')[0]
            stats = self.get_data_table_statistics(hxs)
            for stat in stats:
                l = ItemLoader(item=HistoricalWaterFlow())
                l.default_output_processor = processor.TakeFirst()
                l.add_value('entry_type', 'historical')
                l.add_value('station_id', station_id)
                l.add_value('date', stat[0])
                l.add_value('time', stat[1])
                l.add_value('hack', station_id + stat[0] + stat[1])
                l.add_value('water_flow', stat[2])
                yield l.load_item()

示例#30

0

显示文件

    def parse_items(self, response):
        """ This function parses a sample job page.

            @url https://www.linkedin.com/jobs2/view/66769906?trk=jserp_job_details_text
            @returns items 1
            @scrapes company_logo company_name job_title job_date
            @scrapes job_location job_experience job_function employment_type
            @scrapes industry job_description apply_link company_description
            @scrapes company_youtube_video
            """
        l = ItemLoader(item=LinkedinCrawlerItem(), response=response)
        l.default_output_processor = MapCompose(lambda v: v.strip(),
                                                replace_escape_chars)

        #l.add_value('page_url', response.url)
        l.add_xpath('company_logo', '//*[@class="logo-container"]/a/img/@src')
        l.add_xpath('company_name',
                    ".//*[@id='top-card']/div[1]/div[2]/h2/a/span/text()")
        l.add_xpath('job_title', '//h1/text()')
        l.add_xpath('job_date',
                    ".//*[@id='top-card']/div[1]/div[2]/div[1]/text()")
        l.add_xpath(
            'job_location',
            ".//*[@id='top-card']/div[1]/div[2]/h2/span/span[1]/text()")
        l.add_xpath(
            'job_experience',
            ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[1]/div[2]/text()")
        l.add_xpath(
            'job_function',
            ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[2]/div[2]/text()")
        l.add_xpath(
            'employment_type',
            ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[3]/div[2]/text()")
        l.add_xpath(
            'industry',
            ".//*[@id='top-card']/div[3]/div[1]/ul[2]/li[1]/div[2]/text()")
        l.add_xpath(
            'job_description',
            '//*[@class="description-module container"]/div/div/div/text()|\
            //*[@class="description-module container"]//strong/text()|\
            //*[@class="description-module container"]//li/text()|\
            //*[@class="description-module container"]/div/div/div//ul/li/text()|\
            //*[@class="description-module container"]/div/div/div/strong/span/text()'
        )

        apply_link_selector = response.xpath(
            ".//*[@id='offsite-apply-button']/@href").extract()[0]

        parsed = urlparse(apply_link_selector)
        url_of_job = parsed.query[39:]

        url_of_job = urllib.unquote(url_of_job)

        l.add_value('apply_link', url_of_job)

        l.add_xpath(
            'company_description',
            './/*[@id="company-module"]/div/div[1]/text()|\
            .//*[@id="company-module"]/div/div[1]//strong/text()')

        l.add_xpath(
            'company_youtube_video',
            ".//*[@id='company-module']/div/div[2]/object/param[2]/@value")

        return l.load_item()