def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # prepare to adjust for shootout stats if necessary shootout = 0 if self.year > 2005: shootout = 1 # loop through players for row in rows: loader = ItemLoader(SkatEngItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats if shootout: loader.add_xpath("en_goals", ".//td[20]/text()") loader.add_xpath("ps_goals", ".//td[21]/text()") else: loader.add_xpath("en_goals", ".//td[21]/text()") loader.add_xpath("ps_goals", ".//td[22]/text()") # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatRTSItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats loader.add_xpath("hits", ".//td[6]/text()") loader.add_xpath("blocked_shots", ".//td[7]/text()") loader.add_xpath("missed_shots", ".//td[8]/text()") loader.add_xpath("giveaways", ".//td[9]/text()") loader.add_xpath("takeaways", ".//td[10]/text()") loader.add_xpath("faceoff_wins", ".//td[11]/text()") loader.add_xpath("faceoff_losses", ".//td[12]/text()") # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatTOIItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect TOI stats after converting from m,mmm:ss to seconds i = 5 CATEG = ["es_toi", "sh_toi", "pp_toi", "toi"] while i < 12: i += 1 if i % 2 == 0: temp = row.xpath("td[" + str(i) + "]/text()").extract()[0] sTemp = temp.split(":") sTemp[0] = sTemp[0].replace(",", "") loader.add_value(CATEG[(i - 6) / 2], str(60 * int(sTemp[0]) + int(sTemp[1]))) else: pass # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') for row in rows: loader = ItemLoader(GoalSTItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # add season data loader.add_value('season', str(self.year)) # collect additional stats loader.add_xpath('es_shots_against', './/td[6]/text()') loader.add_xpath('es_goals_against', './/td[7]/text()') loader.add_xpath('es_saves', './/td[8]/text()') loader.add_xpath('es_save_pct', './/td[9]/text()') loader.add_xpath('pp_shots_against', './/td[10]/text()') loader.add_xpath('pp_goals_against', './/td[11]/text()') loader.add_xpath('pp_saves', './/td[12]/text()') loader.add_xpath('pp_save_pct', './/td[13]/text()') loader.add_xpath('sh_shots_against', './/td[14]/text()') loader.add_xpath('sh_goals_against', './/td[15]/text()') loader.add_xpath('sh_saves', './/td[16]/text()') loader.add_xpath('sh_save_pct', './/td[17]/text()') # feed item to pipeline yield loader.load_item()
def parse_course_item(self, response): url_obj = urlparse(response.url) l = ItemLoader(item=CourseItem(), response=response) l.default_input_processor = MapCompose(unicode.strip) l.default_output_processor = TakeFirst() l.add_xpath('code', "/html/head/meta[@name='DC.Subject.ProgramCode']/@content") l.add_xpath('name', "/html/head/meta[@name='DC.Subject.Description.Short']/@content") l.add_xpath('career', "/html/head/meta[@name='DC.Subject.Level']/@content") l.year_in = Identity() l.add_value('year', ppath.basename(ppath.dirname(url_obj.path))) l.add_value('src_url', unicode(response.url)) l.add_xpath('uoc', "/html/head/meta[@name='DC.Subject.UOC']/@content") l.gened_in = MapCompose(unicode.strip, lambda s: s == 'Y') l.add_xpath('gened', "/html/head/meta[@name='DC.Subject.GenED']/@content") l.add_xpath('faculty', "/html/head/meta[@name='DC.Subject.Faculty']/@content") l.add_xpath('school', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'School')]]]/a/text()")) l.add_xpath('campus', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'Campus')]]]/text()")) l.add_xpath('prereqs_str', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[text()[contains(.,'Prerequisite:')]]/text()"), re=r'Prerequisite:\s(.+)') l.add_xpath('eftsl', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'EFTSL')]]]/text()")) l.add_xpath('description_markup', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/h2[text()='Description']/following-sibling::div")) course_item = l.load_item() yield course_item yield Request(url=response.xpath(("//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']//a[text()[contains(.,'Timetable')]]/@href")).extract()[0], callback=self.parse_class_item, meta=dict(course_identifier={k: course_item.get(k, None) for k in ('code', 'career', 'year', )}))
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatSOItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats loader.add_xpath("so_shots", ".//td[13]/text()") loader.add_xpath("so_goals", ".//td[14]/text()") loader.add_xpath("so_pct", ".//td[15]/text()") loader.add_xpath("game_deciding_goals", ".//td[16]/text()") # feed item to pipeline yield loader.load_item()
def parse(self, response): sel = Selector(response) # collect xpaths of each team (row in table) rows = sel.xpath('/html//div[@class="contentBlock"]/table/tbody/tr') # loop through teams for row in rows: loader = ItemLoader(StandingsItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get team identifier team = row.xpath('td[2]/a[1]/@rel').extract() loader.add_value('team', team) # collect several other data points loader.add_xpath('division', './/td[3]/text()') loader.add_xpath('games_played', './/td[4]/text()') loader.add_xpath('wins', './/td[5]/text()') loader.add_xpath('losses', './/td[6]/text()') loader.add_xpath('ot_losses', './/td[7]/text()') loader.add_xpath('points', './/td[8]/text()') loader.add_xpath('row', './/td[9]/text()') # feed item to pipeline yield loader.load_item()
def parse_template(self, response): """ Callback used by Scrapy to process downloaded responses //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2] """ response_body = response.body_as_unicode() # Checking if coffee beans are present in the source, since it shifts down the divs coffee = True if 'cups of coffee' in response_body else False prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()' substr_xpath = 'substring-after(normalize-space({}), "{}")' item_fields = { 'item_hash': '//*[@id="offer_sku"]/text()', 'title': '//*[@id="thing_name"]/text()', 'thumbnail': '//*[@id="thing_image"]/@src', 'description': '//*[@id="description"]', 'creator': '//*[@id="product_manufacturer"]/text()', 'when': prop_xpath.format('Released'), 'bootstrap_version': substr_xpath.format(prop_xpath.format('Bootstrap'), 'Compatible with '), 'cost_single': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()' .format(3 if coffee else 2), '$'), 'cost_multiple': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()' .format(3 if coffee else 2), '$'), 'cost_extended': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()' .format(3 if coffee else 2), '$'), 'purchases': '//div[@class="purchases"]/span[@class="count"]/text()', } selector = Selector(response) loader = ItemLoader(WrapBootstrapTemplate(), selector=selector) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # instantiate parsing variables MONTHS = { "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", } # loop through players for row in rows: loader = ItemLoader(SkatBioItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # parse the name name = row.xpath("td[2]/a/text()").extract() sName = name[0].split(" ", 1) loader.add_value("first_name", sName[0]) loader.add_value("last_name", sName[1]) # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # collect birth year bDate = row.xpath("td[5]/text()").extract()[0] bYear = "19" + bDate[-2:] bMonth = MONTHS[bDate[:3]] bDay = bDate[4:6] loader.add_value("birthday", "%s-%s-%s" % (bYear, bMonth, bDay)) # collect other data points loader.add_xpath("position", ".//td[4]/text()") loader.add_xpath("draft_year", ".//td[12]/text()") loader.add_xpath("draft_position", ".//td[14]/text()") # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # instantiate parsing variables MONTHS = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'} # loop through players for row in rows: loader = ItemLoader(GoalBioItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # parse the name name = row.xpath('td[2]/a/text()').extract() sName = name[0].split(' ', 1) loader.add_value('first_name', sName[0]) loader.add_value('last_name', sName[1]) # collect birth year bDate = row.xpath('td[4]/text()').extract()[0] bYear = "19" + bDate[-2:] bMonth = MONTHS[bDate[:3]] bDay = bDate[4:6] loader.add_value('birthday', "%s-%s-%s" % (bYear, bMonth, bDay)) # add other data points loader.add_value('position', 'G') loader.add_xpath('draft_year', './/td[12]/text()') loader.add_xpath('draft_position', './/td[14]/text()') # feed item to pipeline yield loader.load_item()
def parse(self, response): items = [] for everyday in response.xpath('//ul/li/strong/a'): loader = ItemLoader(ProductItem(), everyday) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_xpath('name', 'text()') loader.add_xpath('price', '@href') loader.add_xpath('stock', '@mon') loader.add_value('last_updated', 'today') # you can also use literal values item = self.to_utf8(loader.load_item(), *['name', 'price', 'stock', 'last_updated']) self.log(item['name'], log.INFO) items.append(item) return items
def parse(self, response): selector = response.selector.xpath(view) #iterate over titles for page in selector.select(self.view): loader = ItemLoader(AmazonItem(), page) #define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #iterate over fields and add xpaths to the loader for field, xpath in self.iem_fields.iteritems(): loader.add_xpath(field, xpath) return loader.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://odds.500.com/index_jczq_2014-08-29.shtml """ selector = Selector(response) # iterate over matchs for match in selector.select(self.match_list_xpath): loader = ItemLoader(Match(), selector=match) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.match_fields.iteritems(): loader.add_xpath(field, xpath) match_item = loader.load_item() match_item["game_date"] = self.game_date match_item["season_id"] = match_item["season_id"].split('-')[-1] match_item["teama_id"] = match_item["teama_id"].split('-')[-1] match_item["teamb_id"] = match_item["teamb_id"].split('-')[-1] if "score" in match_item: sa, sb = match_item["score"].split(':') match_item["score_a"] = sa match_item["score_b"] = sb match_item["result"] = "win" if sa > sb else "draw" if sa == sb else "lost" else: match_item["score_a"] = match_item["score_b"] = -1 match_item["result"] = "none" yield match_item #scrap asia odds #id=454359&ctype=1&start=60&r=1&style=0&guojia=0 for i in xrange(3): url = self.asia_odds_url % (match_item["match_id"], i * 30) request = scrapy.Request(url, callback=self.parse_asia_odds) request.meta['match_item'] = match_item yield request
def parse_class_item(self, response): course_identifier = response.meta.get('course_identifier') for sem in response.xpath(( "//table[tr[td[@class='classSearchSectionHeading']" "[text()[contains(.,'Detail')]]]]/following-sibling::table[1]")): for class_detail in sem.xpath("tr/td[@class='formBody']/table"): l = ItemLoader(item=ClassItem(), selector=class_detail) l.default_input_processor = MapCompose(unicode.strip) l.default_output_processor = TakeFirst() l.add_xpath('class_nbr', "tr/td[@class='label'][text()='Class Nbr']/following-sibling::td[@class='data'][1]/text()") l.add_xpath('activity', "tr/td[@class='label'][text()='Activity']/following-sibling::td[@class='data'][1]/text()") l.add_xpath('section', "tr/td[@class='label'][text()='Section']/following-sibling::td[@class='data'][1]/text()") l.add_xpath('teaching', "tr/td[@class='label'][a[text()='Teaching Period']]/following-sibling::td[@class='data'][1]/text()") l.add_xpath('status', "tr/td[@class='label'][text()='Status']/following-sibling::td[@class='data'][1]/font/text()") l.add_xpath('enrolments', "tr/td[@class='label'][text()='Enrols/Capacity']/following-sibling::td[@class='data'][1]/text()", re=r'(\d+)/\d+') l.add_xpath('capacity', "tr/td[@class='label'][text()='Enrols/Capacity']/following-sibling::td[@class='data'][1]/text()", re=r'\d+/(\d+)') l.offering_start_in = l.offering_end_in = l.updated_in = l.census_date_in = MapCompose(date_parser.parse) l.add_xpath('offering_start', "tr/td[@class='label'][text()='Offering Period']/following-sibling::td[@class='data'][1]/text()", re=r'([\d/]*)\s-\s[\d/]*') l.add_xpath('offering_end', "tr/td[@class='label'][text()='Offering Period']/following-sibling::td[@class='data'][1]/text()", re=r'[\d/]*\s-\s([\d/])*') l.add_xpath('census_date', "tr/td[@class='label'][a[text()='Census Date']]/following-sibling::td[@class='data'][1]/text()") l.add_xpath('consent', "tr/td[@class='label'][text()='Consent']/following-sibling::td[@class='data'][1]/text()") l.add_xpath('mode', "tr/td[@class='label'][text()='Instruction Mode']/following-sibling::td[@class='data'][1]/text()") l.add_value('src_url', unicode(response.url)) l.add_xpath('updated', "//td[@class='note'][text()[contains(., 'Data is correct as at')]]/text()", re=r'Data is correct as at ([\w\s\-:,]*)') l.course_identifier_in = Identity() l.add_value('course_identifier', course_identifier) class_item = l.load_item() yield class_item for meeting in class_detail.xpath("tr/td[@class='formBody']/table/tr[@class='rowHighlight' or @class='rowLowlight']"): m = MeetingItem() m['class_identifier'] = {k: class_item.get(k, None) for k in ('class_nbr', )} d = dict( zip( ('day', 'time', 'location', 'weeks', 'instructor'), meeting.xpath("td[@class='data']/text()").extract() ) ) time = d.pop('time') d['time_start'], d['time_end'] = time.split(' - ') m.update(d) yield m
def parse_template(self, response): """ Callback used by Scrapy to process downloaded responses //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2] """ response_body = response.body_as_unicode() # Checking if coffee beans are present in the source, since it shifts down the divs coffee = True if 'cups of coffee' in response_body else False prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()' substr_xpath = 'substring-after(normalize-space({}), "{}")' item_fields = { 'item_hash': '//*[@id="offer_sku"]/text()', 'title': '//*[@id="thing_name"]/text()', 'thumbnail': '//*[@id="thing_image"]/@src', 'description': '//*[@id="description"]', 'creator': '//*[@id="product_manufacturer"]/text()', 'when': prop_xpath.format('Released'), 'bootstrap_version': substr_xpath.format(prop_xpath.format('Bootstrap'), 'Compatible with '), 'cost_single': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()'.format(3 if coffee else 2), '$'), 'cost_multiple': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()'.format(3 if coffee else 2), '$'), 'cost_extended': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()'.format(3 if coffee else 2), '$'), 'purchases': '//div[@class="purchases"]/span[@class="count"]/text()', } selector = Selector(response) loader = ItemLoader(WrapBootstrapTemplate(), selector=selector) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_asia_odds(self, response): match_item = response.meta['match_item'] selector = Selector(response) # iterate over odds for odds in selector.select(self.asia_odds__xpath): loader = ItemLoader(AsiaOdds(), selector=odds) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.asia_odds_fields.iteritems(): loader.add_xpath(field, xpath) odds_item = loader.load_item() #http://odds.500.com/yazhi.php?cid=515 odds_item["match_id"] = match_item["match_id"] odds_item["company_id"] = odds_item["company_id"].split('=')[-1] odds_item["water_a"] = odds_item["water_a"].replace(self.UP_CHAR, '').replace(self.DOWN_CHAR, '') odds_item["water_b"] = odds_item["water_b"].replace(self.UP_CHAR, '').replace(self.DOWN_CHAR, '') yield odds_item
def parse2(self, response): hxs = Selector(response) items = hxs.xpath(self.deals_list_xpath) for item in items: loader = ItemLoader(faculty_contact(), selector=item) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): if field == 'email': link = item.xpath(self.item_fields['email']) r = httplib.HTTPConnection('dir.aucegypt.edu') try: r.request('GET', '/'+link.extract()[0]) res = r.getresponse() data = res.read() email_selection = Selector(text=data) email = email_selection.xpath('//@href') loader.add_value('email', unicode(urllib.unquote(email.extract()[0]).replace('mailto:', ''))) except IndexError: loader.add_value('email', u'') else: loader.add_xpath(field, xpath) yield loader.load_item()
def parse_items(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = Selector(response) # iterate over articles for article in selector.xpath(self.main_article_xpath): loader = ItemLoader(WwfArticle(), selector=article) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): sel = Selector(response) # collect xpaths of each team (row in table) rows = sel.xpath('/html//div[@class="contentBlock"]/table/tbody/tr') # define collection of teams that play in games we care about # (some teams have moved, but we still want the old games) # We'll grab the first four letters of the team to distinguish, # then assign the NHL's standard 3-letter code. TEAMS = {'Anah': 'ANA', 'Ariz': 'ARI', 'Calg': 'CGY', 'Edmo': 'EDM', 'Los ': 'LAK', 'San ': 'SJS', 'Vanc': 'VAN', 'Chic': 'CHI', 'Colo': 'COL', 'Dall': 'DAL', 'Minn': 'MIN', 'Nash': 'NAS', 'St. ': 'STL', 'Winn': 'WPG', 'Bost': 'BOS', 'Buff': 'BUF', 'Detr': 'DET', 'Flor': 'FLA', 'Mont': 'MTL', 'Otta': 'OTT', 'Tamp': 'TBL', 'Toro': 'TOR', 'Caro': 'CAR', 'Colu': 'CBJ', 'New ': 'NJD', 'NY I': 'NYI', 'NY R': 'NYR', 'Phil': 'PHI', 'Pitt': 'PIT', 'Wash': 'WAS', 'Phoe': 'ARI', 'Atla': 'WPG'} # loop through teams for row in rows: if row.xpath('td[1]/@colspan').extract()[0] == '1': loader = ItemLoader(PlayoffsItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # add season and date loader.add_value('season', str(self.year)) date = datetime.strptime(row.xpath('td[1]/div[1]/text()').extract()[0][4:], '%b %d, %Y').date() loader.add_value('date', str(date)) # get team identifiers away = '' if row.xpath('td[2]/a[1]/@rel').extract(): away = row.xpath('td[2]/a[1]/@rel').extract()[0] elif row.xpath('td[2]/div/text()').extract()[0][:4] in TEAMS: away = TEAMS[row.xpath('td[2]/div/text()').extract()[0][:4]] if away: if row.xpath('td[3]/a[1]/@rel').extract(): home = row.xpath('td[3]/a[1]/@rel').extract()[0] else: home = TEAMS[row.xpath('td[3]/div/text()').extract()[0][:4]] loader.add_value('away', away) loader.add_value('home', home) # collect and parse results away_score = row.xpath('td[5]/span[1]/text()').extract()[0].replace('\n', '').strip() match = re.search('\(.*?\)', away_score) loader.add_value('away_score', match.group(0)[1:-1]) home_score = row.xpath('td[5]/span[2]/text()').extract()[0].replace('\n', '').strip() match = re.search('\(.*?\)', home_score) loader.add_value('home_score', match.group(0)[1:-1]) match = re.search('\).*', home_score) result = match.group(0)[1:] if result == 'S/O': output = 'SO' elif result == 'OT': output = 'OT' else: output = 'REG' loader.add_value('result', output) # feed item to pipeline yield loader.load_item() else: pass
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # prepare to adjust for shootout stats if necessary shootout = 0 if self.year > 2005: shootout = 1 # loop through players for row in rows: loader = ItemLoader(GoalSumItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # add season data loader.add_value('season', str(self.year)) # players on one (extant) team all season have link to team page if row.xpath('td[3]/a/text()').extract(): loader.add_xpath('team', './/td[3]/a/text()') loader.add_value('team2', None) loader.add_value('team3', None) else: temp = row.xpath('td[3]/text()').extract()[0] teams = temp.split(', ') loader.add_value('team', teams[0]) if len(teams) > 2: loader.add_value('team2', teams[1]) loader.add_value('team3', teams[2]) elif len(teams) == 2: loader.add_value('team2', teams[1]) loader.add_value('team3', None) else: loader.add_value('team2', None) loader.add_value('team3', None) # collect several other stats loader.add_xpath('games_played', './/td[4]/text()') loader.add_xpath('games_started', './/td[5]/text()') loader.add_xpath('wins', './/td[6]/text()') loader.add_xpath('losses', './/td[7]/text()') if shootout: loader.add_value('ties', '0') else: loader.add_xpath('ties', './/td[8]/text()') loader.add_xpath('overtime_losses', './/td[%d]/text()' % (9-shootout,)) loader.add_xpath('shots_against', './/td[%d]/text()' % (10-shootout,)) loader.add_xpath('goals_against', './/td[%d]/text()' % (11-shootout,)) loader.add_xpath('gaa', './/td[%d]/text()' % (12-shootout,)) loader.add_xpath('saves_', './/td[%d]/text()' % (13-shootout,)) loader.add_xpath('save_pct', './/td[%d]/text()' % (14-shootout,)) loader.add_xpath('shutouts', './/td[%d]/text()' % (15-shootout,)) loader.add_xpath('goals', './/td[%d]/text()' % (16-shootout,)) loader.add_xpath('assists', './/td[%d]/text()' % (17-shootout,)) loader.add_xpath('penalty_minutes', './/td[%d]/text()' % (18-shootout,)) # convert time in ice to seconds and add location = 'td[%d]/text()' % (19-shootout,) temp = row.xpath(location).extract()[0] sTemp = temp.split(':') sTemp[0] = sTemp[0].replace(',', '') loader.add_value('toi', str(60*int(sTemp[0])+int(sTemp[1]))) # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # prepare to adjust for shootout stats if necessary shootout = 0 if self.year > 2005: shootout = 1 # loop through players for row in rows: loader = ItemLoader(SkatSumItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # players on one (extant) team all season have link to team page if row.xpath("td[3]/a/text()").extract(): loader.add_xpath("team", ".//td[3]/a/text()") loader.add_value("team2", None) loader.add_value("team3", None) else: temp = row.xpath("td[3]/text()").extract()[0] teams = temp.split(", ") loader.add_value("team", teams[0]) if len(teams) > 2: loader.add_value("team2", teams[1]) loader.add_value("team3", teams[2]) elif len(teams) == 2: loader.add_value("team2", teams[1]) loader.add_value("team3", None) else: loader.add_value("team2", None) loader.add_value("team3", None) # collect several other data points loader.add_xpath("games_played", ".//td[5]/text()") loader.add_xpath("goals", ".//td[6]/text()") loader.add_xpath("assists", ".//td[7]/text()") loader.add_xpath("points", ".//td[8]/text()") loader.add_xpath("plus_minus", ".//td[9]/text()") loader.add_xpath("penalty_minutes", ".//td[10]/text()") loader.add_xpath("pp_goals", ".//td[11]/text()") loader.add_xpath("pp_points", ".//td[12]/text()") loader.add_xpath("sh_goals", ".//td[13]/text()") loader.add_xpath("sh_points", ".//td[14]/text()") loader.add_xpath("gw_goals", ".//td[15]/text()") # NHL stopped tracking tying goals in 2005, forcing an adjustment if shootout: loader.add_xpath("ot_goals", ".//td[16]/text()") loader.add_xpath("shots", ".//td[17]/text()") loader.add_xpath("shot_pct", ".//td[18]/text()") else: loader.add_xpath("ot_goals", ".//td[17]/text()") loader.add_xpath("shots", ".//td[18]/text()") loader.add_xpath("shot_pct", ".//td[19]/text()") # feed item to pipeline yield loader.load_item()