def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') for row in rows: loader = ItemLoader(GoalSTItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # add season data loader.add_value('season', str(self.year)) # collect additional stats loader.add_xpath('es_shots_against', './/td[6]/text()') loader.add_xpath('es_goals_against', './/td[7]/text()') loader.add_xpath('es_saves', './/td[8]/text()') loader.add_xpath('es_save_pct', './/td[9]/text()') loader.add_xpath('pp_shots_against', './/td[10]/text()') loader.add_xpath('pp_goals_against', './/td[11]/text()') loader.add_xpath('pp_saves', './/td[12]/text()') loader.add_xpath('pp_save_pct', './/td[13]/text()') loader.add_xpath('sh_shots_against', './/td[14]/text()') loader.add_xpath('sh_goals_against', './/td[15]/text()') loader.add_xpath('sh_saves', './/td[16]/text()') loader.add_xpath('sh_save_pct', './/td[17]/text()') # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatSOItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats loader.add_xpath("so_shots", ".//td[13]/text()") loader.add_xpath("so_goals", ".//td[14]/text()") loader.add_xpath("so_pct", ".//td[15]/text()") loader.add_xpath("game_deciding_goals", ".//td[16]/text()") # feed item to pipeline yield loader.load_item()
def parse_course_item(self, response): url_obj = urlparse(response.url) l = ItemLoader(item=CourseItem(), response=response) l.default_input_processor = MapCompose(unicode.strip) l.default_output_processor = TakeFirst() l.add_xpath('code', "/html/head/meta[@name='DC.Subject.ProgramCode']/@content") l.add_xpath('name', "/html/head/meta[@name='DC.Subject.Description.Short']/@content") l.add_xpath('career', "/html/head/meta[@name='DC.Subject.Level']/@content") l.year_in = Identity() l.add_value('year', ppath.basename(ppath.dirname(url_obj.path))) l.add_value('src_url', unicode(response.url)) l.add_xpath('uoc', "/html/head/meta[@name='DC.Subject.UOC']/@content") l.gened_in = MapCompose(unicode.strip, lambda s: s == 'Y') l.add_xpath('gened', "/html/head/meta[@name='DC.Subject.GenED']/@content") l.add_xpath('faculty', "/html/head/meta[@name='DC.Subject.Faculty']/@content") l.add_xpath('school', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'School')]]]/a/text()")) l.add_xpath('campus', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'Campus')]]]/text()")) l.add_xpath('prereqs_str', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[text()[contains(.,'Prerequisite:')]]/text()"), re=r'Prerequisite:\s(.+)') l.add_xpath('eftsl', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'EFTSL')]]]/text()")) l.add_xpath('description_markup', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/h2[text()='Description']/following-sibling::div")) course_item = l.load_item() yield course_item yield Request(url=response.xpath(("//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']//a[text()[contains(.,'Timetable')]]/@href")).extract()[0], callback=self.parse_class_item, meta=dict(course_identifier={k: course_item.get(k, None) for k in ('code', 'career', 'year', )}))
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatTOIItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect TOI stats after converting from m,mmm:ss to seconds i = 5 CATEG = ["es_toi", "sh_toi", "pp_toi", "toi"] while i < 12: i += 1 if i % 2 == 0: temp = row.xpath("td[" + str(i) + "]/text()").extract()[0] sTemp = temp.split(":") sTemp[0] = sTemp[0].replace(",", "") loader.add_value(CATEG[(i - 6) / 2], str(60 * int(sTemp[0]) + int(sTemp[1]))) else: pass # feed item to pipeline yield loader.load_item()
def parse_content(self, response): '''Parse content pages.''' loader = ItemLoader(item=Rede(), response=response) # Usually, we are only interested in the first item, e.g. for title, place, etc. loader.default_output_processor = TakeFirst() # Add fields loader.add_value('link', response.url) loader.add_css('title', '.text h1', extract_text) # Test if text has an abstract abstract = response.css('.abstract') if abstract: loader.add_css('abstract', '.abstract', extract_text) loader.add_css('text', '.abstract ~ p:not(.picture)', extract_text, Join('\n')) else: loader.add_css('text', '.text p:not(.picture)', extract_text, Join('\n')) # Metadata are in dt/dd pairs. keys = response.css('dl dt::text').extract() values = response.css('dl dd::text').extract() for key, value in zip(keys, values): if key == 'Datum:': match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4})', value) if match: # '22.03.2011' format value = match.group(1) dt = datetime.strptime(value.encode(ENC), '%d.%m.%Y') else: # '22. März 2011' format dt = datetime.strptime(value.encode(ENC), '%d. %B %Y') loader.add_value('date', dt.date()) elif key == 'Ort:': loader.add_value('place', value) return loader.load_item()
def parse(self, response): sel = Selector(response) # collect xpaths of each team (row in table) rows = sel.xpath('/html//div[@class="contentBlock"]/table/tbody/tr') # loop through teams for row in rows: loader = ItemLoader(StandingsItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get team identifier team = row.xpath('td[2]/a[1]/@rel').extract() loader.add_value('team', team) # collect several other data points loader.add_xpath('division', './/td[3]/text()') loader.add_xpath('games_played', './/td[4]/text()') loader.add_xpath('wins', './/td[5]/text()') loader.add_xpath('losses', './/td[6]/text()') loader.add_xpath('ot_losses', './/td[7]/text()') loader.add_xpath('points', './/td[8]/text()') loader.add_xpath('row', './/td[9]/text()') # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # prepare to adjust for shootout stats if necessary shootout = 0 if self.year > 2005: shootout = 1 # loop through players for row in rows: loader = ItemLoader(SkatEngItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats if shootout: loader.add_xpath("en_goals", ".//td[20]/text()") loader.add_xpath("ps_goals", ".//td[21]/text()") else: loader.add_xpath("en_goals", ".//td[21]/text()") loader.add_xpath("ps_goals", ".//td[22]/text()") # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatRTSItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats loader.add_xpath("hits", ".//td[6]/text()") loader.add_xpath("blocked_shots", ".//td[7]/text()") loader.add_xpath("missed_shots", ".//td[8]/text()") loader.add_xpath("giveaways", ".//td[9]/text()") loader.add_xpath("takeaways", ".//td[10]/text()") loader.add_xpath("faceoff_wins", ".//td[11]/text()") loader.add_xpath("faceoff_losses", ".//td[12]/text()") # feed item to pipeline yield loader.load_item()
def parse_template(self, response): """ Callback used by Scrapy to process downloaded responses //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2] """ response_body = response.body_as_unicode() # Checking if coffee beans are present in the source, since it shifts down the divs coffee = True if 'cups of coffee' in response_body else False prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()' substr_xpath = 'substring-after(normalize-space({}), "{}")' item_fields = { 'item_hash': '//*[@id="offer_sku"]/text()', 'title': '//*[@id="thing_name"]/text()', 'thumbnail': '//*[@id="thing_image"]/@src', 'description': '//*[@id="description"]', 'creator': '//*[@id="product_manufacturer"]/text()', 'when': prop_xpath.format('Released'), 'bootstrap_version': substr_xpath.format(prop_xpath.format('Bootstrap'), 'Compatible with '), 'cost_single': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()' .format(3 if coffee else 2), '$'), 'cost_multiple': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()' .format(3 if coffee else 2), '$'), 'cost_extended': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()' .format(3 if coffee else 2), '$'), 'purchases': '//div[@class="purchases"]/span[@class="count"]/text()', } selector = Selector(response) loader = ItemLoader(WrapBootstrapTemplate(), selector=selector) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_item(self, response): l = ItemLoader(item=ItemloadItem(), response=response) l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars) l.add_xpath('name', '//*[@id="sched-page-me-name"]/text()') l.add_xpath('image_url', '//*[@id="myavatar"]/@src') l.add_xpath('friends', '//*[@id="sched-page-me-connections"]/ul/li/a/@title') l.add_xpath('title_company_location', '//*[@id="sched-page-me-profile-data"]/text()') l.add_xpath('links', '//*[@class="sched-network-link"]/a/@href') l.add_xpath('about', '//*[@id="sched-page-me-profile-about"]/text()') return l.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # instantiate parsing variables MONTHS = { "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", } # loop through players for row in rows: loader = ItemLoader(SkatBioItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # parse the name name = row.xpath("td[2]/a/text()").extract() sName = name[0].split(" ", 1) loader.add_value("first_name", sName[0]) loader.add_value("last_name", sName[1]) # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # collect birth year bDate = row.xpath("td[5]/text()").extract()[0] bYear = "19" + bDate[-2:] bMonth = MONTHS[bDate[:3]] bDay = bDate[4:6] loader.add_value("birthday", "%s-%s-%s" % (bYear, bMonth, bDay)) # collect other data points loader.add_xpath("position", ".//td[4]/text()") loader.add_xpath("draft_year", ".//td[12]/text()") loader.add_xpath("draft_position", ".//td[14]/text()") # feed item to pipeline yield loader.load_item()
def parse(self, response): direction = response.xpath('//li[@class="btn-schedules-active"][1]/text()').extract() day = response.xpath('//li[@class="btn-schedules-active"][2]/text()').extract() for sel in response.xpath('//tr'): loader = ItemLoader(item = RtdItem(), selector = sel) loader.default_output_processor = TakeFirst() loader.add_value('day', day) loader.add_value('direction', direction) loader.add_xpath('route', 'th/a/text()') loader.add_xpath('depart_time', 'td[1]/text()') loader.add_xpath('arrive_time', 'td[2]/text()') yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # instantiate parsing variables MONTHS = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'} # loop through players for row in rows: loader = ItemLoader(GoalBioItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # parse the name name = row.xpath('td[2]/a/text()').extract() sName = name[0].split(' ', 1) loader.add_value('first_name', sName[0]) loader.add_value('last_name', sName[1]) # collect birth year bDate = row.xpath('td[4]/text()').extract()[0] bYear = "19" + bDate[-2:] bMonth = MONTHS[bDate[:3]] bDay = bDate[4:6] loader.add_value('birthday', "%s-%s-%s" % (bYear, bMonth, bDay)) # add other data points loader.add_value('position', 'G') loader.add_xpath('draft_year', './/td[12]/text()') loader.add_xpath('draft_position', './/td[14]/text()') # feed item to pipeline yield loader.load_item()
def parse(self, response): items = [] for everyday in response.xpath('//ul/li/strong/a'): loader = ItemLoader(ProductItem(), everyday) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_xpath('name', 'text()') loader.add_xpath('price', '@href') loader.add_xpath('stock', '@mon') loader.add_value('last_updated', 'today') # you can also use literal values item = self.to_utf8(loader.load_item(), *['name', 'price', 'stock', 'last_updated']) self.log(item['name'], log.INFO) items.append(item) return items
def parse(self, response): selector = response.selector.xpath(view) #iterate over titles for page in selector.select(self.view): loader = ItemLoader(AmazonItem(), page) #define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #iterate over fields and add xpaths to the loader for field, xpath in self.iem_fields.iteritems(): loader.add_xpath(field, xpath) return loader.load_item()
def parse_question(self, response): question = ItemLoader(item=ArabiaQuestionItem(), response=response) question.default_output_processor = TakeFirst() question.add_xpath('id', '//*[@id="question_id"]/@value', MapCompose(int)) question.add_xpath('asker_username', '//*[@class="question_meta"]/a/text()') question.add_xpath('answerer_username', '//*[@class="inblock username"]/text()') question.add_xpath('title', '//*[@class="question_title"]/h2/text()') question.add_xpath('date', '//*[@class="question_date"]/text()') question.add_xpath('content', '//*[@id="question_answer"]/*', Join('\n')) question.add_value('url', response.url) question.add_value('item', 'question') yield question.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://odds.500.com/index_jczq_2014-08-29.shtml """ selector = Selector(response) # iterate over matchs for match in selector.select(self.match_list_xpath): loader = ItemLoader(Match(), selector=match) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.match_fields.iteritems(): loader.add_xpath(field, xpath) match_item = loader.load_item() match_item["game_date"] = self.game_date match_item["season_id"] = match_item["season_id"].split('-')[-1] match_item["teama_id"] = match_item["teama_id"].split('-')[-1] match_item["teamb_id"] = match_item["teamb_id"].split('-')[-1] if "score" in match_item: sa, sb = match_item["score"].split(':') match_item["score_a"] = sa match_item["score_b"] = sb match_item["result"] = "win" if sa > sb else "draw" if sa == sb else "lost" else: match_item["score_a"] = match_item["score_b"] = -1 match_item["result"] = "none" yield match_item #scrap asia odds #id=454359&ctype=1&start=60&r=1&style=0&guojia=0 for i in xrange(3): url = self.asia_odds_url % (match_item["match_id"], i * 30) request = scrapy.Request(url, callback=self.parse_asia_odds) request.meta['match_item'] = match_item yield request
def parse_class_item(self, response): course_identifier = response.meta.get('course_identifier') for sem in response.xpath(( "//table[tr[td[@class='classSearchSectionHeading']" "[text()[contains(.,'Detail')]]]]/following-sibling::table[1]")): for class_detail in sem.xpath("tr/td[@class='formBody']/table"): l = ItemLoader(item=ClassItem(), selector=class_detail) l.default_input_processor = MapCompose(unicode.strip) l.default_output_processor = TakeFirst() l.add_xpath('class_nbr', "tr/td[@class='label'][text()='Class Nbr']/following-sibling::td[@class='data'][1]/text()") l.add_xpath('activity', "tr/td[@class='label'][text()='Activity']/following-sibling::td[@class='data'][1]/text()") l.add_xpath('section', "tr/td[@class='label'][text()='Section']/following-sibling::td[@class='data'][1]/text()") l.add_xpath('teaching', "tr/td[@class='label'][a[text()='Teaching Period']]/following-sibling::td[@class='data'][1]/text()") l.add_xpath('status', "tr/td[@class='label'][text()='Status']/following-sibling::td[@class='data'][1]/font/text()") l.add_xpath('enrolments', "tr/td[@class='label'][text()='Enrols/Capacity']/following-sibling::td[@class='data'][1]/text()", re=r'(\d+)/\d+') l.add_xpath('capacity', "tr/td[@class='label'][text()='Enrols/Capacity']/following-sibling::td[@class='data'][1]/text()", re=r'\d+/(\d+)') l.offering_start_in = l.offering_end_in = l.updated_in = l.census_date_in = MapCompose(date_parser.parse) l.add_xpath('offering_start', "tr/td[@class='label'][text()='Offering Period']/following-sibling::td[@class='data'][1]/text()", re=r'([\d/]*)\s-\s[\d/]*') l.add_xpath('offering_end', "tr/td[@class='label'][text()='Offering Period']/following-sibling::td[@class='data'][1]/text()", re=r'[\d/]*\s-\s([\d/])*') l.add_xpath('census_date', "tr/td[@class='label'][a[text()='Census Date']]/following-sibling::td[@class='data'][1]/text()") l.add_xpath('consent', "tr/td[@class='label'][text()='Consent']/following-sibling::td[@class='data'][1]/text()") l.add_xpath('mode', "tr/td[@class='label'][text()='Instruction Mode']/following-sibling::td[@class='data'][1]/text()") l.add_value('src_url', unicode(response.url)) l.add_xpath('updated', "//td[@class='note'][text()[contains(., 'Data is correct as at')]]/text()", re=r'Data is correct as at ([\w\s\-:,]*)') l.course_identifier_in = Identity() l.add_value('course_identifier', course_identifier) class_item = l.load_item() yield class_item for meeting in class_detail.xpath("tr/td[@class='formBody']/table/tr[@class='rowHighlight' or @class='rowLowlight']"): m = MeetingItem() m['class_identifier'] = {k: class_item.get(k, None) for k in ('class_nbr', )} d = dict( zip( ('day', 'time', 'location', 'weeks', 'instructor'), meeting.xpath("td[@class='data']/text()").extract() ) ) time = d.pop('time') d['time_start'], d['time_end'] = time.split(' - ') m.update(d) yield m
def parse_items(self, response): """ This function parses a sample job page. @url https://www.linkedin.com/jobs2/view/66769906?trk=jserp_job_details_text @returns items 1 @scrapes company_logo company_name job_title job_date @scrapes job_location job_experience job_function employment_type @scrapes industry job_description apply_link company_description @scrapes company_youtube_video """ l = ItemLoader(item=LinkedinCrawlerItem(), response=response) l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars) #l.add_value('page_url', response.url) l.add_xpath('company_logo', '//*[@class="logo-container"]/a/img/@src') l.add_xpath('company_name', ".//*[@id='top-card']/div[1]/div[2]/h2/a/span/text()") l.add_xpath('job_title', '//h1/text()') l.add_xpath('job_date', ".//*[@id='top-card']/div[1]/div[2]/div[1]/text()") l.add_xpath('job_location', ".//*[@id='top-card']/div[1]/div[2]/h2/span/span[1]/text()") l.add_xpath('job_experience', ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[1]/div[2]/text()") l.add_xpath('job_function', ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[2]/div[2]/text()") l.add_xpath('employment_type', ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[3]/div[2]/text()") l.add_xpath('industry', ".//*[@id='top-card']/div[3]/div[1]/ul[2]/li[1]/div[2]/text()") l.add_xpath('job_description', '//*[@class="description-module container"]/div/div/div/text()|\ //*[@class="description-module container"]//strong/text()|\ //*[@class="description-module container"]//li/text()|\ //*[@class="description-module container"]/div/div/div//ul/li/text()|\ //*[@class="description-module container"]/div/div/div/strong/span/text()') apply_link_selector = response.xpath(".//*[@id='offsite-apply-button']/@href").extract()[0] parsed = urlparse(apply_link_selector) url_of_job = parsed.query[39:] url_of_job = urllib.unquote(url_of_job) l.add_value('apply_link', url_of_job) l.add_xpath('company_description', './/*[@id="company-module"]/div/div[1]/text()|\ .//*[@id="company-module"]/div/div[1]//strong/text()') l.add_xpath('company_youtube_video', ".//*[@id='company-module']/div/div[2]/object/param[2]/@value") return l.load_item()
def parse_community(self, response): community = ItemLoader(item=ArabiaCommunityItem(), response=response) community.default_output_processor = TakeFirst() community.add_xpath('id', '//*[@id="nav_title"]/a/@href', re=r'/([a-zA-Z0-9-_]+)$') community.add_xpath( 'logo', '//*[@class="category_logo"]/@src', MapCompose( lambda relative_url: urljoin(response.url, relative_url))) community.add_xpath('title', '//*[@id="nav_title"]/a/text()') community.add_xpath('description', '//*[@class="category_description"]/text()') community.add_xpath('followers', '//*[@id="category_follow"]/h3/text()', MapCompose(int), re=r'(\d+)') community.add_value('url', response.url) community.add_value('item', 'community') yield community.load_item()
def parse_template(self, response): """ Callback used by Scrapy to process downloaded responses //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2] """ response_body = response.body_as_unicode() # Checking if coffee beans are present in the source, since it shifts down the divs coffee = True if 'cups of coffee' in response_body else False prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()' substr_xpath = 'substring-after(normalize-space({}), "{}")' item_fields = { 'item_hash': '//*[@id="offer_sku"]/text()', 'title': '//*[@id="thing_name"]/text()', 'thumbnail': '//*[@id="thing_image"]/@src', 'description': '//*[@id="description"]', 'creator': '//*[@id="product_manufacturer"]/text()', 'when': prop_xpath.format('Released'), 'bootstrap_version': substr_xpath.format(prop_xpath.format('Bootstrap'), 'Compatible with '), 'cost_single': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()'.format(3 if coffee else 2), '$'), 'cost_multiple': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()'.format(3 if coffee else 2), '$'), 'cost_extended': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()'.format(3 if coffee else 2), '$'), 'purchases': '//div[@class="purchases"]/span[@class="count"]/text()', } selector = Selector(response) loader = ItemLoader(WrapBootstrapTemplate(), selector=selector) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_asia_odds(self, response): match_item = response.meta['match_item'] selector = Selector(response) # iterate over odds for odds in selector.select(self.asia_odds__xpath): loader = ItemLoader(AsiaOdds(), selector=odds) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.asia_odds_fields.iteritems(): loader.add_xpath(field, xpath) odds_item = loader.load_item() #http://odds.500.com/yazhi.php?cid=515 odds_item["match_id"] = match_item["match_id"] odds_item["company_id"] = odds_item["company_id"].split('=')[-1] odds_item["water_a"] = odds_item["water_a"].replace(self.UP_CHAR, '').replace(self.DOWN_CHAR, '') odds_item["water_b"] = odds_item["water_b"].replace(self.UP_CHAR, '').replace(self.DOWN_CHAR, '') yield odds_item
def parse2(self, response): hxs = Selector(response) items = hxs.xpath(self.deals_list_xpath) for item in items: loader = ItemLoader(faculty_contact(), selector=item) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): if field == 'email': link = item.xpath(self.item_fields['email']) r = httplib.HTTPConnection('dir.aucegypt.edu') try: r.request('GET', '/'+link.extract()[0]) res = r.getresponse() data = res.read() email_selection = Selector(text=data) email = email_selection.xpath('//@href') loader.add_value('email', unicode(urllib.unquote(email.extract()[0]).replace('mailto:', ''))) except IndexError: loader.add_value('email', u'') else: loader.add_xpath(field, xpath) yield loader.load_item()
def parse_items(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = Selector(response) # iterate over articles for article in selector.xpath(self.main_article_xpath): loader = ItemLoader(WwfArticle(), selector=article) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): sel = Selector(response) # collect xpaths of each team (row in table) rows = sel.xpath('/html//div[@class="contentBlock"]/table/tbody/tr') # define collection of teams that play in games we care about # (some teams have moved, but we still want the old games) # We'll grab the first four letters of the team to distinguish, # then assign the NHL's standard 3-letter code. TEAMS = {'Anah': 'ANA', 'Ariz': 'ARI', 'Calg': 'CGY', 'Edmo': 'EDM', 'Los ': 'LAK', 'San ': 'SJS', 'Vanc': 'VAN', 'Chic': 'CHI', 'Colo': 'COL', 'Dall': 'DAL', 'Minn': 'MIN', 'Nash': 'NAS', 'St. ': 'STL', 'Winn': 'WPG', 'Bost': 'BOS', 'Buff': 'BUF', 'Detr': 'DET', 'Flor': 'FLA', 'Mont': 'MTL', 'Otta': 'OTT', 'Tamp': 'TBL', 'Toro': 'TOR', 'Caro': 'CAR', 'Colu': 'CBJ', 'New ': 'NJD', 'NY I': 'NYI', 'NY R': 'NYR', 'Phil': 'PHI', 'Pitt': 'PIT', 'Wash': 'WAS', 'Phoe': 'ARI', 'Atla': 'WPG'} # loop through teams for row in rows: if row.xpath('td[1]/@colspan').extract()[0] == '1': loader = ItemLoader(PlayoffsItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # add season and date loader.add_value('season', str(self.year)) date = datetime.strptime(row.xpath('td[1]/div[1]/text()').extract()[0][4:], '%b %d, %Y').date() loader.add_value('date', str(date)) # get team identifiers away = '' if row.xpath('td[2]/a[1]/@rel').extract(): away = row.xpath('td[2]/a[1]/@rel').extract()[0] elif row.xpath('td[2]/div/text()').extract()[0][:4] in TEAMS: away = TEAMS[row.xpath('td[2]/div/text()').extract()[0][:4]] if away: if row.xpath('td[3]/a[1]/@rel').extract(): home = row.xpath('td[3]/a[1]/@rel').extract()[0] else: home = TEAMS[row.xpath('td[3]/div/text()').extract()[0][:4]] loader.add_value('away', away) loader.add_value('home', home) # collect and parse results away_score = row.xpath('td[5]/span[1]/text()').extract()[0].replace('\n', '').strip() match = re.search('\(.*?\)', away_score) loader.add_value('away_score', match.group(0)[1:-1]) home_score = row.xpath('td[5]/span[2]/text()').extract()[0].replace('\n', '').strip() match = re.search('\(.*?\)', home_score) loader.add_value('home_score', match.group(0)[1:-1]) match = re.search('\).*', home_score) result = match.group(0)[1:] if result == 'S/O': output = 'SO' elif result == 'OT': output = 'OT' else: output = 'REG' loader.add_value('result', output) # feed item to pipeline yield loader.load_item() else: pass
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # prepare to adjust for shootout stats if necessary shootout = 0 if self.year > 2005: shootout = 1 # loop through players for row in rows: loader = ItemLoader(GoalSumItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # add season data loader.add_value('season', str(self.year)) # players on one (extant) team all season have link to team page if row.xpath('td[3]/a/text()').extract(): loader.add_xpath('team', './/td[3]/a/text()') loader.add_value('team2', None) loader.add_value('team3', None) else: temp = row.xpath('td[3]/text()').extract()[0] teams = temp.split(', ') loader.add_value('team', teams[0]) if len(teams) > 2: loader.add_value('team2', teams[1]) loader.add_value('team3', teams[2]) elif len(teams) == 2: loader.add_value('team2', teams[1]) loader.add_value('team3', None) else: loader.add_value('team2', None) loader.add_value('team3', None) # collect several other stats loader.add_xpath('games_played', './/td[4]/text()') loader.add_xpath('games_started', './/td[5]/text()') loader.add_xpath('wins', './/td[6]/text()') loader.add_xpath('losses', './/td[7]/text()') if shootout: loader.add_value('ties', '0') else: loader.add_xpath('ties', './/td[8]/text()') loader.add_xpath('overtime_losses', './/td[%d]/text()' % (9-shootout,)) loader.add_xpath('shots_against', './/td[%d]/text()' % (10-shootout,)) loader.add_xpath('goals_against', './/td[%d]/text()' % (11-shootout,)) loader.add_xpath('gaa', './/td[%d]/text()' % (12-shootout,)) loader.add_xpath('saves_', './/td[%d]/text()' % (13-shootout,)) loader.add_xpath('save_pct', './/td[%d]/text()' % (14-shootout,)) loader.add_xpath('shutouts', './/td[%d]/text()' % (15-shootout,)) loader.add_xpath('goals', './/td[%d]/text()' % (16-shootout,)) loader.add_xpath('assists', './/td[%d]/text()' % (17-shootout,)) loader.add_xpath('penalty_minutes', './/td[%d]/text()' % (18-shootout,)) # convert time in ice to seconds and add location = 'td[%d]/text()' % (19-shootout,) temp = row.xpath(location).extract()[0] sTemp = temp.split(':') sTemp[0] = sTemp[0].replace(',', '') loader.add_value('toi', str(60*int(sTemp[0])+int(sTemp[1]))) # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # prepare to adjust for shootout stats if necessary shootout = 0 if self.year > 2005: shootout = 1 # loop through players for row in rows: loader = ItemLoader(SkatSumItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # players on one (extant) team all season have link to team page if row.xpath("td[3]/a/text()").extract(): loader.add_xpath("team", ".//td[3]/a/text()") loader.add_value("team2", None) loader.add_value("team3", None) else: temp = row.xpath("td[3]/text()").extract()[0] teams = temp.split(", ") loader.add_value("team", teams[0]) if len(teams) > 2: loader.add_value("team2", teams[1]) loader.add_value("team3", teams[2]) elif len(teams) == 2: loader.add_value("team2", teams[1]) loader.add_value("team3", None) else: loader.add_value("team2", None) loader.add_value("team3", None) # collect several other data points loader.add_xpath("games_played", ".//td[5]/text()") loader.add_xpath("goals", ".//td[6]/text()") loader.add_xpath("assists", ".//td[7]/text()") loader.add_xpath("points", ".//td[8]/text()") loader.add_xpath("plus_minus", ".//td[9]/text()") loader.add_xpath("penalty_minutes", ".//td[10]/text()") loader.add_xpath("pp_goals", ".//td[11]/text()") loader.add_xpath("pp_points", ".//td[12]/text()") loader.add_xpath("sh_goals", ".//td[13]/text()") loader.add_xpath("sh_points", ".//td[14]/text()") loader.add_xpath("gw_goals", ".//td[15]/text()") # NHL stopped tracking tying goals in 2005, forcing an adjustment if shootout: loader.add_xpath("ot_goals", ".//td[16]/text()") loader.add_xpath("shots", ".//td[17]/text()") loader.add_xpath("shot_pct", ".//td[18]/text()") else: loader.add_xpath("ot_goals", ".//td[17]/text()") loader.add_xpath("shots", ".//td[18]/text()") loader.add_xpath("shot_pct", ".//td[19]/text()") # feed item to pipeline yield loader.load_item()
def parse_post(self, response): post = ItemLoader(item=ArabiaPostItem(), response=response) post.default_output_processor = TakeFirst() #post.add_xpath('id', '//*[@class="post_content replace_urls"]/@id', MapCompose(int), re=r'(\d+)') post.add_xpath('id', '//*[@class="short_url inputtext"]/@value', MapCompose(int), re=r'(\d+)') post.add_xpath('title', '//*[@id="nav_title"]/a/text()') post.add_xpath('up_votes', '//*[@class="s_upvotes"]/text()', MapCompose(int), re=r'(\d+)') post.add_xpath('down_votes', '//*[@class="s_downvotes"]/text()', MapCompose(int), re=r'(\d+)') post.add_xpath('points', '//*[@class="post_points ltr"]/text()', MapCompose(int)) post.add_xpath('author_username', '//*[@class="block username"]/text()') post.add_xpath('author_fullname', '//*[@class="block full_name"]/text()', MapCompose(lambda value: value.replace(u'\xa0', u''))) post.add_xpath('date', '//*[@class="icon-time"]/../text()') post.add_xpath('community', '//*[@class="icon-reorder"]/../a[1]/text()') post.add_xpath('topics', '//*[@class="topic"]/text()', MapCompose(string.strip)) post.add_xpath('url', '//*[@class="short_url inputtext"]/@value') post.add_value( 'type', 'link' if post.get_xpath('//*[@id="nav_title"]/a/@rel', TakeFirst()) == 'nofollow' else 'text') if post.get_output_value('type') == 'link': post.add_xpath('link', '//*[@id="nav_title"]/a/@href') post.add_xpath('domain', '//*[@class="post_domain"]/text()', re=r'\((.+?)\)') post.add_xpath('content', '//*[@class="post_content replace_urls"]/*', Join('\n')) post.add_value('item', 'post') yield post.load_item() comments = [] for row in response.selector.xpath( '//*[contains(@class, "post_comment")]'): comment = ItemLoader(item=ArabiaCommentItem(), selector=row, response=response) comment.default_output_processor = TakeFirst() comment.add_xpath('id', './@id', re=r'(\d+)') comment.add_xpath('index', './@class', MapCompose(int), re=r'index(\d+)') comment.add_value('post_id', post.get_output_value('id')) #comment.add_value('parent_id', '') comment.add_xpath('author_username', './/*[@class="comment_user"]/a/text()') comment.add_xpath('date', './/*[@class="comment_date"]/text()') comment.add_xpath('points', './/*[@class="comment_points ltr"]/text()') comment.add_xpath( 'content', './/*[@class="post_content comment_content replace_urls"]/*', Join('\n')) #comment.add_xpath('url', './/*[@class="comment_short_url"]/a/@href') comment.add_value( 'url', 'https://arabia.io/go/{0}/{1}'.format( post.get_output_value('id'), comment.get_output_value('id'))) comment.add_value('item', 'comment') comments.append(comment) for (index, comment) in enumerate(comments): if comment.get_output_value('index') == 0: comment.add_value('parent_id', 0) continue for comment_cursor in comments[:index][::-1]: if comment_cursor.get_output_value( 'index') == comment.get_output_value('index') - 1: comment.add_value('parent_id', comment_cursor.get_output_value('id')) break for comment in comments: yield comment.load_item()
def parse(self, response): # fetch all regions URLs hxs = HtmlXPathSelector(response) if response.url == 'http://www.cehq.gouv.qc.ca/suivihydro/default.asp': regions_urls = self.get_regions_urls(hxs) for url in regions_urls: yield Request(url, callback=self.parse) # to fetch all stations URLs if 'ListeStation.asp' in response.url: stations_urls = self.get_stations_urls(hxs) for url in stations_urls: yield Request(url, callback=self.parse) # to fetch their file information, if 'graphique.asp' in response.url: (station_id, name, description, municipality, region, lake_or_river_name, hydrographic_region, drainage_basin, flow_regime, federal_station_number) = self.get_station_items(hxs)[:10] # update our items, l = ItemLoader(item=StationHydrique()) l.default_output_processor = processor.TakeFirst() l.add_value('entry_type', 'station') l.add_value('station_id', station_id) l.add_value('hack', 'station' + station_id) l.add_value('name', name) l.add_value('description', description) l.add_value('municipality', municipality) l.add_value('region', region) l.add_value('lake_or_river_name', lake_or_river_name) l.add_value('hydrographic_region', hydrographic_region) l.add_value('drainage_basin', drainage_basin) l.add_value('flow_regime', flow_regime) l.add_value('federal_station_number', federal_station_number) yield l.load_item() # and fetch any data table URL available data_table_url = self.get_data_table_url(hxs) if data_table_url: yield Request(data_table_url, callback=self.parse) # to store all of it... if 'tableau.asp' in response.url: station_id = response.url.split('NoStation=')[1].split('&')[0] stats = self.get_data_table_statistics(hxs) for stat in stats: l = ItemLoader(item=HistoricalWaterFlow()) l.default_output_processor = processor.TakeFirst() l.add_value('entry_type', 'historical') l.add_value('station_id', station_id) l.add_value('date', stat[0]) l.add_value('time', stat[1]) l.add_value('hack', station_id + stat[0] + stat[1]) l.add_value('water_flow', stat[2]) yield l.load_item()
def parse_items(self, response): """ This function parses a sample job page. @url https://www.linkedin.com/jobs2/view/66769906?trk=jserp_job_details_text @returns items 1 @scrapes company_logo company_name job_title job_date @scrapes job_location job_experience job_function employment_type @scrapes industry job_description apply_link company_description @scrapes company_youtube_video """ l = ItemLoader(item=LinkedinCrawlerItem(), response=response) l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars) #l.add_value('page_url', response.url) l.add_xpath('company_logo', '//*[@class="logo-container"]/a/img/@src') l.add_xpath('company_name', ".//*[@id='top-card']/div[1]/div[2]/h2/a/span/text()") l.add_xpath('job_title', '//h1/text()') l.add_xpath('job_date', ".//*[@id='top-card']/div[1]/div[2]/div[1]/text()") l.add_xpath( 'job_location', ".//*[@id='top-card']/div[1]/div[2]/h2/span/span[1]/text()") l.add_xpath( 'job_experience', ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[1]/div[2]/text()") l.add_xpath( 'job_function', ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[2]/div[2]/text()") l.add_xpath( 'employment_type', ".//*[@id='top-card']/div[3]/div[1]/ul[1]/li[3]/div[2]/text()") l.add_xpath( 'industry', ".//*[@id='top-card']/div[3]/div[1]/ul[2]/li[1]/div[2]/text()") l.add_xpath( 'job_description', '//*[@class="description-module container"]/div/div/div/text()|\ //*[@class="description-module container"]//strong/text()|\ //*[@class="description-module container"]//li/text()|\ //*[@class="description-module container"]/div/div/div//ul/li/text()|\ //*[@class="description-module container"]/div/div/div/strong/span/text()' ) apply_link_selector = response.xpath( ".//*[@id='offsite-apply-button']/@href").extract()[0] parsed = urlparse(apply_link_selector) url_of_job = parsed.query[39:] url_of_job = urllib.unquote(url_of_job) l.add_value('apply_link', url_of_job) l.add_xpath( 'company_description', './/*[@id="company-module"]/div/div[1]/text()|\ .//*[@id="company-module"]/div/div[1]//strong/text()') l.add_xpath( 'company_youtube_video', ".//*[@id='company-module']/div/div[2]/object/param[2]/@value") return l.load_item()