def parse_estation(self, response): estacion_item_loader = ItemLoader(item=Estacion(), response=response) # Informacion de estacion # --------------------------- # linea # url_page # nombre # latitud_dms # longitud_dms # latitud_dec # longitud_dec # Informacion general info_general = estacion_item_loader.nested_xpath( '//*[@id="mw-content-text"]/div/table[1]') linea = response.meta['clave_linea'] url_page = urlparse(response.url).path nombre = response.xpath( '//*[@id="mw-content-text"]/div/table[1]/tr[1]/th/text()').extract( ) estacion_item_loader.add_value('linea', linea) estacion_item_loader.add_value('nombre', nombre) estacion_item_loader.add_value('url_page', url_page) estacion_item_loader.add_css('latitud_dms', '.geo-dms .latitude::text') estacion_item_loader.add_css('longitud_dms', '.geo-dms .longitude::text') estacion_item_loader.add_css('latitud_dec', '.geo-dec .latitude::text') estacion_item_loader.add_css('longitud_dec', '.geo-dec .longitude::text') # informacion de conexiones conexiones = response.css('.infobox table') conexion = conexiones[0] image = conexion.css('tr')[0].css('td')[3].css( 'a.image').extract_first() # direccion down direccion_down = conexion.css('tr')[0].css('td')[1].css('b a') if direccion_down: direccion_down.extract() # direccion_up direccion_up = conexion.css('tr')[0].css('td')[5].css('b a') if direccion_up: direccion_up.extract() yield estacion_item_loader.load_item()
def parse_learn_more(self, response): l = ItemLoader(item=ei.CPCCandidate(), selector=response) l.add_value("riding", response.meta.get("riding")) title = l.nested_xpath("//div[@class='cell text-center']") title.add_xpath("name", "./h1/text()") title.add_xpath("nomination_dt", ".//p[@class='nomination_date']/text()") title.add_xpath("cabinet_position", "./p[not(@class)]/text()") social = l.nested_xpath( "//section[@class='section section--social-share']") social.add_xpath("donate", ".//a[@data-type='donate']/@href") social.add_xpath("website", ".//a[@data-type='website']/@href") social.add_xpath("facebook", ".//a[@data-type='facebook']/@href") social.add_xpath("twitter", ".//a[@data-type='twitter']/@href") social.add_xpath("instagram", ".//a[@data-type='instagram']/@href") l.add_xpath("photo", "//img[@class='team-bio-image']/@src") l.add_xpath("bio", "//section[@class='section section--text-block']//text()") yield l.load_item()
def fetchLatestRadio(self, response): self.logger.debug( "[%s]=====================Saavn Radio list ========================================" % self.loggerName) # fetch albums details loader = ItemLoader(item=Radio(), response=response) radioLoader = loader.nested_xpath( '//div[contains(@class, "album-details")]') radioLoader.add_xpath('name', 'p/text()') loadedRadio = radioLoader.load_item() self.showRadioDetails(loadedRadio) self.logger.debug( "[%s]=============================================================" % self.loggerName) return loadedRadio
def parse_product(self, response): selector = response.selector.xpath('//section[@class="row"]') loader = ItemLoader(item=response.meta["item"], selector=selector) # meta_loader = ItemLoader(item=ProductItemMeta(), selector=selector) loader.add_xpath('detail_name', './/h1[@itemprop="name"]/text()') loader.add_xpath('brand', './/h5[@itemprop="brand"]/text()') loader.add_xpath( 'description', './/div[@class="col-xs-12 col-sm-12 col-md-12 col-lg-12"]/p/text()' ) _loader = loader.nested_xpath('//select[@id="__sku"]/option') _loader.add_xpath('price', './/@data-priceformat') _loader.add_xpath('size_format', './/text()') loader.selector = response.selector.xpath( '//div[@id="accordion"]/div[@class="panel panel-default"]') loader.add_xpath( 'detail_description', './/div[@id="collapseOne"]/div/descendant-or-self::*/text()') loader.add_xpath( 'detail_ingredients', './/div[@id="collapseTwo"]/div/descendant-or-self::*/text()') loader.add_xpath( 'nutritional_facts', './/div[@id="collapseThree"]/div/descendant-or-self::*/text()') loader.add_xpath('nutritional_facts_img_url', './/*[@id="collapseThree"]/div/p/img/@src') loader.selector = response.selector.xpath( '//*[@id="review"]/div/div/div') loader.add_xpath('customer_review_header', './/h3[@class="panel-title"]/text()') ratings = [] for _ in loader.selector: rating = ''.join(_.xpath('.//label/text()').getall()) ratings.append(rating) loader.add_value('customer_review_rating', ratings) loader.add_xpath( 'customer_review', './/blockquote[@class="blockquote-reverse"]/p/text()') self.log(f'finished parsing product page {response.url}') return loader.load_item()
def parse_person(self, response): loader = ItemLoader(item=Person(), response=response) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()') loader.add_value('url', response.url) loader.add_xpath( 'primary_role', '//*[@id="info-card-overview-content"]/div/dl/div/dd') # Fields expected: born, gender, location, website overview = response.xpath( '//*[@id="info-card-overview-content"]/div/dl/dt/text()') overview_loader = loader.nested_xpath( '//*[@id="info-card-overview-content"]/div/dl') for i in range(len(overview)): key = overview[i].extract() key = key[:key.find(':')].lower() try: overview_loader.add_xpath(key, 'dd[{}]/text()'.format(i + 1)) except KeyError as e: # Ignore if key is not in the Item's field pass loader.add_xpath('facebook', '(//a[contains(@class,"facebook")])[1]/@href') loader.add_xpath('twitter', '(//a[contains(@class,"twitter")])[1]/@href') loader.add_xpath('linkedin', '(//a[contains(@class,"linkedin")])[1]/@href') loader.add_xpath('description', '//*[@id="description"]/span/div') loader.add_css('current_jobs', '.current_job') loader.add_css('past_jobs', '.past_job') loader.nested_css('.advisory_roles').add_xpath('board_advisors', './/ul/li') loader.nested_css('table.investors').add_xpath( 'investments', './/tr[not(@class="thead")]') loader.nested_css('.education').add_xpath('education', './/ul/li') return loader.load_item()
def parse(self, response): loader = ItemLoader(DetailedTeamStatItem(), response=response) team_spacing_loader = loader.nested_xpath( ".//div[contains(@class, 'team')]") loader.add_value('last_modified', datetime.utcnow()) # GENERAL CLUB INFORMATION loader.add_xpath('id', ".//div[@class='info']/h1/text()") loader.add_xpath('club_name', ".//div[@class='info']/h1/text()") loader.add_xpath('division', ".//div[contains(@class, 'meta')]//a[last()]/text()") loader.add_xpath('club_logo', ".//div[contains(@class, 'card')]/img/@data-src") loader.add_xpath('flag', ".//div[contains(@class, 'meta')]//img/@data-src") # GENERAL TEAM STATS loader.add_xpath( 'overall', ".//div[contains(@class, 'stats')]/div/div[1]/span/text()") loader.add_xpath( 'attack', ".//div[contains(@class, 'stats')]/div/div[2]/span/text()") loader.add_xpath( 'midfield', ".//div[contains(@class, 'stats')]/div/div[3]/span/text()") loader.add_xpath( 'defence', ".//div[contains(@class, 'stats')]/div/div[4]/span/text()") # DETAILED TEAM STATS # Note: this stat seams to be missing as of 06/17/2019 team_spacing_loader.add_xpath( 'home_stadium', "./ul/li/following::label[contains(., 'Home Stadium')]"\ "/following::text()[1]" ) team_spacing_loader.add_xpath( 'rival_team', "./ul/li/following::label[contains(., 'Rival Team')]"\ "/following::a[1]/@href" ) team_spacing_loader.add_xpath( 'international_prestige', "./ul/li/following::label[contains(., 'International Prestige')]"\ "/following::span[1]/text()" ) team_spacing_loader.add_xpath( 'domestic_prestige', "./ul/li/following::label[contains(., 'Domestic Prestige')]"\ "/following::span[1]/text()" ) team_spacing_loader.add_xpath( 'transfer_budget', "./ul/li/following::label[contains(., 'Domestic Prestige')]"\ "/following::label[contains(., 'Transfer Budget')]"\ "/following::text()[1]" ) team_spacing_loader.add_xpath( 'starting_xi_average_age', "./ul/li/following::label[contains(., 'Starting XI Average Age')]"\ "/following::text()[1]" ) team_spacing_loader.add_xpath( 'whole_team_average_age', "./ul/li/following::label[contains(., 'Whole Team Average Age')]"\ "/following::text()[1]" ) team_spacing_loader.add_xpath( 'captain', "./ul/li/following::label[contains(., 'Captain')]"\ "/following::a[1]/@href" ) loader.add_xpath( 'short_free_kick', "(.//div[contains(@class, 'team')]/ul/li"\ "/following::label[contains(., 'Short Free Kick')]"\ "/following::a[1])[1]/@href" ) loader.add_xpath( 'long_free_kick', "(.//div[contains(@class, 'team')]/ul/li"\ "/following::label[contains(., 'Long Free Kick')]"\ "/following::a[1])[1]/@href" ) loader.add_xpath( 'left_short_free_kick', "(.//div[contains(@class, 'team')]/ul/li"\ "/following::label[contains(., 'Left Short Free Kick')]"\ "/following::a[1])[1]/@href" ) loader.add_xpath( 'right_short_free_kick', "(.//div[contains(@class, 'team')]/ul/li"\ "/following::label[contains(., 'Right Short Free Kick')]"\ "/following::a[1])[1]/@href" ) team_spacing_loader.add_xpath( 'penalties', "./ul/li/following::label[contains(., 'Penalties')]"\ "/following::a[1]/@href" ) team_spacing_loader.add_xpath( 'left_corner', "./ul/li/following::label[contains(., 'Left Corner')]"\ "/following::a[1]/@href" ) team_spacing_loader.add_xpath( 'right_corner', "./ul/li/following::label[contains(., 'Right Corner')]"\ "/following::a[1]/@href" ) team_spacing_loader.add_xpath( 'starting_xi', ".//div[contains(@class, 'lineup')]/div/a/@href") # TACTICS loader.add_xpath( 'defence_defensive_style', ".//dl//span/preceding::dd[text()='Defensive Style']/span/span/"\ "text()" ) loader.add_xpath( 'defence_team_width', "(.//dl//span/preceding::span[text()='Team Width']"\ "/following::span[1]/span/text())[1]" ) loader.add_xpath( 'defence_depth', ".//dl//span/preceding::span[text()='Depth']/following::span[1]"\ "/span/text()" ) loader.add_xpath( 'offense_offensive_style', ".//dl//span/preceding::dd[text()='Offensive Style']/span/span/"\ "text()" ) loader.add_xpath( 'offense_width', ".//dl//span/preceding::span[text()='Width']/following::span[1]"\ "/span/text()" ) loader.add_xpath( 'offense_players_in_box', ".//dl//span/preceding::span[text()='Players in box']"\ "/following::span[1]/span/text()" ) loader.add_xpath( 'offense_corners', ".//dl//span/preceding::span[text()='Corners']/following::span[1]"\ "/span/text()" ) loader.add_xpath( 'offense_free_kicks', ".//dl//span/preceding::span[text()='Free Kicks']"\ "/following::span[1]/span/text()" ) loader.add_xpath( 'build_up_play_speed', ".//dl//span/preceding::span[text()='Speed']/following::span[1]"\ "/span/text()" ) loader.add_xpath( 'build_up_play_dribbling', ".//dl//span/preceding::dd[text()='Dribbling']/span/span/text()") loader.add_xpath( 'build_up_play_passing', "(.//dl//span/preceding::span[text()='Passing']"\ "/following::span[1]/span/text())[1]" ) loader.add_xpath( 'build_up_play_positioning', "(.//dl//span/preceding::span[text()='Positioning'])[1]"\ "/following::span[1]/text()" ) loader.add_xpath( 'chance_creation_passing', "(.//dl//span/preceding::span[text()='Passing']"\ "/following::span[1]/span/text())[2]" ) loader.add_xpath( 'chance_creation_crossing', ".//dl//span/preceding::span[text()='Crossing']"\ "/following::span[1]/span/text()" ) loader.add_xpath( 'chance_creation_shooting', ".//dl//span/preceding::span[text()='Shooting']"\ "/following::span[1]/span/text()" ) loader.add_xpath( 'chance_creation_positioning', "(.//dl//span/preceding::span[text()='Positioning'])[2]"\ "/following::span[1]/text()" ) loader.add_xpath( 'defence_extra_pressure', ".//dl//span/preceding::span[text()='Pressure']"\ "/following::span[1]/span/text()" ) loader.add_xpath( 'defence_extra_aggression', ".//dl//span/preceding::span[text()='Aggression']"\ "/following::span[1]/span/text()" ) loader.add_xpath( 'defence_extra_team_width', "(.//dl//span/preceding::span[text()='Team Width']"\ "/following::span[1]/span/text())[2]" ) loader.add_xpath( 'defence_extra_defender_line', ".//span[text()='Defender Line']/following::span/text()") # PLAYERS loader.add_xpath( 'squad', "(.//table)[1]/tbody/tr//a[contains(@href, '/player/')]/@href") loader.add_xpath( 'on_loan', "(.//table)[2]/tbody/tr//a[contains(@href, '/player/')]/@href") # MEDIA loader.add_xpath( 'kits', ".//div[@class='column col-sm-5 text-center']//img/@src") # COMMUNITY loader.add_xpath( 'likes', "(//div[contains(@class, 'operation spacing')]/a/span[2]/span"\ "/text())[1]" ) loader.add_xpath( 'dislikes', "(//div[contains(@class, 'operation spacing')]/a/span[2]/span"\ "/text())[2]" ) print(response.request.headers['User-Agent']) self.logger.info(f'Parse function called on {response.url}') yield loader.load_item()
def parse(self, response): self.crawler.stats.set_value('pages_to_visit', len(self.urls)) loader = ItemLoader(item=SofifaItem(), response=response) col_4_loader = loader.nested_xpath( ".//div[@class='column col-4 text-center']") loader.add_value('last_modified', datetime.utcnow()) # GENERAL PLAYER INFORMATION loader.add_xpath('id', ".//div[@class='info']/h1/text()") loader.add_xpath('name', ".//div[@class='info']/h1/text()") loader.add_xpath('full_name', ".//div[contains(@class, 'meta')]/text()[1]") loader.add_xpath('age', ".//div[contains(@class, 'meta')]/text()[1]") loader.add_xpath('dob', ".//div[contains(@class, 'meta')]/text()[1]") loader.add_xpath('height', ".//div[contains(@class, 'meta')]/text()[1]") loader.add_xpath('weight', ".//div[contains(@class, 'meta')]/text()[1]") loader.add_xpath('nationality', ".//div[contains(@class, 'meta')]/a/@title") # GENERAL PLAYER STATS loader.add_xpath( 'preferred_foot', "(.//label[text()='Preferred Foot']/following::text())[1]") loader.add_xpath( 'international_reputation', "(.//label[text()='International Reputation']"\ "/following::text())[1]" ) loader.add_xpath( 'weak_foot', "(.//label[text()='Weak Foot']/following::text())[1]") loader.add_xpath( 'skill_moves', "(.//label[text()='Skill Moves']/following::text())[1]") loader.add_xpath( 'work_rate', "(.//label[text()='Work Rate']/following::span/text())[1]") loader.add_xpath( 'body_type', "(.//label[text()='Body Type']/following::span/text())[1]") loader.add_xpath( 'real_face', "(.//label[text()='Real Face']/following::span/text())[1]") # CLUB/TEAM INFORMATION col_4_loader.add_xpath( 'value', "/following::text()[contains(., 'Value')]"\ "/following::span[1]/text()" ) col_4_loader.add_xpath( 'wage', "/following::text()[contains(., 'Wage')]/following::span[1]/text()" ) loader.add_xpath( 'release_clause', "(.//label[text()='Release Clause']/following::span/text())[1]") loader.add_xpath('club_name', "(.//ul[contains(@class, 'pl')]//a/text())[1]") loader.add_xpath('club_url', "(.//ul[contains(@class, 'pl')]//a/@href)[1]") loader.add_xpath( 'club_rating', ".//div[contains(@class, 'column col-5')][1]//li[2]/span[1]/text()" ) loader.add_xpath( 'club_position', "(.//label[text()='Position']/following::text()[1])[1]") loader.add_xpath( 'club_jersey_number', "(.//label[text()='Jersey Number']/following::text()[1])[1]") loader.add_xpath('club_join_date', ".//label[text()='Joined']/following::text()[1]") loader.add_xpath( 'loaned_from', ".//label[text()='Loaned From']/following::a[1]/text()") loader.add_xpath( 'club_contract_end_date', ".//label[text()='Contract Valid Until']/following::text()[1]") loader.add_xpath('team_name', "(.//ul[contains(@class, 'pl')]//a/text())[last()]") loader.add_xpath( 'team_rating', ".//div[contains(@class, 'column col-5')][last()]//li[2]/span[1]"\ "/text()" ) loader.add_xpath( 'team_position', "(.//label[text()='Position']/following::text()[1])[last()]") loader.add_xpath( 'team_jersey_number', "(.//label[text()='Jersey Number']/following::text()[1])[last()]") # PLAYER GAME STATS loader.add_xpath( 'overall_rating', "//div[@class='column col-4 text-center'][1]/span/text()") col_4_loader.add_xpath( 'potential_rating', "//div[@class='column col-4 text-center'][2]/span/text()") loader.add_xpath('positions', ".//div[contains(@class, 'meta')]/span/text()") loader.add_xpath('unique_attributes', ".//div[contains(@class, 'mt-2')]/a/text()") if 'GK' in response.xpath(".//div[contains(@class, 'meta')]"\ "/span/text()").getall(): loader.add_xpath( 'DIV', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'HAN', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'KIC', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'REF', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'SPD', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'POS', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) else: loader.add_xpath( 'PAC', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'SHO', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'PAS', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'DRI', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'DEF', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) loader.add_xpath( 'PHY', "(.//div[@class='wrapper']"\ "//script)[1][contains(text(), 'var')]/text()" ) # PLAYER DETAILED STATS loader.add_xpath('crossing', "(.//span[../span='Crossing']/text())[1]") loader.add_xpath('finishing', "(.//span[../span='Finishing']/text())[1]") loader.add_xpath('heading_accuracy', "(.//span[../span='Heading Accuracy']/text())[1]") loader.add_xpath('short_passing', "(.//span[../span='Short Passing']/text())[1]") loader.add_xpath('volleys', "(.//span[../span='Volleys']/text())[1]") loader.add_xpath('aggression', "(.//span[../span='Aggression']/text())[1]") loader.add_xpath('interceptions', "(.//span[../span='Interceptions']/text())[1]") loader.add_xpath('positioning', "(.//span[../span='Positioning']/text())[1]") loader.add_xpath('vision', "(.//span[../span='Vision']/text())[1]") loader.add_xpath('penalties', "(.//span[../span='Penalties']/text())[1]") loader.add_xpath('composure', ".//li[contains(text(), 'Composure')]/span/text()") loader.add_xpath('dribbling', "(.//span[../span='Dribbling']/text())[1]") loader.add_xpath('curve', "(.//span[../span='Curve']/text())[1]") loader.add_xpath('fk_accuracy', "(.//span[../span='FK Accuracy']/text())[1]") loader.add_xpath('long_passing', "(.//span[../span='Long Passing']/text())[1]") loader.add_xpath('ball_control', "(.//span[../span='Ball Control']/text())[1]") loader.add_xpath('marking', "(.//span[../span='Marking']/text())[1]") loader.add_xpath('standing_tackle', "(.//span[../span='Standing Tackle']/text())[1]") loader.add_xpath('sliding_tackle', "(.//span[../span='Sliding Tackle']/text())[1]") loader.add_xpath('acceleration', "(.//span[../span='Acceleration']/text())[1]") loader.add_xpath('sprint_speed', "(.//span[../span='Sprint Speed']/text())[1]") loader.add_xpath('agility', "(.//span[../span='Agility']/text())[1]") loader.add_xpath('reactions', "(.//span[../span='Reactions']/text())[1]") loader.add_xpath('balance', "(.//span[../span='Balance']/text())[1]") loader.add_xpath('gk_diving', ".//li[contains(text(), 'GK Diving')]/span/text()") loader.add_xpath('gk_handling', ".//li[contains(text(), 'GK Handling')]/span/text()") loader.add_xpath('gk_kicking', ".//li[contains(text(), 'GK Kicking')]/span/text()") loader.add_xpath( 'gk_positioning', ".//li[contains(text(), 'GK Positioning')]/span/text()") loader.add_xpath('gk_reflexes', ".//li[contains(text(), 'GK Reflexes')]/span/text()") loader.add_xpath('shot_power', "(.//span[../span='Shot Power']/text())[1]") loader.add_xpath('jumping', "(.//span[../span='Jumping']/text())[1]") loader.add_xpath('stamina', "(.//span[../span='Stamina']/text())[1]") loader.add_xpath('strength', "(.//span[../span='Strength']/text())[1]") loader.add_xpath('long_shots', "(.//span[../span='Long Shots']/text())[1]") loader.add_xpath( 'traits', ".//h5[text()='Traits']/following-sibling::ul/li/span/text()") # PLAYER REAL OVERALL RATING (POSITIONAL STATS) loader.add_xpath('LS', "(.//div[../div='LS']/following::text())[1]") loader.add_xpath('ST', "(.//div[../div='ST']/following::text())[1]") loader.add_xpath('RS', "(.//div[../div='RS']/following::text())[1]") loader.add_xpath('LW', "(.//div[../div='LW']/following::text())[1]") loader.add_xpath('LF', "(.//div[../div='LF']/following::text())[1]") loader.add_xpath('CF', "(.//div[../div='CF']/following::text())[1]") loader.add_xpath('RF', "(.//div[../div='RF']/following::text())[1]") loader.add_xpath('RW', "(.//div[../div='RW']/following::text())[1]") loader.add_xpath('LAM', "(.//div[../div='LAM']/following::text())[1]") loader.add_xpath('CAM', "(.//div[../div='CAM']/following::text())[1]") loader.add_xpath('RAM', "(.//div[../div='RAM']/following::text())[1]") loader.add_xpath('LM', "(.//div[../div='LM']/following::text())[1]") loader.add_xpath('LCM', "(.//div[../div='LCM']/following::text())[1]") loader.add_xpath('CM', "(.//div[../div='CM']/following::text())[1]") loader.add_xpath('RCM', "(.//div[../div='RCM']/following::text())[1]") loader.add_xpath('RM', "(.//div[../div='RM']/following::text())[1]") loader.add_xpath('LWB', "(.//div[../div='LWB']/following::text())[1]") loader.add_xpath('LDM', "(.//div[../div='LDM']/following::text())[1]") loader.add_xpath('CDM', "(.//div[../div='CDM']/following::text())[1]") loader.add_xpath('RDM', "(.//div[../div='RDM']/following::text())[1]") loader.add_xpath('RWB', "(.//div[../div='RWB']/following::text())[1]") loader.add_xpath('LB', "(.//div[../div='LB']/following::text())[1]") loader.add_xpath('LCB', "(.//div[../div='LCB']/following::text())[1]") loader.add_xpath('CB', "(.//div[../div='CB']/following::text())[1]") loader.add_xpath('RCB', "(.//div[../div='RCB']/following::text())[1]") loader.add_xpath('RB', "(.//div[../div='RB']/following::text())[1]") # COMMUNITY INFORMATION loader.add_xpath( 'followers', "(.//div[contains(@class, 'operation spacing')]/a/span[2]/span"\ "/text())[3]" ) loader.add_xpath( 'likes', "(.//div[contains(@class, 'operation spacing')]/a/span[2]/span"\ "/text())[1]" ) loader.add_xpath( 'dislikes', "(.//div[contains(@class, 'operation spacing')]/a/span[2]/span"\ "/text())[2]" ) # MEDIA loader.add_xpath('face_img', ".//div/div/article/div/img//@data-src") loader.add_xpath('flag_img', ".//div[contains(@class, 'meta')]/a/img/@data-src") loader.add_xpath('club_logo_img', "(.//div/ul/li/figure/img/@data-src)[1]") loader.add_xpath('team_logo_img', "(.//div/ul/li/figure/img/@data-src)[last()]") self.logger.info(f'Parse function called on {response.url}') self.logger.info( f"Currently on page "\ f"{self.crawler.stats.get_value('page_counter')} out of "\ f"{self.crawler.stats.get_value('pages_to_visit')}" ) # TODO: enable continued logging of page_counter after a pause/resume. self.crawler.stats.inc_value(key='page_counter', count=1, start=0) print(response.request.headers['User-Agent']) print(f"{self.crawler.stats.get_value('page_counter')} "\ f"out of {self.crawler.stats.get_value('pages_to_visit')}") yield loader.load_item()
<footer> <a class="social" href="http://facebook.com/whatever">Like Us</a> <a class="social" href="http://twitter.com/whatever">Follow Us</a> <a class="email" href="mailto:[email protected]">Email Us</a> </footer> ''' loader = ItemLoader(item=Item()) # load stuff not in the footer loader.add_xpath('social', '//footer/a[@class = "social"]/@href') loader.add_xpath('email', '//footer/a[@class = "email"]/@href') loader.load_item() loader = ItemLoader(item=Item()) # load stuff not in the footer footer_loader = loader.nested_xpath('//footer') footer_loader.add_xpath('social', 'a[@class = "social"]/@href') footer_loader.add_xpath('email', 'a[@class = "email"]/@href') # no need to call footer_loader.load_item() loader.load_item() 6. Reusing and extending item loaders from scrapy.loader.processors import MapCompose from myproject.ItemLoaders import ProductLoader def strip_dashes(x): return x.strip('-') class SiteSpecificLoader(ProductLoader): name_in = MapCompose(strip_dashes, ProductLoader.name_in) from scrapy.loader.processors import MapCompose
def parse(self, response): loader = ItemLoader(NationalTeamDetailedStats(), response=response) mt_2_loader = loader.nested_xpath(".//div[@class='operation mt-2']/a") col_6_loader = loader.nested_xpath(".//div[@class='column col-6']") # GENERAL CLUB INFORMATION loader.add_xpath('id', ".//div[@class='info']/h1/text()") loader.add_xpath('team_name', ".//div[@class='info']/h1/text()") loader.add_xpath('team_logo', ".//div[@class='card card-border player fixed-width']/img/@data-src") loader.add_xpath('flag', ".//div[@class='meta']//a[last()-1]//img/@data-src") # GENERAL TEAM STATS loader.add_xpath('overall', "(.//div[@class='column col-4 text-center']" "/preceding::text()[contains(.,'Overall')])[2]/following::span[1]/text()") loader.add_xpath('attack', "(.//div[@class='column col-4 text-center']" "/preceding::text()[contains(.,'Attack')])[2]/following::span[1]/text()") loader.add_xpath('midfield', "(.//div[@class='column col-4 text-center']" "/preceding::text()[contains(.,'Midfield')])[2]/following::span[1]/text()") loader.add_xpath('defence', "(.//div[@class='column col-4 text-center']" "/following::text()[contains(.,'Defence')])[1]/following::span[1]/text()") # DETAILED TEAM STATS col_6_loader.add_xpath('home_stadium', ".//following::label[contains(., 'Home Stadium')]/following::text()[1]") col_6_loader.add_xpath('rival_team', ".//following::label[contains(., 'Rival Team')]/following::a[1]/text()") col_6_loader.add_xpath('international_prestige', ".//following::label[contains(., 'International Prestige')]" "/following::span[1]/text()") col_6_loader.add_xpath('starting_xi_average_age', ".//following::label[contains(., 'Starting XI Average Age')]" "/following::text()[1]") col_6_loader.add_xpath('whole_team_average_age', ".//following::label[contains(., 'Whole Team Average Age')]" "/following::text()[1]") col_6_loader.add_xpath('captain', ".//following::label[contains(., 'Captain')]/following::a[1]/@href") col_6_loader.add_xpath('short_free_kick', ".//following::label[text()='Short Free Kick']/following::a[1]/@href") col_6_loader.add_xpath('long_free_kick', ".//following::label[text()='Long Free Kick']/following::a[1]/@href") col_6_loader.add_xpath('left_short_free_kick', ".//following::label[text()='Left Short Free Kick']" "/following::a[1]/@href") col_6_loader.add_xpath('right_short_free_kick', ".//following::label[text()='Right Short Free Kick']" "/following::a[1]/@href") col_6_loader.add_xpath('penalties', ".//following::label[text()='Penalties']/following::a[1]/@href") col_6_loader.add_xpath('left_corner', ".//following::label[text()='Left Corner']/following::a[1]/@href") col_6_loader.add_xpath('right_corner', ".//following::label[text()='Right Corner']/following::a[1]/@href") loader.add_xpath('starting_xi', ".//div[@class='field-player']/a/@href") # TACTICS loader.add_xpath('defence_defensive_style', ".//dl//span/preceding::dd[text()='Defensive Style']" "/span/span/text()") loader.add_xpath('defence_team_width', "(.//dl//span/preceding::span[text()='Team Width']" "/following::div/meter)[1]/@value") loader.add_xpath('defence_depth', "(.//dl//span/preceding::span[text()='Depth']" "/following::div/meter)[1]/@value") loader.add_xpath('offense_offensive_style', ".//dl//span/preceding::dd[text()='Offensive Style']" "/span/span/text()") loader.add_xpath('offense_width', "(.//dl//span/preceding::span[text()='Width']/following::div/meter)[1]" "/@value") loader.add_xpath('offense_players_in_box', "(.//dl//span/preceding::span[text()='Players in box']" "/following::div/meter)[1]/@value") loader.add_xpath('offense_corners', "(.//dl//span/preceding::span[text()='Corners']" "/following::div/meter)[1]/@value") loader.add_xpath('offense_free_kicks', "(.//dl//span/preceding::span[text()='Free Kicks']" "/following::div/meter)[1]/@value") loader.add_xpath('build_up_play_speed', ".//dl//span/preceding::span[text()='Speed']/following::span/text()") loader.add_xpath('build_up_play_dribbling', "(.//dl//span/preceding::dd[text()='Dribbling']//span)[1]" "/span/text()") loader.add_xpath('build_up_play_passing', "(.//dl//span/preceding::span[text()='Passing']/following::span)[1]" "/span/text()") loader.add_xpath('build_up_play_positioning', "(.//dl//span/preceding::span[text()='Positioning'])[1]" "/following::span[1]/text()") loader.add_xpath('chance_creation_passing', "(.//dl//span/preceding::span[text()='Shooting']" "/following::span)[1]/span/text()") loader.add_xpath('chance_creation_crossing', "(.//dl//span/preceding::span[text()='Crossing']" "/following::span)[1]/span/text()") loader.add_xpath('chance_creation_shooting', "(.//dl//span/preceding::span[text()='Shooting']" "/following::span)[1]/span/text()") loader.add_xpath('chance_creation_positioning', "(.//dl//span/preceding::span[text()='Positioning'])[2]" "/following::span[1]/text()") loader.add_xpath('defence_extra_pressure', "(.//dl//span/preceding::span[text()='Pressure']" "/following::span)[1]/span/text()") loader.add_xpath('defence_extra_aggression', "(.//dl//span/preceding::span[text()='Aggression']" "/following::span)[1]/span/text()") loader.add_xpath('defence_extra_team_width', "(.//span[text()='Team Width'])[2]/following::span[1]/span/text()") loader.add_xpath('defence_extra_defender_line', ".//span[text()='Defender Line']/following::span/text()") # PLAYERS loader.add_xpath('squad', "(.//table)[1]/tbody/tr//a[contains(@href, '/player/')]/@href") loader.add_xpath('on_loan', "(.//table)[2]/tbody/tr//a[contains(@href, '/player/')]/@href") # MEDIA loader.add_xpath('kits', ".//div[@class='column col-sm-5 text-center']//img/@src") # COMMUNITY mt_2_loader.add_xpath('likes', "text()[contains(.,'Like')]/following::span[1]/text()") mt_2_loader.add_xpath('dislikes', "text()[contains(.,'Dislike')]/following::span[1]/text()") print(response.request.headers['User-Agent']) self.logger.info(f'Parse function called on {response.url}') yield loader.load_item()
def parse_property_page(self, response): # overview il = ItemLoader(item=overview_item(), response=response) il.add_value('url', response.url) overview_node = il.nested_xpath( '//div[@data-testid="home-details-summary-container"]') overview_node.add_xpath( 'address', './/span[@data-testid="home-details-summary-headline"]/text()') overview_node.add_xpath( 'city_state', './/span[@data-testid="home-details-summary-city-state"]/text()') overview_node.add_xpath( 'price', './/*[@data-testid="on-market-price-details"]//text()', re=r'\$([\d,]+)') overview_node.add_xpath('area', xpath='.//li//text()', re=r'^([\d,]+)\s?sqft$') overview_node.add_xpath('bedrooms', xpath='.//li//text()', re=r'(\d+\.?\d?) (?:Beds|Bed|beds|bed)$') overview_node.add_xpath('bathrooms', xpath='.//li//text()', re=r'(\d+\.?\d?) (?:Baths|Bath|baths|bath)$') details = il.nested_xpath('//div[@data-testid="features-container"]') details.add_xpath('year_built', xpath='.//li//text()', re='Built in (\d+)') details.add_xpath('lot_size', xpath='.//li//text()', re=r'Lot Size: ([\d,.]+) (?:acres|sqft)$') details.add_xpath('lot_size_units', xpath='.//li//text()', re=r'Lot Size: [\d,.]+ (acres|sqft)$') details.add_xpath('price_per_square_foot', xpath='.//li//text()', re=r'\$([\d,.]+)/sqft$') details.add_xpath('days_on_Trulia', xpath='.//li//text()', re=r'([\d,]+)\+? Days on Trulia$') overview_dict = il.load_item() # local info local_info_list = response.xpath( '(//*[div="Local Information"]/parent::div)[2]/following-sibling::div/div/div//text()' ).extract() # for i in range(len(local_info_list) - 1, -1, -1): # if "Map View" in local_info_list[i] or "Street View" in local_info_list[i]: # local_info_list.remove(local_info_list[i]) local_dict_values = '\n'.join(local_info_list) # price_history il = ItemLoader(item=price_item(), response=response) table_xpath = '//div[contains(text(), "Price History for")]/../../following-sibling::table' il.add_xpath('dates', table_xpath + '//tr[1]/td[1]//text()') il.add_xpath('prices', table_xpath + '//tr[1]/td[2]//text()') il.add_xpath('events', table_xpath + '//tr[1]/td[3]//text()') price_dict = il.load_item() # tax info il = ItemLoader(item=taxes_item(), response=response) table_xpath = '//*[div="Property Taxes and Assessment"]/parent::div/following-sibling::table' il.add_xpath('property_tax_assessment_year', table_xpath + '//tr[1]/td[1]//text()') il.add_xpath('property_tax', table_xpath + '//tr[2]/td[1]//text()') il.add_xpath('property_tax_assessment_land', table_xpath + '//tr[4]/td[1]//text()') il.add_xpath('property_tax_assessment_improvements', table_xpath + '//tr[5]/td[1]//text()') il.add_xpath('property_tax_assessment_total', table_xpath + '//tr[6]/td[1]//text()') tax_dict = il.load_item() # 有的“可比较”模块不存在 comparable_path = '//div[contains(text(), "Comparable Sales")]/../../following-sibling::div[3]' header = response.xpath(comparable_path + '//th//text()').extract() header.append('url') num_tr = len(response.xpath(comparable_path + '//tbody/tr')) rows = [] for i in range(1, num_tr + 1): rows.append( response.xpath( (comparable_path + '//tbody/tr[{:d}]//text()').format(i)).extract()) urls = response.xpath(comparable_path + '//tbody//a/@href').extract() urls = [get_rel_url(response.url, url) for url in urls] [rows[i].append(urls[i]) for i in range(num_tr)] comparable_list = [list(zip(header, row)) for row in rows] # price_trends il = ItemLoader(item=price_trends_item(), response=response) price_trend_node = il.nested_xpath( '//*[div="Price Trends"]/parent::div/following-sibling::div[1]') price_trend_node.add_xpath('item1', './*[3]//text()') price_trend_node.add_xpath('item2', './*[4]//text()') price_trend_node.add_xpath('item3', './*[5]//text()') price_trends_dict = il.load_item() price_trends = '\n'.join(list(price_trends_dict.values())) # local common total_reviews = [] reviews = [] review_count = response.xpath( 'count(//div[@data-testid="wls-responisve-slider"]/div/div/child::node())' ).extract()[0] review_count = int(float(review_count)) for i in range(1, 1 + review_count): reviews.append(' '.join( response.xpath( '//div[@data-testid="wls-responisve-slider"]/div/div/*[{:d}]//text()' .format(i)).extract())) reviews = '\n'.join(reviews) common_count = response.xpath( 'count(//div[@data-testid="what-locals-say"]/child::node())' ).extract()[0] common_count = int(float(common_count)) for i in range(1, common_count): total_reviews.append(' '.join( response.xpath( '//div[@data-testid="what-locals-say"]/*[{:d}]//text()'. format(i)).extract())) total_reviews.append(reviews) #similar_house base_xpath = '//*[div="Similar Homes You May Like"]/parent::div/following-sibling::div[1]/div/div' similar_house = self.get_similar_new_part(base_xpath, response) # new linking house base_xpath = '//div[contains(text(), "New Listings near")]/../../following-sibling::div[1]/div/div' new_link_house = self.get_similar_new_part(base_xpath, response) # all new homes builder_tr_count = response.xpath( 'count(//table[@data-testid="quick-movein-builder-homes-table"]//tr)' ).extract()[0] builder_tr_count = int(float(builder_tr_count)) builder_tables = [] for i in range(1, 1 + builder_tr_count): builder_tables.append( response.xpath( '//table[@data-testid="quick-movein-builder-homes-table"]//tr[{:d}]/td//text()' .format(i)).extract()) builder_plans = [] for i in range(1, 1 + builder_tr_count): builder_plans.append( response.xpath( '//table[@data-testid="planned-builder-homes-table"]//tr[{:d}]/td//text()' .format(i)).extract()) new_homes = {} if len(builder_tables) > 0: new_homes['quick-movein-builder'] = builder_tables if len(builder_plans) > 0: new_homes['planned-builder'] = builder_plans il = ItemLoader(item=TruliaItem(), response=response) # home detail il.add_xpath( 'home_detail', '//div[contains(text(), "Home Details for")]/../../following-sibling::ul/li//text()' ) # description il.add_xpath( 'description', '(//*[div="Description"]/parent::div)[2]/following-sibling::div//text()' ) il.add_xpath( 'community_description', '//div[@data-testid="community-description-text-description-text"]//text()' ) il.add_xpath('office_hours', '//div[@data-testid="office-hours-container"]//text()') il.add_xpath('open_house', '//div[@data-testid="open-house-container"]//text()') # local_commons item = il.load_item() # price_history may not exist try: dates = [ datetime.datetime.strptime(date, '%m/%d/%Y') for date in price_dict['dates'] ] prices = [ int(price.lstrip('$').replace(',', '')) for price in price_dict['prices'] ] item['price_history'] = sorted(list( zip(dates, prices, price_dict['events'])), key=lambda x: x[0]) except: item['price_history'] = [] # overview item['overview'] = overview_dict # property_tax may not exist item['property_taxes'] = tax_dict #local_view item['local_information'] = local_dict_values # price_trends item['price_trends'] = price_trends # comparable_sales item['comparable_sales'] = comparable_list # local_commons item['local_commons'] = total_reviews # similar house item['similar_homes'] = similar_house # new_link house item['new_listing'] = new_link_house # new homes item['new_homes'] = new_homes return item
<footer> <a class="social" href="http://facebook.com/whatever">Like Us</a> <a class="social" href="http://twitter.com/whatever">Follow Us</a> <a class="email" href="mailto:[email protected]">Email Us</a> </footer> not nested loader = ItemLoader(item=Item()) # load stuff not in the footer loader.add_xpath('social', '//footer/a[@class = "social"]/@href') loader.add_xpath('email', '//footer/a[@class = "email"]/@href') loader.load_item() nested loader = ItemLoader(item=Item()) # load stuff not in the footer footer_loader = loader.nested_xpath('//footer') footer_loader.add_xpath('social', 'a[@class = "social"]/@href') footer_loader.add_xpath('email', 'a[@class = "email"]/@href') # no need to call footer_loader.load_item() loader.load_item() 9.Reusing and extending Item Loaders eg:remove 3 dashes ---Plasma TV--- from scrapy.loader.processors import MapCompose from myproject.ItemLoaders import ProductLoader def strip_dashes(x): return x.strip('-') class SiteSpecificLoader(ProductLoader): name_in = MapCompose(strip_dashes, ProductLoader.name_in) 10.Available built-in processors class scrapy.loader.processors.Identity The simplest processor, which doesn’t do anything.
def parse_restaurant(self, response): loader = ItemLoader(item=RestaurantItem(source=self.name, language='en', last_update=int(time.time())), response=response) loader.default_input_processor = Compose( MapCompose(lambda x: x.strip() or None)) loader.default_output_processor = TakeFirst() url = url_query_cleaner(response.url) loader.add_value('url', url) id = urllib.unquote(urlparse.urlparse(url).path.split('/')[-1]) loader.add_value('id', id) loader.add_xpath( 'name', '//div[contains(@class, "biz-page-header")]//h1[contains(@class, "biz-page-title")]/text()' ) loader.address_out = Join(' - ') loader.add_xpath('address', "//div[contains(@class, 'map-box-address')]//text()") loader.add_xpath('geolocation', "//div[@class='mapbox-map']//img/@src", MapCompose(lambda url: parse_qs(url).get('center'))) loader.add_xpath( 'phone_number', "//div[@class='mapbox-text']//span[@class='biz-phone']/text()") hours_loader = loader.nested_xpath( "//div[contains(@class, 'biz-hours')]//tr/th[@scope]/..") hours_loader.opening_hours_in = Compose(group_items(3)) hours_loader.opening_hours_out = Identity() hours_loader.add_xpath( 'opening_hours', './th/text() | ./td/span[@class="nowrap"]/text()') loader.add_xpath( 'rating', '//div[contains(@class, "biz-page-header")]//div[contains(@class, "biz-rating")]/div[contains(@class, "i-stars")]/@title', re=r'(?:\D*)(\d+(?:\.\d+)?)') loader.number_of_reviews_in = MapCompose(int) loader.add_xpath( 'number_of_reviews', '//div[contains(@class, "biz-page-header")]//span[contains(@class, "review-count")]/text()', re=r'^\D*(\d+)') info_loader = loader.nested_xpath( '//div[contains(@class, "sidebar")]//div[@class="ywidget"]/ul[@class="ylist"]/li/div[contains(@class, "short-def-list")]/dl' ) info_loader.info_in = Compose(MapCompose(unicode.strip), group_items(2)) info_loader.info_out = Identity() info_loader.add_xpath( 'info', './dt[@class="attribute-key"]/text() | ./dd/text()') item = loader.load_item() menu_url = TakeFirst()(response.xpath( '//h3[@class="menu-preview-heading"]/a/@href').extract()) if menu_url: yield scrapy.Request(response.urljoin(menu_url), callback=self.parse_menu, meta={'item': item}) else: yield item