Exemplo n.º 1
0
class SofifaTeamUrlsSpider(CrawlSpider):
    """
    Collects all team urls found on sofifa.com to be later scraped by
    the SofifaClubPagesSpider. URLs are stored inside the team_urls
    collection at mongodb://mongo_server:27017/sofifa
    """

    name = 'team_pages'

    proxies = gen_proxy_list()
    user_agent = gen_useragent_list()

    custom_settings = sofifa_settings(name=name,
                                      database='sofifa',
                                      collection='team_urls',
                                      proxies=proxies,
                                      user_agent=user_agent,
                                      validator='TeamItem')

    allowed_domains = ['sofifa.com']

    start_urls = ['https://sofifa.com/teams?type=national']

    rules = (Rule(LinkExtractor(
        deny=([
            r'\?', r'/[0-9]+', r'/forgot', r'/shortlist', r'/authorize',
            r'/leagues', r'/squad', r'/help', r'/compare', r'/players',
            r'/player', r'/changeLog', r'/live', r'/calculator'
        ]),
        allow=(['https://sofifa.com/teams?type=national/']),
    ),
                  callback='parse_start_url',
                  follow=True),
             Rule(LinkExtractor(restrict_xpaths="//a[text()='Next']"),
                  callback='parse_item',
                  follow=True))

    def parse_start_url(self, response):

        for row in response.xpath(
                "//table[@class='table table-hover persist-area']/tbody/tr"):

            loader = ItemLoader(item=TeamStatItem(),
                                selector=row,
                                response=response)

            loader.add_value('last_modified', datetime.utcnow())
            loader.add_xpath('id', ".//a[contains(@href, 'team/')]/@href")
            loader.add_xpath('nationality', ".//div/a[1]/@title")
            loader.add_xpath('region', ".//td/a[1]/text()")
            loader.add_xpath('num_players', ".//td[@data-col='ps']/text()")
            loader.add_xpath('hits', ".//td[@class='col-comment']/text()[1]")
            loader.add_xpath('comments',
                             ".//td[@class='col-comment']/text()[2]")
            loader.add_xpath('team_page', ".//td[2]/div/a[2]/@href")

            yield loader.load_item()
Exemplo n.º 2
0
class SofifaClubPagesSpider(CrawlSpider):
    """
    Visits the urls collected by SofifaClubUrlsSpider and scrapes data
    from those urls. Data is stored inside the club_details collection
    at mongodb://mongo_server:27017/sofifa
    """

    name = 'club_details'

    proxies = gen_proxy_list()
    user_agent = gen_useragent_list()

    custom_settings = sofifa_settings(name=name,
                                      database='sofifa',
                                      collection='club_details',
                                      proxies=proxies,
                                      user_agent=user_agent,
                                      validator='ClubItem')

    def start_requests(self):

        client = MongoClient(host, port)
        db = client.sofifa
        collection = db.club_urls

        urls = [
            x["club_page"]
            for x in collection.find({'club_page': {
                '$exists': 'true'
            }})
        ]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):

        loader = ItemLoader(DetailedTeamStatItem(), response=response)

        team_spacing_loader = loader.nested_xpath(
            ".//div[contains(@class, 'team')]")

        loader.add_value('last_modified', datetime.utcnow())

        # GENERAL CLUB INFORMATION

        loader.add_xpath('id', ".//div[@class='info']/h1/text()")
        loader.add_xpath('club_name', ".//div[@class='info']/h1/text()")
        loader.add_xpath('division',
                         ".//div[contains(@class, 'meta')]//a[last()]/text()")
        loader.add_xpath('club_logo',
                         ".//div[contains(@class, 'card')]/img/@data-src")
        loader.add_xpath('flag',
                         ".//div[contains(@class, 'meta')]//img/@data-src")

        # GENERAL TEAM STATS

        loader.add_xpath(
            'overall',
            ".//div[contains(@class, 'stats')]/div/div[1]/span/text()")
        loader.add_xpath(
            'attack',
            ".//div[contains(@class, 'stats')]/div/div[2]/span/text()")
        loader.add_xpath(
            'midfield',
            ".//div[contains(@class, 'stats')]/div/div[3]/span/text()")
        loader.add_xpath(
            'defence',
            ".//div[contains(@class, 'stats')]/div/div[4]/span/text()")

        # DETAILED TEAM STATS

        # Note: this stat seams to be missing as of 06/17/2019
        team_spacing_loader.add_xpath(
            'home_stadium',
            "./ul/li/following::label[contains(., 'Home Stadium')]"\
            "/following::text()[1]"
        )
        team_spacing_loader.add_xpath(
            'rival_team',
            "./ul/li/following::label[contains(., 'Rival Team')]"\
            "/following::a[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'international_prestige',
            "./ul/li/following::label[contains(., 'International Prestige')]"\
            "/following::span[1]/text()"
        )
        team_spacing_loader.add_xpath(
            'domestic_prestige',
            "./ul/li/following::label[contains(., 'Domestic Prestige')]"\
            "/following::span[1]/text()"
        )
        team_spacing_loader.add_xpath(
            'transfer_budget',
            "./ul/li/following::label[contains(., 'Domestic Prestige')]"\
            "/following::label[contains(., 'Transfer Budget')]"\
            "/following::text()[1]"
        )
        team_spacing_loader.add_xpath(
            'starting_xi_average_age',
            "./ul/li/following::label[contains(., 'Starting XI Average Age')]"\
            "/following::text()[1]"
        )
        team_spacing_loader.add_xpath(
            'whole_team_average_age',
            "./ul/li/following::label[contains(., 'Whole Team Average Age')]"\
            "/following::text()[1]"
        )
        team_spacing_loader.add_xpath(
            'captain',
            "./ul/li/following::label[contains(., 'Captain')]"\
            "/following::a[1]/@href"
        )
        loader.add_xpath(
            'short_free_kick',
            "(.//div[contains(@class, 'team')]/ul/li"\
            "/following::label[contains(., 'Short Free Kick')]"\
            "/following::a[1])[1]/@href"
        )
        loader.add_xpath(
            'long_free_kick',
            "(.//div[contains(@class, 'team')]/ul/li"\
            "/following::label[contains(., 'Long Free Kick')]"\
            "/following::a[1])[1]/@href"
        )
        loader.add_xpath(
            'left_short_free_kick',
            "(.//div[contains(@class, 'team')]/ul/li"\
            "/following::label[contains(., 'Left Short Free Kick')]"\
            "/following::a[1])[1]/@href"
        )
        loader.add_xpath(
            'right_short_free_kick',
            "(.//div[contains(@class, 'team')]/ul/li"\
            "/following::label[contains(., 'Right Short Free Kick')]"\
            "/following::a[1])[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'penalties',
            "./ul/li/following::label[contains(., 'Penalties')]"\
            "/following::a[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'left_corner',
            "./ul/li/following::label[contains(., 'Left Corner')]"\
            "/following::a[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'right_corner',
            "./ul/li/following::label[contains(., 'Right Corner')]"\
            "/following::a[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'starting_xi', ".//div[contains(@class, 'lineup')]/div/a/@href")

        # TACTICS

        loader.add_xpath(
            'defence_defensive_style',
            ".//dl//span/preceding::dd[text()='Defensive Style']/span/span/"\
            "text()"
        )
        loader.add_xpath(
            'defence_team_width',
            "(.//dl//span/preceding::span[text()='Team Width']"\
            "/following::span[1]/span/text())[1]"
        )
        loader.add_xpath(
            'defence_depth',
            ".//dl//span/preceding::span[text()='Depth']/following::span[1]"\
            "/span/text()"
        )
        loader.add_xpath(
            'offense_offensive_style',
            ".//dl//span/preceding::dd[text()='Offensive Style']/span/span/"\
            "text()"
        )
        loader.add_xpath(
            'offense_width',
            ".//dl//span/preceding::span[text()='Width']/following::span[1]"\
            "/span/text()"
        )
        loader.add_xpath(
            'offense_players_in_box',
            ".//dl//span/preceding::span[text()='Players in box']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'offense_corners',
            ".//dl//span/preceding::span[text()='Corners']/following::span[1]"\
            "/span/text()"
        )
        loader.add_xpath(
            'offense_free_kicks',
            ".//dl//span/preceding::span[text()='Free Kicks']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'build_up_play_speed',
            ".//dl//span/preceding::span[text()='Speed']/following::span[1]"\
            "/span/text()"
        )
        loader.add_xpath(
            'build_up_play_dribbling',
            ".//dl//span/preceding::dd[text()='Dribbling']/span/span/text()")
        loader.add_xpath(
            'build_up_play_passing',
            "(.//dl//span/preceding::span[text()='Passing']"\
            "/following::span[1]/span/text())[1]"
        )
        loader.add_xpath(
            'build_up_play_positioning',
            "(.//dl//span/preceding::span[text()='Positioning'])[1]"\
            "/following::span[1]/text()"
        )
        loader.add_xpath(
            'chance_creation_passing',
            "(.//dl//span/preceding::span[text()='Passing']"\
            "/following::span[1]/span/text())[2]"
        )
        loader.add_xpath(
            'chance_creation_crossing',
            ".//dl//span/preceding::span[text()='Crossing']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'chance_creation_shooting',
            ".//dl//span/preceding::span[text()='Shooting']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'chance_creation_positioning',
            "(.//dl//span/preceding::span[text()='Positioning'])[2]"\
            "/following::span[1]/text()"
        )
        loader.add_xpath(
            'defence_extra_pressure',
            ".//dl//span/preceding::span[text()='Pressure']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'defence_extra_aggression',
            ".//dl//span/preceding::span[text()='Aggression']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'defence_extra_team_width',
            "(.//dl//span/preceding::span[text()='Team Width']"\
            "/following::span[1]/span/text())[2]"
        )
        loader.add_xpath(
            'defence_extra_defender_line',
            ".//span[text()='Defender Line']/following::span/text()")

        # PLAYERS

        loader.add_xpath(
            'squad',
            "(.//table)[1]/tbody/tr//a[contains(@href, '/player/')]/@href")
        loader.add_xpath(
            'on_loan',
            "(.//table)[2]/tbody/tr//a[contains(@href, '/player/')]/@href")

        # MEDIA

        loader.add_xpath(
            'kits', ".//div[@class='column col-sm-5 text-center']//img/@src")

        # COMMUNITY

        loader.add_xpath(
            'likes',
            "(//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[1]"
        )
        loader.add_xpath(
            'dislikes',
            "(//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[2]"
        )

        print(response.request.headers['User-Agent'])

        self.logger.info(f'Parse function called on {response.url}')

        yield loader.load_item()
Exemplo n.º 3
0
class SofifaPlayerPagesSpider(scrapy.Spider):
    """
    Visits the urls collceted by SofifaPlayerUrlsSpider and scrapes
    data from those urls. Data is stored inside the player_details
    collection at mongodb://mongo_server:27017/sofifa
    """

    name = 'player_details'

    proxies = gen_proxy_list()
    user_agent = gen_useragent_list()

    custom_settings = sofifa_settings(name=name,
                                      database='sofifa',
                                      collection='player_details',
                                      proxies=proxies,
                                      user_agent=user_agent,
                                      validator='PlayerItem')

    def start_requests(self):

        client = MongoClient(host, port)
        db = client.sofifa
        collection = db.player_urls

        urls = [
            x["player_page"]
            for x in collection.find({'player_page': {
                '$exists': 'true'
            }})
        ]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):

        self.crawler.stats.set_value('pages_to_visit', len(self.urls))

        loader = ItemLoader(item=SofifaItem(), response=response)

        col_4_loader = loader.nested_xpath(
            ".//div[@class='column col-4 text-center']")

        loader.add_value('last_modified', datetime.utcnow())

        # GENERAL PLAYER INFORMATION

        loader.add_xpath('id', ".//div[@class='info']/h1/text()")
        loader.add_xpath('name', ".//div[@class='info']/h1/text()")
        loader.add_xpath('full_name',
                         ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('age', ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('dob', ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('height',
                         ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('weight',
                         ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('nationality',
                         ".//div[contains(@class, 'meta')]/a/@title")

        # GENERAL PLAYER STATS

        loader.add_xpath(
            'preferred_foot',
            "(.//label[text()='Preferred Foot']/following::text())[1]")
        loader.add_xpath(
            'international_reputation',
            "(.//label[text()='International Reputation']"\
            "/following::text())[1]"
        )
        loader.add_xpath(
            'weak_foot', "(.//label[text()='Weak Foot']/following::text())[1]")
        loader.add_xpath(
            'skill_moves',
            "(.//label[text()='Skill Moves']/following::text())[1]")
        loader.add_xpath(
            'work_rate',
            "(.//label[text()='Work Rate']/following::span/text())[1]")
        loader.add_xpath(
            'body_type',
            "(.//label[text()='Body Type']/following::span/text())[1]")
        loader.add_xpath(
            'real_face',
            "(.//label[text()='Real Face']/following::span/text())[1]")

        # CLUB/TEAM INFORMATION

        col_4_loader.add_xpath(
            'value',
            "/following::text()[contains(., 'Value')]"\
            "/following::span[1]/text()"
        )

        col_4_loader.add_xpath(
            'wage',
            "/following::text()[contains(., 'Wage')]/following::span[1]/text()"
        )
        loader.add_xpath(
            'release_clause',
            "(.//label[text()='Release Clause']/following::span/text())[1]")
        loader.add_xpath('club_name',
                         "(.//ul[contains(@class, 'pl')]//a/text())[1]")
        loader.add_xpath('club_url',
                         "(.//ul[contains(@class, 'pl')]//a/@href)[1]")
        loader.add_xpath(
            'club_rating',
            ".//div[contains(@class, 'column col-5')][1]//li[2]/span[1]/text()"
        )
        loader.add_xpath(
            'club_position',
            "(.//label[text()='Position']/following::text()[1])[1]")
        loader.add_xpath(
            'club_jersey_number',
            "(.//label[text()='Jersey Number']/following::text()[1])[1]")
        loader.add_xpath('club_join_date',
                         ".//label[text()='Joined']/following::text()[1]")
        loader.add_xpath(
            'loaned_from',
            ".//label[text()='Loaned From']/following::a[1]/text()")
        loader.add_xpath(
            'club_contract_end_date',
            ".//label[text()='Contract Valid Until']/following::text()[1]")
        loader.add_xpath('team_name',
                         "(.//ul[contains(@class, 'pl')]//a/text())[last()]")
        loader.add_xpath(
            'team_rating',
            ".//div[contains(@class, 'column col-5')][last()]//li[2]/span[1]"\
            "/text()"
        )
        loader.add_xpath(
            'team_position',
            "(.//label[text()='Position']/following::text()[1])[last()]")
        loader.add_xpath(
            'team_jersey_number',
            "(.//label[text()='Jersey Number']/following::text()[1])[last()]")

        # PLAYER GAME STATS

        loader.add_xpath(
            'overall_rating',
            "//div[@class='column col-4 text-center'][1]/span/text()")
        col_4_loader.add_xpath(
            'potential_rating',
            "//div[@class='column col-4 text-center'][2]/span/text()")
        loader.add_xpath('positions',
                         ".//div[contains(@class, 'meta')]/span/text()")
        loader.add_xpath('unique_attributes',
                         ".//div[contains(@class, 'mt-2')]/a/text()")

        if 'GK' in response.xpath(".//div[contains(@class, 'meta')]"\
                                  "/span/text()").getall():

            loader.add_xpath(
                'DIV',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'HAN',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'KIC',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'REF',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'SPD',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'POS',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )

        else:

            loader.add_xpath(
                'PAC',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'SHO',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'PAS',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'DRI',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'DEF',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'PHY',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )

        # PLAYER DETAILED STATS

        loader.add_xpath('crossing', "(.//span[../span='Crossing']/text())[1]")
        loader.add_xpath('finishing',
                         "(.//span[../span='Finishing']/text())[1]")
        loader.add_xpath('heading_accuracy',
                         "(.//span[../span='Heading Accuracy']/text())[1]")
        loader.add_xpath('short_passing',
                         "(.//span[../span='Short Passing']/text())[1]")
        loader.add_xpath('volleys', "(.//span[../span='Volleys']/text())[1]")
        loader.add_xpath('aggression',
                         "(.//span[../span='Aggression']/text())[1]")
        loader.add_xpath('interceptions',
                         "(.//span[../span='Interceptions']/text())[1]")
        loader.add_xpath('positioning',
                         "(.//span[../span='Positioning']/text())[1]")
        loader.add_xpath('vision', "(.//span[../span='Vision']/text())[1]")
        loader.add_xpath('penalties',
                         "(.//span[../span='Penalties']/text())[1]")
        loader.add_xpath('composure',
                         ".//li[contains(text(), 'Composure')]/span/text()")
        loader.add_xpath('dribbling',
                         "(.//span[../span='Dribbling']/text())[1]")
        loader.add_xpath('curve', "(.//span[../span='Curve']/text())[1]")
        loader.add_xpath('fk_accuracy',
                         "(.//span[../span='FK Accuracy']/text())[1]")
        loader.add_xpath('long_passing',
                         "(.//span[../span='Long Passing']/text())[1]")
        loader.add_xpath('ball_control',
                         "(.//span[../span='Ball Control']/text())[1]")
        loader.add_xpath('marking', "(.//span[../span='Marking']/text())[1]")
        loader.add_xpath('standing_tackle',
                         "(.//span[../span='Standing Tackle']/text())[1]")
        loader.add_xpath('sliding_tackle',
                         "(.//span[../span='Sliding Tackle']/text())[1]")
        loader.add_xpath('acceleration',
                         "(.//span[../span='Acceleration']/text())[1]")
        loader.add_xpath('sprint_speed',
                         "(.//span[../span='Sprint Speed']/text())[1]")
        loader.add_xpath('agility', "(.//span[../span='Agility']/text())[1]")
        loader.add_xpath('reactions',
                         "(.//span[../span='Reactions']/text())[1]")
        loader.add_xpath('balance', "(.//span[../span='Balance']/text())[1]")
        loader.add_xpath('gk_diving',
                         ".//li[contains(text(), 'GK Diving')]/span/text()")
        loader.add_xpath('gk_handling',
                         ".//li[contains(text(), 'GK Handling')]/span/text()")
        loader.add_xpath('gk_kicking',
                         ".//li[contains(text(), 'GK Kicking')]/span/text()")
        loader.add_xpath(
            'gk_positioning',
            ".//li[contains(text(), 'GK Positioning')]/span/text()")
        loader.add_xpath('gk_reflexes',
                         ".//li[contains(text(), 'GK Reflexes')]/span/text()")
        loader.add_xpath('shot_power',
                         "(.//span[../span='Shot Power']/text())[1]")
        loader.add_xpath('jumping', "(.//span[../span='Jumping']/text())[1]")
        loader.add_xpath('stamina', "(.//span[../span='Stamina']/text())[1]")
        loader.add_xpath('strength', "(.//span[../span='Strength']/text())[1]")
        loader.add_xpath('long_shots',
                         "(.//span[../span='Long Shots']/text())[1]")
        loader.add_xpath(
            'traits',
            ".//h5[text()='Traits']/following-sibling::ul/li/span/text()")

        # PLAYER REAL OVERALL RATING (POSITIONAL STATS)

        loader.add_xpath('LS', "(.//div[../div='LS']/following::text())[1]")
        loader.add_xpath('ST', "(.//div[../div='ST']/following::text())[1]")
        loader.add_xpath('RS', "(.//div[../div='RS']/following::text())[1]")
        loader.add_xpath('LW', "(.//div[../div='LW']/following::text())[1]")
        loader.add_xpath('LF', "(.//div[../div='LF']/following::text())[1]")
        loader.add_xpath('CF', "(.//div[../div='CF']/following::text())[1]")
        loader.add_xpath('RF', "(.//div[../div='RF']/following::text())[1]")
        loader.add_xpath('RW', "(.//div[../div='RW']/following::text())[1]")
        loader.add_xpath('LAM', "(.//div[../div='LAM']/following::text())[1]")
        loader.add_xpath('CAM', "(.//div[../div='CAM']/following::text())[1]")
        loader.add_xpath('RAM', "(.//div[../div='RAM']/following::text())[1]")
        loader.add_xpath('LM', "(.//div[../div='LM']/following::text())[1]")
        loader.add_xpath('LCM', "(.//div[../div='LCM']/following::text())[1]")
        loader.add_xpath('CM', "(.//div[../div='CM']/following::text())[1]")
        loader.add_xpath('RCM', "(.//div[../div='RCM']/following::text())[1]")
        loader.add_xpath('RM', "(.//div[../div='RM']/following::text())[1]")
        loader.add_xpath('LWB', "(.//div[../div='LWB']/following::text())[1]")
        loader.add_xpath('LDM', "(.//div[../div='LDM']/following::text())[1]")
        loader.add_xpath('CDM', "(.//div[../div='CDM']/following::text())[1]")
        loader.add_xpath('RDM', "(.//div[../div='RDM']/following::text())[1]")
        loader.add_xpath('RWB', "(.//div[../div='RWB']/following::text())[1]")
        loader.add_xpath('LB', "(.//div[../div='LB']/following::text())[1]")
        loader.add_xpath('LCB', "(.//div[../div='LCB']/following::text())[1]")
        loader.add_xpath('CB', "(.//div[../div='CB']/following::text())[1]")
        loader.add_xpath('RCB', "(.//div[../div='RCB']/following::text())[1]")
        loader.add_xpath('RB', "(.//div[../div='RB']/following::text())[1]")

        # COMMUNITY INFORMATION

        loader.add_xpath(
            'followers',
            "(.//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[3]"
        )
        loader.add_xpath(
            'likes',
            "(.//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[1]"
        )
        loader.add_xpath(
            'dislikes',
            "(.//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[2]"
        )

        # MEDIA

        loader.add_xpath('face_img', ".//div/div/article/div/img//@data-src")
        loader.add_xpath('flag_img',
                         ".//div[contains(@class, 'meta')]/a/img/@data-src")
        loader.add_xpath('club_logo_img',
                         "(.//div/ul/li/figure/img/@data-src)[1]")
        loader.add_xpath('team_logo_img',
                         "(.//div/ul/li/figure/img/@data-src)[last()]")

        self.logger.info(f'Parse function called on {response.url}')

        self.logger.info(
            f"Currently on page "\
            f"{self.crawler.stats.get_value('page_counter')} out of "\
            f"{self.crawler.stats.get_value('pages_to_visit')}"
        )

        # TODO: enable continued logging of page_counter after a pause/resume.

        self.crawler.stats.inc_value(key='page_counter', count=1, start=0)

        print(response.request.headers['User-Agent'])

        print(f"{self.crawler.stats.get_value('page_counter')} "\
              f"out of {self.crawler.stats.get_value('pages_to_visit')}")

        yield loader.load_item()
Exemplo n.º 4
0
class SofifaPlayerURLsSpider(scrapy.Spider):
    """
    Collects all player urls found on sofifa.com to be later scraped by
    the SofifaPlayerPagesSpider. URLs are stored inside the player_urls
    collection at mongodb://mongo_server:27017/sofifa
    """

    name = 'player_pages'

    proxies = gen_proxy_list()
    user_agent = gen_useragent_list()

    custom_settings = sofifa_settings(name=name,
                                      database='sofifa',
                                      collection='player_urls',
                                      proxies=proxies,
                                      user_agent=user_agent,
                                      validator='PlayerItem')

    allowed_domains = ['sofifa.com']

    start_urls = ['https://sofifa.com/players/']

    rules = (Rule(LinkExtractor(deny=([
        r'\?', r'[0-9]+/[0-9]+/', r'/changeLog', r'/live', r'/squads',
        r'/calculator/', r'/team/', r'[0-9]+', r'/[a-zA-Z0-9]+$'
    ])),
                  callback='parse_item',
                  follow=True),
             Rule(LinkExtractor(
                 restrict_xpaths="//a[contains(@class, 'button pjax')]/@href"),
                  callback='parse_item',
                  follow=True))

    def parse(self, response):
        """
        @url http://sofifa.com/players/
        @returns items 1 61
        @returns requests 0 0
        @scrapes id_player_main total_stats hits comments player_page
        """

        self.crawler.stats.set_value('page_counter',
                                     page_counter(response.url))

        for row in response.xpath(
                "//table[@class='table table-hover persist-area']/tbody/tr"):

            loader = ItemLoader(item=MainPageItem(),
                                selector=row,
                                response=response)

            loader.add_value('last_modified', datetime.utcnow())
            loader.add_xpath('id', ".//a[contains(@href, 'player/')]/@href")
            loader.add_xpath('total_stats',
                             ".//span[contains(@class, 'primary')]/text()")
            loader.add_xpath('hits',
                             ".//td[contains(@class, 'comment')]/text()[1]")
            loader.add_xpath('comments',
                             ".//td[contains(@class, 'comment')]/text()[2]")
            loader.add_xpath('player_page',
                             ".//a[contains(@href, 'player/')]/@href")

            print(response.request.headers['User-Agent'])

            self.logger.info(f'Currently on page {current_page(response.url)}')

            yield loader.load_item()