示例#1
0
class FixtureCrawler(object):
    def __init__(self, url, skip, batch):
        self.browser = WebBrowser()
        self.browser.get(url)

        # Check for forbidden access
        server_response = normalize(
            self.browser.find_element_by_css_selector("div[id='header']").text)
        if server_response == "Server Error":
            self.browser.quit()
            raise ForbiddenAccessError

        self.match_reports = {'reports': []}
        self.timeout = 300  # Wait for 300s for elements to load on the page
        self.skip = skip  # Number of fixtures to skip
        self.batch_size = batch  # Number of fixtures to crawl in one go

    def skip_elements(self, elements):
        """
        Cull the elements list based on skip attribute

        :param elements: list of elements to be culled
        :type elements:

        :rtype:
        """
        size = len(elements)
        skip = self.skip

        if size < self.skip:
            self.skip -= size
        else:
            self.skip = 0

        culled_elements = elements[skip:]
        return culled_elements

    def browse_monthly_fixtures(self):
        """
        Browses monthly fixture pages one by one
        """
        try:
            # Wait till links to match reports are active
            self.browser.wait_till_element_is_loaded(
                "a[class='match-link match-report rc']", self.timeout)

            # Find all links to match reports, cull them and browse them
            elements = self.browser.find_elements_by_css_selector(
                "a[class='match-link match-report rc']")
            elements.reverse()
            culled_elements = self.skip_elements(elements)
            self.browse_match_reports(culled_elements)

        except TimeoutException:
            pass

        finally:
            # Browse previous months and then quit
            self.browse_previous_fixtures()
            self.browser.quit()

        return self.match_reports

    def browse_previous_fixtures(self):
        """
        Browses fixtures from previous months recursively
        """
        # If batch size is zero that means all fixtures have been browsed, so return
        if self.batch_size == 0:
            return

        # Wait till links to previous months are active
        self.browser.wait_till_element_is_loaded(
            "span.ui-icon.ui-icon-triangle-1-w", self.timeout)

        # Navigate to previous month
        elem = self.browser.find_element_by_css_selector(
            "span.ui-icon.ui-icon-triangle-1-w")
        self.browser.click_element(elem)

        # Wait till match report links are active
        self.browser.wait_till_element_is_loaded(
            "a[class='match-link match-report rc']", self.timeout)

        # Sleep for 5s
        time.sleep(5)

        # Find all links to match reports, cull them and browse them
        elements = self.browser.find_elements_by_css_selector(
            "a[class='match-link match-report rc']")
        elements.reverse()
        culled_elements = self.skip_elements(elements)
        self.browse_match_reports(culled_elements)

        # Check if month is August, if not browse previous month
        month = normalize(
            self.browser.find_element_by_css_selector(
                "a[id='date-config-toggle-button']").text)
        if month != "Aug 2015":
            self.browse_previous_fixtures()

    def browse_match_reports(self, elements):
        """
        Browses match reports one by one

        :param elements: list of elements which are links to match reports
        :type elements:
        """
        # Get control key based on system platform
        CONTROL_KEY = get_control_key()

        for elem in elements:
            # If batch size is zero that means all fixtures have been browsed, so return
            if self.batch_size == 0:
                break

            # Skip if required
            if self.skip != 0:
                self.skip -= 1
                continue

            # Save the window opener (current window, do not mistaken with tab... not the same)
            main_window = self.browser.current_window_handle()

            # Open match report in new tab
            self.browser.open_link_in_new_tab(elem)

            # Sleep for 5s
            time.sleep(5)

            # Switch tab to the new tab, which we will assume is the next one on the right
            self.browser.find_element_by_tag_name('body').send_keys(
                Keys.CONTROL + Keys.TAB)

            # Put focus on current window which will, in fact, put focus on the current visible tab
            self.browser.switch_to_window(main_window)

            # Check for forbidden access
            server_response = normalize(
                self.browser.find_element_by_css_selector(
                    "div[id='header']").text)
            if server_response == "Server Error":
                self.browser.quit()
                raise ForbiddenAccessError

            # Analyze the match report
            self.analyze_match_report()

            # Close current tab
            self.browser.find_element_by_tag_name('body').send_keys(
                CONTROL_KEY + 'w')

            # Put focus on current window which will be the window opener
            self.browser.switch_to_window(main_window)

            # Decrement batch size
            self.batch_size -= 1

    def get_match_result(self):
        """
        Returns a dict containing information on match result

        :rtype:
        """
        # Wait till match header is loaded
        self.browser.wait_till_element_is_loaded("div[id='match-header']",
                                                 self.timeout)

        # Extract match result from html source
        match_header_elem = self.browser.find_element_by_css_selector(
            "div[id='match-header']")
        team_elements = match_header_elem.find_elements_by_css_selector(
            "td[class='team']")
        home_team, away_team = normalize(team_elements[0].text), normalize(
            team_elements[1].text)
        result_elem = match_header_elem.find_element_by_css_selector(
            "td[class='result']")
        home_goals, away_goals = map(int,
                                     normalize(result_elem.text).split(':'))
        kickoff_elements = match_header_elem.find_elements_by_css_selector(
            "dd")
        date = normalize(kickoff_elements[-1].text)
        kickoff = normalize(kickoff_elements[-2].text)
        return {
            'home_team': home_team,
            'away_team': away_team,
            'home_goals': home_goals,
            'away_goals': away_goals,
            'kickoff': kickoff,
            'date': date
        }

    def go_to_match_preview(self):
        """
        Navigates to match preview
        """
        # Wait till navigation menu is active on page
        self.browser.wait_till_element_is_loaded("div[id='sub-navigation']",
                                                 self.timeout)

        # Extract preview element from match source
        div_elem = self.browser.find_element_by_css_selector(
            "div[id='sub-navigation']")
        li_elem = div_elem.find_element_by_css_selector("li")
        preview_elem = li_elem.find_element_by_css_selector("a")

        # Click on preview element
        self.browser.click_element(preview_elem)

        # Sleep for 3s
        time.sleep(3)

    def get_height_stats(self):
        """
        Returns height info for both teams

        :rtype:
        """
        # Wait till element is active
        self.browser.wait_till_element_is_loaded("div[class='stat-group']",
                                                 self.timeout)

        # Extract height info from html source
        stat_group_elements = self.browser.find_elements_by_css_selector(
            "div[class='stat-group']")
        stat_group_elem = stat_group_elements[1]
        stat_elements = stat_group_elem.find_elements_by_css_selector(
            "div[class='stat']")
        height_elem = stat_elements[-1]
        height_val_elements = height_elem.find_elements_by_css_selector(
            "span[class='stat-value']")
        home_team_height = float(normalize(height_val_elements[0].text))
        away_team_height = float(normalize(height_val_elements[-1].text))

        return {
            'home_team_height': home_team_height,
            'away_team_height': away_team_height
        }

    def analyze_match_report(self):
        """

        :return:
        """
        self.go_to_match_preview()
        match_result = self.get_match_result()
        height_stats = self.get_height_stats()

        match_report = dict()
        match_report.update(match_result)
        match_report.update(height_stats)

        self.match_reports['reports'].append(match_report)
示例#2
0
class FixtureCrawler(object):
    def __init__(self, url, skip, batch):
        self.browser = WebBrowser()
        self.browser.get(url)

        # Check for forbidden access
        server_response = normalize(self.browser.find_element_by_css_selector("div[id='header']").text)
        if server_response == "Server Error":
            self.browser.quit()
            raise ForbiddenAccessError

        self.match_reports = {'reports': []}
        self.timeout = 300  # Wait for 300s for elements to load on the page
        self.skip = skip  # Number of fixtures to skip
        self.batch_size = batch  # Number of fixtures to crawl in one go

    def skip_elements(self, elements):
        """
        Cull the elements list based on skip attribute

        :param elements: list of elements to be culled
        :type elements:

        :rtype:
        """
        size = len(elements)
        skip = self.skip

        if size < self.skip:
            self.skip -= size
        else:
            self.skip = 0

        culled_elements = elements[skip:]
        return culled_elements

    def browse_monthly_fixtures(self):
        """
        Browses monthly fixture pages one by one
        """
        try:
            # Wait till links to match reports are active
            self.browser.wait_till_element_is_loaded("a[class='match-link match-report rc']", self.timeout)

            # Find all links to match reports, cull them and browse them
            elements = self.browser.find_elements_by_css_selector("a[class='match-link match-report rc']")
            elements.reverse()
            culled_elements = self.skip_elements(elements)
            self.browse_match_reports(culled_elements)

        except TimeoutException:
            pass

        finally:
            # Browse previous months and then quit
            self.browse_previous_fixtures()
            self.browser.quit()

        return self.match_reports

    def browse_previous_fixtures(self):
        """
        Browses fixtures from previous months recursively
        """
        # If batch size is zero that means all fixtures have been browsed, so return
        if self.batch_size == 0:
            return

        # Wait till links to previous months are active
        self.browser.wait_till_element_is_loaded("span.ui-icon.ui-icon-triangle-1-w", self.timeout)

        # Navigate to previous month
        elem = self.browser.find_element_by_css_selector("span.ui-icon.ui-icon-triangle-1-w")
        self.browser.click_element(elem)

        # Wait till match report links are active
        self.browser.wait_till_element_is_loaded("a[class='match-link match-report rc']", self.timeout)

        # Sleep for 5s
        time.sleep(5)

        # Find all links to match reports, cull them and browse them
        elements = self.browser.find_elements_by_css_selector("a[class='match-link match-report rc']")
        elements.reverse()
        culled_elements = self.skip_elements(elements)
        self.browse_match_reports(culled_elements)

        # Check if month is August, if not browse previous month
        month = normalize(self.browser.find_element_by_css_selector("a[id='date-config-toggle-button']").text)
        if month != "Aug 2015":
            self.browse_previous_fixtures()

    def browse_match_reports(self, elements):
        """
        Browses match reports one by one

        :param elements: list of elements which are links to match reports
        :type elements:
        """
        # Get control key based on system platform
        CONTROL_KEY = get_control_key()

        for elem in elements:
            # If batch size is zero that means all fixtures have been browsed, so return
            if self.batch_size == 0:
                break

            # Skip if required
            if self.skip != 0:
                self.skip -= 1
                continue

            # Save the window opener (current window, do not mistaken with tab... not the same)
            main_window = self.browser.current_window_handle()

            # Open match report in new tab
            self.browser.open_link_in_new_tab(elem)

            # Sleep for 5s
            time.sleep(5)

            # Switch tab to the new tab, which we will assume is the next one on the right
            self.browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.TAB)

            # Put focus on current window which will, in fact, put focus on the current visible tab
            self.browser.switch_to_window(main_window)

            # Check for forbidden access
            server_response = normalize(self.browser.find_element_by_css_selector("div[id='header']").text)
            if server_response == "Server Error":
                self.browser.quit()
                raise ForbiddenAccessError

            # Analyze the match report
            self.analyze_match_report()

            # Close current tab
            self.browser.find_element_by_tag_name('body').send_keys(CONTROL_KEY + 'w')

            # Put focus on current window which will be the window opener
            self.browser.switch_to_window(main_window)

            # Decrement batch size
            self.batch_size -= 1

    def get_match_result(self):
        """
        Returns a dict containing information on match result

        :rtype:
        """
        # Wait till match header is loaded
        self.browser.wait_till_element_is_loaded("div[id='match-header']", self.timeout)

        # Extract match result from html source
        match_header_elem = self.browser.find_element_by_css_selector("div[id='match-header']")
        team_elements = match_header_elem.find_elements_by_css_selector("td[class='team']")
        home_team, away_team = normalize(team_elements[0].text), normalize(team_elements[1].text)
        result_elem = match_header_elem.find_element_by_css_selector("td[class='result']")
        home_goals, away_goals = map(int, normalize(result_elem.text).split(':'))
        kickoff_elements = match_header_elem.find_elements_by_css_selector("dd")
        date = normalize(kickoff_elements[-1].text)
        kickoff = normalize(kickoff_elements[-2].text)
        return {'home_team': home_team, 'away_team': away_team, 'home_goals': home_goals,
                'away_goals': away_goals, 'kickoff': kickoff, 'date': date}

    def go_to_match_preview(self):
        """
        Navigates to match preview
        """
        # Wait till navigation menu is active on page
        self.browser.wait_till_element_is_loaded("div[id='sub-navigation']", self.timeout)

        # Extract preview element from match source
        div_elem = self.browser.find_element_by_css_selector("div[id='sub-navigation']")
        li_elem = div_elem.find_element_by_css_selector("li")
        preview_elem = li_elem.find_element_by_css_selector("a")

        # Click on preview element
        self.browser.click_element(preview_elem)

        # Sleep for 3s
        time.sleep(3)

    def get_height_stats(self):
        """
        Returns height info for both teams

        :rtype:
        """
        # Wait till element is active
        self.browser.wait_till_element_is_loaded("div[class='stat-group']", self.timeout)

        # Extract height info from html source
        stat_group_elements = self.browser.find_elements_by_css_selector("div[class='stat-group']")
        stat_group_elem = stat_group_elements[1]
        stat_elements = stat_group_elem.find_elements_by_css_selector("div[class='stat']")
        height_elem = stat_elements[-1]
        height_val_elements = height_elem.find_elements_by_css_selector("span[class='stat-value']")
        home_team_height = float(normalize(height_val_elements[0].text))
        away_team_height = float(normalize(height_val_elements[-1].text))

        return {'home_team_height': home_team_height,
                'away_team_height': away_team_height}

    def analyze_match_report(self):
        """

        :return:
        """
        self.go_to_match_preview()
        match_result = self.get_match_result()
        height_stats = self.get_height_stats()

        match_report = dict()
        match_report.update(match_result)
        match_report.update(height_stats)

        self.match_reports['reports'].append(match_report)
class LeagueTableCrawler(object):
    def __init__(self, url):
        self.browser = WebBrowser()
        self.browser.get(url)

        # Check for forbidden access
        server_response = normalize(self.browser.find_element_by_css_selector("div[id='header']").text)
        if server_response == "Server Error":
            self.browser.quit()
            raise ForbiddenAccessError

        self.table = {}
        self.timeout = 300

    def create_league_table(self):
        self.browser.wait_till_element_is_loaded("tbody[class='standings']", self.timeout)
        table = self.browser.find_element_by_css_selector("tbody[class='standings']")
        rows = table.find_elements_by_tag_name("tr")

        for row in rows:
            columns = row.find_elements_by_tag_name("td")
            counter = 0
            pos, team, matches, wins, draws, losses, gf, ga, gd, points = 0, '', 0, 0, 0, 0, 0, 0, 0, 0
            for column in columns:
                text = normalize(column.text)

                if counter == 0:
                    pos = int(text)
                elif counter == 1:
                    team = text
                elif counter == 2:
                    matches = int(text)
                elif counter == 3:
                    wins = int(text)
                elif counter == 4:
                    draws = int(text)
                elif counter == 5:
                    losses = int(text)
                elif counter == 6:
                    gf = int(text)
                elif counter == 7:
                    ga = int(text)
                elif counter == 8:
                    gd = int(text)
                elif counter == 9:
                    points = int(text)
                elif counter == 10:
                    counter = 0
                    self.table[team] = {'pos': pos, 'matches': matches, 'wins': wins,
                                        'draws': draws, 'losses': losses, 'gf': gf,
                                        'ga': ga, 'gd': gd, 'points': points}

                counter += 1

        self.quit()

    def persist_table(self):
        dump_as_json(self.table, 'league_table.json', 'w')

    def quit(self):
        self.browser.quit()
class LeagueTableCrawler(object):
    def __init__(self, url):
        self.browser = WebBrowser()
        self.browser.get(url)

        # Check for forbidden access
        server_response = normalize(
            self.browser.find_element_by_css_selector("div[id='header']").text)
        if server_response == "Server Error":
            self.browser.quit()
            raise ForbiddenAccessError

        self.table = {}
        self.timeout = 300

    def create_league_table(self):
        self.browser.wait_till_element_is_loaded("tbody[class='standings']",
                                                 self.timeout)
        table = self.browser.find_element_by_css_selector(
            "tbody[class='standings']")
        rows = table.find_elements_by_tag_name("tr")

        for row in rows:
            columns = row.find_elements_by_tag_name("td")
            counter = 0
            pos, team, matches, wins, draws, losses, gf, ga, gd, points = 0, '', 0, 0, 0, 0, 0, 0, 0, 0
            for column in columns:
                text = normalize(column.text)

                if counter == 0:
                    pos = int(text)
                elif counter == 1:
                    team = text
                elif counter == 2:
                    matches = int(text)
                elif counter == 3:
                    wins = int(text)
                elif counter == 4:
                    draws = int(text)
                elif counter == 5:
                    losses = int(text)
                elif counter == 6:
                    gf = int(text)
                elif counter == 7:
                    ga = int(text)
                elif counter == 8:
                    gd = int(text)
                elif counter == 9:
                    points = int(text)
                elif counter == 10:
                    counter = 0
                    self.table[team] = {
                        'pos': pos,
                        'matches': matches,
                        'wins': wins,
                        'draws': draws,
                        'losses': losses,
                        'gf': gf,
                        'ga': ga,
                        'gd': gd,
                        'points': points
                    }

                counter += 1

        self.quit()

    def persist_table(self):
        dump_as_json(self.table, 'league_table.json', 'w')

    def quit(self):
        self.browser.quit()