Пример #1
0
    def rushing(self, player_link, year, **kwargs):

        # Set up the gamelog suffix
        gamelog_suffix = '/gamelog/%s/' % year

        # Modify the player url to point to the gamelog
        log_url = player_link[:-4] + gamelog_suffix

        # Get html
        html = Loader.Loader().load_page(log_url).content.decode()

        # ************** generate general stats, these need to be combined later ******************
        gen = PlayerParser.PlayerParser().parse_general_info(html)

        # parse tables w pandas
        df = pd.read_html(html)[0]

        # hash the columns to determine which fields are being used
        which_cols = hashlib.md5(json.dumps(list(df.columns.levels[0])).encode()).hexdigest()

        # Here we make a dict of hashes and their corresponding column parser, this is faster than if/else
        options = {'c3695be2dd2fa9307301dccf047b4e86': Rushhash.RushHash().md5c3695be2dd2fa9307301dccf047b4e86,
                   '7f97f3885d50fcf9b92797810856a89f': Rushhash.RushHash().md57f97f3885d50fcf9b92797810856a89f,
                   'aa321161d6f3f5230259dbc4ae67299a': Rushhash.RushHash().md5aa321161d6f3f5230259dbc4ae67299a,
                   '9c11c15180efbf7aec4300fc190cd3a5': Rushhash.RushHash().md59c11c15180efbf7aec4300fc190cd3a5,
                   'ad9a12e06546e3019128fec57cdc9d0e': Rushhash.RushHash().md5ad9a12e06546e3019128fec57cdc9d0e,
                   '00f83a7c4b3e891e3c448db700cc9ada': Rushhash.RushHash().md500f83a7c4b3e891e3c448db700cc9ada,
                   '5980508dab2f61013bd07809c5ca0e41': Rushhash.RushHash().md55980508dab2f61013bd07809c5ca0e41,
                   'c35b37a5f0f696bfd1576753faffe81c': Rushhash.RushHash().md5c35b37a5f0f696bfd1576753faffe81c,
                   'aed81e3e77b9842532b5efa73458a259': Rushhash.RushHash().md5aed81e3e77b9842532b5efa73458a259,
                   '7d21a9a4ab9adde626d633fbd62db5c0': Rushhash.RushHash().md57d21a9a4ab9adde626d633fbd62db5c0,
                   '91138c3c08c339b71b8323e2bac3aac7': Rushhash.RushHash().md591138c3c08c339b71b8323e2bac3aac7,
                   'ddcb0610869ff21799f008209ac6d229': Rushhash.RushHash().md5ddcb0610869ff21799f008209ac6d229}

        df = options[which_cols](df)

        # send df to the common parser
        df = self.common(df, year)

        # Add the name
        df.loc[:, 'Name'] = gen['name']

        # Add the players position
        df.loc[:, 'Pos'] = gen['position']

        df['Throws'] = gen['throws']
        df['Height'] = gen['height']
        df['Weight'] = gen['weight']
        df['DOB_mo'] = gen['bday_mo']
        df['DOB_day'] = gen['bday_day']
        df['DOB_yr'] = gen['bday_yr']
        df['College'] = gen['college']

        df = df[['Name', 'Pos', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College'] +
                Rushhash.RushHash().base[1:] + ['PF', 'PA'] + Rushhash.RushHash().receiving + Rushhash.RushHash().rushing +
                Rushhash.RushHash().kick_rt + Rushhash.RushHash().punt_rt + Rushhash.RushHash().scoring2p +
                Rushhash.RushHash().scoring]

        return df
Пример #2
0
    def player_links(self, start, *args):

        links = []
        # check to see if a range of years was given
        if args and int(start < int(args[0])):
            end = args[0]
            for year in range(start, end + 1):

                # load the positions page for the given year
                url = self.url % year

                # parse the html
                content = Loader.Loader().load_page(url).text

                # Parse urls from the position page that point to the individuals players page
                players = re.compile(
                    'data-append-csv=".*?" data-stat="player" csk=".*?" ><a href="(\/players\/[a-zA-Z]\/.*?.htm)"'
                ).findall(content)

                # Since they are relative links, attach the base url and append to a list to be returned
                #links = []
                for x in players:
                    links.append('https://www.pro-football-reference.com' + x)

            # remove duplicate urls from list
            links = list(dict.fromkeys(links))

        else:
            # load the positions page for the given year
            url = self.url % start

            # parse the html
            content = Loader.Loader().load_page(url).text

            # Parse urls from the position page that point to the individuals players page
            players = re.compile(
                'data-append-csv=".*?" data-stat="player" csk=".*?" ><a href="(\/players\/[a-zA-Z]\/.*?.htm)"'
            ).findall(content)

            # Since they are relative links, attach the base url and append to a list to be returned
            # links = []
            for x in players:
                links.append('https://www.pro-football-reference.com' + x)

        return links
Пример #3
0
    def defense(self, player_link, year, **kwargs):
        # Set up the gamelog suffix
        gamelog_suffix = '/gamelog/%s/' % year

        # Modify the player url to point to the gamelog
        log_url = player_link[:-4] + gamelog_suffix

        # Get html
        html = Loader.Loader().load_page(log_url).content.decode()

        # ************** generate general stats, these need to be combined later ******************
        gen = PlayerParser.PlayerParser().parse_general_info(html)

        # parse tables w pandas
        df = pd.read_html(html)[0]

        # hash the columns to determine which fields are being used
        which_cols = hashlib.md5(json.dumps(list(df.columns.levels[0])).encode()).hexdigest()

        # Here we make a dict of hashes and their corresponding column parser, this is faster than if/else
        options = {'0c329a15f241e5c132d0d5c7612032c0': Defhash.DefHash().md50c329a15f241e5c132d0d5c7612032c0,
                   '58ffdd172c2358c5e5ab2e0a1994252a': Defhash.DefHash().md558ffdd172c2358c5e5ab2e0a1994252a,
                   '141f3f6945aa9495c6580650649f4b8f': Defhash.DefHash().md5141f3f6945aa9495c6580650649f4b8f,
                   '109394668745222b0ccbd92bfd0ac4c1': Defhash.DefHash().md5109394668745222b0ccbd92bfd0ac4c1,
                   '60dfaf4e946c4ae3d47c6d8b430c92a4': Defhash.DefHash().md560dfaf4e946c4ae3d47c6d8b430c92a4,
                   'fa476dd5c907f86452c016e54b3fe0f8': Defhash.DefHash().md5fa476dd5c907f86452c016e54b3fe0f8}

        df = options[which_cols](df)

        # send df to the common parser
        df = self.common(df, year)

        # Add the name
        df.loc[:, 'Name'] = gen['name']

        # Add the players position
        df.loc[:, 'Pos'] = gen['position']

        df['Throws'] = gen['throws']
        df['Height'] = gen['height']
        df['Weight'] = gen['weight']
        df['DOB_mo'] = gen['bday_mo']
        df['DOB_day'] = gen['bday_day']
        df['DOB_yr'] = gen['bday_yr']
        df['College'] = gen['college']

        df = df[['Name', 'Pos', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College'] +
                Defhash.DefHash().base[1:] + ['PF', 'PA'] +  Defhash.DefHash().punt_rt + Defhash.DefHash().kick_rt +
                Defhash.DefHash().scoring + Defhash.DefHash().rush_sk + Defhash.DefHash().def_int]

        return df
Пример #4
0
    def kicking(self, player_link, year, **kwargs):
        # Set up the gamelog suffix
        gamelog_suffix = '/gamelog/%s/' % year

        # Modify the player url to point to the gamelog
        log_url = player_link[:-4] + gamelog_suffix

        # Get html
        html = Loader.Loader().load_page(log_url).content.decode()

        # ************** generate general stats, these need to be combined later ******************
        gen = PlayerParser.PlayerParser().parse_general_info(html)

        # parse tables w pandas
        df = pd.read_html(html)[0]

        # hash the columns to determine which fields are being used
        which_cols = hashlib.md5(json.dumps(list(df.columns.levels[0])).encode()).hexdigest()

        # Here we make a dict of hashes and their corresponding column parser, this is faster than if/else
        options = {'080683052961d92b5efd07588e614700': Kickhash.KickHash().md5080683052961d92b5efd07588e614700,
                   'c0fe30e42184e7a59c00c04dc917bb87': Kickhash.KickHash().md5c0fe30e42184e7a59c00c04dc917bb87,
                   '7ad30bf95e287937864b02dca25801bf': Kickhash.KickHash().md57ad30bf95e287937864b02dca25801bf}

        df = options[which_cols](df)

        # send df to the common parser
        df = self.common(df, year)

        # Add the name
        df.loc[:, 'Name'] = gen['name']

        # Add the players position
        df.loc[:, 'Pos'] = gen['position']

        df['Throws'] = gen['throws']
        df['Height'] = gen['height']
        df['Weight'] = gen['weight']
        df['DOB_mo'] = gen['bday_mo']
        df['DOB_day'] = gen['bday_day']
        df['DOB_yr'] = gen['bday_yr']
        df['College'] = gen['college']

        df = df[['Name', 'Pos', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College'] +
                Kickhash.KickHash().base[1:] + ['PF', 'PA'] + Kickhash.KickHash().scoring]

        return df
Пример #5
0
    def passing(self, player_link, year, **kwargs):
        # Set up the gamelog suffix
        gamelog_suffix = '/gamelog/%s/' % year

        # Modify the player url to point to the gamelog
        log_url = player_link[:-4] + gamelog_suffix

        # Get html
        html = Loader.Loader().load_page(log_url).content.decode()

        # gent general stats
        gen = PlayerParser.PlayerParser().parse_general_info(html)

        # parse tables w pandas
        df = pd.read_html(html)[0]

        # drop first level of cols
        df.columns = df.columns.droplevel()

        # rename the home column
        df = df.rename(columns={df.columns[5]: "Home"})

        # There may be many extra blank cols, delet them

        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

        # send df to the common parser
        df = self.common(df, year)

        # Add the name
        df.loc[:, 'Name'] = gen['name']

        # Add the players position
        df.loc[:, 'Pos'] = gen['position']

        # add additional player info
        df['Throws'] = gen['throws']
        df['Height'] = gen['height']
        df['Weight'] = gen['weight']
        df['DOB_mo'] = gen['bday_mo']
        df['DOB_day'] = gen['bday_day']
        df['DOB_yr'] = gen['bday_yr']
        df['College'] = gen['college']

        return df
Пример #6
0
    def receiving(self, player_link, year, **kwargs):
        # Set up the gamelog suffix
        gamelog_suffix = '/gamelog/%s/' % year

        # Modify the player url to point to the gamelog
        log_url = player_link[:-4] + gamelog_suffix

        # Get html
        html = Loader.Loader().load_page(log_url).content.decode()

        # ************** generate general stats, these need to be combined later ******************
        gen = PlayerParser.PlayerParser().parse_general_info(html)

        # parse tables w pandas
        df = pd.read_html(html)[0]

        # hash the columns to determine which fields are being used
        which_cols = hashlib.md5(json.dumps(list(df.columns.levels[0])).encode()).hexdigest()

        # Here we make a dict of hashes and their corresponding column parser, this is faster than if/else
        options = { "b3c4237d9a10de8cfaad61852cb552c4": Rechash.RecHash().md5b3c4237d9a10de8cfaad61852cb552c4,
                    "bcb96297b50fb2120f475e8e05fbabcd": Rechash.RecHash().md5bcb96297b50fb2120f475e8e05fbabcd,
                    "4560c290b45e942c16cc6d7811345fce": Rechash.RecHash().md54560c290b45e942c16cc6d7811345fce,
                    "4c82a489ec5b2c943e78c9018dcbbca1": Rechash.RecHash().md54c82a489ec5b2c943e78c9018dcbbca1,
                    "e8ffc7202223bb253e92da83b76e9944": Rechash.RecHash().md5e8ffc7202223bb253e92da83b76e9944,
                    "50fcceaa170b1a1e501e3f40548e403d": Rechash.RecHash().md550fcceaa170b1a1e501e3f40548e403d,
                    "e160e714b29305ecfecf513cbf84b80f": Rechash.RecHash().md5e160e714b29305ecfecf513cbf84b80f,
                    "111e8480632f73642d7e20acbdbe6b16": Rechash.RecHash().md5111e8480632f73642d7e20acbdbe6b16,
                    "adc05c5af0f88775d3605d02c831c0ed": Rechash.RecHash().md5adc05c5af0f88775d3605d02c831c0ed,
                    "bfbf86ae0485a0a70692ae04124449b9": Rechash.RecHash().md5bfbf86ae0485a0a70692ae04124449b9,
                    "6b4698269dd34a823cf6b233c6165614": Rechash.RecHash().md56b4698269dd34a823cf6b233c6165614,
                    "7f97f3885d50fcf9b92797810856a89f": Rechash.RecHash().md57f97f3885d50fcf9b92797810856a89f,
                    "aa321161d6f3f5230259dbc4ae67299a": Rechash.RecHash().md5aa321161d6f3f5230259dbc4ae67299a,
                    "1193d47266d4acdcf1b6fca165121100": Rechash.RecHash().md51193d47266d4acdcf1b6fca165121100,
                    "52589e869a13d76c6d0dbf066cab536f": Rechash.RecHash().md552589e869a13d76c6d0dbf066cab536f,
                    "d522b9357244c20714a3b21f8f404918": Rechash.RecHash().md5d522b9357244c20714a3b21f8f404918}

        df = options[which_cols](df)

        # send df to the common parser
        df = self.common(df, year)

        # Add the name
        df.loc[:, 'Name'] = gen['name']

        # Add the players position
        df.loc[:, 'Pos'] = gen['position']

        df['Throws'] = gen['throws']
        df['Height'] = gen['height']
        df['Weight'] = gen['weight']
        df['DOB_mo'] = gen['bday_mo']
        df['DOB_day'] = gen['bday_day']
        df['DOB_yr'] = gen['bday_yr']
        df['College'] = gen['college']

        df = df[['Name', 'Pos', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College'] +
                Rechash.RecHash().base[1:] + ['PF', 'PA'] + Rechash.RecHash().receiving + Rechash.RecHash().rushing +
                Rechash.RecHash().kick_rt + Rechash.RecHash().punt_rt + Rechash.RecHash().scoring2p +
                Rechash.RecHash().scoring]

        return df
Пример #7
0
    def receiving(self, url=None, **kwargs):
        # We generally pass in a url and then load the page, for testing the function allow html to be passed in
        if url:
            response = Loader.Loader().load_page(url)
            html = response.text
        else:
            for k, v in kwargs.items():
                if k == 'html':
                    html = v

        #Scrape general stats
        general_stats = self.parse_general_info(html)

        # Here we test to see if the player page being called is for a receiver or running back. Since the dataframe
        # structure is the same for both positions, we'll call one or the other. If the position is anything else, we
        # wont try to parse it
        parseablePositions = ['TE', 'WR']

        if not any(x in general_stats['position'] for x in parseablePositions):
            if any(x in general_stats['position'] for x in ['RB', 'FB']):
                print(url, " is a ", general_stats['position'],
                      " calling rushing method instead")
                df = self.rushing(url)
            else:
                print(
                    url,
                    " is not a receiver we can parse so we're skipping this player"
                )
                return pd.DataFrame()
        else:

            # load the stats table into pandas dataframe. Using 'df' as the variable name to signify it's a pd.DataFrame.
            df = pd.read_html(html)[0]

            df = df.iloc[:, :27]

            # rename columns from origional multirow colums
            cols = [
                'Year', 'Age', 'Tm', 'Pos', 'No', 'G', 'GS', 'Tgt', 'Rec',
                'Rec_Yds', 'Y/R', 'Rec_TD', 'Rec_Lng', 'R/G', 'Rec_Y/G',
                'Ctch%', 'Rush', 'Rush_Yds', 'Rush_TD', 'Rush_Lng', 'Y/A',
                'Rush_Y/G', 'A/G', 'YScm', 'RRTD', 'Fmb', 'AV'
            ]
            try:
                df.columns = cols
            except ValueError:
                print('Column mismatch, check url: ', url,
                      'skipping and returning blank DF')
                return pd.DataFrame()

            # remove the career totals row
            df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
            df = df[~df['Age'].isna()]

            # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc
            df['Year'] = df['Year'].str.replace('+', '')
            df['Year'] = df['Year'].str.replace('*', '')

            # some players have multiple rows w.o a year if they played on more than 1 team in that year
            df['Year'] = df['Year'].astype(str)
            df = df[df.Year != 'nan']
            df['Year'] = pd.to_numeric(df['Year'])

            # sometimes this field is blank, so we convert the nan to an empty string so we can parse further
            df['Ctch%'] = df['Ctch%'].astype(str)
            df['Ctch%'] = df['Ctch%'].fillna('')
            # remove % sign on ctch% and convert to float
            df['Ctch%'] = df['Ctch%'].str.replace('%', '')
            df['Ctch%'] = pd.to_numeric(df['Ctch%'], errors='coerce')

            # uppercase some qualitatives
            df['Tm'] = df['Tm'].str.upper()

            # Insert general scraped info from player page
            df['Name'] = general_stats['name']
            df['Throws'] = general_stats['throws']
            df['Height'] = general_stats['height']
            df['Weight'] = general_stats['weight']
            df['DOB_mo'] = general_stats['bday_mo']
            df['DOB_day'] = general_stats['bday_day']
            df['DOB_yr'] = general_stats['bday_yr']
            df['College'] = general_stats['college']

            # This is hacky but position info isn't always contained in every row
            if df['Pos'].isnull().values.any():
                df['Pos'] = general_stats['position']
            df['Pos'] = df['Pos'].str.upper()

            # rearange the dataframe columns, this is personal preference
            df = df[[
                'Name', 'Year', 'Age', 'Throws', 'Height', 'Weight', 'DOB_mo',
                'DOB_day', 'DOB_yr', 'College', 'Tm', 'Pos', 'No', 'G', 'GS',
                'Tgt', 'Rec', 'Rec_Yds', 'Y/R', 'Rec_TD', 'Rec_Lng', 'R/G',
                'Rec_Y/G', 'Ctch%', 'Rush', 'Rush_Yds', 'Rush_TD', 'Rush_Lng',
                'Y/A', 'Rush_Y/G', 'A/G', 'YScm', 'RRTD', 'Fmb', 'AV'
            ]]

        return df
Пример #8
0
    def kicking(self, url=None, **kwargs):
        # We generally pass in a url and then load the page, for testing the function allow html to be passed in
        if url:
            response = Loader.Loader().load_page(url)
            html = response.text
        else:
            for k, v in kwargs.items():
                if k == 'html':
                    html = v

        # Scrape general stats
        general_stats = self.parse_general_info(html)

        # Ensure we're only parsing QB's
        parseablePositions = ['K', 'P']

        if not any(x in general_stats['position'] for x in parseablePositions):
            print(
                url,
                " is not a kicker we can parse so we're skipping this player")
            return pd.DataFrame()
        else:

            # load the stats table into pandas dataframe. Using 'df' as the variable name to signify it's a pd.DataFrame.
            df = pd.read_html(html)[0]

            # sometimes there's unneeded cols
            df = df.iloc[:, :30]

            # rename columns from original multirow colums
            cols = [
                'Year', 'Age', 'Tm', 'Pos', 'No.', 'G', 'GS', '0-19FGA',
                '0-19FGM', '20-29FGA', '20-29FGM', '30-39FGA', '30-39FGM',
                '40-49FGA', '40-49FGM', '50+FGA', '50+FGM', 'scr_FGA',
                'scr_FGM', 'Lng', 'scr_FG%', 'scr_XPA', 'scr_XPM', 'scr_XP%',
                'Pnt', 'Yds', 'Lng', 'Blck', 'Y/P', 'AV'
            ]

            try:
                df.columns = cols
            except ValueError:
                print('Column mismatch, check url: ', url)

            # remove the career totals row
            df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
            df = df[~df['Age'].isna()]

            # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc
            df['Year'] = df['Year'].astype(str)
            df['Year'] = df['Year'].str.replace('+', '')
            df['Year'] = df['Year'].str.replace('*', '')

            # some players have multiple rows w.o a year if they played on more than 1 team in that year
            df = df[df.Year != 'nan']
            df['Year'] = pd.to_numeric(df['Year'])

            # uppercase some qualitatives
            df['Tm'] = df['Tm'].str.upper()

            # Insert general scraped info from player page
            df['Name'] = general_stats['name']
            df['Throws'] = general_stats['throws']
            df['Height'] = general_stats['height']
            df['Weight'] = general_stats['weight']
            df['DOB_mo'] = general_stats['bday_mo']
            df['DOB_day'] = general_stats['bday_day']
            df['DOB_yr'] = general_stats['bday_yr']
            df['College'] = general_stats['college']

            # This is hacky but position info isn't always contained in every row
            if df['Pos'].isnull().values.any():
                df['Pos'] = general_stats['position']

            df = df[[
                'Name', 'Year', 'Age', 'Throws', 'Height', 'Weight', 'DOB_mo',
                'DOB_day', 'DOB_yr', 'College', 'Tm', 'Pos', 'No.', 'G', 'GS',
                '0-19FGA', '0-19FGM', '20-29FGA', '20-29FGM', '30-39FGA',
                '30-39FGM', '40-49FGA', '40-49FGM', '50+FGA', '50+FGM',
                'scr_FGA', 'scr_FGM', 'Lng', 'scr_FG%', 'scr_XPA', 'scr_XPM',
                'scr_XP%', 'Pnt', 'Yds', 'Lng', 'Blck', 'Y/P', 'AV'
            ]]

        return df
Пример #9
0
    def defense(self, url=None, **kwargs):
        # We generally pass in a url and then load the page, for testing the function allow html to be passed in
        if url:
            response = Loader.Loader().load_page(url)
            html = response.text
        else:
            for k, v in kwargs.items():
                if k == 'html':
                    html = v

        # Scrape general stats
        general_stats = self.parse_general_info(html)

        # load the stats table into pandas dataframe. Using 'df' as the variable name to signify it's a pd.DataFrame.
        df = pd.read_html(html)[0]

        df = df.iloc[:, :22]

        cols = [
            'Year', 'Age', 'Tm', 'Pos', 'No.', 'G', 'GS', 'Int', 'Yds', 'TD',
            'Lng', 'PD', 'FF', 'Fmb', 'FR', 'Fmb_Yds', 'Fmb_TD', 'Sk', 'Tkl',
            'Ast', 'Sfty', 'AV'
        ]

        try:
            df.columns = cols
        except:
            print('Column mismatch, check url: ', url,
                  'skipping and returning blank DF')
            return pd.DataFrame()

        # remove the career totals row
        df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
        df = df[~df['Age'].isna()]

        # some players have multiple rows w.o a year if they played on more than 1 team in that year
        df = df[df.Year != 'nan']
        # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc
        df['Year'] = df['Year'].str.replace('+', '')
        df['Year'] = df['Year'].str.replace('*', '')
        df['Year'] = pd.to_numeric(df['Year'])

        # uppercase some qualitatives
        df['Tm'] = df['Tm'].str.upper()

        # Insert general scraped info from player page
        df['Name'] = general_stats['name']

        df['Throws'] = general_stats['throws']
        df['Height'] = general_stats['height']
        df['Weight'] = general_stats['weight']
        df['DOB_mo'] = general_stats['bday_mo']
        df['DOB_day'] = general_stats['bday_day']
        df['DOB_yr'] = general_stats['bday_yr']
        df['College'] = general_stats['college']

        # This is hacky but position info isn't always contained in every row
        if df['Pos'].isnull().values.any():
            df['Pos'] = general_stats['position']
        df['Pos'] = df['Pos'].str.upper()

        df = df[[
            'Name', 'Year', 'Age', 'Throws', 'Height', 'Weight', 'DOB_mo',
            'DOB_day', 'DOB_yr', 'College', 'Tm', 'Pos', 'No.', 'G', 'GS',
            'Int', 'Yds', 'TD', 'Lng', 'PD', 'FF', 'Fmb', 'FR', 'Fmb_Yds',
            'Fmb_TD', 'Sk', 'Tkl', 'Ast', 'Sfty', 'AV'
        ]]

        return df
Пример #10
0
    def passing(self, url=None, **kwargs):
        # We generally pass in a url and then load the page, for testing the function allow html to be passed in
        if url:
            response = Loader.Loader().load_page(url)
            html = response.text
        else:
            for k, v in kwargs.items():
                if k == 'html':
                    html = v

        # Scrape general stats
        general_stats = self.parse_general_info(html)

        # Ensure we're only parsing QB's
        parseablePositions = ['QB']

        if not any(x in general_stats['position'] for x in parseablePositions):
            print(
                url,
                " is not a quarterback we can parse so we're skipping this player"
            )
            return pd.DataFrame()
        else:
            # load the stats table into pandas dataframe. Using 'df' as the variable name to signify it's a
            # pd.DataFrame.
            df = pd.read_html(html)[0]

            # remove the career totals row
            df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
            df = df[~df['Age'].isna()]

            # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc
            df['Year'] = df['Year'].str.replace('+', '')
            df['Year'] = df['Year'].str.replace('*', '')

            # some players have multiple rows w.o a year if they played on more than 1 team in that year
            df = df[df.Year != 'nan']
            df['Year'] = pd.to_numeric(df['Year'])

            df['GS'] = pd.to_numeric(df['GS'])

            # Insert general scraped info from player page
            df['Pos'] = general_stats['position']
            df['Name'] = general_stats['name']
            df['Throws'] = general_stats['throws']
            df['Height'] = general_stats['height']
            df['Weight'] = general_stats['weight']
            df['DOB_mo'] = general_stats['bday_mo']
            df['DOB_day'] = general_stats['bday_day']
            df['DOB_yr'] = general_stats['bday_yr']
            df['College'] = general_stats['college']

            # uppercase some qualitatives
            df['Tm'] = df['Tm'].str.upper()
            df['Pos'] = df['Pos'].str.upper()

            # Parse out rushing and receiving information and append to the passing info
            soup = BeautifulSoup(html, 'lxml')

            # parse out the chunk of rushing and receiving info from the html comments
            rush_cols = [
                'Year', 'Age', 'Tm', 'Pos', 'No.', 'G', 'GS', 'Rush',
                'Rush_Yds', 'Rush_TD', 'Rush_Lng', 'Rush_Y/A', 'Rush_Y/G',
                'A/G', 'Tgt', 'Rec', 'Rec_Yds', 'Y/R', 'Rec_TD', 'Rec_Lng',
                'R/G', 'Rec_Y/G', 'Ctch%', 'YScm', 'RRTD', 'Fmb'
            ]

            # we need to keep track of if we actually found rushing info
            found = False

            #Rushing info for QBs is commented out unless java is enabled, so search comments
            for comment in soup.findAll(
                    text=lambda text: isinstance(text, Comment)):
                if 'id="div_rushing_and_receiving">' in comment:
                    new_html = comment

                    rush_df = pd.read_html(new_html)[0]
                    rush_df = rush_df.iloc[:, :26]

                    try:
                        rush_df.columns = rush_cols
                    except ValueError:
                        print('Column mismatch, check url: ', url)

                    # munge the columns similar to above
                    # remove the career totals row
                    rush_df['Age'] = pd.to_numeric(rush_df['Age'],
                                                   errors='coerce')
                    rush_df = rush_df[~rush_df['Age'].isna()]

                    # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc
                    rush_df['Year'] = rush_df['Year'].str.replace('+', '')
                    rush_df['Year'] = rush_df['Year'].str.replace('*', '')

                    # some players have multiple rows w.o a year if they played on more than 1 team in that year
                    rush_df = rush_df[rush_df.Year != 'nan']
                    rush_df['Year'] = pd.to_numeric(rush_df['Year'])

                    # This is hacky but position info isn't always contained in every row
                    rush_df['Pos'] = general_stats['position']

                    # uppercase some qualitatives
                    rush_df['Tm'] = rush_df['Tm'].str.upper()
                    rush_df['Pos'] = rush_df['Pos'].str.upper()

                    # Ensure that we know we have the rushing info we're looking for
                    found = True

            # if we didn't get any rushing info, create an empty df
            if not found:
                rush_df = pd.DataFrame(columns=rush_cols)

            # merge the two DataFrames on overlapping columns and return
            combined_df = pd.merge(
                df,
                rush_df,
                on=['Year', 'Age', 'Tm', 'Pos', 'No.', 'G', 'GS'],
                how='left')

        return combined_df
Пример #11
0
def defense_req():
    print('loaded defense html')
    return Loader.Loader().load_page(urls['defense'])
Пример #12
0
def kicking_req():
    print('loaded kicking html')
    return Loader.Loader().load_page(urls['kicking'])
Пример #13
0
def rushing_req():
    print('loaded rushing html')
    return Loader.Loader().load_page(urls['rushing'])
Пример #14
0
def passing_req():
    print('loaded passing html')
    return Loader.Loader().load_page(urls['passing'])