Exemplos de getSoupFromURL em Python, exemplos de dfs.extdata.common.scraper.getSoupFromURL em Python

Exemplo n.º 1

0

Exibir arquivo

def load_stats_tables_from_history_page(url):
    """Load all the prediction tables from a Numberfire history page"""
    soup = getSoupFromURL(url)
    #salary = load_player_salary_table(soup)
    projection_months = [
        '%s-schedule' % month for month in [
            'March', 'April', 'May', 'June', 'July', 'August', 'September',
            'October'
        ]
    ]
    month_tables = []
    for month in projection_months:
        month_schedule = soup.find('div', attrs={'id': month})
        month_table = load_player_history_table(month_schedule)
        if month_table is not None:
            month_tables.append(month_table)
    if month_tables:
        all_predictions = pandas.concat(month_tables)
        all_predictions.sort_index(inplace=True)
        if all_predictions.index.duplicated().any():
            print 'Duplicate games scraped!'
            import IPython
            IPython.embed()
    else:
        all_predictions = None
    return all_predictions

Exemplo n.º 2

0

Exibir arquivo

Arquivo: scraper.py Projeto: jgershen/sportsball

def load_overview_pages(players):
  """
  Hit the overview page and load gamelog_url_list for each of the players in the player dict.
  Maybe this should be in the webio submodule? I am leaving it here since it controls scraping program flow.
  :param players: player dict
  :return dict: player dict
  """
  # Helper function to guess which position a player plays from the overview table of stats.
  # Just grab the position from the most recent year in which it was defined, and return that.
  def quick_position_guess(overview_table):
    return overview_table.dropna(subset=['Pos'])['Pos'].iloc[-1]
  pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()])
  print 'Accessing and parsing overview pages...'
  for i, (bref_id, player_dict) in pbar(list(enumerate(players.items()))):
    overview_soup = getSoupFromURL(players[bref_id]['overview_url'])
    players[bref_id]['overview_url_content'] = overview_soup.text
    # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
    # so we can use those to pull out our urls.
    for li in overview_soup.find_all('li'):
      if 'Game Logs' in li.getText():
        game_log_links = li.findAll('a')
        for game_log_link in game_log_links:
          players[bref_id]['gamelog_url_list'].append('http://www.basketball-reference.com' + game_log_link.get('href'))
    player_name = overview_soup.find('h1').text
    players[bref_id]['info']['name'] = player_name
    # Read (guess?) player's position
    overview_table = dfFromOverviewPage(overview_soup)
    if len(overview_table.dropna(subset=['Pos'])) > 0:
      players[bref_id]['info']['pos'] = quick_position_guess(overview_table)
    else:
      players[bref_id]['info']['pos'] = '?'  # this will only happen for chumps but by defining a value we should block exceptions
  return players

Exemplo n.º 3

0

Exibir arquivo

def load_positions_for_day(sport, game_date, game='FanDuel'):
    ''' get salaries and positions for eligible players for the given day / fantasy site
  :param datetime.datetime game_date:
  :param basestring game:
  :return:
  '''
    month, day = game_date.month, game_date.day
    url_template = nba_url_template if sport == 'nba' else mlb_url_template
    day_part = '%02d' % day
    url = url_template.format(month=month,
                              day=day_part,
                              game_code=game_code_dict[game])
    soup = getSoupFromURL(url)
    if sport == 'nba':
        all_rows = soup.findAll('tr')
        player_rows = filter(lambda r: is_actual_player_row('nba', r),
                             all_rows)
        parsed_rows = map(parse_player_row, player_rows)
        day_salaries = pandas.DataFrame.from_records(
            parsed_rows, columns=['Player', 'Position', 'Salary'])
        day_salaries["Salary"] = day_salaries["Salary"].apply(int)
        day_salaries["Player"] = day_salaries["Player"].apply(
            lambda x: x.strip())
        day_salaries["Position"] = day_salaries["Position"].apply(
            lambda x: x.strip())
        day_salaries.set_index("Player", inplace=True)
    else:
        day_salaries = parse_mlb_csv_from_soup(soup)
    return day_salaries

Exemplo n.º 4

0

Exibir arquivo

Arquivo: scrape_nba_odds.py Projeto: johncdwall/sportsball

def load_odds_for_day(game_day):
    day_part = '%02d' % game_day.day
    month_part = '%02d' % game_day.month
    url = nba_template.format(year=game_day.year,
                              month=month_part,
                              day=day_part)
    soup = getSoupFromURL(url)
    odds_tables = soup.findAll('table', {'class': 'tbl-odds'})
    if len(odds_tables) < 1:
        print 'Hit some weird (intermittent?) bug with no odds tables being found. Needs more investigation!'
        IPython.embed()
    if odds_tables[0].text == u'No games to display':
        return None
    try:
        odds = list(
            itertools.chain.from_iterable(
                filter(lambda x: x is not None,
                       [parse_odds_table(ot) for ot in odds_tables])))
    except TypeError:
        IPython.embed()
    df = pd.DataFrame(
        odds,
        columns=['Team', 'spread', 'vig', 'scoreline', 'rl_over', 'rl_under'])
    df.set_index('Team', drop=True, inplace=True)
    return df

Exemplo n.º 5

0

Exibir arquivo

Arquivo: starting_pitcher.py Projeto: johncdwall/sportsball

def get_nbc_starting_pitchers(game_day):
  # TODO(jgershen): we should handle doubleheaders more gracefully here
  soup = getSoupFromURL(url)
  main_div = soup.find('div', id='shsMLBprobables')
  main_div_children = list(main_div.children)
  game_day_header_re = game_day.strftime('%b\. %-d, %Y')
  for i, child in enumerate(main_div_children):
    if hasattr(child, 'text') and re.search(game_day_header_re, child.text):
      break
  else:
    print "Couldn't find stats for " + game_day.strftime('%b. %-d, %Y')
    import IPython
    IPython.embed()
    return None

  actual_div = main_div_children[i+1]
  game_rows = actual_div.findAll('tr', {'class': "shsRow0Row"}) + actual_div.findAll('tr', {'class': "shsRow1Row"})
  teams = []
  for row in game_rows:
    teams += parse_game_row(row)
  team_df = pd.DataFrame(teams, columns=['Tm', 'starter', 'HomeAway', 'Opp'])

  # Scraped it succesfully. Now, map the names/teams of the scraped pitchers to bref ID's
  team_df['starter_bref_id'] = team_df.apply(lambda team_row: name2brefid(team_row['starter'], team_row['Tm']),
                                             axis=1)
  team_df.set_index(['Tm'], inplace=True)
  return team_df

Exemplo n.º 6

0

Exibir arquivo

Arquivo: webio.py Projeto: johncdwall/sportsball

def dfFromGameLogURL(url):
    """Takes a url of a player's game log for a given year, returns a DataFrame"""
    glsoup = getSoupFromURL(url)
    reg_season_table = glsoup.findAll('table',
                                      attrs={'id': 'pgl_basic'
                                             })  # id for reg season table
    playoff_table = glsoup.findAll('table', attrs={'id': 'pgl_basic_playoffs'
                                                   })  # id for playoff table
    # parse the table header.  we'll use this for the creation of the DataFrame
    header = []
    # use the playoff table to get the header if the guy never played in the regular season this year.
    header_table = reg_season_table if len(reg_season_table) else playoff_table
    for th in header_table[0].findAll('th'):
        if not th.getText() in header:
            header.append(th.getText())
    # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly
    header[5] = u'HomeAway'
    header.insert(7, u'WinLoss')
    reg = soupTableToDF(reg_season_table, header)
    playoff = soupTableToDF(playoff_table, header)

    if reg is None:
        return playoff
    elif playoff is None:
        return reg
    else:
        return pandas.concat([reg, playoff])

Exemplo n.º 7

0

Exibir arquivo

Arquivo: scrape_bsbr.py Projeto: johncdwall/sportsball

def dfFromGameLogURL(url):
    """Takes a url of a player's game log for a given year, returns a DataFrame"""
    glsoup = getSoupFromURL(url)
    if not glsoup:  #Hmm, this really shouldn't happen?
        logging.warning("No soup parsed from %s", url)
        return None
    stats_table = glsoup.findAll('table', attrs={'class': 'stats_table'
                                                 })  # id for reg season table
    # parse the table header.  we'll use this for the creation of the DataFrame
    header = []
    if not stats_table:
        return None
    for th in stats_table[0].find("thead").findAll('th'):
        if not th.getText() in header:
            header.append(th.getText())
    # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly
    header[5] = u'HomeAway'
    year = url[-4:] if re.search('(?P<year>\d+)$',
                                 url) else datetime.datetime.today().year
    date_column = header.index("Date")
    # turn soup of the table into a list o' lists
    stats_table = soupTableToTable(stats_table)
    # Run cleanup for MLB tables on baseball-reference.com -- turn dates into actual dates.
    for row_ix in range(len(stats_table)):
        raw_date = stats_table[row_ix][date_column]
        # Remove non-ASCII characters from the date str and replace with single spaces
        # (sometimes the space between month and day is a whacky unicode char; thx baseball-reference.)
        raw_date = re.sub(r'[^\x00-\x7F]+', ' ', raw_date)
        # Ignore if the game was suspended and resumed later
        raw_date = re.sub(r'susp', '', raw_date)
        if '(' not in raw_date and len(raw_date):
            stats_table[row_ix][date_column] = parser.parse(raw_date + ' ' +
                                                            str(year))
        elif raw_date:
            # This is a doubleheader! Assign doubleheaders to "hours".
            # This doesn't do anything smart, except keep the data indexed by separate values so that
            # it could conceivably be retrieved later.
            dateparts = re.match(
                "(?P<month>\w+) (?P<day>\d+) ?\((?P<gameno>\d+)\)", raw_date)
            assembled_date = parser.parse(
                dateparts.group("month") + " " + dateparts.group("day") + " " +
                dateparts.group("gameno") + ":00" + " " + str(year))
            stats_table[row_ix][date_column] = assembled_date
        else:
            # There's not a date here -- it's probably the EOY summary row.
            # It could also be a trade notification? Either way, ignore it.
            continue
    # Discard EOY summary row
    stats_table = stats_table[:-1]
    # Remove any rows which contain "Player went from" -- trade notifications sneaking in there
    stats_table = filter(
        lambda row: not any(
            isinstance(cell, basestring) and cell.startswith(
                'Player went from') for cell in row), stats_table)
    # Use common function to turn our cleaned-up stats table into a dataframe
    parsed_df = parsedTableToDF(stats_table, header, date_index=date_column)
    return parsed_df

Exemplo n.º 8

0

Exibir arquivo

Arquivo: scrape_bsbr.py Projeto: jgershen/sportsball

def get_active_players():
  letters = list('abcdefghijklmnopqrstuvwxyz')
  player_and_url_list = []
  print 'Checking currently active players on baseball-reference.com...'
  pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()])
  for letter in pbar(letters):
    letter_page = getSoupFromURL('http://www.baseball-reference.com/players/%s/' % (letter))
    # we don't just need active players (<b> tags), we need anyone who played in 2015!
    prefix_sections = letter_page.findAll('pre')
    for section in prefix_sections:
      player_and_url_list += list(_parse_bsbr_prefix_section(section))
  bref_id_dict = dict(player_and_url_list)
  return bref_id_dict

Exemplo n.º 9

0

Exibir arquivo

Arquivo: scrape_bsbr.py Projeto: jgershen/sportsball

def dfFromGameLogURL(url):
  """Takes a url of a player's game log for a given year, returns a DataFrame"""
  glsoup = getSoupFromURL(url)
  if not glsoup: #Hmm, this really shouldn't happen?
    logging.warning("No soup parsed from %s", url)
    return None
  stats_table = glsoup.findAll('table', attrs={'class': 'stats_table'})  # id for reg season table
  # parse the table header.  we'll use this for the creation of the DataFrame
  header = []
  if not stats_table:
    return None
  for th in stats_table[0].find("thead").findAll('th'):
    if not th.getText() in header:
      header.append(th.getText())
  # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly
  header[5] = u'HomeAway'
  year = url[-4:] if re.search('(?P<year>\d+)$', url) else datetime.datetime.today().year
  date_column = header.index("Date")
  # turn soup of the table into a list o' lists
  stats_table = soupTableToTable(stats_table)
  # Run cleanup for MLB tables on baseball-reference.com -- turn dates into actual dates.
  for row_ix in range(len(stats_table)):
    raw_date = stats_table[row_ix][date_column]
    # Remove non-ASCII characters from the date str and replace with single spaces
    # (sometimes the space between month and day is a whacky unicode char; thx baseball-reference.)
    raw_date = re.sub(r'[^\x00-\x7F]+',' ', raw_date)
    # Ignore if the game was suspended and resumed later
    raw_date = re.sub(r'susp','',raw_date)
    if '(' not in raw_date and len(raw_date):
      stats_table[row_ix][date_column] = parser.parse(raw_date + ' ' + str(year))
    elif raw_date:
      # This is a doubleheader! Assign doubleheaders to "hours".
      # This doesn't do anything smart, except keep the data indexed by separate values so that
      # it could conceivably be retrieved later.
      dateparts = re.match("(?P<month>\w+) (?P<day>\d+) ?\((?P<gameno>\d+)\)", raw_date)
      assembled_date = parser.parse(dateparts.group("month") + " " + dateparts.group("day") + " " +
                                    dateparts.group("gameno") + ":00" + " " + str(year))
      stats_table[row_ix][date_column] = assembled_date
    else:
      # There's not a date here -- it's probably the EOY summary row.
      # It could also be a trade notification? Either way, ignore it.
      continue
  # Discard EOY summary row
  stats_table = stats_table[:-1]
  # Remove any rows which contain "Player went from" -- trade notifications sneaking in there
  stats_table = filter(lambda row: not any(isinstance(cell, basestring) and cell.startswith('Player went from') for cell in row), stats_table)
  # Use common function to turn our cleaned-up stats table into a dataframe
  parsed_df = parsedTableToDF(stats_table, header, date_index=date_column)
  return parsed_df

Exemplo n.º 10

0

Exibir arquivo

Arquivo: webio.py Projeto: jgershen/sportsball

def get_active_players(letters=list('abcdefghijklmnopqrstuvwxyz')):
  players = []
  print 'Loading currently active players from basketball-reference.com...'
  pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()])
  for letter in pbar(letters):
    letter_page = getSoupFromURL('http://www.basketball-reference.com/players/%s/' % (letter))
    # we know that all the currently active players have <strong> tags, so we'll limit our names to those
    current_names = letter_page.findAll('strong')
    for n in current_names:
      name_data = n.children.next()
      full_url = 'http://www.basketball-reference.com' + name_data.attrs['href']
      bref_id = bbr_id_regex.match(full_url).group('pid')
      players.append((bref_id, full_url))
  players = dict(players)
  return players

Exemplo n.º 11

0

Exibir arquivo

Arquivo: scrape_bsbr.py Projeto: jgershen/sportsball

def load_overview_pages(players):
  pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()])
  print 'Accessing and parsing overview pages...'
  for i, (bref_id, player_dict) in pbar(list(enumerate(players.items()))):
    if players[bref_id]['overview_url_content'] is None:
      overview_soup = getSoupFromURL(players[bref_id]['overview_url'])
      players[bref_id]['overview_url_content'] = overview_soup.text
      # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
      # so we can use those to pull out our urls.
      game_log_links = []
      for li in overview_soup.find_all('li'):
        if 'Game Logs' in li.getText():
          game_log_links =  li.findAll('a')
      for game_log_link in game_log_links:
        players[bref_id]['gamelog_url_list'].append('http://www.baseball-reference.com' + game_log_link.get('href'))
  return players

Exemplo n.º 12

0

Exibir arquivo

def load_odds_for_day(game_day):
  # TODO(jgershen): we should handle doubleheaders more gracefully here
  day_part = '%02d' % game_day.day
  month_part = '%02d' % game_day.month
  url = mlb_template.format(year=game_day.year, month=month_part, day=day_part)
  soup = getSoupFromURL(url)
  odds_tables = soup.findAll('table', {'class': 'tbl-odds'})
  if odds_tables[0].text == u'No games to display':
    return None
  odds = list(itertools.chain.from_iterable([parse_odds_table(ot) for ot in odds_tables]))
  df = pd.DataFrame(odds, columns=['Team', 'moneyline', 'runline', 'rl_over', 'rl_under'])
  df['odds'] = df['moneyline'].apply(moneyline_to_implied_odds)
  df['rl_over_odds'] = df['rl_over'].apply(moneyline_to_implied_odds)
  df['rl_under_odds'] = df['rl_under'].apply(moneyline_to_implied_odds)
  df.set_index('Team', drop=True, inplace=True)
  return df

Exemplo n.º 13

0

Exibir arquivo

Arquivo: nba_scraper.py Projeto: jgershen/sportsball

def load_stats_tables_from_history_page(url):
  """Load all the prediction tables from a Numberfire history page"""
  soup = getSoupFromURL(url)
  salary = load_player_salary_table(soup)
  projection_months = ['%s-schedule' % month for month in
                       ['October', 'November', 'December', 'January', 'February', 'March', 'April']]
  month_tables = []
  for month in projection_months:
    month_schedule = soup.find('div', attrs={'id': month})
    month_table = load_player_history_table(month_schedule)
    if month_table is not None:
      month_tables.append(month_table)
  if month_tables:
    all_predictions = pandas.concat(month_tables)
  else:
    all_predictions = None
  return all_predictions, salary

Exemplo n.º 14

0

Exibir arquivo

Arquivo: park_effects.py Projeto: johncdwall/sportsball

def _load_parkeffect_from_url(team):
    soup = getSoupFromURL(url.format(team=team))
    stats_table = soup.findAll(id='franchise_years')
    parsed_table = soupTableToTable(stats_table)
    header = []
    if not stats_table:
        print 'stats table not loaded for this team'
        import IPython
        IPython.embed()
        return None
    for th in stats_table[0].find("thead").findAll('th'):
        if not th.getText() in header:
            header.append(th.getText())
    df = parsedTableToDF(
        parsed_table, header, date_index=0
    )  # Use "Rk" as index so we can talk about most recent year
    return df.loc[1, 'BPF'], df.loc[1, 'PPF']

Exemplo n.º 15

0

Exibir arquivo

Arquivo: scrape_bsbr.py Projeto: johncdwall/sportsball

def get_active_players():
    letters = list('abcdefghijklmnopqrstuvwxyz')
    player_and_url_list = []
    print 'Checking currently active players on baseball-reference.com...'
    pbar = progressbar.ProgressBar(widgets=[
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ])
    for letter in pbar(letters):
        letter_page = getSoupFromURL(
            'http://www.baseball-reference.com/players/%s/' % (letter))
        # we don't just need active players (<b> tags), we need anyone who played in 2015!
        prefix_sections = letter_page.findAll('pre')
        for section in prefix_sections:
            player_and_url_list += list(_parse_bsbr_prefix_section(section))
    bref_id_dict = dict(player_and_url_list)
    return bref_id_dict

Exemplo n.º 16

0

Exibir arquivo

def load_overview_pages(players):
    """
  Hit the overview page and load gamelog_url_list for each of the players in the player dict.
  Maybe this should be in the webio submodule? I am leaving it here since it controls scraping program flow.
  :param players: player dict
  :return dict: player dict
  """

    # Helper function to guess which position a player plays from the overview table of stats.
    # Just grab the position from the most recent year in which it was defined, and return that.
    def quick_position_guess(overview_table):
        return overview_table.dropna(subset=['Pos'])['Pos'].iloc[-1]

    pbar = progressbar.ProgressBar(widgets=[
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ])
    print 'Accessing and parsing overview pages...'
    for i, (bref_id, player_dict) in pbar(list(enumerate(players.items()))):
        overview_soup = getSoupFromURL(players[bref_id]['overview_url'])
        players[bref_id]['overview_url_content'] = overview_soup.text
        # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
        # so we can use those to pull out our urls.
        for li in overview_soup.find_all('li'):
            if 'Game Logs' in li.getText():
                game_log_links = li.findAll('a')
                for game_log_link in game_log_links:
                    players[bref_id]['gamelog_url_list'].append(
                        'http://www.basketball-reference.com' +
                        game_log_link.get('href'))
        player_name = overview_soup.find('h1').text
        players[bref_id]['info']['name'] = player_name
        # Read (guess?) player's position
        overview_table = dfFromOverviewPage(overview_soup)
        if len(overview_table.dropna(subset=['Pos'])) > 0:
            players[bref_id]['info']['pos'] = quick_position_guess(
                overview_table)
        else:
            players[bref_id]['info'][
                'pos'] = '?'  # this will only happen for chumps but by defining a value we should block exceptions
    return players

Exemplo n.º 17

0

Exibir arquivo

Arquivo: nba_scraper.py Projeto: johncdwall/sportsball

def load_stats_tables_from_history_page(url):
    """Load all the prediction tables from a Numberfire history page"""
    soup = getSoupFromURL(url)
    salary = load_player_salary_table(soup)
    projection_months = [
        '%s-schedule' % month for month in [
            'October', 'November', 'December', 'January', 'February', 'March',
            'April'
        ]
    ]
    month_tables = []
    for month in projection_months:
        month_schedule = soup.find('div', attrs={'id': month})
        month_table = load_player_history_table(month_schedule)
        if month_table is not None:
            month_tables.append(month_table)
    if month_tables:
        all_predictions = pandas.concat(month_tables)
    else:
        all_predictions = None
    return all_predictions, salary

Exemplo n.º 18

0

Exibir arquivo

Arquivo: webio.py Projeto: johncdwall/sportsball

def get_active_players(letters=list('abcdefghijklmnopqrstuvwxyz')):
    players = []
    print 'Loading currently active players from basketball-reference.com...'
    pbar = progressbar.ProgressBar(widgets=[
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ])
    for letter in pbar(letters):
        letter_page = getSoupFromURL(
            'http://www.basketball-reference.com/players/%s/' % (letter))
        # we know that all the currently active players have <strong> tags, so we'll limit our names to those
        current_names = letter_page.findAll('strong')
        for n in current_names:
            name_data = n.children.next()
            full_url = 'http://www.basketball-reference.com' + name_data.attrs[
                'href']
            bref_id = bbr_id_regex.match(full_url).group('pid')
            players.append((bref_id, full_url))
    players = dict(players)
    return players

Exemplo n.º 19

0

Exibir arquivo

Arquivo: scrape_bsbr.py Projeto: johncdwall/sportsball

def load_overview_pages(players):
    pbar = progressbar.ProgressBar(widgets=[
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ])
    print 'Accessing and parsing overview pages...'
    for i, (bref_id, player_dict) in pbar(list(enumerate(players.items()))):
        if players[bref_id]['overview_url_content'] is None:
            overview_soup = getSoupFromURL(players[bref_id]['overview_url'])
            players[bref_id]['overview_url_content'] = overview_soup.text
            # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
            # so we can use those to pull out our urls.
            game_log_links = []
            for li in overview_soup.find_all('li'):
                if 'Game Logs' in li.getText():
                    game_log_links = li.findAll('a')
            for game_log_link in game_log_links:
                players[bref_id]['gamelog_url_list'].append(
                    'http://www.baseball-reference.com' +
                    game_log_link.get('href'))
    return players

Exemplo n.º 20

0

Exibir arquivo

Arquivo: mlb_scraper.py Projeto: jgershen/sportsball

def load_stats_tables_from_history_page(url):
  """Load all the prediction tables from a Numberfire history page"""
  soup = getSoupFromURL(url)
  #salary = load_player_salary_table(soup)
  projection_months = ['%s-schedule' % month for month in
                       ['March', 'April', 'May', 'June', 'July', 'August', 'September', 'October']]
  month_tables = []
  for month in projection_months:
    month_schedule = soup.find('div', attrs={'id': month})
    month_table = load_player_history_table(month_schedule)
    if month_table is not None:
      month_tables.append(month_table)
  if month_tables:
    all_predictions = pandas.concat(month_tables)
    all_predictions.sort_index(inplace=True)
    if all_predictions.index.duplicated().any():
      print 'Duplicate games scraped!'
      import IPython
      IPython.embed()
  else:
    all_predictions = None
  return all_predictions

Exemplo n.º 21

0

Exibir arquivo

Arquivo: scraper.py Projeto: jgershen/sportsball

def load_positions_for_day(sport, game_date, game='FanDuel'):
  ''' get salaries and positions for eligible players for the given day / fantasy site
  :param datetime.datetime game_date:
  :param basestring game:
  :return:
  '''
  month, day = game_date.month, game_date.day
  url_template = nba_url_template if sport == 'nba' else mlb_url_template
  day_part = '%02d' % day
  url = url_template.format(month=month, day=day_part, game_code=game_code_dict[game])
  soup = getSoupFromURL(url)
  if sport == 'nba':
    all_rows = soup.findAll('tr')
    player_rows = filter(lambda r: is_actual_player_row('nba', r), all_rows)
    parsed_rows = map(parse_player_row, player_rows)
    day_salaries = pandas.DataFrame.from_records(parsed_rows, columns=['Player', 'Position', 'Salary'])
    day_salaries["Salary"] = day_salaries["Salary"].apply(int)
    day_salaries["Player"] = day_salaries["Player"].apply(lambda x: x.strip())
    day_salaries["Position"] = day_salaries["Position"].apply(lambda x: x.strip())
    day_salaries.set_index("Player", inplace=True)
  else:
    day_salaries = parse_mlb_csv_from_soup(soup)
  return day_salaries

Exemplo n.º 22

0

Exibir arquivo

Arquivo: webio.py Projeto: jgershen/sportsball

def dfFromGameLogURL(url):
  """Takes a url of a player's game log for a given year, returns a DataFrame"""
  glsoup = getSoupFromURL(url)
  reg_season_table = glsoup.findAll('table', attrs={'id': 'pgl_basic'})  # id for reg season table
  playoff_table = glsoup.findAll('table', attrs={'id': 'pgl_basic_playoffs'}) # id for playoff table
  # parse the table header.  we'll use this for the creation of the DataFrame
  header = []
  # use the playoff table to get the header if the guy never played in the regular season this year.
  header_table = reg_season_table if len(reg_season_table) else playoff_table
  for th in header_table[0].findAll('th'):
    if not th.getText() in header:
      header.append(th.getText())
  # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly
  header[5] = u'HomeAway'
  header.insert(7, u'WinLoss')
  reg = soupTableToDF(reg_season_table, header)
  playoff = soupTableToDF(playoff_table, header)

  if reg is None:
    return playoff
  elif playoff is None:
    return reg
  else:
    return pandas.concat([reg, playoff])