def purge_database():
    """
        Archive old documents
    """
    with mongo.Mongo() as database:
        database.purge_old_documents()
    sys.exit(0)
示例#2
0
def get_reputation_events_for_source(addr, source, start_date):
    """
        Get reputation events with full data (raw data included) for
        a given ip and a given source.

        :param str addr: Ip the reputation must be computed with
        :param str source: Source short name to get events of
        :param int start_date: Timestamp the events must be retrieved from
        :rtype: array
        :return: Array of events
    """
    with mongo.Mongo() as database:
        events = database.find_all_event_data_for_ip(addr, start_date, True)

    result = [event for event in events if event['source'] == _map_source_from_shortname(source)]

    # Find the first data to determine whether data are b64 encoded or not.
    is_encoded = False
    for event in result:
        if event['data']:
            is_encoded = utils.is_base64_encoded(event['data'])
            break

    # If data are encoded, then decode all
    if is_encoded:
        for event in result:
            event['data'] = base64.b64decode(event['data']).decode() if event['data'] else event['data']

    return result
示例#3
0
def player_box_score(game_id):
    """
    Scrape all player stats from a specific game and store in Mongo

    :param game_id: MongoDB and Basketball Reference game id
    """

    # HTML Content
    response = requests.get('https://www.basketball-reference.com/boxscores/' +
                            game_id + '.html')
    soup = BeautifulSoup(response.content, "html.parser")

    # MongoDB Collection
    mongo_wrapper = mongo.Mongo()

    # The ids of the tables have team names in them
    table_id = re.compile('^box_[a-z]{3}_basic$')

    home_players = []
    away_players = []

    home = False

    for table in soup.find_all(id=table_id):
        sub_table = table.find('tbody')

        for player in sub_table.find_all('tr', {'class': None}):

            player_stats = {}

            # Player ID
            player_id = player.find('th')
            player_id = player_id['data-append-csv']

            player_stats['player'] = player_id

            # Loop through each stat
            for stat in player.find_all('td'):
                player_stats[stat['data-stat']] = scrape_utils.stat_parse(stat['data-stat'], stat.string)  # pylint: disable=line-too-long

            # If this key exists it means the player did not play
            if 'reason' not in player_stats:

                if home:
                    home_players.append(player_stats)
                else:
                    away_players.append(player_stats)

        home = True

    # Insert into database
    mongo_wrapper.update(
        'game_log', {'_id': game_id},
        {'$set': {
            'hplayers': home_players,
            'aplayers': away_players
        }})
示例#4
0
    def send_reports(self):
        """
            The only public method used to run the process of email sending.
        """
        with mongo.Mongo() as database:
            for entry in database.find_highest_scores():
                subject = self._prepare_subject(entry['_id'], entry['value'])

                raw = self._prepare_raw(database, entry['_id'])
                body = self._prepare_body(entry['_id'], entry['value'], raw)

                self._send_mail(subject, body)
示例#5
0
def betting_df(season=None, sportsbooks=None):
    """
    Creates a Pandas DataFrame that contains betting information by game/sportsbook.

    Args:
        season: List of NBA Seasons
        sportsbooks: List of sportsbook names

    Returns:
        A Pandas DataFrame containing odds information
    """

    mongo_wrapper = mongo.Mongo()

    season_match = {}
    sportsbook_match = {}

    # Match the right season
    if season is not None:
        if isinstance(season, int):
            season = [season]
        season_match = {'season': {'$in': season}}

    # Match the right sportsbook
    if sportsbooks is not None:
        if isinstance(sportsbooks, str):
            sportsbooks = [sportsbooks]
        sportsbook_match = {'sportsbook': {'$in': sportsbooks}}

    # Mongo Aggregation
    pipeline = [{
        '$match': season_match
    }, {
        '$project': {
            'odd': '$odds.sportsbooks'
        }
    }, {
        '$unwind': '$odd'
    }, {
        '$project': {
            'sportsbook': '$odd.sportsbook',
            'home_odds': '$odd.home_odds',
            'away_odds': '$odd.away_odds'
        }
    }, {
        '$match': sportsbook_match
    }]

    cursor = mongo_wrapper.aggregate(mongo_wrapper.GAME_LOG, pipeline)

    return pd.DataFrame(list(cursor))
示例#6
0
def team_abilities(decay, att_constraint, def_constraint, day_span):
    """
    Return abilities based on the time decay factor

    Args:
        decay: Time decay parameter
        att_constraint: Mean Attack Constraint of the model
        def_constraint: Mean Defence Constraint of the model

    Returns:
        Pandas DataFrame of team parameters by week
    """

    query = {
        'mw': decay,
        'att_constraint': att_constraint,
        'def_constraint': def_constraint,
        'day_span': day_span
    }

    projection = {
        '_id': 0,
        'model': 0,
        'mw': 0,
        'att_constraint': 0,
        'def_constraint': 0
    }

    mongo_wrapper = mongo.Mongo()
    cursor = mongo_wrapper.find(mongo_wrapper.DIXON_TEAM, query, projection)

    # The attack and defence columns are dicts, so need to expand them and then
    # melt so that each row is a team/week
    abilities_df = pd.DataFrame(list(cursor))

    attack = pd.DataFrame(abilities_df.att.values.tolist())
    attack['date'] = abilities_df['date']
    attack = attack.melt('date', var_name='team', value_name='attack')

    defence = pd.DataFrame(abilities_df['def'].values.tolist())
    defence['date'] = abilities_df['date']
    defence = defence.melt('date', var_name='team', value_name='defence')

    home_adv = pd.DataFrame(abilities_df['home_adv'].values.tolist())
    home_adv['date'] = abilities_df['date']
    home_adv = home_adv.melt('date', var_name='team', value_name='home_adv')

    abilities_df = attack.merge(defence)
    abilities_df = abilities_df.merge(home_adv)

    return abilities_df
示例#7
0
def player_per_game(player):
    """ Scrape a player's yearly per game stats"""

    # Mongo
    mongo_wrapper = mongo.Mongo()

    # Request
    url = "http://www.basketball-reference.com" + player['url']
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Player's statistics
    per_game = soup.find(id="per_game").find('tbody')

    # When there are missing years, there is no id or data-stat for the year
    regex = re.compile('.*')

    # Player dictionary
    player_stats = {
        '_id': url.rsplit('/', 1)[-1].rsplit('.', 1)[0],
        'name': player['name'],
        'seasons': []
    }

    # These entries are defined in the per game and advanced tables
    # Only want them to be displayed once per season for a player
    #entries = ['age', 'team_id', 'lg_id', 'pos', 'g', 'gs', 'mp']

    # Iterate through the years
    for year in per_game.find_all('tr', {'id': regex}):

        # Season stats
        season = {}

        season_year = year['id'][9:13]

        season['season'] = int(season_year)

        # Each stat in a season (Per Game)
        for stat in year.find_all('td', {'data-stat': regex}):
            season[stat['data-stat']] = scrape_utils.stat_parse(
                stat['data-stat'], stat.string)

        #for key in entries:
        #    if key in season:
        #        player_stats[season_year][key] = season.pop(key)

        player_stats['seasons'].append(season)

    mongo_wrapper.insert('player_season', player_stats)
示例#8
0
def aggregate_reputation_per_source(addr, start_date):
    """
        Aggregate ip reputation per source returning for each source
        the sum of the weights.

        :param str addr: Ip the reputation must be computed with
        :param int start_date: Timestamp the events must be retrieved from
        :rtype: dict
        :return: dictionary that gives for each source, the aggregated
            weight
    """
    with mongo.Mongo() as database:
        events = database.find_all_events_for_ip(addr, start_date, True)

    # Reduce by source
    scores_by_source = _compute_score_by_source(events)

    # Append sources which are missing in scores_by_source (no attached events)
    for parser in parsers.keys():
        if parser not in scores_by_source.keys():
            scores_by_source[parser] = 0

    # Format final dto
    result = []
    for source in scores_by_source.keys():
        if source not in shortened_names.keys():
            short_name = source
        else:
            short_name = shortened_names[source]

        result.append({
            'short_name': short_name,
            'full_name': source,
            'result': scores_by_source[source],
        })

    return result
示例#9
0
def player_abilities(decay, day_span):

    query = {'mw': decay, 'day_span': day_span}

    projection = {
        '_id': 0,
        'mw': 0,
        'day_span': 0,
    }

    mongo_wrapper = mongo.Mongo()
    cursor = mongo_wrapper.find(mongo_wrapper.PLAYERS_BETA, query, projection)

    abilities_df = pd.DataFrame(list(cursor))

    df = pd.concat([
        abilities_df.drop(['player'], axis=1), abilities_df['player'].apply(
            pd.Series)
    ],
                   axis=1)

    df['mean'] = beta.mean(df.a, df.b)

    return df
示例#10
0
def init_source_ips():
    collection_name = MONGO_COLLECTION_SOURCE
    mongo_conn = mongo.Mongo().get_conn(collection_name)
    Downloader(APNIC_URL).download()
    _gen_source_ip(mongo_conn)
    mongo.Mongo().init_index(collection_name)
示例#11
0
def get_starting_lineups(team, year):
    """
    Scrape a team's starting lineup for every game in a season.

    :param team: NBA Team (Team abbreviation)
    :param year: NBA Season
    """

    # MongoDB
    mongo_wrapper = mongo.Mongo()

    # Rename team if relocated
    team = scrape_utils.rename_team(team, year)

    # Starting Lineup URL
    url = "http://www.basketball-reference.com/teams/%s/%s_start.html" % (team,
                                                                          year)

    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "html.parser")

    team = scrape_utils.rename_team(team)

    # Line up table
    lineup_table = soup.find(id='starting_lineups').find('tbody')

    # Iterate through each game
    for game in lineup_table.find_all('tr', {'class': None}):

        # Information to query mongodb to update collection
        date = game.find('td', {'data-stat': 'date_game'}).text
        date = datetime.strptime(date, '%a, %b %d, %Y')
        opponent = game.find('td', {'data-stat': 'opp_name'})
        opponent = opponent.find('a')['href'][7:10]

        # Determine home team for query
        location = game.find('td', {'data-stat': 'game_location'}).text

        lineup = []

        if location == '@':
            home = opponent
            away = team
            key = 'starters.away'
        else:
            home = team
            away = opponent
            key = 'starters.home'

        # Get the starting lineup
        starters = game.find('td', {'data-stat': 'game_starters'})
        for player in starters.find_all('a'):
            lineup.append(player['href'].rsplit('/', 1)[-1].rsplit('.', 1)[0])

        # Update document
        mongo_wrapper.update('game_log', {
            'date': date,
            'home.team': home,
            'away.team': away
        }, {'$set': {
            key: lineup
        }})
示例#12
0
 def __init__(self):
     self.source_ips_path = SOURCE_IPS_PATH
     self.collection = mongo.Mongo().get_conn(MONGO_COLLECTION_SOURCE)
     self.port_str = PORT_STR
     self.scan_options = "-sV --host-timeout {}".format(SCAN_TIMEOUT)
示例#13
0
def game_results(season=None, teams=None, date=None):
    """
    Creates a Pandas DataFrame that contains game results.

    Args:
        season: A list of season numbers
        teams: Team Names, if it's not None the DataFrame will contain indices

    Returns:
        A Pandas DataFrame containing a historical NBA results.

    """

    mongo_wrapper = mongo.Mongo()

    season_match = {}

    # Match the right season
    if season is not None:
        if isinstance(season, int):
            season = [season]
        season_match['season'] = {'$in': season}

    date_match = {}
    if date is not None:
        date_match['date'] = {'$lt': date}

    pipeline = [{
        '$match': season_match
    }, {
        '$project': {
            'home_team': '$home.team',
            'away_team': '$away.team',
            'home_pts': '$home.pts',
            'away_pts': '$away.pts',
            'season': 1,
            'date': 1
        }
    }, {
        '$match': date_match
    }]
    # Could aggregate
    cursor = mongo_wrapper.aggregate(mongo_wrapper.GAME_LOG, pipeline)

    games_df = pd.DataFrame(list(cursor))

    # If team names are included, replace index numbers
    if teams is not None:
        home_index = np.zeros(len(games_df), dtype=int)
        away_index = np.zeros(len(games_df), dtype=int)

        # Iterate through each game
        for row in games_df.itertuples():
            # Team indexes
            home_index[row.Index] = teams.index(row.home_team)
            away_index[row.Index] = teams.index(row.away_team)

        games_df['home_team'] = pd.Series(home_index, index=games_df.index)
        games_df['away_team'] = pd.Series(away_index, index=games_df.index)

    return games_df
示例#14
0
def player_results(season=None, date=None):

    # MongoDB
    m = mongo.Mongo()

    season_match = {}

    # Match the right season
    if season is not None:
        if isinstance(season, int):
            season = [season]
        season_match['season'] = {'$in': season}

    date_match = {}
    if date is not None:
        date_match['date'] = {'$lt': date}

    df = None

    for i in [['$hplayers.player', '$hplayers.pts', '$home.team', '$home.pts'],
              ['$aplayers.player', '$aplayers.pts', '$away.team',
               '$away.pts']]:
        pipeline = [{
            '$match': season_match
        }, {
            '$match': date_match
        }, {
            '$project': {
                'player': i[0],
                'pts': i[1],
                'team': i[2],
                'team_pts': i[3],
                'date': 1,
                'season': 1
            }
        }, {
            '$unwind': {
                'path': '$player',
                'includeArrayIndex': 'player_index'
            }
        }, {
            '$unwind': {
                'path': '$pts',
                'includeArrayIndex': 'pts_index'
            }
        }, {
            '$project': {
                'date': 1,
                'team': 1,
                'season': 1,
                'player': 1,
                'pts': 1,
                'team_pts': 1,
                'compare': {
                    '$cmp': ['$player_index', '$pts_index']
                }
            }
        }, {
            '$match': {
                'compare': 0
            }
        }]

        games = m.aggregate('game_log', pipeline)

        if df is None:
            df = pd.DataFrame(list(games))
        else:
            df = pd.concat([df, pd.DataFrame(list(games))])

    return df
示例#15
0
 def __init__(self):
     self.collection_source = mongo.Mongo().get_conn(MONGO_COLLECTION_SOURCE)
     self.collection_http = mongo.Mongo().get_conn(MONGO_COLLECTION_HTTP)
     self.collection_https = mongo.Mongo().get_conn(MONGO_COLLECTION_HTTPS)
     self.http_check_url = HTTP_CHECK_URL
     self.https_check_url = HTTPS_CHECK_URL
示例#16
0
def get_conn():
    if not hasattr(g, 'mongodb'):
        g.mongodb = mongo.Mongo()
    return g.mongodb
示例#17
0
    def __init__(self, mw, att_constraint, def_constraint, day_span=7):

        # Team Information
        self.nteams = 30
        self.teams = process_utils.name_teams(False, 30)

        # MongoDB
        self.mongo = mongo.Mongo()

        # Model parameters
        self.mw = mw
        self.att_constraint = att_constraint
        self.def_constraint = def_constraint
        self.day_span = day_span

        self.today = datetime.datetime.now()
        self.today = pd.Timestamp(
            self.today.replace(hour=0, minute=0, second=0, microsecond=0))

        # Train new abilities if they don't exist in the database
        if self.mongo.count(
                self.mongo.DIXON_TEAM, {
                    'mw': self.mw,
                    'att_constraint': self.att_constraint,
                    'def_constraint': self.def_constraint,
                    'day_span': self.day_span
                }) == 0:
            print('Training Team Abilities')
            self.train_all(teams=True, players=False)
        # ELIF TRAIN MISSING DAYS
        elif self.mongo.count(
                self.mongo.DIXON_TEAM, {
                    'mw': self.mw,
                    'att_constraint': self.att_constraint,
                    'def_constraint': self.def_constraint,
                    'day_span': self.day_span,
                    'date': self.today
                }) == 0:

            print('Scraping Missing Games')
            for team in scrape_utils.team_names():
                team_scraper.season_game_logs(team, 2019)

            print('Training Missing Days (Including Today)')
            ab = datasets.team_abilities(mw, att_constraint, def_constraint,
                                         day_span)
            games = datasets.game_results([2017, 2018, 2019])

            missing_ab = ab.merge(games, on='date', how='right')

            # Train for the missing dates
            for date in missing_ab.loc[missing_ab.team.isnull(),
                                       'date'].unique():
                self.train(pd.Timestamp(date))

            # Need to add today as this won't include that
            self.train(self.today)

        # Train new abilities if they don't exist in the database
        if self.mongo.count(self.mongo.PLAYERS_BETA, {
                'mw': 0.044,
                'day_span': self.day_span
        }) == 0:
            print('Training Player Abilities')
            self.train_all(teams=False, players=True)
        # ELIF TRAIN MISSING DAYS
        elif self.mongo.count(self.mongo.PLAYERS_BETA, {
                'mw': 0.044,
                'day_span': self.day_span,
                'date': self.today
        }) == 0:

            ab = datasets.player_abilities(0.044, day_span)
            games = datasets.game_results([2017, 2018, 2019])

            # Determine which games need to be scraped
            missing_ab = ab.merge(games, on='date', how='right')
            missing_ids = missing_ab[
                missing_ab['mean'].isnull()]['_id'].unique()

            # Scrape the missing game logs
            print('Scraping Player Box Scpres')
            for id in missing_ids:
                player_scraper.player_box_score(id)

            # Train for the missing dates
            print('Train Missing Days')
            for date in missing_ab.loc[missing_ab.team.isnull(),
                                       'date'].unique():
                self.train_players(pd.Timestamp(date))

            # Need to add today as this won't include that
            self.train_players(self.today)

        # Get all abilities in DF
        self.abilities = datasets.team_abilities(mw, att_constraint,
                                                 def_constraint, day_span)
        self.player_abilities = datasets.player_abilities(0.044, day_span)