Exemplo n.º 1
0
def get_player_url(player_url):
    try:
        return db.query(Player).filter(Player.url == player_url).one()
    except:
        if "https" in player_url:
            player_url = player_url.replace("https","http")
        else:
            player_url = player_url.replace("http","https")
        return db.query(Player).filter(Player.url == player_url).one()
Exemplo n.º 2
0
def make_player(player_url):
    try:
        return db.query(Player).filter(Player.url == player_url).one()
    except:
        # really make it
        pass

    if not ("http://" in player_url or "https://" in player_url):
        return None

    # uniform url protocol
    player_url = player_url.replace("http://", "https://")

    print "new player", player_url

    # find more information
    r = requests.get(player_url)
    soup = BeautifulSoup(r.content, "html.parser")


    cat = soup.find("div", {"class":"player-wins"}).findChildren()[2].contents[0].lower().split()

    if not "ms" in cat:
        print "> not men's singles"
        return None

    player = Player()

    try:
        ages = soup.find("div", {"class":"player-age"}).findChildren()[-1].contents[0].strip().split("/")

        player.birthyear = ages[2]
        player.birthmonth = ages[1]
        player.birthday = ages[0]
    except:
        # no birth information
        pass

    hand = soup.find("div", {"class":"player-handed"}).findChildren()[-1].contents[0].strip().lower()

    if hand != "n/a":
        player.handedness = "right" if "right" in hand else "left"

    info = soup.find("div", {"class":"player-profile-country-wrap"})

    # create player
    player.name = info.findChildren()[1].contents[0].strip().lower()
    player.country = info.findChildren()[0].attrs['title'].lower()
    player.gender = "male"
    player.playertype = "singles"
    player.url = player_url


    print player.name, player.country, player.gender, player.birth(), player.handedness

    db.add(player)
    db.commit()

    return player
Exemplo n.º 3
0
def delete_tournament(name):
    r = db.query(Tournament).filter(Tournament.name == name).one()
    matches = db.query(Match).filter(Tournament.url == r.url).all()

    for match in matches:
        ms1 = match.stat1
        ms2 = match.stat2

        sets = match.sets

        for s in sets:
            db.delete(s)

        db.delete(ms1)
        db.delete(ms2)
        
        db.delete(match)

    db.commit()
Exemplo n.º 4
0
def make_scrape_page(year, week, page):
    url = make_url(year, week, page)
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")

    rows = soup.findAll("tr")
    del rows[::2] # remove every second item

    for row in rows:
        tds = row.findAll("td")

        rank = int(tds[0].contents[0].strip())
        country = tds[1].findChildren()[0].findChildren()[0].contents[0].strip().upper()
        name = tds[2].findChildren()[0].findChildren()[0].findChildren()[0].contents[0].strip().lower()

        winslosses = tds[4].contents[0].strip().split(" - ")
        wins = int(winslosses[0])
        losses = int(winslosses[1])
        
        try:
            money = float(tds[5].contents[0].strip().replace(',','').replace('$',''))
        except ValueError:
            # N/A
            money = 0.0

        pointstournaments = tds[6].findChildren()[0].contents[0].strip().replace(',','').split(" / ")
        points = int(pointstournaments[0])
        tournaments = int(pointstournaments[1])

        player_url = tds[2].find("a").attrs['href']

        try:
            player = get_player_url(player_url)
        except:
            player = make_player(player_url)

        if player is None:
            # player is no longer active player
            continue

        print ">", player.name

        # if already scraped this week
        hasrank = len(
            db.query(
                PlayerRanking
            ).filter(PlayerRanking.player == player, PlayerRanking.week == week, PlayerRanking.year == year)
             .all()
        ) > 0

        if hasrank:
            print "has rank", year, ":", week
            continue

        ranking = PlayerRanking()
        ranking.player = player
        ranking.week = week
        ranking.year = year
        ranking.rank = rank
        ranking.wins = wins
        ranking.losses = losses
        ranking.points = points
        ranking.tournaments = tournaments
        ranking.prizemoney = money

        db.add(ranking)
        db.commit()

        print "> stored rank", year, ":", week
Exemplo n.º 5
0
import requests
from lxml import html
from bs4 import BeautifulSoup
import requests

from db.db_handler import db
from db.models import Player, PlayerRanking

from methods import make_player

years = range(2010, 2019)
weeks = range(1, 53)
players = db.query(Player).all()

player_urls = set([p.url for p in players])


def make_url(page):
    page = str(page)
    return "http://bwfbadminton.com/players/?char=all&country=&page_size=1000&page_no=" + page


def make_scrape_page(page):
    url = make_url(page)

    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")

    players = soup.findAll("div", {"class": "player"})

    for player_soup in players:
Exemplo n.º 6
0
from db.models import *

import numpy as np
import pickle

import datetime
import random


def now():
    return datetime.datetime.now()


MIN_PRICE = 1
MIN_YEAR = 2010
matches = db.query(Match).all()

matches = [
    m for m in matches
    if m.tournament.prizemoney > MIN_PRICE and m.tournament.year >= MIN_YEAR
]

# because the recent matches take
# much faster to compute
np.random.shuffle(matches)

Xs = []
ys = []

time_last = now()
time_lefts = []
Exemplo n.º 7
0
from db.db_handler import db
from db.models import *
import datetime

now = datetime.datetime.now()
thisyear = now.year
thisweek = now.isocalendar()[1]

# remove future rankings
rankings = db.query(PlayerRanking).filter(PlayerRanking.year == thisyear,
                                          PlayerRanking.week > thisweek).all()
[db.delete(r) for r in rankings]
print "removed", len(rankings), "rankings"
db.commit()
Exemplo n.º 8
0
def get_player_name(player_name):
    return db.query(Player).filter(Player.name == player_name).all()[0]
Exemplo n.º 9
0
def make_scrape_page(year):
    url = make_url(year)

    r = requests.get(url, verify=False)
    soup = BeautifulSoup(r.content, "html.parser")

    rows = soup.findAll("tr")

    for tr in rows[1:]:
        tds = tr.findAll("td")

        week = tds[0].contents[0].strip()

        try:
            country = tds[1].findChildren()[0].findChildren(
            )[0].contents[0].strip()
        except IndexError:
            continue

        dates = tds[2].contents[0].strip()
        name = tds[3].findChildren()[0].findChildren()[0].contents[0].strip()
        url = tds[3].findChildren()[0].findChildren()[0].attrs['href']
        money = tds[4].findChildren()[0].contents[0].strip()

        if not "bwfbadminton.com" in url:
            continue

        if money == "-":
            prizemoney = 0
        else:
            prizemoney = int(re.sub(r'[^\d.]', '', money))

        category = tds[5].findChildren()[0].findChildren(
        )[0].contents[0].strip()
        city = tds[6].findChildren()[0].contents[0].strip()

        tours = db.query(Tournament).filter(Tournament.name == name,
                                            Tournament.year == year).all()
        has_tournament = len(tours) > 0

        if has_tournament:
            continue

        t = Tournament()
        t.week = week
        t.start_date = dates.split("-")[0]
        t.end_date = dates.split("-")[1]
        t.name = name
        t.url = url
        t.country = country
        t.prizemoney = prizemoney
        t.category = category
        t.city = city
        t.year = year

        print "new tournament", t.name
        print t.url

        def go(t):

            try:
                make_scrape_tournament(t)
            except requests.exceptions.SSLError:
                print "bwfbadminton.com is down 1"
                return False
            except requests.exceptions.ConnectionError:
                # does not exist
                print "bad connection"
                return False
            except Exception as e:
                # e.g., timeout ...

                print "<<<<TRY AGAIN>>>>>"
                traceback.print_exc()

                # try again
                return go(t)

            return True

        success = go(t)
        if success:
            db.add(t)
            db.commit()
        else:
            db.rollback()