示例#1
0
def extract(gender, year, div, org, game, path):
    if not exists(join(path, 'Box Score - All (Parsed).csv')):
        print("Missing: {}".format(path), file=sys.stderr)
        return

    try:
        box = loadTable(join(path, 'Box Score - All (Parsed).csv'))
        row = [r for r in box[1:] if r[3] == org][-1]

        isHome = row[3] == box[-1][3]
        home = "Home" if isHome else "Away"
        opponent = box[1 if isHome else -1][3]

        info = loadTable(join(path, 'Info.csv'))
        location, attendance = "", ""
        for (c, v) in zip(info[0], info[1]):
            if c == "Location:":
                location = v
            elif c == "Attendance:":
                attendance = v

        row = [gender, year, div] + row[:2] + [org, opponent, home] + row[6:] + [location, attendance]

        output.append(row)
    except:
        print("Exception: {}".format(path), file=sys.stderr)
示例#2
0
    def loadDateTime():
        reDateTime = re.compile(r"^([^ ]+)(?: +(.+))?$")

        infoTable = loadTable(infofile)

        m = reDateTime.match(infoTable[1][0])
        return [withDefault(m.group(1), default=""), withDefault(m.group(2), default="")]
示例#3
0
    def loadPlayers(team):
        filename = os.path.join(
            os.path.split(datafile)[0],
            period + " - {}.csv".format(team)
        )
        if not os.path.exists(filename):
            filename = os.path.join(
                os.path.split(datafile)[0],
                "Box Score - {}.csv".format(team)
            )

        boxScoreTable = loadTable(filename)
        return [(r[0], r[2] != "") for r in boxScoreTable[1:-2]] + [("Team", None)]
示例#4
0
def parse(datafile, period, infofile):
    def computeSeconds(t):
        m, s = tuple(int(v) for v in t.split(":"))
        return m * 60 + s

    def computeTime(t):
        (x, y) = divmod(t, 60)
        return "{}:{:02d}".format(x, y)

    pbpTable = loadTable(datafile)

    rePBP = re.compile(
        r"^([0-9]+|[A-Z.`'-?\xbf]*(?:(?:, *| +)[A-Z.`'-?\xbf]*)*)" +
        r"(?:(?:^| +)(made|missed))?" +
        r"(?:(?:^| +)((?:[A-Z][a-z]*|[0-9]+)(?: +[A-Z][a-z]*|[0-9]+)*))$",
        re.U
    )

    teams = [pbpTable[0][1], pbpTable[0][3]]

    def loadDateTime():
        reDateTime = re.compile(r"^([^ ]+)(?: +(.+))?$")

        infoTable = loadTable(infofile)

        m = reDateTime.match(infoTable[1][0])
        return [withDefault(m.group(1), default=""), withDefault(m.group(2), default="")]

    def loadPlayers(team):
        filename = os.path.join(
            os.path.split(datafile)[0],
            period + " - {}.csv".format(team)
        )
        if not os.path.exists(filename):
            filename = os.path.join(
                os.path.split(datafile)[0],
                "Box Score - {}.csv".format(team)
            )

        boxScoreTable = loadTable(filename)
        return [(r[0], r[2] != "") for r in boxScoreTable[1:-2]] + [("Team", None)]

    dateTime = loadDateTime()
    players = [loadPlayers(team) for team in teams]

    def buildTable():
        table = [["Date", "Time", "Period", "Time Left", "Score", "Team", "Player", "Status", "Action"]]
        for r in pbpTable[1:]:
            if len(r) < 4:
                break

            flag = r[1] != ""

            m = rePBP.match(r[1 if flag else 3])
            if m is None:
                continue

            table.append(
                dateTime + \
                [period] + \
                [
                    r[0],
                    r[2],
                    teams[0 if flag else 1],
                    best(
                        [p[0] for p in players[0 if flag else 1]],
                        withDefault(m.group(1), "").replace(",", ", ")
                    ),
                    withDefault(m.group(2), ""),
                    withDefault(m.group(3), "")
                ]
            )

        return table

    table = buildTable()

    def getPeriodLength(table):
        maxTime = table[1][3]

        n = int(maxTime.split(':')[0])
        if 10 < n <= 20:
            return "20:00"
        if 5 < n <= 10:
            return "10:00"
        elif 0 < n <= 5:
            return "5:00"

    periodLength = getPeriodLength(table)

    def addTimer(table):
        table[0].append("Shot Clock")

        lastTime = periodLength
        for r in table[1:]:
            time = r[3]

            if r[-1] == "Turnover":
                lastTime = time

            if r[-1] in attacks or r[-1] in defends:
                diff = min(30, computeSeconds(lastTime) - computeSeconds(time))
                r.append(str(30 - diff) if diff > 0 else "")
                lastTime = time
            else:
                r.append("")

    addTimer(table)

    def addLineup(table):
        initPlayers = (
            set(p[0] for p in players[0] if p[1]),
            set(p[0] for p in players[1] if p[1])
        )

        for r in reversed(table[1:]):
            player = r[6]
            i = teams.index(r[5])
            if player == "Team":
                continue

            if r[-2] == "Enters Game":
                if player in initPlayers[i]:
                    initPlayers[i].remove(player)
            elif r[-2] == "Leaves Game":
                initPlayers[i].add(player)

        table[0].extend(["Lineup", "Lineup Time"])
        lineupTimes = [{}, {}]
        lastLineup, lastLineupTime = [None, None], [periodLength, periodLength]
        for r in table[1:]:
            player = r[6]
            i = teams.index(r[5])

            if player != "Team":
                if r[-2] == "Leaves Game":
                    if player in initPlayers[i]:
                        initPlayers[i].remove(player)
                else:
                    initPlayers[i].add(player)

            lineup = "; ".join(sorted(initPlayers[i]))
            r.append(lineup)

            lineupTime = r[3]
            if lastLineup[i] == None:
                lastLineup[i] = lineup

            lineupTimes[i][lastLineup[i]] = lineupTimes[i].setdefault(lastLineup[i], 0) + computeSeconds(lastLineupTime[i]) - computeSeconds(lineupTime)

            r.append(computeTime(lineupTimes[i].setdefault(lineup, 0)))

            lastLineup[i], lastLineupTime[i] = lineup, lineupTime

    addLineup(table)

    return table
示例#5
0
def crawl(sport, year, division, org, game, url, neutral=False):
    global data
    data = data.format(sport, year, division)

    gamename = game.replace('/', '.')

    def readFlag(flag):
        if not os.path.exists(os.path.join(data, org, gamename)):
            os.mkdir(os.path.join(data, org, gamename))

        return os.path.exists(os.path.join(data, org, gamename, flag))

    def setFlag(flag):
        with open(os.path.join(data, org, gamename, flag), 'w') as f:
            pass

    if neutral and not readFlag(".neutral"):
        setFlag(".neutral")

    filename = os.path.join(data, org, gamename, "{}.csv")

    if not readFlag(".done"):
        try:
            gamelink = urljoin(domain, url)
            log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink)))

            gs = parseURL(gamelink)

            sleep(2)

            gamescore = None
            gameinfo = None

            periods = []
            teams = []
            nextPeriod = 0
            for table in gs.select("div.header_menu a"):
                if (
                        table["href"] == "#" or
                        not (
                            table["href"].startswith("/game/box_score") or
                            table["href"].startswith("/game/play_by_play")
                        )
                    ):
                    continue

                tablelink = urljoin(domain, table["href"])
                print2("{} \033[4m{}\033[0m".format(table.text.strip(), tablelink))

                ts = parseURL(tablelink)

                if gamescore is None:
                    gamescore = parseTable(ts.select("table:nth-of-type(1)")[0])
                    dumpTable(
                        gamescore,
                        filename.format("Score")
                    )

                if gameinfo is None:
                    gameinfo = transposeTable(
                        parseTable(ts.select("table:nth-of-type(3)")[0]) +
                        parseTable(ts.select("table:nth-of-type(4)")[0])
                    )
                    dumpTable(
                        gameinfo,
                        filename.format("Info")
                    )

                teams = [gamescore[1][0].text.strip(), gamescore[2][0].text.strip()]
                periods = [v.text.strip() for v in gamescore[0][1:]]

                if table["href"].startswith("/game/box_score"):
                    if table.text.strip() == "Box Score":
                        sfilename = filename.format("Box Score - {}")
                    else:
                        sfilename = filename.format(periods[nextPeriod] + " - {}")
                        nextPeriod += 1

                    dumpTable(
                        parseTable(ts.select("table:nth-of-type(5)")[0], header=1),
                        sfilename.format(teams[0])
                    )
                    dumpTable(
                        parseTable(ts.select("table:nth-of-type(6)")[0], header=1),
                        sfilename.format(teams[1])
                    )
                elif table["href"].startswith("/game/play_by_play"):
                    sfilename = filename.format("Play by Play - {}")

                    for (i, period) in enumerate(periods[:-1]):
                        dumpTable(
                            parseTable(ts.select("table:nth-of-type({})".format(6 + 2 * i))[0], header=0),
                            sfilename.format(period)
                        )

                sleep(2)

            if gamescore == gameinfo == None:
                raise Exception("Not a game.")

            setFlag(".done")

            sleep(2)
        except Exception as e:
            print2(colored("Error: ", "red"), e)
        finally:
            print2()

    if not readFlag(".parsed"):
        try:
            gamelink = urljoin(domain, url)
            log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink)))
            print2("Parsing...")

            gamescore = loadTable(filename.format("Score"))

            sfilename = filename.format("Box Score - {}")
            teams = [gamescore[1][0], gamescore[2][0]]
            with open(filename.format("Box Score - All (Parsed)"), "w") as af:
                for team in teams:
                    boxScore = parseBoxScore(
                        sfilename.format(team),
                        filename.format("Info"),
                        team,
                        "All"
                    )

                    rawDumpTable(boxScore[(0 if team == teams[0] else 1):], af)

            sfilename = filename.format("Play by Play - {}")
            periods = gamescore[0][1:]
            with open(filename.format("Play by Play - All (Parsed)"), "w") as af:
                for period in periods[:-1]:
                    playByPlay = parsePlayByPlay(
                        sfilename.format(period),
                        period,
                        filename.format("Info")
                    )

                    rawDumpTable(playByPlay[(0 if period == periods[0] else 1):], af)

            setFlag(".parsed")
        except Exception as e:
            print2(colored("Error: ", "red"), e)
        finally:
            print2()