Пример #1
0
def main(argv):
    starturl = "http://www.espn.com/college-football/schedule"
    path = "{0}{1}/{2}".format(settings.predict_root, year,
                               settings.predict_sched)

    print("Scrape Schedule Tool")
    print("**************************")
    print("data is from {0}".format(starturl))
    print
    print("Year is: {0}".format(year))
    print("Directory location: {0}".format(path))
    print("**************************")

    Path(path).mkdir(parents=True, exist_ok=True)
    for p in Path(path).glob("sched*.*"):
        p.unlink()

    url = []
    url.append("{0}/_/week/1/year/{1}/seasontype/3".format(starturl, year))
    if (year == int(now.year)):
        for week in range(1, 17):
            url.append("{0}/_/week/{1}/seasontype/2".format(starturl, week))
        url.append("{0}/_/week/1/seasontype/3".format(starturl))
    else:
        for week in range(1, 17):
            url.append("{0}/_/week/{1}/year/{2}/seasontype/2".format(
                starturl, week, year))
        url.append("{0}/_/week/1/year/{1}/seasontype/3".format(starturl, year))
    pages = []
    for item in url:
        req = Request(
            url=item,
            headers={
                'User-Agent':
                ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'
            })
        try:
            page = urlopen(req)
        except HTTPError as e:
            page = e.read()
        pages.append(BeautifulSoup(page, "html5lib"))

    loop = 0
    for page in pages:
        loop += 1
        dates = page.findAll("h2", {"class": "table-caption"})
        tables = page.findAll('table', {"class": "schedule"})
        dateidx = 0
        index = 0
        IDX = []
        Y = []
        A = []
        B = []
        C = []
        D = []
        F = []
        G = []
        for table in tables:
            teams = table.findAll('abbr')
            home = table.findAll('td', {"class": "home"})
            scores = table.findAll('td')
            E = []
            for score in scores:
                data = score.find(text=True)
                if (data is not None
                        and ("Canceled" in data or "Postponed" in data)):
                    E.append(data)
                elif data is not None and ',' in data and num_there(data):
                    E.append(data)
                else:
                    E.append("?")
            if loop == len(pages):
                for item in range(2, len(E), 7):
                    F.append(E[item])
            else:
                for item in range(2, len(E), 6):
                    F.append(E[item])
            neutral = table.findAll('tr', {'class': ['odd', 'even']})
            line = 0
            count = 0
            for team in teams:
                if (line % 2 == 0):
                    if dateidx < len(dates):
                        theDate = dates[dateidx].find(text=True)
                    else:
                        theDate = "?"
                    A.append(theDate)
                    if "January" not in theDate:
                        Y.append(year)
                    else:
                        Y.append(year + 1)
                    B.append(pyBlitz.CleanString(team['title']))
                    if loop != len(pages):
                        try:
                            if (neutral[count]['data-is-neutral-site'] ==
                                    'true'):
                                C.append("Neutral")
                            else:
                                C.append("?")
                        except KeyError as e:
                            C.append("Neutral")
                    else:
                        C.append("Neutral")
                    if (index < len(F)):
                        G.append(F[index])
                    else:
                        G.append("?")
                    count += 1
                    index += 1
                    IDX.append(index)
                else:
                    D.append(pyBlitz.CleanString(team['title']))
                    if (C[-1] == '?'):
                        C[-1] = D[-1]
                line += 1
            dateidx += 1
        df = pd.DataFrame(IDX, columns=['Index'])
        df['Year'] = Y
        df['Date'] = A
        df['TeamA'] = B
        df['Home'] = C
        df['TeamB'] = D
        df['Score'] = G
        if (not df.empty):
            filename = "{0}sched{1}.json".format(path, loop)
            with open(filename, 'w') as f:
                f.write(df.to_json(orient='index'))

            with open(filename) as sched_json:
                dict_sched = json.load(sched_json,
                                       object_pairs_hook=OrderedDict)

            filename = "{0}sched{1}.csv".format(path, loop)
            sched_sheet = open(filename, 'w', newline='')
            csvwriter = csv.writer(sched_sheet)
            count = 0
            for row in dict_sched.values():
                if (count == 0):
                    header = row.keys()
                    csvwriter.writerow(header)
                    count += 1
                csvwriter.writerow(row.values())
            sched_sheet.close()
    for root, dirs, files in os.walk(settings.predict_root):
        for d in dirs:
            os.chmod(os.path.join(root, d),
                     stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
        for f in files:
            os.chmod(
                os.path.join(root, f), stat.S_IRUSR | stat.S_IWUSR
                | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH)
    print("done.")
Пример #2
0
D = []
E = []
F = []
index = 0
for row in dict_bpi.values():  #Main key put every one in
    A.append(row["School"])
    B.append("?")
    C.append("?")
    D.append("?")
    E.append("?")
    F.append(row["Class"])
    index += 1
    IDX.append(str(index))

for item in dict_stats_merge:
    teamrankings = pyBlitz.CleanString(item['teamrankings'])
    team = pyBlitz.CleanString(item['BPI'])
    if (item['corrected BPI'].strip() != ""):
        team = pyBlitz.CleanString(item['corrected BPI'])
    index = GetIndex(A, team, F)
    for row in dict_teamrankings.values():
        if (row['Team'].lower().strip() == teamrankings.lower().strip()):
            if (index > -1):
                B[index] = teamrankings
                break

for item in dict_abbr_merge:
    abbr_team = pyBlitz.CleanString(item['abbr team'])
    stats = pyBlitz.CleanString(item["stats team"].lower().strip())
    if (item["corrected stats team"].lower().strip()):
        stats = pyBlitz.CleanString(
Пример #3
0
team_set = set(AllTeams)
stats_teams = list(team_set)
stats_teams.sort()

file = "{0}merge_schedule.csv".format(settings.data_path)
merge_sheet = open(file, 'w', newline='')
csvwriter = csv.writer(merge_sheet)
dict_merge = OrderedDict()
dict_merge["scheduled team"] = []
dict_merge["match ratio"] = []
dict_merge["stats team"] = []
dict_merge["corrected stats team"] = []
values = []
for item in sched_teams:
    key = process.extractOne(item, stats_teams, scorer=fuzz.QRatio)
    dict_merge["scheduled team"].append(pyBlitz.CleanString(item))
    dict_merge["match ratio"].append(key[1])
    dict_merge["stats team"].append(pyBlitz.CleanString(key[0]))
    ovr = GetOverride(item, list_overrides)
    dict_merge["corrected stats team"].append(ovr)
    values.append([item, key[1], key[0], ovr])

#pdb.set_trace()

csvwriter.writerow(dict_merge.keys())
for value in values:
    #pdb.set_trace()
    csvwriter.writerow(value)
merge_sheet.close()
print("done.")
Пример #4
0
r = requests.post(url, data=data6, headers=headers)
soup = BeautifulSoup(r.content, "html5lib")
table6 = soup.findAll("table")

IDX=[]
A=[]
B=[]
C=[]
index=0
for row in table1[0].findAll("tr"):
    col=row.findAll('td')
    if len(col)>0 and col[0].find(text=True)!="School":
        index+=1
        IDX.append(index)
        A.append(pyBlitz.CleanString(col[0].find(text=True)))
        B.append(col[1].find(text=True))
        C.append(col[2].find(text=True))
for row in table2[0].findAll("tr"):
    col=row.findAll('td')
    if len(col)>0 and col[0].find(text=True)!="School":
        index+=1
        IDX.append(index)
        A.append(pyBlitz.CleanString(col[0].find(text=True)))
        B.append(col[1].find(text=True))
        C.append(col[2].find(text=True))
for row in table3[0].findAll("tr"):
    col=row.findAll('td')
    if len(col)>0 and col[0].find(text=True)!="School":
        index+=1
        IDX.append(index)
Пример #5
0
AllTeams = []
for item in dict_teamrankings.values():
    AllTeams.append(item["Team"])
team_set = set(AllTeams)
teamrankings = list(team_set)
teamrankings.sort()

file = "{0}merge_stats.csv".format(settings.data_path)
merge_sheet = open(file, 'w', newline='')
csvwriter = csv.writer(merge_sheet)
dict_merge = OrderedDict()
dict_merge["teamrankings"] = []
dict_merge["match ratio"] = []
dict_merge["BPI"] = []
dict_merge["corrected BPI"] = []
values = []
for item in teamrankings:
    key = process.extractOne(item, bpi, scorer=fuzz.QRatio)
    dict_merge["teamrankings"].append(pyBlitz.CleanString(item))
    dict_merge["match ratio"].append(key[1])
    dict_merge["BPI"].append(pyBlitz.CleanString(key[0]))
    ovr = GetOverride(item, list_overrides)
    dict_merge["corrected BPI"].append(ovr)
    values.append([item, key[1], key[0], ovr])

csvwriter.writerow(dict_merge.keys())
for value in values:
    csvwriter.writerow(value)
merge_sheet.close()
print("done.")
def main(argv):
    starturl = "http://www.espn.com/college-football/schedule"

    print("Scrape abbreviations Tool")
    print("**************************")
    print("data is from {0}".format(starturl))
    print
    print("Year is: {0}".format(year))
    print("Directory location: {0}".format(settings.data_path))
    print("**************************")
    Path(settings.data_path).mkdir(parents=True, exist_ok=True)

    url = []
    url.append("{0}/_/week/1/year/{1}/seasontype/3".format(starturl, year))
    if (year == int(now.year)):
        for week in range(1, 17):
            url.append("{0}/_/week/{1}/seasontype/2".format(starturl, week))
        url.append("{0}/_/week/1/seasontype/3".format(starturl))
    else:
        for week in range(1, 17):
            url.append("{0}/_/week/{1}/year/{2}/seasontype/2".format(
                starturl, week, year))
        url.append("{0}/_/week/1/year/{1}/seasontype/3".format(starturl, year))
    pages = []
    for item in url:
        req = Request(
            url=item,
            headers={
                'User-Agent':
                ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'
            })
        try:
            page = urlopen(req)
        except HTTPError as e:
            page = e.read()
        pages.append(BeautifulSoup(page, "html5lib"))

    Path(settings.data_path).mkdir(parents=True, exist_ok=True)
    stats_sheet = open(settings.data_path + 'abbreviation.csv',
                       'w',
                       newline='')
    csvwriter = csv.writer(stats_sheet)

    index = 0
    A = []
    B = []
    C = []
    D = []
    for page in pages:
        tables = page.findAll('table', {"class": "schedule"})
        for table in tables:
            teams = table.findAll('abbr')
            for team in teams:
                A.append(pyBlitz.CleanString(team['title']))
                B.append(team.text)
                index += 1
    C = list(OrderedDict.fromkeys(A))
    D = list(OrderedDict.fromkeys(B))
    index = len(C)
    IDX = []
    for loop in range(1, index + 1):
        IDX.append(loop)
    df = pd.DataFrame(IDX, columns=['Index'])
    df['Team'] = C
    df['Abbreviation'] = D
    if (not df.empty):
        with open(settings.data_path + 'abbreviation.json', 'w') as f:
            f.write(df.to_json(orient='index'))

        with open(settings.data_path + "abbreviation.json") as stats_json:
            dict_stats = json.load(stats_json, object_pairs_hook=OrderedDict)

        count = 0
        for row in dict_stats.values():
            if (count == 0):
                header = row.keys()
                csvwriter.writerow(header)
                count += 1
            csvwriter.writerow(row.values())
    stats_sheet.close()
    for root, dirs, files in os.walk(settings.data_path):
        for d in dirs:
            os.chmod(os.path.join(root, d),
                     stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
        for f in files:
            os.chmod(
                os.path.join(root, f), stat.S_IRUSR | stat.S_IWUSR
                | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH)
    print("done.")
Пример #7
0
# Add any Missing Teams Here
AddSchool("ALABAMA-BIRMINGHAM", "UAB")
AddSchool("ALABAMA A&M", "AAMU")
AddSchool("ALBANY-NY", "ALBY")
AddSchool("WESTERN KENTUCKY", "WKU")
# Add any Missing Teams Here
for row in tables[0].findAll("tr"):
    col = row.findAll('td')
    if len(col) > 0:
        tag = str(col[0].find(text=True)).strip()
        tag2 = str(col[0].find(href=True)).lower().strip()
        if (tag != "None"):
            if ("#f" in tag2):
                index += 1
                IDX.append(index)
                A.append(pyBlitz.CleanString(tag))
                B.append(col[1].find(text=True))

df = pd.DataFrame(IDX, columns=['Index'])
df['Team'] = A
df['Abbreviation'] = B

Path(settings.data_path).mkdir(parents=True, exist_ok=True)
with open(settings.data_path + 'abbreviation.json', 'w') as f:
    f.write(df.to_json(orient='index'))

with open(settings.data_path + "abbreviation.json") as stats_json:
    dict_stats = json.load(stats_json, object_pairs_hook=OrderedDict)
stats_sheet = open(settings.data_path + 'abbreviation.csv', 'w', newline='')
csvwriter = csv.writer(stats_sheet)
count = 0
Пример #8
0
    print ("teamrankings file is missing, run the scrape_teamrankings tool to create")
    exit()
with open(file) as stats_file:
    dict_teamrankings = json.load(stats_file, object_pairs_hook=OrderedDict)
IDX=[]
A=[]
B=[]
C=[]
D=[]
E=[]
F=[]
G=[]
H=[]
index = 0
for item in dict_merge.values():
    teamrankings = pyBlitz.CleanString(item['teamrankings'])
    team = pyBlitz.CleanString(item['BPI'])
    
    row_team = []
    for row in dict_teamrankings.values():
        if(row['Team'].lower().strip()==teamrankings.lower().strip()):
            row_team = row  
            break

    for row in dict_bpi.values():
        if(row['School'].lower().strip()==team.lower().strip() and row['Class'].upper().strip()=="DIVISION 1  FBS"):
            index+=1
            IDX.append(str(index))
            A.append(team)
            B.append(teamrankings)
            if (row_team):