/
scrapers.py
158 lines (130 loc) · 5.24 KB
/
scrapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from bs4 import BeautifulSoup
import re
import requests
from hmc_urllib import getHTML
from urllib.request import http
import scraping_functions as sf
def scrape_tournament(filename, url_list):
"""Scrape all the urls in a file and write the corresponding tournament's matches to a text file."""
for url in url_list:
print(url)
if "challonge" in url:
write_txt_from_challonge(url, filename)
elif "teamliquid" in url:
write_txt_from_liquipedia(url, filename)
elif "smash.gg" in url:
write_txt_from_smashgg(url, filename)
def write_txt_from_challonge(url, file):
"""Writes the results from a Challonge URL to TxtFile.
Challonge: a string; the URL for a Challonge.
TxtFile: a string; the name of the file to be written.
Example: WriteTxtFromChallonge('http://apex2015melee.challonge.com/singles', 'Apex 2015')"""
file = sf.add_txt(file)
webpage = getHTML(url)[0].replace('Mark as In Progress\n\n\nUnmark as In Progress\n\n\n\n', '') \
.replace('\n\n\n\n\n\nEdit\n\n\nReopen', '').split('\n\n\n\nMatch Details\n\n\n\n\n\n\n')[1:]
parsed_matches = ""
for item in webpage:
item = item.splitlines()
if item[2] == "" or item[7] == "":
continue
try:
if int(item[24]) < 0:
continue
except:
pass
try:
if int(item[27]) < 0:
continue
except:
pass
line = item[2] + "," + item[24] + "," + item[7] + "," + item[27]
line = sf.strip_match(line)
if line is not None and sf.parse_match(line) != "":
parsed_matches += sf.parse_match(line) + "\n"
with open(file, 'a') as file:
file.write(parsed_matches)
def format_liquipedia_url(url):
"""Converts bracket url to source url, if necessary."""
if not "&action=edit" in url:
url = re.sub("(wiki\.teamliquid\.net/smash/)(.*)", r"\1index.php?title=\2&action=edit", url)
return url
def write_txt_from_liquipedia(url, filename):
"""Returns match data from a Liquipedia link."""
url = format_liquipedia_url(url)
try:
soup = BeautifulSoup(requests.get(url).content)
except http.client.IncompleteRead as e:
soup = BeautifulSoup(e.partial)
match_data = str(soup.find("textarea"))
matches = ""
prev_line_start = "xxxx"
for line in match_data.split("\n"):
if re.match('^\|[rl]\d+m\d+', line):
if line.startswith(prev_line_start):
matches += " " + line
else:
matches += "\n" + line
prev_line_start = re.sub('^(\|[rl]\d+m\d+).*', r'\1', line)
parsed_matches = ""
for line in matches.split("\n"):
stripped_line = sf.strip_match(line)
if stripped_line is not None and sf.match_played(url, line):
parsed_match = sf.parse_match(stripped_line)
if parsed_match != "":
parsed_matches += parsed_match + "\n"
with open(filename, 'a', encoding="utf8") as file:
file.write(parsed_matches)
def format_smashgg_url(url):
"""Converts bracket url to api url, if necessary."""
if "api.smash.gg" not in url:
url = "http://api.smash.gg/phase_group/" + url.split("/")[-1]
api_string = "?expand[0]=sets&expand[1]=entrants"
if api_string not in url:
url += api_string
return url
def parse_smashgg_set(set, entrant_dict):
"""Returns the winner and loser of a smash.gg set."""
winnerId = set["winnerId"]
entrant1Id = set["entrant1Id"]
entrant1Score = set["entrant1Score"]
entrant2Id = set["entrant2Id"]
entrant2Score = set["entrant2Score"]
if entrant1Id and entrant2Id:
entrant1Name = sf.normalize_name(entrant_dict[entrant1Id])
entrant2Name = sf.normalize_name(entrant_dict[entrant2Id])
if type(entrant1Score) is int and type(entrant2Score) is int:
if entrant1Score > -1 and entrant2Score > -1:
if entrant1Id == winnerId:
return entrant1Name + "," + entrant2Name
else:
return entrant2Name + "," + entrant1Name
else:
if entrant1Id == winnerId:
return entrant1Name + "," + entrant2Name
else:
return entrant2Name + "," + entrant1Name
def write_txt_from_smashgg(url, filename):
"""Writes smash.gg bracket data to a file."""
url = format_smashgg_url(url)
data = requests.get(url).json()
entrants = data["entities"]["entrants"]
entrant_dict = {}
for entrant in entrants:
entrant_dict[entrant["id"]] = entrant["name"]
sets = data["entities"]["sets"]
set_data = ""
grand_finals = ""
for set in sets:
parsed_set = parse_smashgg_set(set, entrant_dict)
if parsed_set:
if set["isGF"]:
grand_finals += parsed_set + "\n"
else:
set_data += parsed_set + "\n"
parsed_matches = set_data + grand_finals
with open(filename, 'a', encoding="utf8") as file:
file.write(parsed_matches)
if __name__ == "__main__":
with open("MeleeUrls/Tipped-Off-11.txt", "r") as file:
for line in file:
write_txt_from_smashgg(line.strip(), "MeleeResults/Tipped-Off-11.txt")