def __init__(self): self.schedule = GetSchedule() self.url = "http://www.womensprosoccer.com/Home/schedule/2011-wps-schedule"
class WomensProSoccer(object): """Download schedule from official WPS website and process information.""" def __init__(self): self.schedule = GetSchedule() self.url = "http://www.womensprosoccer.com/Home/schedule/2011-wps-schedule" def crawl(self): """Crawl the page with BeautifulSoup for applicable information.""" schedule = [] soup = self.schedule.load_page(self.url) print 'Locating scheduling section' tbody = soup.find("tbody").findAll('tr') del tbody[0] # Remove table header for row in tbody: stat = row.findAll("div", {"align": "center"}) stat = [self.cleaner(info, 1) for info in stat] # Remove <div> # If the row consists of one column it contains the date. This is #quicker than searching for <th> with BeatifulSoup if len(stat) == 1: date = self.date(stat[0]) else: match = {} # Dictionary to hold match information match['date'] = date # Assign the date of the last header row match['team1'] = stat[0][0] match['team2'] = stat[1][0] match['venue'] = stat[2][0] match.update(self.score(stat[3])) # Fourth element is a link to match report print stat[5] match['attendance'] = self.attendance(stat[5]) print match # Add match dictionary to schedule list schedule.append(match) return schedule def cleaner(self, html, repeat): """Recursively remove tags for a certain count, returning a list.""" count = 0 html = unicode(html) soup = BeautifulSoup(html) for element in soup.findAll(True): if count == repeat: break else: content = element.contents count += 1 return content def date(self, html): """Find the date in a table row. If playoff match create element.""" row = self.cleaner(html, 1) # Remove <strong> if len(row) == 1: date = row[0] else: # Process ugly formatting of playoff games row = [element for element in row if isinstance(element, NavigableString)] date = row[-1] #TODO(pamolloy): Store game title return date def score(self, section): """Process the score into number of (penalty) goals for each team""" match = {} if section[0] == 'Postponed': pass elif section[0] == '\n': # Penalty goals section = section[1] goals = re.findall('\d', unicode(section)) match['goals1'] = int(goals[0]) match['goals2'] = int(goals[1]) match['pens1'] = int(goals[2]) match['pens2'] = int(goals[3]) else: section = section[0] match['goals1'] = int(section[0]) match['goals2'] = int(section[4]) return match def attendance(self, html): """Find the attendance within list""" print html if len(html) == 1: attendance = html[0] elif len(html) == 2: print html attendance = [element for element in html if isinstance(element, NavigableString)] print attendance attendance = attendance[0] elif len(html) == 0: # Postponed game attendance = int() return attendance
def __init__(self): self.schedule = GetSchedule() self.url = "http://www.ussoccer.com/Schedule-Tickets/Schedule.aspx" self.att = {"class": "genericTable"}
class USSoccer(object): """Download schedule from official USSF website and process information.""" def __init__(self): self.schedule = GetSchedule() self.url = "http://www.ussoccer.com/Schedule-Tickets/Schedule.aspx" self.att = {"class": "genericTable"} def crawl(self): """Crawl the page with BeautifulSoup for applicable information.""" schedule = [] soup = self.schedule.load_page(self.url) print 'Locating scheduling section: {}'.format(self.att.values()[0]) table = soup.find("table", self.att).findAll('tr') del table[0] # Remove table header for row in table: #stat = row.findAll("div", {"align": "center"}) stats = row.contents stats = [element for element in stats if element != '\n'] stats = [element.contents for element in stats] # Remove <td> match = {} # Dictionary to hold match information match['date'] = self.cleaner(stats[0], 1)[0] match['time'] = self.cleaner(stats[2], 1)[0] stadium = self.cleaner(stats[3][0], 1)[0] city = stats[3][2] match['venue'] = '{}, {}'.format(stadium, city) teams = stats[1][0] teams = teams.split(' vs. ') match['team1'] = teams[0] match['team2'] = teams[1] channels = stats[4][0] channels = channels.strip() if channels == ' ': pass else: channels = channels.split(', ') count = 0 for station in channels: match['tv{}'.format(count)] = station count += 1 # Fifth element is "Info Center" print match # Add match dictionary to schedule list schedule.append(match) return schedule def cleaner(self, html, repeat): """Recursively remove tags for a certain count, returning a list.""" count = 0 html = unicode(html) soup = BeautifulSoup(html) for element in soup.findAll(True): if count == repeat: break else: content = element.contents count += 1 return content
def __init__(self): self.schedule = GetSchedule() self.url = "http://www.mlssoccer.com/schedule?month=all&year=2011" self.att = "schedule-page"
class MLSSoccer(object): """Download schedule from official MLS website and process information.""" def __init__(self): self.schedule = GetSchedule() self.url = "http://www.mlssoccer.com/schedule?month=all&year=2011" self.att = "schedule-page" def crawl(self): """Crawl the page with BeautifulSoup for applicable information.""" schedule = [] soup = self.schedule.load_page(self.url) print 'Locating scheduling section: {}'.format(att) section = soup.find("div", {"class": self.att}) for table in section.findAll("table"): table_body = table.find('tbody') date = self.date(table) table_rows = table_body.findAll('tr') for row in table_rows: match = {} # NOT easily processed information match['date'] = date details = self.details(row) match.update(details) goals = self.score(row) match.update(goals) # Easily processed information match['venue'] = self.generic(row, "views-field venue") match['team1'] = self.generic(row, "views-field home-team") match['team2'] = self.generic(row, "views-field away-team") # Add match dictionary to schedule list schedule.append(match) return schedule def date(self, section): """Find the date of each match based on the last preceding <h3> tag""" date = section.findPreviousSibling("h3") date = BeautifulSoup(unicode(date)) date = date.h3.contents[0] # Remove tags return date def details(self, section): """Process the venue and channels from the details section""" match = {} html = section.find("td", {"class": "views-field start-time"}) details = html.contents # If the game has passed, ignore "Final" if details[0] == u'Final': return match else: match['hour'] = details[0] count = 0 channels = html.findAll('strong') for station in channels: station = BeautifulSoup('{}'.format(station)) match['tv{}'.format(count)] = station.strong.contents[0] count += 1 return match def score(self, section): """Process the score into number of (penalty) goals for each team""" match = {} score = section.find("td", {"class": "views-field score"}).contents # Ignore score for upcoming games, which return empty list if score == []: return match else: score = score[0] # Select first string from list # Store penalties if re.search('(\(|\))', score): match['goals1'] = int(score[0]) match['goals2'] = int(score[8]) match['pens1'] = int(score[3]) match['pens2'] = int(score[11]) elif re.search('[0-9]', score): match['goals1'] = int(score[0]) match['goals2'] = int(score[4]) else: pass return match def generic(self, section, attribute): """Find and return the match venue.""" info = section.find("td", {"class": attribute}).contents[0] return info