def __init__(self, row, gunners=False): self.kick_off = datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S') self.home_team = row[2] self.away_team = row[3] self.venue = row[4] self.played = row[6] self.gunners = gunners self.home_url = list_replace([self.home_team], CHANGES)[0] self.away_url = list_replace([self.away_team], CHANGES)[0] stats_url_template = 'http://www.guardian.co.uk/football/match/{0}/{1}/{2}/{3}-v-{4}' events_url_template = 'http://www.guardian.co.uk/football/match-popup/{0}/{1}/{2}/{3}-v-{4}' self.stats_url = stats_url_template.format(self.kick_off.year, datetime.strftime(self.kick_off, '%b').lower(), str(self.kick_off.day).zfill(2), self.home_url, self.away_url) self.events_url = events_url_template.format(self.kick_off.year, datetime.strftime(self.kick_off, '%b').lower(), str(self.kick_off.day).zfill(2), self.home_url, self.away_url)
def scrape_events(self): """ Scrapes events from the target url. Returns a named tuple with minute, event_type, event """ url = self.events_url print 'Scraping events form ', url output = namedtuple('output', 'minute event_type event') output.minute, output.event_type, output.event = [], [], [] try: page = urlopen(url).read() except HTTPError as e: print "Can't find events page", e return output except URLError as e: print "Can't find events page", e return output soup = BeautifulSoup(page) # List of table rows with css class event table = soup.findAll('tr', {'class': 'event'}) # Defines what's to be changed for /r/soccer's custom match thread # icons changes = [('SUB', '[](//#sub) Sub'), ('RED CARD', '[](//#red) Red'), ('YELLOW CARD', '[](//#yellow) Yellow'), ('GOAL', '[](//#ball) **Goal**')] for row in table: try: output.minute.append(row.td.contents[0]) except AttributeError: output.minute.append('') try: output.event_type.append(row.td.next_sibling.next_sibling.contents[1].contents[0]) except AttributeError: output.event_type.append('') try: output.event.append(row.td.next_sibling.next_sibling.contents[2]) except AttributeError: output.event.append('') # Replace the event types with custom event types defined above output.event_type = list_replace(output.event_type, changes) return output