예제 #1
0
파일: wps.py 프로젝트: pamolloy/USSA
 def __init__(self):
     self.schedule = GetSchedule()
     self.url = "http://www.womensprosoccer.com/Home/schedule/2011-wps-schedule"
예제 #2
0
파일: wps.py 프로젝트: pamolloy/USSA
class WomensProSoccer(object):
    """Download schedule from official WPS website and process information."""
    
    def __init__(self):
        self.schedule = GetSchedule()
        self.url = "http://www.womensprosoccer.com/Home/schedule/2011-wps-schedule"
        
    def crawl(self):
        """Crawl the page with BeautifulSoup for applicable information."""
        
        schedule = []
        
        soup = self.schedule.load_page(self.url)
        print 'Locating scheduling section'
        tbody = soup.find("tbody").findAll('tr')
        del tbody[0] # Remove table header
        
        for row in tbody:
            stat = row.findAll("div", {"align": "center"})
            stat = [self.cleaner(info, 1) for info in stat] # Remove <div>
            
            # If the row consists of one column it contains the date. This is
            #quicker than searching for <th> with BeatifulSoup
            if len(stat) == 1:
                date = self.date(stat[0])
            else:
                match = {} # Dictionary to hold match information
                
                match['date'] = date # Assign the date of the last header row
                match['team1'] = stat[0][0]
                match['team2'] = stat[1][0]
                match['venue'] = stat[2][0]
                match.update(self.score(stat[3]))
                # Fourth element is a link to match report
                print stat[5]
                match['attendance'] = self.attendance(stat[5])
                print match
                
                # Add match dictionary to schedule list
                schedule.append(match)
                
        return schedule
    
    def cleaner(self, html, repeat):
        """Recursively remove tags for a certain count, returning a list."""
        
        count = 0
        html = unicode(html)
        soup = BeautifulSoup(html)
        
        for element in soup.findAll(True):
            
            if count == repeat:
                break
            else:
                content = element.contents
                count += 1
                
        return content
    
    def date(self, html):
        """Find the date in a table row. If playoff match create element."""
        
        row = self.cleaner(html, 1) # Remove <strong>
        
        if len(row) == 1:
            date = row[0]
        else: # Process ugly formatting of playoff games
            row = [element for element in row if isinstance(element, NavigableString)]
            date = row[-1] #TODO(pamolloy): Store game title
            
        return date
        
    def score(self, section):
        """Process the score into number of (penalty) goals for each team"""
        
        match = {}
        
        if section[0] == 'Postponed':
            pass
        elif section[0] == '\n': # Penalty goals
            section = section[1]
            goals = re.findall('\d', unicode(section))
            match['goals1'] = int(goals[0])
            match['goals2'] = int(goals[1])
            match['pens1'] = int(goals[2])
            match['pens2'] = int(goals[3])
        else:
            section = section[0]
            match['goals1'] = int(section[0])
            match['goals2'] = int(section[4])
        
        return match
       
    def attendance(self, html):
        """Find the attendance within list"""
        print html
        if len(html) == 1:
            attendance = html[0]
        elif len(html) == 2:
            print html
            attendance = [element for element in html if isinstance(element, NavigableString)]
            print attendance
            attendance = attendance[0]
        elif len(html) == 0: # Postponed game
            attendance = int() 
        
        return attendance
예제 #3
0
파일: ussf.py 프로젝트: pamolloy/USSA
 def __init__(self):
     self.schedule = GetSchedule()
     self.url = "http://www.ussoccer.com/Schedule-Tickets/Schedule.aspx"
     self.att = {"class": "genericTable"}
예제 #4
0
파일: ussf.py 프로젝트: pamolloy/USSA
class USSoccer(object):
    """Download schedule from official USSF website and process information."""
    
    def __init__(self):
        self.schedule = GetSchedule()
        self.url = "http://www.ussoccer.com/Schedule-Tickets/Schedule.aspx"
        self.att = {"class": "genericTable"}
        
    def crawl(self):
        """Crawl the page with BeautifulSoup for applicable information."""
        
        schedule = []
        
        soup = self.schedule.load_page(self.url)
        print 'Locating scheduling section: {}'.format(self.att.values()[0])
        table = soup.find("table", self.att).findAll('tr')
        del table[0] # Remove table header
        
        for row in table:
            #stat = row.findAll("div", {"align": "center"})
            stats = row.contents
            stats = [element for element in stats if element != '\n']
            stats = [element.contents for element in stats] # Remove <td>
            
            match = {} # Dictionary to hold match information
            
            match['date'] = self.cleaner(stats[0], 1)[0]
            match['time'] = self.cleaner(stats[2], 1)[0]
            
            stadium = self.cleaner(stats[3][0], 1)[0]
            city = stats[3][2]
            match['venue'] = '{}, {}'.format(stadium, city)
            
            teams = stats[1][0]
            teams = teams.split(' vs. ')
            match['team1'] = teams[0]
            match['team2'] = teams[1]
            
            channels = stats[4][0]
            channels = channels.strip()
            if channels == '&nbsp;':
                pass
            else:
                channels = channels.split(', ')
                count = 0
                for station in channels:
                    match['tv{}'.format(count)] = station
                    count += 1
                
            # Fifth element is "Info Center"
            print match
            
            # Add match dictionary to schedule list
            schedule.append(match)
            
        return schedule
    
    def cleaner(self, html, repeat):
        """Recursively remove tags for a certain count, returning a list."""
        
        count = 0
        html = unicode(html)
        soup = BeautifulSoup(html)
        
        for element in soup.findAll(True):
            if count == repeat:
                break
            else:
                content = element.contents
                count += 1
                
        return content
예제 #5
0
파일: mls.py 프로젝트: pamolloy/USSA
 def __init__(self):
     self.schedule = GetSchedule()
     self.url = "http://www.mlssoccer.com/schedule?month=all&year=2011"
     self.att = "schedule-page"
예제 #6
0
파일: mls.py 프로젝트: pamolloy/USSA
class MLSSoccer(object):
    """Download schedule from official MLS website and process information."""
    
    def __init__(self):
        self.schedule = GetSchedule()
        self.url = "http://www.mlssoccer.com/schedule?month=all&year=2011"
        self.att = "schedule-page"
    
    def crawl(self):
        """Crawl the page with BeautifulSoup for applicable information."""
        
        schedule = []
        
        soup = self.schedule.load_page(self.url)
        print 'Locating scheduling section: {}'.format(att)
        section = soup.find("div", {"class": self.att})
        
        for table in section.findAll("table"):
            table_body = table.find('tbody')
            date = self.date(table)
            table_rows = table_body.findAll('tr')
            
            for row in table_rows:
                
                match = {}
                
                # NOT easily processed information
                match['date'] = date
                details = self.details(row)
                match.update(details)
                goals = self.score(row)
                match.update(goals)
                
                # Easily processed information
                match['venue'] = self.generic(row, "views-field venue")
                match['team1'] = self.generic(row, "views-field home-team")
                match['team2'] = self.generic(row, "views-field away-team")
                
                # Add match dictionary to schedule list
                schedule.append(match)
                
        return schedule
        
    def date(self, section):
        """Find the date of each match based on the last preceding <h3> tag"""
        
        date = section.findPreviousSibling("h3")
        date = BeautifulSoup(unicode(date))
        date = date.h3.contents[0] # Remove tags
        
        return date
        
    def details(self, section):
        """Process the venue and channels from the details section"""
        
        match = {}
        
        html = section.find("td", {"class": "views-field start-time"})
        details = html.contents
        
        # If the game has passed, ignore "Final"
        if details[0] == u'Final':
            return match
        else:
            match['hour'] = details[0]
            count = 0
            channels = html.findAll('strong')
            for station in channels:
                station = BeautifulSoup('{}'.format(station))
                match['tv{}'.format(count)] = station.strong.contents[0]
                count += 1
            
            return match
            
    def score(self, section):
        """Process the score into number of (penalty) goals for each team"""
        
        match = {}
        score = section.find("td", {"class": "views-field score"}).contents
        # Ignore score for upcoming games, which return empty list
        if score == []:
           return match 
        else:
            score = score[0] # Select first string from list
            # Store penalties
            if re.search('(\(|\))', score):
                match['goals1'] = int(score[0])
                match['goals2'] = int(score[8])
                match['pens1'] = int(score[3])
                match['pens2'] = int(score[11])
            elif re.search('[0-9]', score):
                match['goals1'] = int(score[0])
                match['goals2'] = int(score[4])
            else: pass
            
            return match
            
    def generic(self, section, attribute):
        """Find and return the match venue."""
        
        info = section.find("td", {"class": attribute}).contents[0]
        
        return info