class MatchDetailsScraper: def __init__(self, url): self.url = url self.logger = Logger(2) def extract_lineups(self): sender = Sender() sender.set_debug_level(2) response = sender.get(self.url, {}) self.logger.debug('Processing lineups of ' + self.url) html = BeautifulSoup(response, 'html.parser') result = {'home': '', 'away': ''} teams_container_main = html.find('section', {'class': 'laliga-fantasy columna2'}) if not teams_container_main is None: teams_container = teams_container_main.find_all('section')[:2] home_container = teams_container[0] away_container = teams_container[1] result = { 'home': self._extract_players(home_container), 'away': self._extract_players(away_container) } else: self.logger.error(500, 'No lineup found in ' + self.url) return result def _extract_players(self, team_container): result = [] for player in team_container.find_all('li'): result.append(player.getText().strip()) #self.logger.debug('Retrieving ' + str(result)) if len(result) != 11: self.logger.error(100, 'Team with ' + str(len(result)) + ' players') return result
class SeasonScraper: def __init__(self): self.sender = scrape_request.Sender() self.sender.set_delay(2) self.sender.set_debug_level(2) self.logger = Logger(2) def scrape_page(self, season): splitted = season.split('/') league = splitted[0] page_content = self.do_request(season) writer = utils.create_mongo_writer(league) if page_content: html = BeautifulSoup(page_content, 'html.parser') days = html.find_all('div', {'class': 'jornada-calendario-historico'}) matches_results = [] for day in days[:]: table_title = day.find('div', {'class': 'nombre_jornada'}) day_str = self.extract_day(table_title.contents[0]) day_num = self.extract_daynum(table_title.contents[0]) tables = day.find_all( 'table', {'class': 'tabla_jornada_calendario_historico'}) for table in tables[:]: rows = table.find_all( 'tr', {'onclick': re.compile('^abrir_partido')}) for row in rows[:]: js_params = self.extract_popup_win_js_params( row['onclick']) if js_params != False: match_id = js_params['temporada'] + '_' + js_params['jornada'] + '_' + \ js_params['equipo'] + '_' + js_params['competicion'] cell = row.find('td') content_to_process = str(cell.contents[0]) txt = self.extract_result(content_to_process) matches_results.append({ 'season': season, 'day': day_str, 'day_num': day_num, 'home': str.strip(txt.group(1)), 'away': str.strip(txt.group(3)), 'score_home': txt.group(2), 'score_away': txt.group(4), 'match_id': match_id }) popup_scraper = PopUpScraper.PopUpScraper( match_id, writer) popup_scraper.scrape_popup(js_params) writer.write_dictionaries_list('results', matches_results) def do_request(self, path): sender = self.sender url = 'http://www.laliga.es/estadisticas-historicas/calendario/' + path + '/' return sender.get(url, {}) def extract_result(self, content_to_process): ''' :param content_to_process: Cosas como '<span>RCD Mallorca: <b>1</b><br>Real Madrid: <b>2</b></span>' :return: array con 4 elementos para nombre del equipo y goles ''' content_to_process = content_to_process.replace(":", "") cell_pattern_str = '<span>(.+?)<b>(.+?)</b><br/>(.+?)<b>(.+?)</b></span>' return re.search(cell_pattern_str, content_to_process) def extract_daynum(self, content_to_process): ''' :param content_to_process: Cosas como 'Jornada: 02 - 26/08/2016' :return: fecha en formato dd-mm-yyyy ''' cell_pattern_str = '(\d+)' parsed = re.search(cell_pattern_str, content_to_process) return str.strip(parsed.group(1)) def extract_day(self, content_to_process): cell_pattern_str = '(\d+/\d+/\d+)' parsed = re.search(cell_pattern_str, content_to_process) return str.strip(str.replace(parsed.group(1), "/", "-")) def extract_popup_win_js_params(self, function_call_str): ''' Saca los parámetros a pasar al JS que obtiene los popups con los detalles de un partido. ej. 'abrir_partido(115,37,"barcelona",1)' ''' pattern = 'abrir_partido\((.+?),(.+?),"(.+?)",(.+?)\)' parsed = re.search(pattern, function_call_str) if parsed is None: self.logger.error( 400, 'Error in extract_popup_win_js_params ' + function_call_str) return False else: return { 'temporada': parsed.group(1), 'jornada': parsed.group(2), 'equipo': parsed.group(3), 'competicion': parsed.group(4) }
class ResultsCurrentYearScraper: def __init__(self): self.sender = Sender() self.logger = Logger(2) self.url = 'http://www.marca.com/futbol/primera-division/calendario.html' self.raw_content = '' self.writer = PrefixedMongoWrapper('marca') self.collection_name = 'current_season_results' def _getPage(self): self.raw_content = self.sender.get(self.url, {}) if self.raw_content == '': self.logger.error(500, 'Empty page') exit() def scrape(self): self.logger.debug('Downloading marca web data') self._getPage() self.writer.drop_collection(self.collection_name) html = BeautifulSoup(self.raw_content, 'html.parser') for day_table in html.find_all('li', {'id': 'contenedorCalendarioInt'}): day_info = self.extract_day(day_table) self.logger.debug('Processing "' + day_info['num_day'] + ', ' + day_info['date'] + '"') results = self.process_results(day_table) dictionary_to_insert = {'day': day_info, 'results': results} self.writer.write_dictionary(self.collection_name, dictionary_to_insert) self.logger.debug('Done') def extract_day(self, day_table): header = day_table.find('span') num_day = header.find('h2').getText() date = header.contents[2].strip() return {'num_day': num_day, 'date': date} def process_results(self, day_table): results = [] for row in day_table.find('ul', { 'class': 'partidos-jornada' }).find_all('a'): counter = 0 result = {} colmap = {0: 'home', 1: 'away', 2: 'result'} for cell in row.find_all('span'): result[colmap[counter % 3]] = cell.getText() counter = counter + 1 if row.has_attr('href'): lineups = self._get_lineups(row['href']) result['home_lineup'] = lineups['home'] result['away_lineup'] = lineups['away'] results.append(result) self.logger.debug('Inserted ' + str(len(results)) + ' items') return results def _get_lineups(self, url): scraper = MatchDetailsScraper(url) return scraper.extract_lineups()
class PlayerNormalizer: def __init__(self): self.logger = Logger(2) self.default_csv_filename = './players_mapping.csv' self.loaded = False def find_player_id(self, source, player): self._init_data() results_indexes = self.data['master'].index[self.data[source] == player] if len(results_indexes) > 1: self.logger.error( 300, 'More than a candidate (' + str(len(results_indexes)) + '): ' + player) for result in results_indexes: return result self.logger.error(100, 'Cannot find map for ' + source + ': ' + player) return '' def _init_data(self): if self.loaded == False: self.logger.debug('Loading map file') if not os.path.isfile(self.default_csv_filename): self.init_map_file() self.data = pd.read_csv(self.default_csv_filename) self.loaded = True def init_map_file(self): self.logger.debug('Generating master') mongo_wrapper = PrefixedMongoWrapper('laliga_web') # datos sacados del apartado "plantillas" #result = mongo_wrapper.get_collection('players').find({'season': 'primera/2016-17'}).distinct('player') #result += mongo_wrapper.get_collection('players').find({'season': 'segunda/2016-17'}).distinct('player') # faltan los de la temporada 1928-29 integramos con los resultados de primera y segunda result = mongo_wrapper.get_collection('players').distinct('player') result += mongo_wrapper.get_collection( 'primera_popups_matches_stats').distinct('player') result += mongo_wrapper.get_collection( 'segunda_popups_matches_stats').distinct('player') self.logger.debug('Done') data = {'master': list(set(result))} self.save_csv(data) def _get_marca_list(self): result = [] mongo_wrapper = PrefixedMongoWrapper('marca_current_season') for day in mongo_wrapper.get_collection('results').find( {"results.home_lineup": { "$exists": True }}): for match in day['results']: result += match['home_lineup'] result += match['away_lineup'] return list(set(result)) def normalize(self): self._init_data() self.logger.debug('Normalizing data...') return self._normalize_one('marca', self._get_marca_list()) def save_csv(self, result): self.logger.debug('Creating ' + self.default_csv_filename) csv_filename = self.default_csv_filename repo = pd.DataFrame(result) repo.index += 1 repo.to_csv(csv_filename) def get_valid_players(self): mongo_wrapper = PrefixedMongoWrapper('laliga_web') result = mongo_wrapper.get_collection( 'primera_popups_matches_stats').distinct('player') result += mongo_wrapper.get_collection('players').find({ 'season': 'primera/2016-17' }).distinct('player') result += mongo_wrapper.get_collection('players').find({ 'season': 'segunda/2016-17' }).distinct('player') return list(set(result)) def _normalize_one(self, source, players): result = { 'master': [], source: [], } num_matched = 0 valid_players = self.get_valid_players() #print(valid_players) #exit() already_got = [] for master_player in self.data['master']: best_similarity = 0 second_best_similarity = 0 matched = '' if master_player in valid_players: for player in players: matcher = SequenceMatcher( None, self.preprocess_name(master_player), self.preprocess_name(player)) similarity = matcher.ratio() if (similarity > best_similarity) and \ (similarity > 0.95) and \ (second_best_similarity < 0.60) and \ (player not in already_got): second_best_similarity = best_similarity best_similarity = similarity matched = player if matched != '': self.logger.debug('Matched "' + matched + '" with "' + master_player + '" ' + str(best_similarity)) already_got.append(matched) num_matched += 1 result['master'].append(master_player) result[source].append(matched) self.logger.debug( str(len(players)) + ' players, ' + str(num_matched) + ' matched') return result def preprocess_name(self, name): result = name.lower() result = result.replace(',', '') return result