def _get_fighter_name_and_link(self, ) -> Dict[str, List[str]]: fighter_name_and_link = {} fighter_name = "" l = len(self.fighter_group_urls) print("Scraping all fighter names and links: ") print_progress(0, l, prefix="Progress:", suffix="Complete") for index, fighter_group_url in enumerate(self.fighter_group_urls): soup = make_soup(fighter_group_url) table = soup.find("tbody") names = table.findAll("a", {"class": "b-link b-link_style_black"}, href=True) for i, name in enumerate(names): if (i + 1) % 3 != 0: if fighter_name == "": fighter_name = name.text else: fighter_name = fighter_name + " " + name.text else: fighter_name_and_link[fighter_name] = name["href"] fighter_name = "" print_progress(index + 1, l, prefix="Progress:", suffix="Complete") return fighter_name_and_link
def _get_total_fight_stats( cls, event_and_fight_links: Dict[str, List[str]]) -> str: total_stats = "" l = len(event_and_fight_links) print(f'Scraping data for {l} fights: ') print_progress(0, l, prefix="Progress:", suffix="Complete") for index, (event, fights) in enumerate(event_and_fight_links.items()): event_soup = make_soup(event) event_info = FightDataScraper._get_event_info(event_soup) # Get data for each fight in the event in parallel. with concurrent.futures.ThreadPoolExecutor( max_workers=8) as executor: futures = [] for fight in fights: futures.append( executor.submit(FightDataScraper._get_fight_stats_task, self=cls, fight=fight, event_info=event_info)) for future in concurrent.futures.as_completed(futures): fighter_stats = future.result() if fighter_stats != "": if total_stats == "": total_stats = fighter_stats else: total_stats = total_stats + "\n" + fighter_stats print_progress(index + 1, l, prefix="Progress:", suffix="Complete") return total_stats
def _get_fighter_data_task(self, fighter_name, fighter_url): another_soup = make_soup(fighter_url) divs = another_soup.findAll( "li", { "class": "b-list__box-list-item b-list__box-list-item_type_block" }, ) data = [] for i, div in enumerate(divs): if i == 9: # An empty string is scraped here, let's not append that continue data.append( div.text.replace(" ", "").replace("\n", "").replace( "Height:", "").replace("Weight:", "").replace("Reach:", "").replace( "STANCE:", "").replace("DOB:", "").replace("SLpM:", "").replace( "Str. Acc.:", "").replace("SApM:", "").replace( "Str. Def:", "").replace("TD Avg.:", "").replace( "TD Acc.:", "").replace("TD Def.:", "").replace("Sub. Avg.:", "")) return fighter_name, data
def get_fight_links(event_links: List[str]) -> Dict[str, List[str]]: event_and_fight_links = {} l = len(event_links) print("Scraping event and fight links: ") print_progress(0, l, prefix="Progress:", suffix="Complete") for index, link in enumerate(event_links): event_fights = [] soup = make_soup(link) for row in soup.findAll( "tr", { "class": "b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click" }, ): href = row.get("data-link") event_fights.append(href) event_and_fight_links[link] = event_fights print_progress(index + 1, l, prefix="Progress:", suffix="Complete") return event_and_fight_links
def _get_updated_event_links(self) -> Tuple[List[str], List[str]]: all_event_links = [] soup = make_soup(self.all_events_url) for link in soup.findAll("td", {"class": "b-statistics__table-col"}): for href in link.findAll("a"): foo = href.get("href") all_event_links.append(foo) if not self.PAST_EVENT_LINKS_PICKLE_PATH.exists(): # if no past event links are present, then there are no new event links new_event_links = [] else: # get past event links with open(self.PAST_EVENT_LINKS_PICKLE_PATH.as_posix(), "rb") as pickle_in: past_event_links = pickle.load(pickle_in) # Find links of the newer events new_event_links = list( set(all_event_links) - set(past_event_links)) # dump all_event_links as PAST_EVENT_LINKS with open(self.PAST_EVENT_LINKS_PICKLE_PATH.as_posix(), "wb") as f: pickle.dump(all_event_links, f) return new_event_links, all_event_links
def _get_fighter_name_and_details( self, fighter_name_and_link: Dict[str, List[str]]) -> None: fighter_name_and_details = {} l = len(fighter_name_and_link) print("Scraping all fighter data: ") print_progress(0, l, prefix="Progress:", suffix="Complete") for index, (fighter_name, fighter_url) in enumerate(fighter_name_and_link.items()): another_soup = make_soup(fighter_url) divs = another_soup.findAll( "li", { "class": "b-list__box-list-item b-list__box-list-item_type_block" }, ) data = [] for i, div in enumerate(divs): if i == 9: # An empty string is scraped here, let's not append that continue data.append( div.text.replace(" ", "").replace("\n", "").replace( "Height:", "").replace("Weight:", "").replace( "Reach:", "").replace("STANCE:", "").replace( "DOB:", "").replace("SLpM:", "").replace( "Str. Acc.:", "").replace("SApM:", "").replace( "Str. Def:", "").replace("TD Avg.:", "").replace( "TD Acc.:", "").replace( "TD Def.:", "").replace("Sub. Avg.:", "")) fighter_name_and_details[fighter_name] = data print_progress(index + 1, l, prefix="Progress:", suffix="Complete") fighters_with_no_data = [] for name, details in fighter_name_and_details.items(): if len(details) != len(self.HEADER): fighters_with_no_data.append(name) [fighter_name_and_details.pop(name) for name in fighters_with_no_data] if not fighter_name_and_details: print("No new fighter data to scrape at the moment!") return self.new_fighters_exists = True # dump fighter_name_and_details as scraped_fighter_data_dict with open(self.SCRAPED_FIGHTER_DATA_DICT_PICKLE_PATH.as_posix(), "wb") as f: pickle.dump(fighter_name_and_details, f)
def _get_total_fight_stats(cls, event_and_fight_links: Dict[str, List[str]]) -> str: total_stats = "" l = len(event_and_fight_links) print("Scraping all fight data: ") print_progress(0, l, prefix="Progress:", suffix="Complete") for index, (event, fights) in enumerate(event_and_fight_links.items()): event_soup = make_soup(event) event_info = FightDataScraper._get_event_info(event_soup) for fight in fights: try: fight_soup = make_soup(fight) fight_stats = FightDataScraper._get_fight_stats(fight_soup) fight_details = FightDataScraper._get_fight_details(fight_soup) result_data = FightDataScraper._get_fight_result_data(fight_soup) except Exception as e: continue total_fight_stats = ( fight_stats + ";" + fight_details + ";" + event_info + ";" + result_data ) if total_stats == "": total_stats = total_fight_stats else: total_stats = total_stats + "\n" + total_fight_stats print_progress(index + 1, l, prefix="Progress:", suffix="Complete") return total_stats
def _get_fight_stats_task(self, fight, event_info): #print(threading.get_native_id()) total_fight_stats = "" try: fight_soup = make_soup(fight) fight_stats = FightDataScraper._get_fight_stats(fight_soup) fight_details = FightDataScraper._get_fight_details(fight_soup) result_data = FightDataScraper._get_fight_result_data(fight_soup) total_fight_stats = (fight_stats + ";" + fight_details + ";" + event_info + ";" + result_data) except Exception as e: pass #print("Error getting fight stats, " + str(e)) return total_fight_stats