Python make_soup示例，src.createdata.utils.make_soup Python示例

示例#1

0

显示文件

    def _get_fighter_name_and_link(self, ) -> Dict[str, List[str]]:
        fighter_name_and_link = {}
        fighter_name = ""

        l = len(self.fighter_group_urls)
        print("Scraping all fighter names and links: ")
        print_progress(0, l, prefix="Progress:", suffix="Complete")

        for index, fighter_group_url in enumerate(self.fighter_group_urls):
            soup = make_soup(fighter_group_url)
            table = soup.find("tbody")
            names = table.findAll("a", {"class": "b-link b-link_style_black"},
                                  href=True)
            for i, name in enumerate(names):
                if (i + 1) % 3 != 0:
                    if fighter_name == "":
                        fighter_name = name.text
                    else:
                        fighter_name = fighter_name + " " + name.text
                else:
                    fighter_name_and_link[fighter_name] = name["href"]
                    fighter_name = ""
            print_progress(index + 1, l, prefix="Progress:", suffix="Complete")

        return fighter_name_and_link

示例#2

0

显示文件

文件： scrape_fight_data.py 项目： firingbrisingr/UFC-Predictions

    def _get_total_fight_stats(
            cls, event_and_fight_links: Dict[str, List[str]]) -> str:
        total_stats = ""

        l = len(event_and_fight_links)
        print(f'Scraping data for {l} fights: ')
        print_progress(0, l, prefix="Progress:", suffix="Complete")

        for index, (event, fights) in enumerate(event_and_fight_links.items()):
            event_soup = make_soup(event)
            event_info = FightDataScraper._get_event_info(event_soup)

            # Get data for each fight in the event in parallel.
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=8) as executor:
                futures = []
                for fight in fights:
                    futures.append(
                        executor.submit(FightDataScraper._get_fight_stats_task,
                                        self=cls,
                                        fight=fight,
                                        event_info=event_info))
                for future in concurrent.futures.as_completed(futures):
                    fighter_stats = future.result()
                    if fighter_stats != "":
                        if total_stats == "":
                            total_stats = fighter_stats
                        else:
                            total_stats = total_stats + "\n" + fighter_stats
                    print_progress(index + 1,
                                   l,
                                   prefix="Progress:",
                                   suffix="Complete")

        return total_stats

示例#3

0

显示文件

 def _get_fighter_data_task(self, fighter_name, fighter_url):
     another_soup = make_soup(fighter_url)
     divs = another_soup.findAll(
         "li",
         {
             "class":
             "b-list__box-list-item b-list__box-list-item_type_block"
         },
     )
     data = []
     for i, div in enumerate(divs):
         if i == 9:
             # An empty string is scraped here, let's not append that
             continue
         data.append(
             div.text.replace("  ", "").replace("\n", "").replace(
                 "Height:",
                 "").replace("Weight:", "").replace("Reach:", "").replace(
                     "STANCE:",
                     "").replace("DOB:", "").replace("SLpM:", "").replace(
                         "Str. Acc.:", "").replace("SApM:", "").replace(
                             "Str. Def:",
                             "").replace("TD Avg.:", "").replace(
                                 "TD Acc.:",
                                 "").replace("TD Def.:",
                                             "").replace("Sub. Avg.:", ""))
     return fighter_name, data

示例#4

0

显示文件

文件： scrape_fight_links.py 项目： kit-git/UFC-Predictions

        def get_fight_links(event_links: List[str]) -> Dict[str, List[str]]:
            event_and_fight_links = {}

            l = len(event_links)
            print("Scraping event and fight links: ")
            print_progress(0, l, prefix="Progress:", suffix="Complete")

            for index, link in enumerate(event_links):
                event_fights = []
                soup = make_soup(link)
                for row in soup.findAll(
                        "tr",
                    {
                        "class":
                        "b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click"
                    },
                ):
                    href = row.get("data-link")
                    event_fights.append(href)
                event_and_fight_links[link] = event_fights

                print_progress(index + 1,
                               l,
                               prefix="Progress:",
                               suffix="Complete")

            return event_and_fight_links

示例#5

0

显示文件

文件： scrape_fight_links.py 项目： kit-git/UFC-Predictions

    def _get_updated_event_links(self) -> Tuple[List[str], List[str]]:
        all_event_links = []
        soup = make_soup(self.all_events_url)

        for link in soup.findAll("td", {"class": "b-statistics__table-col"}):
            for href in link.findAll("a"):
                foo = href.get("href")
                all_event_links.append(foo)

        if not self.PAST_EVENT_LINKS_PICKLE_PATH.exists():
            # if no past event links are present, then there are no new event links
            new_event_links = []
        else:
            # get past event links
            with open(self.PAST_EVENT_LINKS_PICKLE_PATH.as_posix(),
                      "rb") as pickle_in:
                past_event_links = pickle.load(pickle_in)

            # Find links of the newer events
            new_event_links = list(
                set(all_event_links) - set(past_event_links))

        # dump all_event_links as PAST_EVENT_LINKS
        with open(self.PAST_EVENT_LINKS_PICKLE_PATH.as_posix(), "wb") as f:
            pickle.dump(all_event_links, f)

        return new_event_links, all_event_links

示例#6

0

显示文件

    def _get_fighter_name_and_details(
            self, fighter_name_and_link: Dict[str, List[str]]) -> None:
        fighter_name_and_details = {}

        l = len(fighter_name_and_link)
        print("Scraping all fighter data: ")
        print_progress(0, l, prefix="Progress:", suffix="Complete")

        for index, (fighter_name,
                    fighter_url) in enumerate(fighter_name_and_link.items()):
            another_soup = make_soup(fighter_url)
            divs = another_soup.findAll(
                "li",
                {
                    "class":
                    "b-list__box-list-item b-list__box-list-item_type_block"
                },
            )
            data = []
            for i, div in enumerate(divs):
                if i == 9:
                    # An empty string is scraped here, let's not append that
                    continue
                data.append(
                    div.text.replace("  ", "").replace("\n", "").replace(
                        "Height:", "").replace("Weight:", "").replace(
                            "Reach:", "").replace("STANCE:", "").replace(
                                "DOB:", "").replace("SLpM:", "").replace(
                                    "Str. Acc.:",
                                    "").replace("SApM:", "").replace(
                                        "Str. Def:",
                                        "").replace("TD Avg.:", "").replace(
                                            "TD Acc.:", "").replace(
                                                "TD Def.:",
                                                "").replace("Sub. Avg.:", ""))

            fighter_name_and_details[fighter_name] = data
            print_progress(index + 1, l, prefix="Progress:", suffix="Complete")

        fighters_with_no_data = []
        for name, details in fighter_name_and_details.items():
            if len(details) != len(self.HEADER):
                fighters_with_no_data.append(name)

        [fighter_name_and_details.pop(name) for name in fighters_with_no_data]

        if not fighter_name_and_details:
            print("No new fighter data to scrape at the moment!")
            return

        self.new_fighters_exists = True

        # dump fighter_name_and_details as scraped_fighter_data_dict
        with open(self.SCRAPED_FIGHTER_DATA_DICT_PICKLE_PATH.as_posix(),
                  "wb") as f:
            pickle.dump(fighter_name_and_details, f)

示例#7

0

显示文件

文件： scrape_fight_data.py 项目： kit-git/UFC-Predictions

    def _get_total_fight_stats(cls, event_and_fight_links: Dict[str, List[str]]) -> str:
        total_stats = ""

        l = len(event_and_fight_links)
        print("Scraping all fight data: ")
        print_progress(0, l, prefix="Progress:", suffix="Complete")

        for index, (event, fights) in enumerate(event_and_fight_links.items()):
            event_soup = make_soup(event)
            event_info = FightDataScraper._get_event_info(event_soup)

            for fight in fights:
                try:
                    fight_soup = make_soup(fight)
                    fight_stats = FightDataScraper._get_fight_stats(fight_soup)
                    fight_details = FightDataScraper._get_fight_details(fight_soup)
                    result_data = FightDataScraper._get_fight_result_data(fight_soup)
                except Exception as e:
                    continue

                total_fight_stats = (
                    fight_stats
                    + ";"
                    + fight_details
                    + ";"
                    + event_info
                    + ";"
                    + result_data
                )

                if total_stats == "":
                    total_stats = total_fight_stats
                else:
                    total_stats = total_stats + "\n" + total_fight_stats

            print_progress(index + 1, l, prefix="Progress:", suffix="Complete")

        return total_stats

示例#8

0

显示文件

文件： scrape_fight_data.py 项目： firingbrisingr/UFC-Predictions

    def _get_fight_stats_task(self, fight, event_info):
        #print(threading.get_native_id())
        total_fight_stats = ""
        try:
            fight_soup = make_soup(fight)
            fight_stats = FightDataScraper._get_fight_stats(fight_soup)
            fight_details = FightDataScraper._get_fight_details(fight_soup)
            result_data = FightDataScraper._get_fight_result_data(fight_soup)
            total_fight_stats = (fight_stats + ";" + fight_details + ";" +
                                 event_info + ";" + result_data)
        except Exception as e:
            pass
            #print("Error getting fight stats, " + str(e))

        return total_fight_stats