Python getSoupFromURL 예제들, soup_utils.getSoupFromURL Python 예제들

예제 #1

0

파일 보기

    def scrape_data(self):
        print self.name, self.overview_url
        if self.overview_url_content is not None:
            raise Exception("Can't populate this!")

        overview_soup = getSoupFromURL(self.overview_url)
        self.overview_url_content = overview_soup.text

        try:
            pos = filter(lambda x: 'Position:' in x, [p.text for p in overview_soup.findAll('p')])[0].strip().replace('\n','')
            self.positions = re.findall(self.POSN_PATTERN, pos)[0].strip().encode("utf8").split(" and ")
            self.height = overview_soup.find('span', {'itemprop':'height'}).text
            self.weight = overview_soup.find('span', {'itemprop':'weight'}).text[:-2]

        except Exception as ex:
            logging.error(ex.message)
            self.positions = []
            self.height = None
            self.weight = None

        # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
        # so we can use those to pull out our urls.
        for li in overview_soup.find_all('li'):
            game_log_links = []
            if 'Game Logs' in li.getText():
                game_log_links =  li.findAll('a')

            for game_log_link in game_log_links:
                if 'gamelog' in game_log_link.get('href'):
                    self.gamelog_url_list.append('http://www.basketball-reference.com' + game_log_link.get('href'))

예제 #2

0

파일 보기

파일: player.py 프로젝트: zhuyeqing/nnnba

    def scrape_data(self):
        print self.name,self.overview_url
        if self.overview_url_content is not None:
            raise Exception("Can't populate this!")

        overview_soup = getSoupFromURL(self.overview_url)
        self.overview_url_content = overview_soup.text

        try:
            player_position_text = overview_soup.findAll(text=re.compile(u'(Point Guard|Center|Power Forward|Shooting Guard|Small Forward)'))[0]
            player_height_text = overview_soup.findAll(text=re.compile(self.HEIGHT_PATTERN))[0]
            player_weight_text = overview_soup.findAll(text=re.compile(self.WEIGHT_PATTERN))[0]
            self.height = re.findall(self.HEIGHT_PATTERN,player_height_text)[0].strip().encode("utf8")
            self.weight = re.findall(self.WEIGHT_PATTERN,player_weight_text)[0].strip().encode("utf8")
            tempPositions = re.findall(self.POSN_PATTERN,player_position_text)
            self.positions = [position.strip().encode("utf8") for position in tempPositions]
            self.salaries = self.findSalaries(overview_soup)
            self.age = self.findAge(overview_soup)

        except Exception as ex:
            logging.error(ex.message)
            self.positions = []
            self.height = None
            self.weight = None

        # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
        # so we can use those to pull out our urls.
        for li in overview_soup.find_all('li'):
            game_log_links = []
            if 'Game Logs' in li.getText():
                game_log_links =  li.findAll('a')

            for game_log_link in game_log_links:
                self.gamelog_url_list.append('http://www.basketball-reference.com' + game_log_link.get('href'))

예제 #3

0

파일 보기

파일: player.py 프로젝트: GeorgeSixx/basketballcrawler

    def scrape_data(self):
        print self.name,self.overview_url
        if self.overview_url_content is not None:
            raise Exception("Can't populate this!")

        overview_soup = getSoupFromURL(self.overview_url)
        self.overview_url_content = overview_soup.text

        try:
            player_infotext = overview_soup.findAll('p',attrs={'class':'padding_bottom_half'})[0].text.split('\n')[0]

            self.positions = re.findall(self.POSN_PATTERN,player_infotext)[0].strip().encode("utf8").split(" and ")
            self.height = re.findall(self.HEIGHT_PATTERN,player_infotext)[0].strip().encode("utf8")
            self.weight = re.findall(self.WEIGHT_PATTERN,player_infotext)[0].strip().encode("utf8")

        except Exception as ex:
            logging.error(ex.message)
            self.positions = []
            self.height = None
            self.weight = None

        # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
        # so we can use those to pull out our urls.
        for li in overview_soup.find_all('li'):
            game_log_links = []
            if 'Game Logs' in li.getText():
                game_log_links =  li.findAll('a')

            for game_log_link in game_log_links:
                self.gamelog_url_list.append('http://www.basketball-reference.com' + game_log_link.get('href'))

예제 #4

0

파일 보기

    def scrape_data(self):
        print(self.name, self.overview_url)
        if self.overview_url_content is not None:
            raise Exception("Can't populate this!")

        overview_soup = getSoupFromURL(self.overview_url)
        print(type(overview_soup))  # this is a test line
        type_name = type(overview_soup).__name__

        if type_name == 'bs4.BeautifulSoup':
            self.overview_url_content = overview_soup.get_text()

        try:
            player_position_text = overview_soup.find_all(
                text=self.POSN_PATTERN)[0]
            player_height_text = overview_soup.find_all(
                text=self.HEIGHT_PATTERN)[0]
            player_weight_text = overview_soup.find_all(
                text=self.WEIGHT_PATTERN)[0]
            self.height = self.HEIGHT_PATTERN.findall(
                player_height_text)[0].strip()
            self.weight = self.WEIGHT_PATTERN.findall(
                player_weight_text)[0].strip()
            tempPositions = self.POSN_PATTERN.findall(player_position_text)
            self.positions = [position.strip() for position in tempPositions]
            self.scrape_player_nicknames(overview_soup)
            self.scrape_teams(overview_soup)

        except Exception as ex:
            logging.error(ex)
            self.positions = []
            self.nicknames = []
            self.height = None
            self.weight = None

        # the links to each year's game logs are in <li> tags, and the text
        # contains 'Game Logs'so we can use those to pull out our urls.
        link_prefix = "https://www.basketball-reference.com"
        for li in overview_soup.find_all('li'):
            if 'Game Logs' in li.getText():
                all_links = li.findAll('a')
                for link in all_links:
                    link_suffix = link.get('href')
                    if "/gamelog/" in link_suffix:
                        full_link = link_prefix + link_suffix
                        season = link.get_text().strip()
                        self.gamelog_url_list.append(full_link)
                        self.gamelog_url_dict[season] = full_link
                if len(self.gamelog_url_list) > 0:
                    break

예제 #5

0

파일 보기

    def scrape_data(self):
        print(self.name, self.overview_url)
        if self.overview_url_content is not None:
            raise Exception("Can't populate this!")

        overview_soup = getSoupFromURL(self.overview_url)
        self.overview_url_content = overview_soup.text

        try:
            self.scrape_teams(overview_soup)

        except Exception as ex:
            logging.error(ex.message)
            self.teams = {}

예제 #6

0

파일 보기

    def scrape_data(self):
        print(self.name, self.overview_url)
        if self.overview_url_content is not None:
            raise Exception("Can't populate this!")

        overview_soup = getSoupFromURL(self.overview_url)
        self.overview_url_content = overview_soup.get_text()

        try:
            bio_soup = overview_soup.find('div', attrs={"id": "meta"})
            bio_lines = bio_soup.find_all('p')
            bio_text_lines = [line for line in bio_lines if line.find("strong") is not None]
            self.scrape_location(bio_text_lines)
            self.scrape_former_names(bio_text_lines)

        except Exception as ex:
            logging.error(ex.message)
            self.location = {}
            self.former_names = []

예제 #7

0

파일 보기

    def scrape_data(self):
        print self.name, self.overview_url
        if self.overview_url_content is not None:
            raise Exception("Can't populate this!")

        overview_soup = getSoupFromURL(self.overview_url)
        self.overview_url_content = overview_soup.text

        try:
            player_infotext = overview_soup.findAll(
                'p', attrs={'class':
                            'padding_bottom_half'})[0].text.split('\n')[0]

            self.positions = re.findall(
                self.POSN_PATTERN,
                player_infotext)[0].strip().encode("utf8").split(" and ")
            self.height = re.findall(self.HEIGHT_PATTERN,
                                     player_infotext)[0].strip().encode("utf8")
            self.weight = re.findall(self.WEIGHT_PATTERN,
                                     player_infotext)[0].strip().encode("utf8")

        except Exception as ex:
            logging.error(ex.message)
            self.positions = []
            self.height = None
            self.weight = None

        # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
        # so we can use those to pull out our urls.
        for li in overview_soup.find_all('li'):
            game_log_links = []
            if 'Game Logs' in li.getText():
                game_log_links = li.findAll('a')

            for game_log_link in game_log_links:
                self.gamelog_url_list.append(
                    'http://www.basketball-reference.com' +
                    game_log_link.get('href'))