Пример #1
0
 async def _get_images(self,
                       data: element,
                       title: str,
                       max_images: Optional[int] = None) -> element:
     images = []
     img_urls = []
     img_tags = data.find_all('img')
     if max_images:
         img_tags = img_tags[:max_images]
     for img_tag in img_tags:
         if 'alt' in img_tag.attrs:
             if img_tag.attrs['alt'] in self._BANNED_ALT:
                 continue
         img_url = img_tag.attrs['src'].split('?')[0]
         if img_url in img_urls:
             continue
         img_urls.append(img_url)
         if not urllib.parse.urlparse(img_url).netloc:
             img_url = urllib.parse.urljoin(self._DOMAIN, img_url)
         image = await self._get_file_value_object(
             url=img_url,
             pretty_name=title,
             filename_unique=self._FILENAME_UNIQUE,
             public_url=self._PUBLIC_URL)
         images.append(image)
     return images
Пример #2
0
def get_cells_from_row(row: bs4.element) -> list:
    """
    Get text from cells in the given BS4 table row.
    :param row: BS4 table row.
    """
    for column in row.find_all('td'):
        yield column.text.strip()
Пример #3
0
    def validate_and_extract(self, node: bs4.element):
        if isinstance(node, bs4.element.Tag) \
                and node.has_attr('class') \
                and ('twitter-tweet' in node['class']
                     or 'twitter-tweet-rendered' in node['class']):

            tweet_a_tag = node.find_all('a')

            if tweet_a_tag and tweet_a_tag[-1].has_attr('href'):
                tweet_url = tweet_a_tag[-1]['href']
                tweet_id = tweet_url.split('/')[-1].split('?')[0]
                return ETweet(tweet_id)

        return None
Пример #4
0
def _parse_one_oakland_chart(chart: bs4_element):
    """Parse one of the charts on the Oakland page"""
    title_el = chart.find('div', attrs={'class': 'chart-vertical-title'})
    title = title_el.text

    data_els = [el for el in chart.find_all('li') if 'title' in el.attrs]
    months = []
    teus = []
    for el in data_els:
        month = pd.to_datetime(el.attrs['title'], format='%b').month
        num_el = el.find('span', attrs={'class': 'number'})
        num = np.nan if len(num_el.text) == 0 else np.float(num_el.text.replace(',', ''))
        months.append(month)
        teus.append(num)

    return title, months, teus
Пример #5
0
def get_entries_from_table(table: bs4.element) -> list:
    """
    Take a BS4 representation of a tournament entries table and return a list of lists, each internal list holding the
    last names of the team's members.
    :param table: Table of entries.
    :return: List of name lists.
    """
    for row in table.find_all('tr'):
        # Skip rows with fewer than three columns - they won't have partnership info.
        if len(row.find_all('td')) < 3:
            continue

        columns = list(get_cells_from_row(row))

        # Don't return TBA entries.
        if columns[2] == 'Names TBA':
            continue

        names = columns[2].replace('&', '').split()
        school = columns[0]

        yield Partnership(school, tuple(names))
Пример #6
0
    def __parse_row(self, row: element) -> None:
        data_list = row.find_all('td')
        self.code = data_list[0].text.strip()

        period = data_list[1].span.text.strip()
        if ' a ' in period:
            buy, sell = period.split(' a ')
        else:
            buy = period
            sell = None

        self.buy_date = datetime.strptime(buy, "%d/%m/%Y").date()
        if sell is not None:
            self.sell_date = datetime.strptime(sell, "%d/%m/%Y").date()
        else:
            self.sell_date = None

        self.buy_amount = int(data_list[2].text.strip())
        self.sell_amount = int(data_list[3].text.strip())
        self.buy_price = float(data_list[4].text.strip().replace(',', '.'))
        self.sell_price = float(data_list[5].text.strip().replace(',', '.'))

        self.position = data_list[7].text.strip()
        self.__initialized = True
Пример #7
0
 async def _remove_non_text_tags(data: element) -> element:
     for script in data.find_all('script'):
         script.decompose()
     return data