async def _get_images(self, data: element, title: str, max_images: Optional[int] = None) -> element: images = [] img_urls = [] img_tags = data.find_all('img') if max_images: img_tags = img_tags[:max_images] for img_tag in img_tags: if 'alt' in img_tag.attrs: if img_tag.attrs['alt'] in self._BANNED_ALT: continue img_url = img_tag.attrs['src'].split('?')[0] if img_url in img_urls: continue img_urls.append(img_url) if not urllib.parse.urlparse(img_url).netloc: img_url = urllib.parse.urljoin(self._DOMAIN, img_url) image = await self._get_file_value_object( url=img_url, pretty_name=title, filename_unique=self._FILENAME_UNIQUE, public_url=self._PUBLIC_URL) images.append(image) return images
def get_cells_from_row(row: bs4.element) -> list: """ Get text from cells in the given BS4 table row. :param row: BS4 table row. """ for column in row.find_all('td'): yield column.text.strip()
def validate_and_extract(self, node: bs4.element): if isinstance(node, bs4.element.Tag) \ and node.has_attr('class') \ and ('twitter-tweet' in node['class'] or 'twitter-tweet-rendered' in node['class']): tweet_a_tag = node.find_all('a') if tweet_a_tag and tweet_a_tag[-1].has_attr('href'): tweet_url = tweet_a_tag[-1]['href'] tweet_id = tweet_url.split('/')[-1].split('?')[0] return ETweet(tweet_id) return None
def _parse_one_oakland_chart(chart: bs4_element): """Parse one of the charts on the Oakland page""" title_el = chart.find('div', attrs={'class': 'chart-vertical-title'}) title = title_el.text data_els = [el for el in chart.find_all('li') if 'title' in el.attrs] months = [] teus = [] for el in data_els: month = pd.to_datetime(el.attrs['title'], format='%b').month num_el = el.find('span', attrs={'class': 'number'}) num = np.nan if len(num_el.text) == 0 else np.float(num_el.text.replace(',', '')) months.append(month) teus.append(num) return title, months, teus
def get_entries_from_table(table: bs4.element) -> list: """ Take a BS4 representation of a tournament entries table and return a list of lists, each internal list holding the last names of the team's members. :param table: Table of entries. :return: List of name lists. """ for row in table.find_all('tr'): # Skip rows with fewer than three columns - they won't have partnership info. if len(row.find_all('td')) < 3: continue columns = list(get_cells_from_row(row)) # Don't return TBA entries. if columns[2] == 'Names TBA': continue names = columns[2].replace('&', '').split() school = columns[0] yield Partnership(school, tuple(names))
def __parse_row(self, row: element) -> None: data_list = row.find_all('td') self.code = data_list[0].text.strip() period = data_list[1].span.text.strip() if ' a ' in period: buy, sell = period.split(' a ') else: buy = period sell = None self.buy_date = datetime.strptime(buy, "%d/%m/%Y").date() if sell is not None: self.sell_date = datetime.strptime(sell, "%d/%m/%Y").date() else: self.sell_date = None self.buy_amount = int(data_list[2].text.strip()) self.sell_amount = int(data_list[3].text.strip()) self.buy_price = float(data_list[4].text.strip().replace(',', '.')) self.sell_price = float(data_list[5].text.strip().replace(',', '.')) self.position = data_list[7].text.strip() self.__initialized = True
async def _remove_non_text_tags(data: element) -> element: for script in data.find_all('script'): script.decompose() return data