Exemplo n.º 1
0
def get_lineups():
    for league in LEAGUES:
        if league in settings.ROTOWIRE_LEAGUE_URLS:
            url = settings.ROTOWIRE_BASE_URL + settings.ROTOWIRE_LEAGUE_URLS[
                league]
        else:
            continue
        rotowire_html = make_request(url, settings.REQUEST_HEADERS['rotowire'],
                                     'text')
        lineup_html = BeautifulSoup(rotowire_html['data'],
                                    parse_only=SoupStrainer(class_="lineups"),
                                    features="html.parser")

        # ONLY GET NEW LINEUP IF IT CHANGED
        current_html = RotowireRequestLog.objects.get_html(league)
        if current_html:
            current_html = current_html.html
        else:
            current_html = ''
        if current_html != str(lineup_html):
            rr = RotowireRequestLog()
            rr.html = str(lineup_html)
            rr.league = League.objects.get(code=league)
            rr.save()
    return True
Exemplo n.º 2
0
 def test_parseOnlyThese_renamed_to_parse_only(self):
     with warnings.catch_warnings(record=True) as w:
         soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
     msg = str(w[0].message)
     self.assertTrue("parseOnlyThese" in msg)
     self.assertTrue("parse_only" in msg)
     self.assertEqual(b"<b></b>", soup.encode())
Exemplo n.º 3
0
    def test_find_all_by_attribute_soupstrainer(self):
        tree = self.soup("""
                         <a id="first">Match.</a>
                         <a id="second">Non-match.</a>""")

        strainer = SoupStrainer(attrs={'id' : 'first'})
        self.assertSelects(tree.find_all(strainer), ['Match.'])
Exemplo n.º 4
0
async def feed_sniffer(url: str, html: AnyStr) -> Optional[str]:
    if url in FeedSnifferCache:
        return FeedSnifferCache[url]
    # if len(html) < 69:  # len of `<html><head></head><body></body></html>` + `<link rel="alternate" href="">`
    #     return None  # too short to sniff

    soup = await run_async_on_demand(BeautifulSoup,
                                     html,
                                     'lxml',
                                     parse_only=SoupStrainer(
                                         name=('a', 'link'),
                                         attrs={'href': True}),
                                     prefer_pool='thread',
                                     condition=len(html) > 64 * 1024)
    links = (soup.find_all(name='link',
                           attrs={
                               'rel': 'alternate',
                               'type': FeedLinkTypeMatcher
                           }) or soup.find_all(name='link',
                                               attrs={
                                                   'rel': 'alternate',
                                                   'href': FeedLinkHrefMatcher
                                               })
             or soup.find_all(name='a', attrs={'class': FeedATextMatcher})
             or soup.find_all(name='a', attrs={'title': FeedATextMatcher})
             or soup.find_all(name='a', attrs={'href': FeedAHrefMatcher})
             or soup.find_all(name='a', string=FeedATextMatcher))
    if links:
        feed_url = urljoin(url, links[0]['href'])
        FeedSnifferCache[url] = feed_url
        return feed_url
    return None
Exemplo n.º 5
0
 def test_soupstrainer(self):
     # The html5lib tree builder does not support SoupStrainers.
     strainer = SoupStrainer("b")
     markup = "<p>A <b>bold</b> statement.</p>"
     soup = self.soup(markup,
                      parse_only=strainer)
     self.assertEqual(
         soup.decode(), self.document_for(markup))
Exemplo n.º 6
0
 def extract_links(self, result):
     soup = BeautifulSoup(result['res'],
                          parse_only=SoupStrainer("a"),
                          features="html.parser")
     return [
         link["href"]
         for link in soup.find_all(attrs={"class": "storylink"})
         if link.has_attr("href")
     ]
Exemplo n.º 7
0
def get_dates_html():
    page_source = r.get(f'{FIFA_URL}/{FIRST_DATE}/')
    page_source.raise_for_status()
    dates = BeautifulSoup(
        page_source.text,
        'html.parser',
        parse_only=SoupStrainer(
            'li', attrs={'class': 'fi-ranking-schedule__nav__item'}))
    return dates
Exemplo n.º 8
0
    def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
        markup = ('<head><meta content="text/html; charset=x-sjis" '
                  'http-equiv="Content-type"/></head><pre>foo</pre>')

        # Beautiful Soup used to try to rewrite the meta tag even if the
        # meta tag got filtered out by the strainer. This test makes
        # sure that doesn't happen.
        strainer = SoupStrainer('pre')
        soup = self.soup(markup, parse_only=strainer)
        assert soup.contents[0].name == 'pre'
Exemplo n.º 9
0
    def test_soupstrainer(self):
        # The html5lib tree builder does not support SoupStrainers.
        strainer = SoupStrainer("b")
        markup = "<p>A <b>bold</b> statement.</p>"
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup(markup, parse_only=strainer)
        self.assertEqual(soup.decode(), self.document_for(markup))

        self.assertTrue("the html5lib tree builder doesn't support parse_only"
                        in str(w[0].message))
Exemplo n.º 10
0
    def __init__(self, page):
        """
        Construct an EventHub object
        :param page: the response to the get request
        :type: str
        """

        # Only fetch tags that are used by scrapers.
        strainer = SoupStrainer(name=[tag.value for tag in Tags])

        self.page = page
        self.html = BeautifulSoup(self.page.content,
                                  "html.parser",
                                  parse_only=strainer)
        self.events = list()
Exemplo n.º 11
0
 def collect_page_urls(self, url: str):
     response = requests.get(url)
     if "text/html" in response.headers["Content-Type"]:
         hrefs = list(
             filter(
                 lambda doctype: doctype.has_attr("href"),
                 BeautifulSoup(
                     response.text,
                     features="html.parser",
                     parse_only=SoupStrainer("a"),
                 ),
             ),
         )
         urls = [f"{element['href']}" for element in hrefs]
         relative_urls = list(filter(self.validate_url, urls))
         page_urls = [self.clean_url(url) for url in relative_urls]
     else:
         page_urls = []
     return page_urls
Exemplo n.º 12
0
    def _fetch_company_data(self, ticker: str, year: int, txt_url: str) -> None:
        """Fetch the data_assets for the specified company and year from sec
        Args:
            ticker str: The ticker name
            year int: The year
            txt_url str: The url to fetch from the data_assets
        Returns:
            None
        """
        if not txt_url:
            return

        to_get_html_site = f'{self.SEC_ARCHIVE_URL}/{txt_url}'
        data = requests.get(to_get_html_site).content

        xbrl_doc = SoupStrainer("xbrl")
        soup = BeautifulSoup(data, 'lxml', parse_only=xbrl_doc)

        if soup:
            self.get_financial_data(soup, ticker, year)
Exemplo n.º 13
0
def scrapy_rank_table(page, date):
    rows = BeautifulSoup(page, 'html.parser',
                         parse_only=SoupStrainer('tbody')).find_all('tr')
    table = []
    for row in rows:
        table.append({
            'id':
            int(row['data-team-id']),
            'country_full':
            row.find('span', {
                'class': 'fi-t__nText'
            }).text,
            'country_abrv':
            row.find('span', {
                'class': 'fi-t__nTri'
            }).text,
            'rank':
            int(row.find('td', {
                'class': 'fi-table__rank'
            }).text),
            'total_points':
            int(row.find('td', {
                'class': 'fi-table__points'
            }).text),
            'previous_points':
            int(row.find('td', {
                'class': 'fi-table__prevpoints'
            }).text or 0),
            'rank_change':
            int(
                row.find('td', {
                    'class': 'fi-table__rankingmovement'
                }).text.replace('-', '0')),
            'confederation':
            row.find('td', {
                'class': 'fi-table__confederation'
            }).text.strip('#'),
            'rank_date':
            date
        })
    return table
Exemplo n.º 14
0
def get_book_list(str_url):
    soup = parser(str_url)
    plist = SoupStrainer(id="plist")
    content = soup.find_all(plist)
    items = soup.select('.item')
    cur_page_data_list = []
    for item in items:
        book_info_list = []
        index = smart_str(item.select('.index')[0].contents[0].string)  #图书排行

        p_name = item.select('.p-name')[0].contents[0]
        book_url = smart_str(p_name['href'])  #图书链接地址
        book_name = smart_str(p_name.text)  #图书名称

        book_info = item.select('.p-info')
        book_publisher_auther = book_info[0]

        book_p_a_len = len(list(book_publisher_auther.select('a')))
        book_auther = ''  #图书作者
        book_trans_auther = ''  #图书译者
        book_publisher = ''  #图书出版社
        if book_p_a_len == 2:
            book_auther = smart_str(book_publisher_auther.select('a')[1].text)
            book_publisher = smart_str(
                book_publisher_auther.select('a')[1].text)
        elif book_p_a_len == 1:
            book_auther = ''
            book_publisher = smart_str(
                book_publisher_auther.select('a')[0].text)
        elif book_p_a_len == 3:
            book_auther = smart_str(book_publisher_auther.select('a')[0].text)
            book_trans_auther = smart_str(
                book_publisher_auther.select('a')[1].text)
            book_publisher = smart_str(
                book_publisher_auther.select('a')[2].text)

        book_img = smart_str(item.select('.p-img.bookimg')[0].img['src'])

        book_prices = book_info[1]
        del_price = (smart_str(book_prices.select('del')[0].text)).replace(
            '¥', '')  #定价
        jd_price = (smart_str(book_prices.select('span')[0].text)).replace(
            '¥', '')  #京东价

        #print index,book_name,book_url
        #print book_auther,book_trans_auther,book_publisher
        #print del_price,jd_price
        #print book_img
        #print '|'.join(book_info_list)

        book_info_list.append(index)
        book_info_list.append(book_name)
        book_info_list.append(book_img)
        book_info_list.append(book_url)
        book_info_list.append(book_auther)
        book_info_list.append(book_trans_auther)
        book_info_list.append(book_publisher)
        book_info_list.append(del_price)
        book_info_list.append(jd_price)
        cur_page_data_list.append(tuple(book_info_list))
    return cur_page_data_list
Exemplo n.º 15
0
 def test_soupstrainer(self):
     """Parsers should be able to work with SoupStrainers."""
     strainer = SoupStrainer("b")
     soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
                      parse_only=strainer)
     self.assertEqual(soup.decode(), "<b>bold</b>")
Exemplo n.º 16
0
 def test_soupstrainer(self):
     
     strainer = SoupStrainer("b")
Exemplo n.º 17
0
class DomRiaFlatParser(DomRiaEstateParser):
    """
    Final parsers' progress, specialized on dom.ria.com flat offers' processing.

    Class properties:
        __area_pattern: regex to extract flat's area shapes
        during the page processing
    """
    _offer_strainer = SoupStrainer('section')
    _details = json('resources/dom_ria_flat_reaper/details.json')
    __area_pattern = compile(r'Площа (\S+)/(\S+)/(\S+)')

    def _parse_page(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
        return list(
            filter(
                lambda o: o is not None,
                map(self.__parse_section,
                    soup.find_all('section', 'ticket-clear'))))

    def __parse_section(self, tag: Tag) -> Dict[str, Any]:
        """
        Finds offer's specific data during the page processing.

        :param tag: DOM's node
        :return: "raw offer"
        """
        url = tag.find('a', 'blue')['href']
        if url.startswith('/uk/'):
            avatar = tag.find('span', 'load-photo').find('img')
            areas = self.__area_pattern.search(
                next((t.text for t in tag.find_all('li', 'mt-5 i-block')
                      if t.get('title', '').startswith('Площа:')))).groups()
            return {
                'url': f'https://dom.ria.com{url}',
                'avatar': avatar.get('src', avatar.get('data-src')),
                'area': self._float(areas[0]),
                'living_area': self._float(areas[1]),
                'kitchen_area': self._float(areas[2])
            }

    def _parse_offer(self, url: str, soup: BeautifulSoup,
                     **kwargs: Any) -> Flat:
        pairs = self._parse_pairs(soup)
        params = self.__parse_parameters(pairs)
        return Flat(url=url,
                    avatar=self._parse_avatar(soup, kwargs['avatar']),
                    published=self._parse_published(soup),
                    geolocation=self._parse_geolocation(soup),
                    price=self._parse_price(soup),
                    area=kwargs['area'],
                    living_area=kwargs['living_area'],
                    kitchen_area=kwargs['kitchen_area'],
                    rooms=params['rooms'],
                    floor=params['floor'],
                    total_floor=params['total_floor'],
                    ceiling_height=params['ceiling_height'],
                    details=self._parse_details(pairs))

    def _parse_pairs(self, soup: BeautifulSoup) -> Dict[str, str]:
        pairs = super()._parse_pairs(soup)
        pairs['Тип'] = next(
            filter(
                lambda t: t.endswith(' житло'),
                map(lambda t: t.text.strip(),
                    soup.find_all('li', 'labelHot'))), None)
        return pairs

    def __parse_parameters(
            self, pairs: Dict[str, str]) -> Dict[str, Union[int, float]]:
        """
        Extracts basic offer's numeric parameters (floor, rooms, etc).

        :param pairs: offer's tabular data
        :return: offer's numeric parameters
        """
        return {
            'rooms': self._int(pairs.get('Кімнат')),
            'floor': self._int(pairs.get('Поверх')),
            'total_floor': self._int(pairs.get('Поверховість')),
            'ceiling_height': self._ceiling_height(pairs.get('висота стелі'))
        }

    def _ceiling_height(self, string: str) -> Optional[float]:
        """
        Calculates flat's ceiling height (dependently on the digits' amount).

        :param string: number's char array
        :return: ceiling height in meters or None
        """
        ceiling_height = self._float(string)
        if ceiling_height is not None:
            return ceiling_height * 10**ceil(-log10(ceiling_height))
Exemplo n.º 18
0
# Imports Needed for The Web Scraping
from bs4 import BeautifulSoup
import requests
import lxml
from bs4.element import SoupStrainer
import urllib.request



# Setup URL String and Make a Request on the set URL Variable
URL = "https://www.worldometers.info/coronavirus/"
r = requests.get(URL) 
links = SoupStrainer("div")

# URl Checker. Checks if Status Returned is 200 and Returns True or False
def url_check(URL):
    r = requests.head(URL)
    if (r.status_code == 200):
        return True
    else:
        return False

# Prints the Coronavirus Statistics by Scrapping the Website URL
def printStats():
    # Make Beautiful Soup grab the Content from our Request and Parse it using the LXML(Faster Parsing then html5lib)
    soup = BeautifulSoup(r.content,'lxml',parse_only=links)

    #Store Relevant Number Categories
    Data = []

    # Total,Deaths,Recovered Segment from the URL. Store into a List
Exemplo n.º 19
0
class OlxEstateParser(EstateParser):
    """
    Deep parser's evolution, specialized on www.olx.ua offers' processing.

    Class properties:
        _url_pattern: truncates the url string removing trailing hash
        _published_pattern: publication date regex pattern
        _months: matches between literal and numeric months
        _shapes_pattern: regex pattern with offer's price and currency
    """
    _offer_strainer = SoupStrainer('a')
    _url_pattern = compile(r'^(\S+\.html)')
    _published_pattern = compile(r'(\d{1,2}) (\w+) (\d{4})')
    _months = {
        'января': 1,
        'февраля': 2,
        'марта': 3,
        'апреля': 4,
        'мая': 5,
        'июня': 6,
        'июля': 7,
        'августа': 8,
        'сентября': 9,
        'октября': 10,
        'ноября': 11,
        'декабря': 12
    }
    _shapes_pattern = compile(r'^([ .\d]+) (\D+)$')

    def _parse_stop(self, soup: BeautifulSoup) -> Optional[int]:
        tags = soup.find_all('a', 'brc8')
        if len(tags) > 0:
            return int(tags[-1].findChild().text)

    def _parse_page(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
        return [{
            'url': self._url_pattern.search(t['href']).groups()[0]
        } for t in soup.find_all('a', 'thumb')]

    def _check_offer(self, soup: BeautifulSoup) -> bool:
        return (self.__find_junk(soup) is None
                and self.__find_published(soup) is not None
                and self.__find_point(soup) is not None
                and self.__find_price(soup) is not None)

    @staticmethod
    def __find_junk(soup: BeautifulSoup) -> Optional[Tag]:
        """
        Detects the tag which indicates the offer's obsolescence.

        :param soup: DOM tags' tree
        :return: target tag
        """
        return soup.find('h3', 'lheight20 large cfff')

    @staticmethod
    def __find_published(soup: BeautifulSoup) -> Optional[Tag]:
        """
        Detects the tag which contains offer's publication date.

        :param soup: DOM tags' tree
        :return: tag with publication date
        """
        return soup.find('span', 'pdingleft10')

    @staticmethod
    def __find_point(soup: BeautifulSoup) -> Optional[Tag]:
        """
        Finds tag with estate's longitude & latitude.

        :param soup: DOM tags' tree
        :return: tag with estate's geoposition
        """
        return soup.find(id='mapcontainer')

    @staticmethod
    def __find_price(soup: BeautifulSoup) -> Optional[Tag]:
        """
        Finds tag with offer's price and currency.

        :param soup: DOM tags' tree
        :return: tag with offer's shapes
        """
        tag = soup.find('strong', 'xxxx-large')
        return (tag if tag is not None else soup.find_all(
            'strong', 'xx-large')[1])

    @staticmethod
    def _parse_avatar(soup: BeautifulSoup) -> Optional[str]:
        """
        Extracts offer's avatar source if available.

        :param soup: DOM tags' tree
        :return: offer avatar's source or None
        """
        tag = soup.find('img', 'vtop')
        if tag is not None:
            return tag['src']

    def _parse_published(self, soup: BeautifulSoup) -> date:
        """
        Fetches offer's publication date.

        :param soup: DOM tags' tree
        :return: offer's publication date
        """
        published = self._published_pattern.search(
            self.__find_published(soup).text).groups()
        return date(int(published[2]), self._months.get(published[1], 1),
                    int(published[0]))

    def _parse_geolocation(
            self, soup: BeautifulSoup) -> Dict[str, Tuple[float, float]]:
        """
        Finds offer's geoposition and transforms them into point.

        :param soup: DOM tags' tree
        :return: offer's positional data
        """
        tag = self.__find_point(soup)
        return {'point': (float(tag['data-lon']), float(tag['data-lat']))}

    def _parse_shapes(self,
                      soup: BeautifulSoup) -> Dict[str, Union[Decimal, str]]:
        """
        Fetches offer's price and currency.

        :param soup: DOM tags' tree
        :return: offer's financial values
        """
        shapes = self._shapes_pattern.match(
            self.__find_price(soup).text).groups()
        return {
            'price': decimalize(shapes[0].replace(' ', '')),
            'currency': shapes[1]
        }

    @staticmethod
    def _parse_pairs(soup: BeautifulSoup) -> Dict[str, str]:
        """
        Extracts offer's tabular data (area, floor, details, etc).

        :param soup: DOM tags' tree
        :return: offer's numeric data & details
        """
        return {
            t.find('th').text: t.find('strong').text.strip('\t\n')
            for t in soup.find_all('table', 'item')
        }

    def _parse_junk(self, url: str, soup: BeautifulSoup) -> Optional[str]:
        if self.__find_junk(soup) is not None:
            return url
Exemplo n.º 20
0
import httplib2
import urllib.parse
from bs4 import BeautifulSoup
from bs4.element import SoupStrainer

http = httplib2.Http()
status, response = http.request("https://billwurtz.com/songs.html")

songLinks = BeautifulSoup(response, parse_only=SoupStrainer("a"))

for link in songLinks:
    if link.has_attr("href") and link["href"].endswith(".mp3"):
        songResponse, songData = http.request("https://billwurtz.com/" +
                                              link["href"])
        songPath = "songs/" + urllib.parse.unquote(link["href"])
        with open(songPath, 'wb') as f:
            f.write(songData)
        print("wrote: " + songPath)
Exemplo n.º 21
0
 def test_find_all_by_tag_strainer(self):
     self.assertSelects(
         self.tree.find_all(SoupStrainer('a')),
         ['First tag.', 'Nested tag.'])
Exemplo n.º 22
0
 def test_soupstrainer_constructor_string(self):
     with warnings.catch_warnings(record=True) as w:
         strainer = SoupStrainer(text="text")
         assert strainer.text == 'text'
         msg = str(w[0].message)
         assert msg == "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead."
Exemplo n.º 23
0
 def test_parse_with_soupstrainer(self):
     markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
     strainer = SoupStrainer("b")
     soup = self.soup(markup, parse_only=strainer)
     self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
Exemplo n.º 24
0
 def test_soupstrainer(self):
     strainer = SoupStrainer("b")
     soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>",
                      parse_only=strainer)
     self.assertEqual(soup.decode(), "<b>bold</b>")
Exemplo n.º 25
0
def soupify_links(url, file_extension=None):
    """
    Returns a String list containing urls that match the specified file_extension
    Only works on link tags
    
    Args:
        url (String): the target URL
    
    Returns:
        [String]: A list of string URLs representing all links to content from <a> and <img> tags
    """

    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
    }

    with requests.session() as session:

        try:
            # run a GET request on the supplied URL
            r = session.get(url, headers=headers, stream=True, timeout=1)
            r.raise_for_status()
        except HTTPError as http_err:
            click.secho(f"\nHTTP error occurred: {http_err}\n",
                        fg="red",
                        bold=False)
            return False
        except TimeoutError as timeout_err:
            click.secho(f"\nRequest timed out: {timeout_err}\n",
                        fg="red",
                        bold=False)
            return False
        except Exception as err:
            click.secho(f"\nOther error occurred: {err}\n",
                        fg="red",
                        bold=False)
            return False
        else:
            # no errors... continue
            # parse just the <img> and <a> tags
            soup_a = BeautifulSoup(r.content,
                                   "lxml",
                                   parse_only=SoupStrainer("a"))
            soup_img = BeautifulSoup(r.content,
                                     "lxml",
                                     parse_only=SoupStrainer("img"))

    # build the list of hrefs
    hrefs = []

    if file_extension is not None:
        print(f"Getting links for {file_extension} files...")
        # Looking for a specific file_extension
        for img_link in soup_img(src=regex.compile(f".{file_extension}")):
            if img_link.get("src") is not None:
                hrefs.append(conv_rel_abs_addr(url, img_link.get("src")))
        for a_link in soup_a(href=regex.compile(f".{file_extension}")):
            if a_link.get("href") is not None:
                hrefs.append(conv_rel_abs_addr(url, a_link.get("href")))
    else:
        print("Getting links...")
        for img_link in soup_img.find_all("img"):
            if img_link.get("src") is not None:
                hrefs.append(conv_rel_abs_addr(url, img_link.get("src")))
        for a_link in soup_a.find_all("a"):
            if a_link.get("href") is not None:
                hrefs.append(conv_rel_abs_addr(url, a_link.get("href")))

    return hrefs