Exemplos de ss em Python, exemplos de bs4.ss em Python

Exemplo n.º 1

0

Exibir arquivo

def state_data(s):
	data = [{"state": _} for _ in s]

	territories = ["american samoa", "district of columbia", "guam", "marshall islands", "micronesia", "northern mariana islands", "palau", "puerto rico", "u.s. virgin islands"]
	unitedstates = s + territories

	page3 = requests.get("https://www.cdc.gov/coronavirus/2019-ncov/vaccines/index.html")
	soup3 = bs(page3.content, 'html.parser')
	soup3.find_all('a', attrs={"class":"dropdown-item noLinking"})
	links = []

	for link in soup3.find_all('a', attrs={"class":"dropdown-item noLinking"}, href=True):
		links.append(link['href'])
		statelinks = dict(zip(unitedstates, links))
	for territory in territories:
		del statelinks[territory]

	for state in s:
		url = f"https://www.nytimes.com/interactive/2020/us/{state.replace(' ', '-').lower()}-coronavirus-cases.html"
		req = requests.Session()
		response = req.get(url)
		strainer = ss("td", attrs={"class": "num yesterday svelte-fin3s2"})
		soup = bs(response.content, features="html.parser", parse_only=strainer)
		counts = soup.find_all("span", attrs={"class": "svelte-fin3s2"})

		strainer = ss("tr", attrs={"class": "svelte-fin3s2"})
		soup = bs(response.content, features="html.parser", parse_only=strainer)
		yesterday = str(soup.find_all("th", attrs={"class": "header yesterday svelte-fin3s2"})[0].text).split("On ")[1]

		data[s.index(state)]["link"] = statelinks[state]
		data[s.index(state)]["cases"] = counts[0].text
		data[s.index(state)]["deaths"] = counts[1].text
		data[s.index(state)]["hospitalized"] = counts[2].text
	return data, yesterday

Exemplo n.º 2

0

Exibir arquivo

Arquivo: scrapeSites.py Projeto: kindlychung/python_modules

def scrapeFun1(lemonde_url, from_lang, to_lang):
    lemond_worldnews = SimpleUrl(lemonde_url)
    page = lemond_worldnews()
    filtertag = ss("article")
    titletag = ss("title")
    souptitle = bs(page, "html.parser", parse_only=titletag)
    subj = souptitle.get_text()
    soup = bs(page, "html.parser", parse_only=filtertag)
    soupstr = soup.get_text()
    markup = marksoup(soupstr, from_lang, to_lang)
    sendnews(subj, markup)

Exemplo n.º 3

0

Exibir arquivo

def find_price_nasdaq(t):
    from bs4 import BeautifulSoup as bs
    from bs4 import SoupStrainer as ss
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    import time
    from fake_useragent import UserAgent

    ua = UserAgent()
    user_agent = ua.random
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('user-agent={0}'.format(user_agent))
    capa = DesiredCapabilities.CHROME
    capa["pageLoadStrategy"] = "none"
    driver = webdriver.Chrome(options=options, desired_capabilities=capa)
    driver.set_window_size(1440, 900)
    driver.get('https://www.nasdaq.com/market-activity/stocks/{}'.format(t))
    time.sleep(4)
    plain_text = driver.page_source
    driver.quit()
    only_class = ss(class_='symbol-page-header__pricing-price')
    soup = bs(plain_text, 'html.parser', parse_only=only_class)
    prices_found = []
    for result in soup:
        if result.text != '':
            price = result.text
            prices_found.append(price)
    price = float(max(prices_found).replace('$', ''))
    return {'price': price}

Exemplo n.º 4

0

Exibir arquivo

    def __parse_data(self:object,start_date:str,end_date:str) -> None:
        data = self.__get_html(start_date,end_date)
        print('Your request in progress please wait...')
        for x in data :
            link = f'https://www1.arun.gov.uk/aplanning/OcellaWeb/{x.get("href")}'
            item = {}
            r = requests_session.get(link)
            innerstrainter = ss('table')
            innerhtml = bs(r.text,'lxml',parse_only=innerstrainter)
            innerinfoblocks = innerhtml.findAll('tr')

            '''
            Parse data from html document
            '''
            item['Url'] = link
            item['Address'] = innerinfoblocks[4].findAll('td')[1].text
            item['ReferenceNumber'] = innerinfoblocks[1].findAll('td')[1].text
            item['Validateddate'] = innerinfoblocks[8].findAll('td')[1].text
            item['Status'] = innerinfoblocks[2].findAll('td')[1].text
            item['Proposal'] = innerinfoblocks[3].findAll('td')[1].text

            '''
            Write data to csv file 
            '''
        
            writer = csv.DictWriter(csv_file, fieldnames=csv_columns)
            writer.writerow(item)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: CS1051_FinalProject_NYTCrosswordPuzzleBlackBoxAnalysis.py Projeto: ISmolerSchatz/FinalProject_CS1051

def bbHeatMapEval(date, size, puzzlesEvaluated):
    puzzleLink = "https://www.xwordinfo.com/Crossword?date=" + date
    puzzle = requests.get(puzzleLink)
    # populates bbHeatMapDict for evaluated puzzle
    tablesoup = bs(puzzle.content, 'html.parser', parse_only=ss(id="PuzTable"))
    rowsoup = tablesoup.find_all('tr')
    num_rows = len(rowsoup)
    cellsoup = tablesoup.find_all('td')
    num_cols = len(cellsoup) // len(rowsoup)
    if num_rows != size or num_cols != size:
        print("Crossword puzzle board for", date, "is nonstandard size!")
        return puzzlesEvaluated
    bbNum = 0
    for x in range(len(rowsoup)):
        cellsoup = rowsoup[x].find_all('td')
        cellList = str(cellsoup).split(',')
        for y in range(len(cellList)):
            if 'class="black"' in cellList[y]:
                bbNum += 1
                bbHeatMapDict[x][y] += 1
    bbNumList.append(bbNum)
    datesEvaluated.append(date)
    puzzlesEvaluated += 1
    if puzzlesEvaluated % 5 == 0:
        print(puzzlesEvaluated, "puzzles evaluated so far")
    return puzzlesEvaluated

Exemplo n.º 6

0

Exibir arquivo

def find_price_tradingview(t):
    from bs4 import BeautifulSoup as bs
    from bs4 import SoupStrainer as ss
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    import time
    from fake_useragent import UserAgent

    ua = UserAgent()
    user_agent = ua.random

    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('user-agent={0}'.format(user_agent))
    capa = DesiredCapabilities.CHROME
    capa["pageLoadStrategy"] = "none"
    driver = webdriver.Chrome(options=options, desired_capabilities=capa)
    driver.set_window_size(1440, 900)
    driver.get('https://www.tradingview.com/symbols/{}/'.format(t))
    time.sleep(4)
    plain_text = driver.page_source
    driver.quit()

    only_class = ss(class_='tv-symbol-price-quote__value js-symbol-last')
    price = float(bs(plain_text, 'html.parser', parse_only=only_class).text)
    return {'price': price}

Exemplo n.º 7

0

Exibir arquivo

    def __get_html(self:object, fromm:str, to:str) -> list:
        r = requests_session.get(f'https://www1.arun.gov.uk/aplanning/OcellaWeb/planningSearch?action=Search&showall=showall&reference=&location=&OcellaPlanningSearch.postcode=&area=&applicant=&agent=&undecided=&receivedFrom={fromm}&receivedTo={to}&decidedFrom=&decidedTo=')
        strainter = ss('table')
        html = bs(r.text,'lxml',parse_only=strainter)

        '''
        Caught link for every item 
        '''

        infoblocks = html.findAll('a',href=True)
        return infoblocks

Exemplo n.º 8

0

Exibir arquivo

def marketwatch_other_data(t):
    import requests
    from bs4 import BeautifulSoup as bs
    from bs4 import SoupStrainer as ss
    import time
    from fake_useragent import UserAgent

    ua = UserAgent()
    user_agent = ua.random

    # get market cap and 52 week range from table
    header = {'user-agent': user_agent}
    url = 'https://www.marketwatch.com/investing/stock/{}?mod=over_search'
    f = requests.get(url.format(t, t), headers=header)
    time.sleep(4)
    only_class = ss(class_='list list--kv list--col50')
    soup = bs(f.content, 'html.parser', parse_only=only_class)
    mlist = soup.find_all(class_='primary')

    fifty_two_week_range = mlist[2].text.split(' - ')
    fifty_two_week_low = float(fifty_two_week_range[0])
    fifty_two_week_high = float(fifty_two_week_range[1])

    market_cap_string = mlist[3].text.replace('$', '')

    if market_cap_string[-1] == 'T':
        split = market_cap_string.split('.')
        first = int(split[0]) * (10**12)
        second = int(split[1].replace('T', '')) * (10**9)
        market_cap = first + second

    if market_cap_string[-1] == 'B':
        split = market_cap_string.split('.')
        first = int(split[0]) * (10**9)
        second = int(split[1].replace('B', '')) * (10**6)
        market_cap = first + second

    elif market_cap_string[-1] == 'M':
        split = market_cap_string.split('.')
        first = int(split[0]) * (10**6)
        second = int(split[1].replace('M', '')) * (10**3)
        market_cap = first + second

    data = {
        'market_cap': market_cap,
        'fifty_two_week_low': fifty_two_week_low,
        'fifty_two_week_high': fifty_two_week_high
    }
    return data

Exemplo n.º 9

0

Exibir arquivo

def nasdaq_other_data(t):
    from bs4 import BeautifulSoup as bs
    from bs4 import SoupStrainer as ss
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    import time
    from fake_useragent import UserAgent

    ua = UserAgent()
    user_agent = ua.random
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('user-agent={0}'.format(user_agent))
    capa = DesiredCapabilities.CHROME
    capa["pageLoadStrategy"] = "none"
    driver = webdriver.Chrome(options=options, desired_capabilities=capa)
    driver.set_window_size(1440, 900)
    driver.get('https://www.nasdaq.com/market-activity/stocks/{}'.format(t))
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, 800)")
    time.sleep(4)
    plain_text = driver.page_source
    driver.quit()

    # find table
    only_table_class = ss(class_="summary-data__cell")
    table = bs(plain_text, 'html.parser', parse_only=only_table_class)

    cell_list = []

    for x in table:
        cell_list.append(x.text)

    fifty_two_week_high_low = cell_list[8].split('/')

    # find 52-week low
    fifty_two_week_low = float(fifty_two_week_high_low[1].replace('$', ''))

    # find 52-week high
    fifty_two_week_high = float(fifty_two_week_high_low[0].replace('$', ''))

    # market cap
    market_cap = int(cell_list[9].replace(',', ''))

    return {
        'fifty_two_week_low': fifty_two_week_low,
        'fifty_two_week_high': fifty_two_week_high,
        'market_cap': market_cap
    }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: linkcheck.py Projeto: instantaphex/linkcheck

 def check_url(self, url):
     req = urllib.request.Request(url)
     req.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36')
     try:
         html = urllib.request.urlopen(req).read()
     except urllib.error.HTTPError as e:
         html = e.read()
     soup = bs(html, parse_only=ss('a'))
     link = soup.find_all('a', attrs={'href': re.compile(self.domain)})
     if len(link) > 0: #link from domain was found
         for i in link:
             if i.has_attr('rel'):
                 return 'NOFOLLOWED'
             else:
                 return 'EXISTS'
     else:
         return 'REMOVED'

Exemplo n.º 11

0

Exibir arquivo

def find_price_marketwatch(t):
    import requests
    from bs4 import BeautifulSoup as bs
    from bs4 import SoupStrainer as ss
    import time
    from fake_useragent import UserAgent

    ua = UserAgent()
    user_agent = ua.random

    header = {'user-agent': user_agent}
    url = 'https://www.marketwatch.com/investing/stock/{}?mod=over_search'
    f = requests.get(url.format(t, t), headers=header)
    time.sleep(2)
    only_class = ss(class_='intraday__data')
    soup = bs(f.content, 'html.parser', parse_only=only_class)
    price = float(soup.find(class_='value').text)
    return {'price': price}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: lib_page.py Projeto: ev4ngel/OOOO

def fromMonthPage(url):
    """
    return:[{"title":item_title."url":url_of_next_page},more...]
    """
    rlt = []
    request = urllib2.Request(url, headers=Common.HEADER)
    try:
        html = urllib2.urlopen(request).read()
    except:
        if Common.DEBUG:
            print "Error:Reading MonthPage From %s" % url
        return rlt
    bx = bs(html, "html.parser", parse_only=ss("div", id="content"))
    aas = bx.find_all("a")
    for a in aas:
        href = a.get("href")
        if re.search(r'(\d\d-){5}\d\d\.html', href):
            rlt.append({"title": a.text, "url": urlparse.urljoin(url, href)})
    return rlt

Exemplo n.º 13

0

Exibir arquivo

def find_price_barrons(t):
    import requests
    from bs4 import BeautifulSoup as bs
    from bs4 import SoupStrainer as ss
    import time
    from fake_useragent import UserAgent

    ua = UserAgent()
    user_agent = ua.random

    header = {'user-agent': user_agent}
    url = 'https://www.barrons.com/quote/stock/{}'
    f = requests.get(url.format(t), headers=header)
    time.sleep(2)

    html_test = f.content
    price_class = ss(class_='market__price bgLast')
    price = float(bs(html_test, 'html.parser', parse_only=price_class).text)
    return {'price': price}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: lib_page.py Projeto: ev4ngel/OOOO

def fromItemPage(url):
    """
    return:[{"tor":url_of_torrent_page,"img":[url_of_one_img,url_of_other_img],'purl':parent_url},{more...}]
    """
    #urlparse.urljoin(filepath,n)
    request = urllib2.Request(url, headers=Common.HEADER)
    rlt = []
    try:
        html = urllib2.urlopen(request).read()
    except:
        if Common.DEBUG:
            print "Error:Reading %s" % url
            return {url: rlt}
    bx = bs(html, "html.parser", parse_only=ss("div", id="content"))
    nexxt = bx.find(["a", "img"])
    while nexxt:
        if nexxt.name == "a":
            try:
                nexxt.img.get(
                    "src"
                )  #Failed When The Link Is A Pic instead of a link to torrent download page
            except:  #To Get A TorPage's Link
                if len(rlt) != 0:
                    href = nexxt.get("href").strip()
                    if not rlt[-1].get("tor", None) and re.search(
                            r'([A-Z0-9]{6,10}\.html$)|([a-z0-9]{16}\.html$)',
                            href):
                        #getFirst Tor and the link of the page is like XJKJDL.htm or eab34dfa8ab.html
                        rlt[-1]["tor"] = href
        elif nexxt.name == "img":
            src = nexxt.get("src").strip()
            if re.search(r'jpg$', src, re.I):
                if len(rlt) == 0 or rlt[-1].get("tor", None):
                    rlt.append({"img": [src]})
                else:
                    rlt[-1]["img"].append(src)
        nexxt = nexxt.find_next(["a", "img"])
    return {url: rlt}

Exemplo n.º 15

0

Exibir arquivo

from bs4 import BeautifulSoup as bs
from bs4 import SoupStrainer as ss
import requests
import sys

# request page, then load only the div tags with 'menu-item' class attribute into memory
url = requests.get('http://www.teiteirobata.com/new-page-2/')
items = bs(url.text, 'html.parser', parse_only=ss('div', class_='menu-item')).contents

# create menu dictionary, then fill it with menu items and their descriptions
menu = {}
for x in range(len(items)):
    # checks to see if a description is available for the menu item
    # note: items with '3' as their length do not have descriptions
    if len(items[x]) == 3:
        item = str(items[x].contents[1].contents[0]).strip()
        desc = ''
    else:
        item = str(items[x].contents[1].contents[0]).strip()
        desc = str(items[x].contents[3].contents[0]).strip()
    menu[item] = desc

# function to perform a search on the menu dictionary
# note:  default search is for carpaccio, because it is awesome.
def find_in_menu(sub='carpaccio'):
    try:
        search = [s for s in menu if sub in s]
        result_item, result_desc = str(search[0]), menu[str(search[0])]
        print(result_item.title() + '\n' + result_desc)
    except:
        print('Sorry, \"' + str(sub) + '\" is not on the menu today.')

Exemplo n.º 16

0

Exibir arquivo

Arquivo: rank grinder.py Projeto: Quantificate/rank-grinder

import bleach
import re
import csv
#open csv
with open('<csv file goes here>', 'w') as fout:
    w = csv.writer(fout)
    w.writerow([
        'URL', 'Keyterm Density', 'Keyterm Score', 'PageSpeed Score',
        'Backlink Score', 'Content Score', 'Total Score'
    ])
    #open sitemap
    with ul.request.urlopen('<domain/sitemap url goes here>') as response:
        html = response.read()

    #extract sitemap links within the domain
    articles = ss('a')
    linklist = []
    soup = bs(html, "lxml", parse_only=articles)
    for link in soup.find_all('a', href=True):
        linklist.append(link['href'])

    clearflags = ['/', '/rss', '#weatherinline', '/happenings/', '/sitemap']
    clearlist = [x for x in linklist if '.' not in x]
    clearlinks = [y for y in clearlist if y not in clearflags]

    finallinks = []
    a = 0
    while a < len(clearlinks):
        fulllink = '<domain url goes here>' + clearlinks[a]
        finallinks.append(fulllink)
        a = a + 1

Exemplo n.º 17

0

Exibir arquivo

def tradingview_other_data(t):
    from bs4 import BeautifulSoup as bs
    from bs4 import SoupStrainer as ss
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    import time
    from fake_useragent import UserAgent

    ua = UserAgent()
    user_agent = ua.random
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('user-agent={0}'.format(user_agent))
    capa = DesiredCapabilities.CHROME
    capa["pageLoadStrategy"] = "none"
    driver = webdriver.Chrome(options=options, desired_capabilities=capa)
    driver.set_window_size(1440, 900)
    driver.get('https://www.tradingview.com/symbols/{}/'.format(t))
    time.sleep(4)
    plain_text = driver.page_source
    driver.quit()

    only_class = ss(
        class_='tv-widget-fundamentals__value apply-overflow-tooltip')
    found = bs(plain_text, 'html.parser', parse_only=only_class)
    found_list = []
    for x in found:
        found_list.append(x.text)

    # market_cap
    market_cap_string = found_list[0].strip()

    # find 52-week low
    fifty_two_week_low = float(found_list[23].strip())

    # find 52-week high
    fifty_two_week_high = float(found_list[22].strip())

    split = market_cap_string.split('.')

    if len(market_cap_string) > 1:
        if market_cap_string[-1] == 'T':
            first = int(split[0]) * (10**12)
            second = int(split[1].replace('T', '')) * (10**9)
            market_cap = first + second

        elif market_cap_string[-1] == 'B':
            first = int(split[0]) * (10**9)
            second = int(split[1].replace('B', '')) * (10**6)
            market_cap = first + second

        elif market_cap_string[-1] == 'M':
            first = int(split[0]) * (10**6)
            second = int(split[1].replace('M', '')) * (10**3)
            market_cap = first + second

        elif market_cap_string[-1] == 'K':
            first = int(split[0]) * (10**3)
            second = int(split[1].replace('K', '')) * (10**0)
            market_cap = first + second

        data = {
            'market_cap': market_cap,
            'fifty_two_week_low': fifty_two_week_low,
            'fifty_two_week_high': fifty_two_week_high
        }
    else:
        data = {
            'fifty_two_week_low': fifty_two_week_low,
            'fifty_two_week_high': fifty_two_week_high
        }
    return data

Exemplo n.º 18

0

Exibir arquivo

from bs4 import BeautifulSoup as bs
from bs4 import SoupStrainer as ss
import urllib as ul
import urllib.request
import bleach
import re

with ul.request.urlopen('http://www.premier-mountain-properties.net/sitemap/') as response:
    html = response.read()

articles = ss('a')
linklist=[]
soup = bs(html, "lxml", parse_only=articles)
for link in soup.find_all('a', href=True):
    linklist.append(link['href'])

print(linklist)
clearflags = ['/', '/rss','#weatherinline','/happenings/','/sitemap']
clearlist = [x for x in linklist if '.' not in x]
clearlinks = [y for y in clearlist if y not in clearflags]
print(clearlinks)

finallinks = []
a=0
while a < len(clearlinks):
    fulllink = 'http://www.premier-mountain-properties.net' + clearlinks[a]
    finallinks.append(fulllink)
    a=a+1
print(finallinks)

Exemplo n.º 19

0

Exibir arquivo

    print(SC[b] + ": " + str(sctotal))
    seccontent = seccontent + sctotal
    b = b+1

maincontent = maincontent*10
seccontent = seccontent*5
tags = html.count(UN[0])
uncontent = tags - maincontent - seccontent
print("Total Unrelated Tags: " + str(uncontent))
#Calculate % of MC and SC.
mcper = round((maincontent / uncontent)*100, 2)
scper = round((seccontent / uncontent)*100, 2)
print("Main: " + str(mcper))
print("Secondary: " + str(scper))
#pull just the article from the html
articles = ss('article')
#clean all tags from the article
soup = bs(html, "lxml", parse_only=articles)
article = soup.find_all('article')
thing = bleach.clean(str(article), strip=True)
thing = re.sub('\W+',' ', thing)
thing = re.sub( r"([A-Z])", r" \1", thing)
#remove extraneous words from the text
text = thing
text = text.lower()
text = text.split()
cleanlist = ["a", "an", "the", "for", "of", "it", "but", "nor", "so", "and", "but", "or", "yet", "is", "to", "at", "i", "if", "as", "in", "by", "on", "li", "ul", "p"]
cleantext  = [word for word in text if word.lower() not in cleanlist]
#count keyterms in the text
izer = len(keyterm)
x = 0

Exemplo n.º 20

0

Exibir arquivo

Arquivo: wordmeaning.py Projeto: MinerMends/Translate

    async def _dict(self, ctx, *, term: str = None):
        """(∩｀-´)⊃━☆ﾟ.*･｡ﾟ Search definitions in English
        using Oxford English Dictionary database
        
        Usage:
        {prefix}dict <word> [synonyms|proverbs|examples]"""
        if term is None:  # Simple usage return for invoking an empty cmd
            sample = random.choice([
                'lecture', 'fantasy', 'gathering', 'gradually ',
                'international', 'desire'
            ])
            v = f'{ctx.prefix}{ctx.invoked_with} {sample}'
            usage = f'**Usage:** basic results\n{v}\n\n' \
                f'**Advanced Usage:** add any parameter\n{v} `examples` `synonyms` `proverbs` `sourcecode`'
            return await ctx.send(usage)
        await ctx.channel.trigger_typing()

        query = ''.join(
            term.split(' ')[0]
        )  # We only want to search the first term, the rest is for extra result
        url = f"{self.query_url}{query.lower()}"  # we lower it so it works as part of the search link
        page = requests.get(
            url, headers=_HEADERS
        )  # requests code, use the headers to appear like a normal browser
        e = discord.Embed(
            color=self.user_color
        )  # This command is EMBED-only, it doesn't work without embed perms
        x = "https://media.discordapp.net/attachments/541059392951418880/557660549073207296/oxford_favicon.png"

        try:
            e.set_author(
                name=
                f'Definition of {query.title()} in English by Oxford Dictionaries',
                url=url,
                icon_url=x)

            # SoupStrainer is required to load 1/3 of the page, discarding unnecessary content
            # "gamb" contains definition, "etym" contains pronunciation and origin
            _section_content = ss(
                "section",
                attrs={
                    "class": ["gramb", "etymology etym", "pronSection etym"]
                })

            # Then we parse the resulting web page with Beautiful Soup 4
            soup = bs(page.content,
                      "html.parser",
                      parse_only=_section_content,
                      from_encoding="utf-8")

            # ================= Send HTML5 code as a message into chat ====================
            if ctx.message.content.endswith(
                    'sourcecode') and query != 'sourcecode':
                # This is mostly for debugging purposes, if cmd doesn't give a result, check that the code works,
                # if `code` returns empty, it is because the command couldn't find a valid page for {query}
                defs = soup.find('section',
                                 attrs={"class":
                                        "gramb"})  # sends page parsed as HTML5
                if defs is not None:
                    block = await ctx.send(
                        f'```html\n{defs.prettify()[:1970]}``` Chars: `{len(defs.text)}`'
                    )
                    await block.add_reaction('\N{WHITE HEAVY CHECK MARK}')

            # ============= Word and its classification and pronunciation ================
            classification = soup.find(
                'span', attrs={"class":
                               "pos"})  # noun, verb, adjective, adverb, etc...
            if classification is not None:
                cl = f"*`[{classification.text}]`*  " or "\u200b"
                e.title = cl  # f"{cl}{query.title()}{pr.replace('/', '')}"
            # =============================================================================
            definition = soup.find('span', attrs={"class":
                                                  "ind"})  # first description
            if definition is not None:  # BUG-HUNTER, 1ˢᵗ 2ⁿᵈ 3ʳᵈ, 4ᵗʰ
                # Checks for a definition, if not found, it defaults to fail-safe description below
                e.description = f"1. {definition.text[:500]}"  # await ctx.send(first.text[:500])  # BUG-HUNTER
            # ===================== if cmd *args == 'examples' ============================
            if 'examples' in ctx.message.content and query != 'examples':
                example_1 = soup.find('div', attrs={"class":
                                                    "exg"})  # first example
                if example_1 is not None:
                    ex_1 = f'*{example_1.text[1:]}*' or "\u200b"
                    try:
                        example_2 = soup.find_all('div',
                                                  attrs={"class": "exg"})[1]
                        list_1 = example_2.text[1:].replace("’ ‘", "’*\n*‘")
                        ex_2 = f'\n*{list_1}'
                    except IndexError:  # ResultSet object has no attribute '.text'
                        ex_2 = "\u200b"
                    result = f"{ex_1}{ex_2}"  # This is merely aesthetic so that it ends with ... or not
                    if result[:800].endswith("’"):  # We expect it to ed well
                        complete = f'{result[:800]}*'
                    else:  # if it doesn't, then we format it properly here
                        complete = f'{result[:800]}...*'
                    e.add_field(name='Examples', value=complete,
                                inline=False)  # BUG-HUNTER

            # ======================= First Synonyms in result =============================
            try:
                synonyms_1 = soup.find('div', attrs={
                    "class": "synonyms"
                })  # .find_all('strong')  # Synonyms for search
                if synonyms_1 is not None:
                    results = synonyms_1.text
                    syns = results.replace('Synonyms', '').replace(
                        'View synonyms', '') or "#z"
                    if 'synonyms' in ctx.message.content and query != 'synonyms':
                        e.add_field(name='Synonyms',
                                    value=f'```bf\n{syns[:460]}```',
                                    inline=False)  # BUG-HUNTER
                    else:
                        synonyms_2 = soup.find('div', attrs={"class": "exs"})
                        res = synonyms_2.find_all('strong').text
                        e.add_field(name='Synonyms',
                                    value=f'```bf\n{res}```',
                                    inline=False)  # BUG-HUNTER
                    # await ctx.send(phrases.text[:270])  # BUG-HUNTER
            except AttributeError:  # ResultSet object has no attribute '.text'
                pass

            # ======================= Output proverbs and samples ==========================
            proverb = soup.find('div', attrs={"class": "trg"})
            if proverb is not None:
                try:
                    proverb.find('div', attrs={
                        "span": "sense-registers"
                    })  # Proverb, {query} used in sentences
                    x = proverb.text.replace("’ ‘",
                                             "’\n‘").replace(". ‘", ".\n\n‘")
                    if 'proverbs' in ctx.message.content and query != 'proverbs':
                        z = '’'.join(
                            x.split("’")[3:-4]
                        )  # split x and output after 'More example sentences...'
                        e.add_field(name='Proverb',
                                    value=f"*{z[1:][:960]}...*",
                                    inline=False)
                    else:
                        z = '’'.join(x.split("’")[3:-2])
                        e.add_field(name='Proverb',
                                    value=f"*{z[1:][:240]}...*",
                                    inline=False)
                        # return await ctx.send(z[:1600])  # BUG-HUNTER
                except TypeError:  # TypeError: unhashable type: 'slice' in [:260]
                    pass

            # =================== Word Origin ETYMOLOGY [working] =========================
            try:
                pronunciation_2 = soup.find('span',
                                            attrs={
                                                "class": "phoneticspelling"
                                            })  # etymology & pronunciation
                if pronunciation_2 is not None:
                    try:
                        classification_2 = soup.find_all('section',
                                                         attrs={
                                                             "class":
                                                             "etymology etym"
                                                         })[1].find('p').text
                        msg = f'\n**Origin:** *{classification_2}*'
                    except IndexError:  # ResultSet object has no attribute '.text'
                        msg = ""
                    pro = f"**Pronunciation:** `({pronunciation_2.text})`" or "N/A"
                    e.add_field(name=f'Etymology of {query.title()}',
                                value=f"{pro.replace('/', '')}{msg[:750]}",
                                inline=False)
                    # await ctx.send(msg[:750])  # BUG-HUNTER
            except IndexError:  # ResultSet object has no attribute '.text'
                pass
            # ================== copyright acknowledgments ================================
            e.set_footer(
                text=
                f'Oxford University Press © 2020 | Duration: {self.bot.ws.latency * 1000:.2f} ms'
            )
            # ================== Fail-safe for words without a definition =================
            if not definition:
                e.description = f"Whoopsie! I couldn't find a definition for *{query}*.\n" \
                    f"Check spelling, or look for a variation of {query} as verb, noun, etc."

            try:
                return await ctx.send(embed=e)
            except Exception as e:
                tb = traceback.format_exc()
                return await ctx.send(
                    f'```css\n[DAFUQ]\n{e}```\n```py\n、ヽ｀、ヽ｀个o(･･｡)｀ヽ、｀ヽ、\n\n{tb}```'
                )

            # await ctx.message.add_reaction('thankful:389969145019498502')
        except Exception as e:
            tb = traceback.format_exc()
            return await ctx.send(
                f'```css\n[OOPS, I DID IT AGAIN]\n{e}```\n```py\nヾ(ﾟ∀ﾟ○)ﾂ三ヾ(●ﾟдﾟ)ﾉ\n\n{tb}```'
            )

Exemplo n.º 21

0

Exibir arquivo

from bs4 import BeautifulSoup as bs, SoupStrainer as ss
from lxml.cssselect import CSSSelector
import const
import lxml.html as lh
import string 

BOX_SELECTOR = CSSSelector(".center a ")
UNFORMATTED_URL = "https://www.basketball-reference.com{}"
basic_stats_strainer = ss("table", id = lambda x : x and x.endswith('-game-basic'))

def extract_box_scores_from_raw_box_score_pages(authorizer,season):
    '''
    takes raw html and returns a list of string matrices 
    '''
    steve_cur = authorizer.conn.cursor()
    steve_cur.execute("""
    select url,raw_html from raw_box_score_pages where (season,type) = (%s,%s)
    """, (season,const.BOX_SCORE_PAGE_TYPE))

    box_scores = []
    for url,raw_html in steve_cur:
        game_soup = bs(raw_html, 'lxml', parse_only = basic_stats_strainer)
        date = url[47:55]

        game_stats = []
        scores = []
        teams = []
        for table in game_soup.find_all('table'):
            team = table.get('id')
            team = team[4:7]
            teams.append(team)