Пример #1
0
    def crawl(self, pages, depth=2):

        for i in range(depth):
            newpages = set()
            for page in pages:
                #try:
                c = urllib.request.urlopen(page)
                #except:
                #	print("Could not open %s" % page)
                #	continue
                soup = bs4(c.read())
                self.addtoindex(page, soup)

                links = soup('a')
                for link in links:
                    if ('href' in dict(link.attrs)):
                        url = urljoin(page, link['href'])
                        if url.find("'") != -1:
                            continue
                        url = url.split('#')[0]
                        if utl[0:4] == 'http' and not self.isindexed(url):
                            newpages.add(url)
                        linkText = self.gettextonly(link)
                        self.addlinkref(page, url, linkText)

                self.dbcommit()

            pages = newpages
Пример #2
0
def tmp():  # need for "Optimize imports"
    time()
    urllib()
    bs4()
    Category()
    Deepl()
    FindDigits()
    Html()
    LoadDictFromFile()
    Parsing()
    Product()
    SaveDictToFile()
    Sw()
    WorkWithJSON()
    print()
    datetime()
    quote()
    urljoin()
def searchYoutubeAlternative(songName):
    # YouTube will block you if you query too many songs using this search.
    textToSearch = songName
    query = urllib.parse.quote(textToSearch)
    url = "https://www.youtube.com/results?search_query=" + query
    response = urllib.request.urlopen(url)
    html = response.read()
    soup = bs4(html, 'html.parser')
    for vid in soup.findAll(attrs={'class': 'yt-uix-tile-link'}):
        print('https://www.youtube.com' + vid['href'])
Пример #4
0
def get_historical_data(name, number_of_days):
    data = []
    url = "https://finance.yahoo.com/quote/" + name + "/history/"
    rows = bs4(
        urllib2.urlopen(url).read()).findAll('table')[0].tbody.findAll('tr')

    for each_row in rows:
        divs = each_row.findAll('td')
        if divs[1].span.text != 'Dividend':  # Ignore this row in the table
            # I'm only interested in 'Open' price; For other values, play with divs[1 - 5]
            data.append({
                'Date': divs[0].span.text,
                'Open': float(divs[1].span.text.replace(',', ''))
            })

    return data[:number_of_days]
Пример #5
0
'''
https://www.codewars.com/kata/get-a-users-honor/train/python

If you want/don't want your username to be in the tests, ask me in the discourse area. There can't be too many though because the server may time out.

Example:
>>> get_honor('dpleshkov')
4418
>>> get_honor('jhoffner')
21949
F# examples
>>> GetHonor('dpleshkov')
4418
>>> GetHonor('jhoffner')
21949
¹ Honor may or may not be current to the user

Libraries/Recommendations:

Fsharp:
open System.Net: use this namespace for opening a webpage(It's just a suggestion).
open System.Text.RegularExpressions: this namepsace will give you access to Regex.

Python:
urllib.request.urlopen: Opens up a webpage.
re: The RegEx library for Python.
bs4(BeautifulSoup): A tool for scraping HTML and XML.

Notes:
While this kata is in beta, please vote on it and give your rank assessment.
Пример #6
0
import bs4 as bs4

html=request("Webseite")
soup=bs4(html_file, "html.parser")

match_title=soup.title.text
match_div=soup.div # erstes div
match_div2=soup.div(class="Sonst") # DIV mit der classe SONST

# Durchsuchen
for article in soup.find_all('div', class_='article'):
    headline=article.p.text # das was in p steht

for link in soup.find_all('a', href=True):  # alle Links finden
    print (link['href'])
import requests, bs4

res = requests.get('http://nostarch.com')
res.raise_for_status()
noStarchSoup = bs4.BeautifulSoup(res.text)
self.soup = bs4(noStarchSoup, 'html.parser')

type(noStarchSoup)
Пример #8
0
 def wikipedia_scraping(self, url):
   #need a way to choose a random entry in wikipedia. list all, random number index and then that entry?
   r = requests.get(url)
   html = str(r.content)
   self.soups[url] = bs4(html, 'html.parser')