def crawl(self, pages, depth=2): for i in range(depth): newpages = set() for page in pages: #try: c = urllib.request.urlopen(page) #except: # print("Could not open %s" % page) # continue soup = bs4(c.read()) self.addtoindex(page, soup) links = soup('a') for link in links: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] if utl[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) self.dbcommit() pages = newpages
def tmp(): # need for "Optimize imports" time() urllib() bs4() Category() Deepl() FindDigits() Html() LoadDictFromFile() Parsing() Product() SaveDictToFile() Sw() WorkWithJSON() print() datetime() quote() urljoin()
def searchYoutubeAlternative(songName): # YouTube will block you if you query too many songs using this search. textToSearch = songName query = urllib.parse.quote(textToSearch) url = "https://www.youtube.com/results?search_query=" + query response = urllib.request.urlopen(url) html = response.read() soup = bs4(html, 'html.parser') for vid in soup.findAll(attrs={'class': 'yt-uix-tile-link'}): print('https://www.youtube.com' + vid['href'])
def get_historical_data(name, number_of_days): data = [] url = "https://finance.yahoo.com/quote/" + name + "/history/" rows = bs4( urllib2.urlopen(url).read()).findAll('table')[0].tbody.findAll('tr') for each_row in rows: divs = each_row.findAll('td') if divs[1].span.text != 'Dividend': # Ignore this row in the table # I'm only interested in 'Open' price; For other values, play with divs[1 - 5] data.append({ 'Date': divs[0].span.text, 'Open': float(divs[1].span.text.replace(',', '')) }) return data[:number_of_days]
''' https://www.codewars.com/kata/get-a-users-honor/train/python If you want/don't want your username to be in the tests, ask me in the discourse area. There can't be too many though because the server may time out. Example: >>> get_honor('dpleshkov') 4418 >>> get_honor('jhoffner') 21949 F# examples >>> GetHonor('dpleshkov') 4418 >>> GetHonor('jhoffner') 21949 ¹ Honor may or may not be current to the user Libraries/Recommendations: Fsharp: open System.Net: use this namespace for opening a webpage(It's just a suggestion). open System.Text.RegularExpressions: this namepsace will give you access to Regex. Python: urllib.request.urlopen: Opens up a webpage. re: The RegEx library for Python. bs4(BeautifulSoup): A tool for scraping HTML and XML. Notes: While this kata is in beta, please vote on it and give your rank assessment.
import bs4 as bs4 html=request("Webseite") soup=bs4(html_file, "html.parser") match_title=soup.title.text match_div=soup.div # erstes div match_div2=soup.div(class="Sonst") # DIV mit der classe SONST # Durchsuchen for article in soup.find_all('div', class_='article'): headline=article.p.text # das was in p steht for link in soup.find_all('a', href=True): # alle Links finden print (link['href'])
import requests, bs4 res = requests.get('http://nostarch.com') res.raise_for_status() noStarchSoup = bs4.BeautifulSoup(res.text) self.soup = bs4(noStarchSoup, 'html.parser') type(noStarchSoup)
def wikipedia_scraping(self, url): #need a way to choose a random entry in wikipedia. list all, random number index and then that entry? r = requests.get(url) html = str(r.content) self.soups[url] = bs4(html, 'html.parser')