def getNews(): client = mc(host) collection = client['mydb']['nba_news'] lines = open('links.tmp', 'r').readlines() toggle, title, link = True, None, None for l in lines: if toggle: title = l.strip() else: link = l.strip() req = requests.get('{}/{}'.format(head, link)) page = soup(req.text, 'html.parser') section = page.find('section') section = '<html><body>{}</body></html>'.format(str(section)) article = soup(section, 'html.parser').find_all('p') content = ''.join([ p.text for p in article ]) print title, link, content doc = { "title": title, "link": '{}/{}'.format(head, link), "content": content.replace("\"", "\'") } collection.insert_one(doc) toggle = not toggle print collection.count()
def nextborrower(url, urls): #there's no centralized page that lists all past loans on Zidisha, so we need to do some crawling to find the next loan page maxtries = 30 borrowurl = "" html = urlopen(url) bsobj = soup(html.read(), 'html.parser') mydivs = bsobj.findAll("div", {"class" : "lender-thumbnail"}) #get all the lenders who contributed otherborrowers = [] tries = 0 #keep trying until we find a lender with at least one other borrower listed on their page. there should be a more systematic way to do this to avoid repeats and reduce runtime. while (len(otherborrowers) == 0) and (tries < 30): choice = mydivs[randint(0,len(mydivs)-1)] lendurl = choice.a.get('href') html = urlopen(lendurl) bsobj = soup(html.read(), 'html.parser') mydivs2 = bsobj.findAll("div", {"class" : "lender-thumbnail"}) #find all the borrowers that lender has given to if len(mydivs2) > 1: otherborrowers = mydivs2 choice = mydivs2[randint(0,len(mydivs2)-1)] borrowurl = choice.a.get('href') tries += 1 if borrowurl in urls: return nextborrower(url, urls) #if this borrower has already been used, recursively go back to the beginning. A bit kludgy. html = urlopen(borrowurl) bsobj = soup(html.read(), 'html.parser') col = bsobj('div', {'class' : 'col-sm-6'})[2].get_text() if "Date Disbursed" not in col: #if the loan hasn't been disbursed yet, don't use it for training or validation return nextborrower(url, urls) assert tries < maxtries return borrowurl
def frontpage(n): #generates scores for the first n loans listed on Zidisha's main page and writes a csv file of them url = "https://www.zidisha.org/lend" html = urlopen(url) bsobj = soup(html.read(), 'html.parser') mydivs = bsobj.findAll("div", {"class" : "profile-image-container"}) fpfile = open('frontpage.csv','wr') fpwriter = csv.writer(fpfile) fpwriter.writerow(['url','amount','cost','ratio','duration','city','country','ontime','notontime','history','posvote','negvote','fees','feeratio','title', 'description', 'pastscore']) links = [prof.a.get('href') for prof in mydivs] titles = [] for i in range(n): beforescore, afterscore, ontime = getscore(links[i]) fpwriter.writerow(profile(links[i]) + [beforescore]) html = urlopen(links[i]) bsobj = soup(html.read(), 'html.parser') hits = bsobj.findAll('p',{'class' : 'alpha'}) titles.append(hits[0].get_text().replace(' ','').replace('\n','')) fpfile.close() h2o.init() from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glme fpdf = h2o.import_file(path = abspath('./frontpage.csv')) result = evalmodel(fpdf) resultfile = open('results.csv','wr') resultwriter = csv.writer(resultfile) resultwriter.writerow(['project','url','score']) for i in range(n): resultwriter.writerow([titles[i],links[i],result[i]])
def crawl(href,count): print(get_time() + ", Parsing Link: " + href) req = Request(href, headers={'User-Agent': 'Mozilla/5.0'}) uClient = uReq(req) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") heading = page_soup.find('center') content_container = page_soup.find('table', attrs={'style' : "background:transparent; text-align:justify;"}).prettify() table = soup(content_container,"html.parser") para = table.find_all('p') #name = str(count)+".html" with io.open("para_hn.html", "a", encoding="utf-8") as fout: #fout.write("\n\n" + heading.text + "\n\n") # for i in para: #print(para[i]) fout.write(str(para)) link = page_soup.find('img', attrs={'alt' : 'Next.png'}) next_link = link.findPrevious('a')['href'] complete_link = "http://hi.krishnakosh.org" + quote(next_link, safe='%,/') return complete_link
def imdb(): res = r.get("http://www.imdb.com/chart/top") s = soup(res.text) x = [] for i in s.findAll("a"): x.append(i.get("href")) l = [] for i in x: if i.startswith("/title"): l.append("http://imdb.com" + i) l = list(set(l)) d = {} for k in l: k = k.split("?") k = k[0] res = r.get(k) s = soup(res.text) for j in s.findAll("span", {"class": "itemprop"}): q = s.title.text.split("-") q = q[0].strip() if d.get(j.text): d[j.text].append(q) else: d.setdefault(j.text, []).append(q) return d # <a href="fullcredits?ref_=tt_cl_sm#cast">See full cast</a> """
def appendFacebookInfo(stars): for key in stars: if stars[key]["url"]["facebook"] != "": if stars[key]["url"]["facebook"].endswith('/'): stars[key]["url"]["facebook"] = stars[key]["url"]["facebook"][:-1] try: url = stars[key]["url"]["facebook"] + "/info/?tab=page_info" try: web_soup = soup(urllib2.urlopen(url),'lxml') infoString = web_soup.find(name="div", attrs={'data-id': 'page_info'}) emails = get_emails(str(infoString).lower()) for email in emails: #print email stars[key]["email"].append(email) except urllib2.HTTPError: #print "Invalid Facebook URL Format :(" pass except: web_soup = soup(urllib2.urlopen(url),'lxml') except KeyError: print "f****n key error" pass return stars
def getLinks(): cache = open('links.tmp', 'a') navigation = '{}/2016/news/archive/{}/index.html' for week in xrange(10, 20): req = requests.get(navigation.format(head, week)) page = soup(req.text, 'html.parser') div = page.find('div', class_='nbaNAContent') div = '<html><body>{}</body></html>'.format(str(div)) links = soup(div, 'html.parser').find_all('a') for l in links: cache.write(l.get_text() + '\n') cache.write(l.get('href') + '\n')
def test_menu_item(self): output = soup(menu_item({}, link='/', name='bob', always_display=True)).find('li') self.assertEqual('bob', output.text) self.assertTrue('menu-bob' in output.attrs['class']) self.assertEqual('/', output.find('a').attrs['href']) output = soup(menu_item({}, link='/', name='bob', id='bobby', current=True, unavailable=True, always_display=True)).find('li') self.assertEqual('bob', output.text) self.assertTrue('menu-bobby' in output.attrs['class']) self.assertTrue('current' in output.attrs['class']) self.assertTrue('unavailable' in output.attrs['class']) self.assertEqual('/', output.find('a').attrs['href'])
def getscore(url): #does sentiment analysis on the comment thread for a given loan html = urlopen(url + '/discussion') bsobj = soup(html.read(), 'html.parser') html2 = urlopen(url) bsobj2 = soup(html2.read(), 'html.parser') col = bsobj2('div', {'class' : 'col-sm-6'})[2] if "Date Disbursed" in col.get_text(): cutoff = datetime.strptime(col('strong')[1].get_text(), '%b %d, %Y').date() if len(col('strong', text = re.compile(r'On Time'))) > 0: ontime = 1 else: ontime = 0 else: cutoff = datetime.now().date() ontime = 1 mydivs = bsobj.findAll("div", {"class" : "media-body"}) comments = [div.p.get_text() for div in mydivs] spans = bsobj('span', {'class' : 'comment-actions'}) dates = [datetime.strptime(span.get_text(), '%b %d, %Y').date() for span in spans] beforecomments = [comments[i] for i in range(len(comments)) if dates[i] < cutoff] aftercomments = [comments[i] for i in range(len(comments)) if dates[i] >= cutoff] if len(beforecomments) > 0: comment = " ".join(beforecomments) comment = comment.replace(" ", "").replace("&","and").replace("#","") #there is often a lot of extra whitespace. get rid of that. Also, ampersands and pound signs seem to cause a problem, so toss 'em. chunks = re.findall(re.compile(r'.{1,1000}', re.DOTALL),comment) #chunks of text larger than 1-2k characters often don't seem to get processed properly. this is really kludgy, though. chunks = [''.join(s for s in chunk if ord(s)>31 and ord(s)<126) for chunk in chunks] #get rid of special and non-ascii characters #print(chunks) scores = [] for chunk in chunks: analysis = client.get_request({"text" : chunk}, HODApps.ANALYZE_SENTIMENT, async=False) #sentiment analysis of each chunk scores.append(analysis["aggregate"]["score"]) beforescore = mean(scores) else: beforescore = 0. if len(aftercomments) > 0: comment = " ".join(aftercomments) comment = comment.replace(" ", "") #there is often a lot of extra whitespace. get rid of that. chunks = re.findall(re.compile(r'.{1,1000}', re.DOTALL),comment) #chunks of text larger than 1-2k characters often don't seem to get processed properly. this is really kludgy, though. chunks = [''.join(s for s in chunk if ord(s)>31 and ord(s)<126) for chunk in chunks] #get rid of special and non-ascii characters #print(chunks) scores = [] for chunk in chunks: analysis = client.get_request({"text" : chunk}, HODApps.ANALYZE_SENTIMENT, async=False) #sentiment analysis of each chunk scores.append(analysis["aggregate"]["score"]) afterscore = mean(scores) else: afterscore = 0. return beforescore, afterscore, ontime
def get_title(self, url): """ Get the title of the specified url. If there are any redirects, they will first be followed before pulling the title. Image and pdf links will be ignored. @param url - url to pull title for. @return - title if found. """ while True: try: html = requests.get(url, verify=False) html.raise_for_status() except requests.exceptions.RequestException, e: log.err(str(e)) return if html.headers['content-type'].startswith('image'): return elif html.headers['content-type'].startswith('application/pdf'): return else: parsed = soup(html.text, 'html.parser') if parsed.title is None: redirect = self._meta_redirect(parsed) if not redirect: log.err("Couldn't parse content from %s" % (url,)) return else: url = redirect else: break
def gcoos_describe_sensor(r_a, urn): """ Notes: We get all settings from the .cfg file and build the_url. Different RAs are running different versions of SOS so the XML parsing might need some tweaking. This code is known to work with the GCOOS-RA SOS server. """ the_url = CONFIG.get('ra_servers', r_a) the_url = the_url + CONFIG.get('base_urls', 'describe_sensor') the_url = the_url.replace('[anyURI]', urn) if DEBUG: print "gcoos_describe_sensor(%s, %s)..." % (r_a, urn) the_soup = soup(urllib2.urlopen(the_url).read(), 'html.parser') #get position the_pos = the_soup.find('gml:pos').contents[0] latitude = float(the_pos.split(' ')[0]) longitude = float(the_pos.split(' ')[1]) #slurp up the rest of the tasty bits... the_org = the_soup.find('sml:organizationname').contents[0] the_description = the_soup.find('gml:description').contents[0] sensor_list = [] for sensor in set(the_soup.find_all('sml:output')): sensor_list.append(sensor['name']) #Get GeoJSON with it... my_feature = Feature(geometry=Point(([longitude, latitude]))) my_feature.header = {'Organization' : the_org, 'Station' : urn, 'Description' : the_description, 'Sensors' : sensor_list} return my_feature
def scrape_news(input_from_click): input_from_click_processed = input_from_click.replace(" ","\%20") country_name = input_from_click_processed search_term=country_name+'\%20happiness' news_url=f"https://news.google.com/rss/search?q={search_term}" Client=urlopen(news_url) xml_page=Client.read() Client.close() soup_page=soup(xml_page,"xml") news_list=soup_page.findAll("item") news_articles = "" # Print news title, url and publish date for news in news_list[:25]: #print(news.title.text) news_articles+=news.title.text+'</br>' #print(news.link.text) news_articles+=news.link.text+'</br>' #print(news.pubDate.text) news_articles+=news.pubDate.text+'</br>' #print("-"*60) news_articles+="-"*60+'</br>' #print(f"Number of articles: {len(news_list)}") return news_articles
def inbound_sms(): response = twiml.Response() # we get the SMS message from the request. we could also get the # "To" and the "From" phone number as well inbound_message = request.form.get("Body") # we can now use the incoming message text in our Python application chromedriver = '//Users/Pranavtadepalli/Downloads/chromedriver' browser = webdriver.Chrome(chromedriver) browser.get('https://lgusd.powerschool.com/public/home.html') username = browser.find_element_by_name("account") password = browser.find_element_by_name("pw") username.send_keys("prantadepa") password.send_keys("17802249") browser.find_element_by_id("btn-enter").click() raw=soup(browser.page_source) browser.quit() simp='-'.join([elem.text for elem in raw.findAll('td') if elem.text!='\xa0']).split('-.-.-.-.-') teachers=[] for grade in simp[1:]: current=[] current.append(grade.split('\xa0')[0]) current.append(' '.join(grade.split(',')[1].split('\xa0Email'))) current.append(grade.split(':')[1].split('(')[0].split('-')[-4]) teachers.append(current) if inbound_message != '': response.message(str(teachers)) else: response.message("Hi! Not quite sure what you meant, but okay.") # we return back the mimetype because Twilio needs an XML response return Response(str(response), mimetype="application/xml"), 200
def get_leetcode_url(): urls = [] # 获取网页内容 headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/43.0.2357.130 Safari/537.36' } cur_url = 'https://leetcode.com/problemset/algorithms/' s = requests.session() r = s.get(cur_url, headers=headers) r.encoding = 'utf-8' html = soup(r.contents) problem_list = html.findAll('table', {"id": "problemList"}) tbody = problem_list[0].find('tbody') common_url = 'https://leetcode.com' trs = tbody.findAll('tr') for tr in trs: tds = tr.findAll('td') prob_id = tds[1].text prob_name = tds[2].text url = tds[2].find('a')['href'] # print prob_id + ' ' + prob_name # print common_url + url urls.append(prob_id + '||' + prob_name + '||' + common_url + url) return urls
def serializar_ocorrencias(lista_urls, palavra): #Armazenara o array de todas as ocorrencias de cada link ocorrencias = [] #Para cada link na lista sera feito o bloco de codigo for link in lista_urls: #formatando a pagina para html pag_url = str(uReq(link).read()) pag_soup = soup(pag_url, "html.parser") #qtd de paragrafos <p> no html ct = paragrafos(pag_soup,palavra) #cria um dicionario com o link e a quantidade de ocorrencias ocorrencia = {"url":link, "qtd_de_ocorrencias": str(ct)} #adiciona o dicionario a lista de ocorrencias ocorrencias.append(ocorrencia) #printando no console a troca da url print("*************************************************************************") #retornando o array de dicionarios serializado no formato json #return json.dumps(ocorrencias, indent=4) return ocorrencias
def request_unique(session, num): unique = str(num) url = registrar_url + "/" + fall_2016_str + "/" + str(num) r = session.get(url) if r.status_code != 200: raise HTTPException("Got status code: " + str(r.status_code) + " when requesting unique range.") r = check_for_lares(r) results = soup(r.text).findAll("table", {"id": "details_table"}) if len(results) < 1: return [] elif len(results) > 1: raise HTTPException("Too many tables?") classes = [] table = results[0] rows = table.findAll("tr") for tr in rows: unique = get_unique(tr) status = get_status(tr) days = get_days(tr) hours = get_hours(tr) if unique is not None: classes.append((unique, status, days, hours)) sleep(0.02) return classes
def get_auth_token(self): url = self.endpoint + '/' + self.authEndpoint res = self.session.get(url) dom = soup(res.text) token = dom.find('input', {'name': 'token'}).get('value') self.token = token return token
def intitle(self, pattern, url): html = self.ResList[url].HTML try: title = soup(html).title except: return False return re.search(pattern, title.string) is not None
def content_seen_test(page): ''' This document is responsible for performing a document seen test Checking to see if the document has being downloaded already by using the sha224 algorithm >>>content_seen(page) True ''' try: from pymongo import MongoClient from hashlib import sha224 connection=MongoClient() db=connection.jobsdbs assert db.connection==connection if page==soup('','lxml'): return False if db.crawler_page_info.find_one({'doc_digest':sha224(page.body.encode('utf-8')).hexdigest()}): return True else: return False except: import sys db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)}) print 'problem with document finger printing algorithm' return False
def crawl_full(): page = 1 while True: params = {'page': page} response = requests.get(url + '/home/index.html', headers=headers, params=params, timeout=45) timestamp = time.time() with open('tmp', 'w') as fw: fw.write(response.text) s = soup(response.text, 'lxml') matches = s.find('div', {'class': 'items'}).findAll('a') pg = s.find('li', {'class': 'page active'}) for match in matches: if not 'dota2-icon' in match.find('i').get('class'): continue series = match.find('span', {'class': 'spinach-league'}).text.strip().split('【')[0] try: matchtime_rlt = convert_time(match.find('div', {'class': 'pull-right spinach-league-right'}).text.strip()) except: print('***ERROR***', match.find('div', {'class': 'pull-right spinach-league-right'})) try: notes = match.find('span', {'class': 'spinach-league'}).text.strip().split('【')[1][:-1] except IndexError: notes = None href = url + match.get('href') yield crawl_details(href, series, matchtime_rlt, notes) if not pg.findNextSibling() or 'Market' in pg.findNextSibling().find('a').get('href'): break page += 1
def meta_scrape_table(self, url): from bs4 import BeautifulSoup as soup import requests type_map = { 'Char':'varchar', 'Character':'varchar', 'Charter':'varchar', # A spelling error in the source page. 'Decimal':'real', 'Date':'datetime'} with self.session: self.database.create() r = requests.get(url) for tr in soup(r.content).find('table').find_all('tr')[1:]: cells = [td.text for td in tr.find_all('td')] yield { 'field': cells[1].strip(), 'datatype': type_map[cells[2].strip()], 'size': int(cells[3].strip()), 'description': cells[4].strip() }
def login(username, password): # Login to QRZ - Must have access to XML API login_url = ('{0}?username={1};password={2};agent=qrzpy1.0' .format(api_root, username, password)) # Send request try: res = requests.get(login_url) except requests.exceptions.Timeout: _error('Login request to QRZ.com timed out', True) # Check Response code if res.status_code != 200: _error('Invalid server response from QRZ.com', True) # Parse response and grab session key data = soup(res.content) if data.session.key: session_key = data.session.key.text else: if data.session.error: err = data.session.error.text _error('Could not login to QRZ.com - {0}'.format(err), True) else: _error('Unspecified error logging into QRZ.com', True) return session_key
def lookup_callsign(callsign, session_key): # Check for no callsign if not callsign: return search_url = ('{0}?s={1};callsign={2}' .format(api_root, session_key, callsign)) # Send request try: res = requests.get(search_url) except requests.exceptions.Timeout: _error('Login request to QRZ.com timed out', True) # Check response code if res.status_code != 200: _error('Invalid server respnse from QRZ.com') return # Parse response and grab operator info data = soup(res.content) if not data.callsign: print 'No data found on {0}'.format(callsign) else: display_callsign_info(data.callsign)
def request_unique_range(session, start, finish): assert start < finish url = registrar_url + "/" + fall_2016_str + "/results/?search_type_main=UNIQUE&ccyys=" url += fall_2016_str + "&start_unique=" + str(start) + "&end_unique=" + str(finish) r = session.get(url) if r.status_code != 200: raise HTTPException("Got status code: " + str(r.status_code) + " when requesting unique range.") r = check_for_lares(r) results = soup(r.text).findAll("table", {"class": "results"}) if len(results) < 1: return [] elif len(results) > 1: raise HTTPException("Too many tables?") classes = [] table = results[0] rows = table.findAll("tr") for tr in rows: unique = get_unique(tr) status = get_status(tr) days = get_days(tr) hours = get_hours(tr) if unique is not None: classes.append((unique, status, days, hours)) sleep(0.02) return classes
def get_github_url(): urls = [] # 获取网页内容 headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/43.0.2357.130 Safari/537.36' } cur_url = 'https://github.com/wzqwsrf/Leetcode/tree/master/Java' s = requests.session() r = s.get(cur_url, headers=headers) # 这里主要转义一些特殊符号,如<会有问题 r.encoding = 'utf-8' html = soup(r.text) problem_list = html.findAll('table', {"class": "files"}) tbody = problem_list[0].find('tbody') common_url = 'https://github.com' spans = tbody.findAll('span', {"class": "css-truncate css-truncate-target"}) k = 0 slen = len(spans) for x in xrange(0, slen, 3): span = spans[x] print span a = span.find('a') title = a['title'] url = a['href'] urls.append(title) return urls
def findKonjugation(string): req = Request( f"{url}{quote(string)}", data=None, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' }) req.add_header('Referer', 'https://www.verbformen.de/konjugation/') try: s = soup(urlopen(req).read(), "html.parser") except urllib.error.HTTPError as e: print("Error while searching: %s" % e) sys.exit(1) result = {} for tense in filtered_tenses: result[tense] = [] for table in s.findAll('div', attrs={'class': 'rAufZu'}): for column in table.findAll('div', attrs={'class': 'vTbl'}): tense = column.find('h3').text if (tense in filtered_tenses) and not result[tense]: for tr in column.findAll('tr'): result[tense].append(tr.text.strip()) if result["Präsens"]: for tense in filtered_tenses: print(f"\033[1m{tense}\033[0m") print(" / ".join(result[tense])) print("") print(f"Quelle: {url}{quote(string)}\n") else: print("No result found.")
def send(self, request, **kwargs): response = super(HTTPAdapter, self).send(request, **kwargs) # ignore redirects if 'location' in response.headers: pass # look for the login page url elif response.url.startswith(auth_url + u'authenticate.html'): form = soup(response.text, 'lxml').find('form') if not form: raise ParseError("Could not parse login form", response) # build the login form param dict data = { i['name']: i.attrs.get('value') for i in form.find_all('input') if i['type'] not in ('checkbox', 'radio', 'submit') } data['userid'] = self.userid data['pwd'] = self.pwd # make a second request auth_request = requests.Request('POST', auth_url + 'authenticate2.html', data=data) auth_request = auth_request.prepare() response = self.send(auth_request) # look for the login page url elif response.url.startswith(auth_url + u'authenticate2.html'): raise CredentialError("Invalid credentials") return response
def get_soup_from_url(self, url_in): """ Return data loaded from an URL, as BeautifulSoup(3) object. Wrapper helper function aronud self.get_data_from_url() """ return soup(self.get_data_from_url(url_in), 'html.parser')
def standards (user, year): standards = [] #Get the standards given a user from the nzqa.login function s = soup(requests.get(nzqa_standards_url, cookies=user).text, "html.parser") for tbody in s.find_all("tbody", class_='results-{}'.format(year)): #Filter the results to the year chosen by the user for tr in tbody.find_all("tr")[0:]: #Find all 'tr' tags within the 'tbody' tags of the page tds = tr.find_all("td") #Find all 'td' tags within the 'tr' tags of the page if tds[0].span and tds[1].span and tds[2] and tds[3]: standard = { #Filtering the scraped data contain only assessment standard data "Standard": tds[0].span.contents[0].strip(), #The standard number for this result "Assessment type": tds[2].contents[0].strip(), #The Assessment type: Internal or External "Description": re.sub('\s\s+',"",tds[3].span.contents[0]), #Assessment description: What is the assessment? "Level": tds[4].contents[0].strip(), #Assessment level: What year of NCEA is this? "Result": tds[10].span.contents[0].strip() #What the student has achieved: A, M or E }
def solveIXL(problem): print(problem) browser = webdriver.Chrome(chromedriver) browser.get('https://www.wolframalpha.com/input/?i='+problem.replace('+','%2B')) browser.find_element_by_name('equal').click() time.sleep(3) source=browser.page_source source=soup(source,'lxml') browser.quit() #print(source) try: stuff=source.findAll('wa-no-query-link',class_="ng-scope ng-isolate-scope") #print(stuff) for elem in stuff: pie=elem return str(pie).split('title')[-1].split('"')[1].strip('.') except: stuff=source.findAll('a',class_="ng-scope ng-isolate-scope") #print(stuff) for elem in stuff: pie=elem try: return str(pie).split('title')[-1].split('"')[1].strip('.') except: return "don't know"
def latest_ca_scrape(): res = requests.get('https://www.bseindia.com/corporates/corporate_act.aspx') res.raise_for_status() page_soup = soup(res.content,features='lxml') no_of_pages_tab=page_soup.find('tr',{'class':'pgr'}) no_of_pages=len(no_of_pages_tab.find_all('a'))+1 options = webdriver.ChromeOptions() options.add_argument('--ignore-certificate-errors') options.add_argument('--incognito') options.add_argument('--headless') #This prevents the browser from opening up driver = webdriver.Chrome("/Users/pratikbaid/Developer/chromedriver", chrome_options=options) pageSource=res.content dataList=[] page_soup = soup(pageSource,features='lxml') dataRows=page_soup.find_all('tr',{"class":"TTRow"}) for dataRow in dataRows: dataColumns=dataRow.find_all('td') data=[] for dataColumn in dataColumns: data.append(dataColumn.text) dataList.append(data) if(no_of_pages>1): print('Entered first if') for i in range (2,no_of_pages+1): print("Entered ",i) xpath=f'//*[@id="ContentPlaceHolder1_gvData"]/tbody/tr[1]/td/table/tbody/tr/td[{i}]/a' print(xpath) driver.get('https://www.bseindia.com/corporates/corporate_act.aspx') driver.find_element_by_xpath(xpath).click() pageSource=driver.page_source page_soup = soup(pageSource,features='lxml') dataRows=page_soup.find_all('tr',{"class":"TTRow"}) for dataRow in dataRows: dataColumns=dataRow.find_all('td') data=[] for dataColumn in dataColumns: data.append(dataColumn.text) dataList.append(data) ca_array=[] for data in dataList: corporate_action={ 'secuarity_code':data[0], 'secuarity_name':data[1], 'ex_date':data[2], 'purpose':data[3], 'record_date':data[4], 'bc_start_date':data[5], 'bc_end_date':data[6], 'nd_start_date':data[7], 'nd_end_date':data[8], 'actual_payment_date':data[9] } ca_array.append(corporate_action) latest_ca_json={ 'Latest_CA':ca_array } json_data=json.dumps(latest_ca_json) return(json_data) '''//*[@id="ContentPlaceHolder1_gvData"]/tbody/tr[1]/td/table/tbody/tr/td[2]/a''' '''//*[@id="ContentPlaceHolder1_gvData"]/tbody/tr[1]/td/table/tbody/tr/td[3]/a'''
import random for x in range(1): x=random.randint(1,101) print(x) myUrl='https://www4.bing.com/search?q=beans='+str(x) print(myUrl) #Opening connection uClient=uReq(myUrl) pageHtml=uClient.read() uClient.close() #Parsing part pageSoup=soup(pageHtml, "html.parser") # Grab products boop=pageSoup.findAll("li",{"class":"b_algo"}) filename="Fact Spreadsheet.csv" f=open(filename,"w") headers="title, desc, link\n" f.write("") # Get Data beep=boop[0] for beep in boop:
def wiley(input, name): filename = "Wiley_" + name + ".xlsx" filepath = "wiley/csv/" + filename now = datetime.datetime.now() workbook = xlsxwriter.Workbook(filepath) f = workbook.add_worksheet() f.write('A1', 'Keyword : ') f.write('B1', input) f.write('A2', 'Database : ') f.write('B2', 'https://onlinelibrary.wiley.com/') f.write('A3', 'Date : ') f.write('B3', str(now.isoformat())) count = 1 n = 4 f.write('A' + str(n), 'S.No') f.write('B' + str(n), 'Website') f.write('C' + str(n), 'Title') f.write('D' + str(n), 'Journal name') f.write('E' + str(n), 'Volume') f.write('F' + str(n), 'Date') f.write('G' + str(n), 'Doi number') f.write('H' + str(n), 'Author name') f.write('I' + str(n), 'E-mail by method1') f.write('J' + str(n), 'E-mail by method2') f.write('K' + str(n), 'Affiliation') f.write('L' + str(n), 'Country') n += 1 for i in range(0, 999999): print("Page : " + str(i)) stop = True try: headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } a = 'https://onlinelibrary.wiley.com/action/doSearch?AllField=' + input.replace( " ", "+") + '&startPage=&PubType=journal' b = 'https://onlinelibrary.wiley.com/action/doSearch?AllField=' + input.replace( " ", "%20") + '&startPage=' + str(i) + '&PubType=journal' my_url = "" if (i == 0): my_url = a else: my_url = b response = requests.get(my_url, headers=headers) page = soup(response.content, "html5lib") body = page.findAll("div", {"class": "item__body"}) for each in body: link = each.h2.span.a['href'] title = each.h2.text info = each.find("div", {"class": "meta__info"}) date = info.find("span", {"class": "meta__epubDate"}).text doi = each.h2.span.a['href'] #-------------------Initialization-------------------------------------------------------- print("link : " + link) f.write('A' + str(n), str(count)) f.write('B' + str(n), 'https://onlinelibrary.wiley.com' + link) #--------------Title---------------------------------------------- print("Title : " + title) f.write('C' + str(n), title) #--------------Journal---------------------------------------------- journal = info.find("a", {"class": "meta__serial"}).text print("Journal : " + journal) f.write('D' + str(n), journal) try: vol = info.find("a", {"class": "meta__volume"}).text print("Volume : " + vol) f.write('E' + str(n), vol) except Exception as e: print("Exception volume : " + str(e)) f.write('E' + str(n), 'Cannot get volume') #--------------Date---------------------------------------------- try: print("Date : " + date) f.write('F' + str(n), date) except Exception as e: print("Exception date : " + str(e)) f.write('F' + str(n), 'Cannot get date') #--------------Doi---------------------------------------------- try: print("Doi : https://nph.onlinelibrary.wiley.com" + doi) f.write('G' + str(n), 'https://nph.onlinelibrary.wiley.com' + doi) except Exception as e: print("Exception doi : " + str(e)) f.write('G' + str(n), 'Cannot get doi') #--------------Authors and email---------------------------------------------- parse = "https://nph.onlinelibrary.wiley.com" + doi n = contact(parse, f, n) print("-------------------------------------------") count += 1 n += 1 stop = False if (stop): break except Exception as e: print("Exception big : " + str(e)) print("Page : " + str(i)) break print("Jimmy") workbook.close()
def contact(input, f, n): print("enter contact") headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } response = requests.get(input, headers=headers) page = soup(response.content, "html5lib") body = page.findAll( "div", {"class": "accordion-tabbed__tab-mobile accordion__closed"}) print(len(body)) for i in range(len(body) // 2): email = [] country = [] affiliation = [] #--------------Authors---------------------------------------------- print("Author : " + body[i].a.span.text) f.write('H' + str(n), body[i].a.span.text) try: add = body[i].find( "div", {"class": "author-info accordion-tabbed__content"}) try: allP = add.findAll("p") for each in allP: print("Address : " + each.text) affiliation.append(each.text) match = re.search( "(( )*[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)", each.text) if (match): email.append(match.group(0)) print("Found email in author : " + match.group(0)) except Exception as e: print("Exception address1 : " + str(e)) f.write('K' + str(n), "Cannot get affiliation") except Exception as e: print("Exception address2 : " + str(e)) f.write('K' + str(n), 'Cannot get affiliation') #--------------email 1---------------------------------------------- print("Len email : " + str(len(email))) try: info = body[i].find("div", {"class": "bottom-info"}) match = re.search( "(( )*[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", info.text) if match: print("Email : " + match.group(0)) email.append(match.group(0)) else: print("Email not match :" + info.text) print("Email not match") if (len(email) == 0): print("Enter if len(email)") email.append("Cannot get email") except Exception as e: print("Exception email : " + str(e)) if (len(email) == 0): print("Enter if len(email)") email.append("Cannot get email") if (len(email) == 0): f.write('I' + str(n), 'Cannot get email') else: f.write('I' + str(n), email[0]) #--------------email 2---------------------------------------------- try: text = page.find("div", {"class": "article-header__correspondence-to"}) match = re.search(body[i].a.span.text, text.text) if (match): match = re.search( "(( )*[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", text.text) if (match): f.write('J' + str(n), match.group(0)) else: f.write('J' + str(n), 'Cannot get email') else: f.write('J' + str(n), 'Cannot get email') except Exception as e: print("Exception email2 : " + str(e)) f.write('J' + str(n), 'Cannot get email') print("-----------------------------------------") #--------------Country and affiliation---------------------------------------------- for each in affiliation: checkCountry(each, country) try: for i in range(0, len(affiliation)): f.write('K' + str(n), affiliation[i]) f.write('L' + str(n), country[i]) print("Affiliation : " + affiliation[i]) print("Country : " + country[i]) n += 1 except Exception as e: print("Exception country : " + str(e)) return n
import csv #excel file from urllib.request import urlopen as uReq # web client from bs4 import BeautifulSoup as soup #data stracture #url web scrap from www.startech.com.bd my_url = 'https://www.startech.com.bd/component/graphics-card' #openning up connecting ,grabing the page url uClient = uReq(my_url) # parses html into a soup data structure to traverse html # as if it were a json data type. page_soup = soup(uClient.read(), "html.parser") uClient.close() #grabe each product store page containers = page_soup.findAll('div', {'class': 'row main-content'}) #loops over each product and grabs attributes about container = containers[0] #print(container.div.img['alt']) # finds each product from the store page product_container = container.findAll('h4', {'class': 'product-name'}) #product(product_container[0].text) # finds each product from the store page price_container = container.findAll('div', {'class': 'price'}) #price(price_container[0].text) # name the output file to write to local disk
category = 'landscape' url = 'https://unsplash.com/s/photos/' + category foldername = category quality = 7 #1-7 regex = r'(?<=' + str(quality) + '00w,\s)https:\/\/images.unsplash.com\/photo.*w=' + str(quality+1) + '00&q=60(?=\s' + str(quality+1) + '00w)' try: request = req(url) pageHtml = request.read() print('erfolgreich geladen') except Exception as exception: print('Fehler :( ', exception) links = [] sFile = soup(pageHtml, "html.parser") try: imagesUncut = sFile.select('a > div > img') amount = len(imagesUncut) print('Es gibt ' + str(amount) + ' images') except Exception as exception: print('Fehler beim Soupen ...\n', exception) images=[] for img in imagesUncut: picUrl = img.get('srcset') images.append(picUrl)
def getPageSoup(self, url): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) page = requests.get(url, verify=False) page_html = page.text page_soup = soup(page_html, "html.parser") return page_soup
from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup my_url = 'https://www.newegg.com/p/pl?d=graphic+card&cm_sp=KeywordRelated-_-graphics%20cards-_-graphic%20card-_-INFOCARD' uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, 'html.parser') print(page_soup.body)
def atualizaFipe(): carsTree = None carrosList = [] carrosMap = {} fipeMap = dicionario.HashMap() fipeURL = "http://fipeapi.appspot.com/api/1/carros/marcas.json" tknzr = WhitespaceTokenizer() chromeOptions = webdriver.ChromeOptions( ) # seta configs pra nao carregar imagens (aumenta velocidade do crawler) prefs = {"profile.managed_default_content_settings.images": 2} chromeOptions.add_experimental_option("prefs", prefs) chromeOptions.add_argument("--incognito") browser = webdriver.Chrome(chrome_options=chromeOptions) browser.get(fipeURL) page_json = soup(browser.page_source, 'html5lib').find("body").find("pre").text filename = 'fipe.csv' f = open(filename, "w") jsonMarcas = json.loads(page_json) mapMarcas = dicionario.HashMap() mapVeiculos = dicionario.HashMap() for marca in jsonMarcas: browser.get('http://fipeapi.appspot.com/api/1/carros/veiculos/' + str(marca['id']) + '.json') time.sleep(1) modelos = soup(browser.page_source, 'html5lib').find("body").find("pre").text modelos = json.loads(modelos) mapMarcas.put(marca["fipe_name"], modelos) for modelo in modelos: print(modelo) modeloNome = tknzr.tokenize(modelo['name']) if modeloNome[0].upper() == "GRAND" or ( len(modeloNome) > 1 and modeloNome[1].upper() == "LOUNGE") or modeloNome[0].upper() == "XC": modeloNome = str(modeloNome[0] + modeloNome[1]) elif modeloNome[0].upper() == "SANTA": modeloNome = str(modeloNome[0] + modeloNome[1][:2]) else: modeloNome = modeloNome[0] modeloNome = modeloNome.upper() modeloNome = modeloNome.replace("-", "") modeloNome = modeloNome.replace("!", "") if modelo['fipe_marca'].upper() == 'VW - VOLKSWAGEN': modelo['fipe_marca'] = 'VOLKSWAGEN' elif modelo['fipe_marca'].upper() == 'GM - CHEVROLET': modelo['fipe_marca'] = 'CHEVROLET' elif modelo['fipe_marca'] == 'Citro\u00ebn': modelo['fipe_marca'] = 'CITROEN' elif modelo['fipe_marca'].upper() == 'KIA MOTORS': modelo['fipe_marca'] = 'KIA' f.write(modelo["fipe_marca"].upper() + "," + str(marca['id']) + "," + modeloNome + "," + modelo["id"] + "\n") with open('fipe.csv', 'r') as f: reader = csv.reader(f) for row in reader: fipeMap.put( str(row[0]) + str(row[2]), str(row[1]) + " " + str(row[3])) #pair(fipeMarca+fipeNome,marcaID+nomeID) with open('carros.csv', 'r') as f: reader = csv.reader(f) for row in reader: chave = str(row[1]) + str(row[2]) + str( row[3]) #fabricante + modelo + ano objId = fipeMap.getObj( str(row[1]) + str(row[2]) ) #guarda objeto com os varios codigos fip para aquele veículo. (vamos precisar iterar ele depois quando formos acessar os dados via selenium) car1 = { "chave": chave, "fabricante": row[1], "modelo": row[2], "ano": row[3], "objID": objId } #tal iteracao é para driblar falta de informacao dos veículos das revendedoras (fipe usa nome completo com especificacoes e as revendedoras nao) if carrosList.count(car1) == 0: carrosList.append(car1) print(len(carrosList)) print(carrosList) i = 0 filename = "carrosEFipe.csv" #abe arquivo para escrita fw = open(filename, "w") for car1 in carrosList: i += 1 print(i) for j in range(1, 4, 1): for ID in car1['objID'].listValues: ID = tknzr.tokenize(str(ID)) marcaID = ID[0] fipeID = ID[1] print("\t" + fipeID) year = str(car1["ano"]) + "-" + str(j) print("\t" + year) time.sleep( 0.5 ) # servidor da api tem limite de requisicoes por minuto browser.get( "http://fipeapi.appspot.com/api/1/carros/veiculo/" + marcaID + "/" + fipeID + "/" + year + ".json") elem = soup(browser.page_source, "html5lib").find("body").find("h1") if elem is not None and elem.text == '500 Internal Server Error': continue else: break if elem is not None and elem.text == '500 Internal Server Error': continue else: carroFipeInfo = soup(browser.page_source, 'html5lib').find("body").find("pre").text carroFipeInfo = json.loads(carroFipeInfo) preco = tknzr.tokenize(carroFipeInfo["preco"]) preco = preco[1] preco = preco[:len(preco) - 3].replace(".", "") print(car1["chave"]) fw.write(car1["chave"] + "," + preco + "\n") break
from selenium import webdriver from selenium.webdriver.firefox.options import Options from bs4 import BeautifulSoup as soup import sys import time import smtplib #set options for headless browser & getting the text i need options = Options() options.headless = True driver = webdriver.Firefox(firefox_options=options) driver.get("https://www.ubnt.com/download/edgemax/edgerouter-x") time.sleep(5) # wait 5 sec to make sure page loads. Could be done with the driver.wait but this is easier gethtml = driver.page_source # gets the source html = soup(gethtml, 'html.parser') # parse it to bs4 htmlFind = html.find('td', class_='downloadResults__name') # find the strings i want text = htmlFind.get_text() test = "EdgeRouter ER-X/ER-X-SFP/EP-R6: Firmware v1.10.8" if text == test: driver.quit() sys.exit() else: driver.quit() #Make's connection to mail server of google. user = '' password = '' sendFrom = user to = "" msg = "\n Nieuwe update beschikbaar voor unifi router"
for page in pages: my_url = 'https://www.arrow.com/en/products/search?page=' + page + '&prodline=Telecom%20Transformers&selectedType=plNames&perPage=100' user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0)' headers = {'User-Agent': user_agent} #opening up connection' grabbing the page #uClient = uReq.(my_url, headers=headers) uClient = requests.get(my_url, headers=headers) page_html = uClient.content uClient.close() sleep(randint(30, 50)) #html parsing page_soup = soup(page_html, "html.parser") table_body = page_soup.find("tbody") containers = table_body.find_all("tr") f.write(header) for container in containers: mfr_prt = container.find_all("span", {"class": "SearchResults-productName"}) try: manufacturer_part = mfr_prt[0].text.strip() except IndexError: manufacturer_part = 'null' mfr = container.find_all(
def getStockData(): data = [[], []] for i in range(2): temp_data = list() geturl = finance_url + stock_url[i] driver = webdriver.Chrome(executable_path=path_to_chromedriver, chrome_options=options) driver.get(geturl) time.sleep(5) for j in range(1000): t = str(10000 * (j + 1)) driver.execute_script("window.scrollTo(0, " + t + ")") res = driver.execute_script( "return document.documentElement.outerHTML") driver.quit() page_soup = soup(res, "lxml") containers = page_soup.findAll("tr", {"class": "Whs(nw)"}) # status = containers.findAll("small", {"class":"intraday__status"}) x = 0 for obj in containers: val_obj = obj.findAll("td", {"class": "Py(10px)"}) x = x + 1 # print(val_obj[0].text, val_obj[1].text, val_obj[2].text, val_obj[3].text, val_obj[4].text, val_obj[5].text) # print(stockNames[i], val_obj) Date = "" Open = "" High = "" Low = "" Close = "" AdjClose = "" Volume = "" # if(len(val_obj) != 7): # temp_data.append(val_obj) if (len(val_obj) == 2): Date = val_obj[0].text Open = val_obj[1].text High = "" Low = "" Close = "" AdjClose = "" Volume = "" else: Date = val_obj[0].text Open = val_obj[1].text High = val_obj[2].text Low = val_obj[3].text Close = val_obj[4].text AdjClose = val_obj[5].text Volume = val_obj[6].text temp = [ stockNames[i], Date, Open, High, Low, Close, AdjClose, Volume ] temp_data.append(temp) data[i] = temp_data time.sleep(3) return data
from urllib.request import urlopen as uReq import pandas as pd # url for future undergrad faq # url = 'https://itsc.ontariotechu.ca/faqs/faqs-students.php' # faculty itcs link url = 'https://itsc.ontariotechu.ca/faqs/faqs-faculty-staff.php' # open connection to webpage and read the html client = uReq(url) pagehtml = client.read() client.close() # read parse the html html = soup(pagehtml, "html.parser") # loop through all the html data and remove break tags for i in html.findAll('br'): i.extract() data = html.find_all('a', {'class': 'accordion-title'}) string_data = [] for i in data: string_data.append(str(i.text)) df = pd.DataFrame(string_data) df.insert(1, '1', 'itcs_faculty-Staff_faq')
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uReq import os import csv import pandas as pd from datetime import datetime import calendar import re from io import StringIO my_url = "https://www.mohfw.gov.in/" uClient = uReq(my_url) pageHTML = uClient.read() uClient.close() pageSoup = soup(pageHTML, "html.parser") containers = pageSoup.findAll("tbody") raw_data = containers[-1].text.strip() #print(raw_data) data = "" for c in raw_data: if c == '\n': data += ',' else: data += c #print(data) data_array = data.split(',') #print(data_array) column_count = 6 mydata = "" i = 0 for c in data_array:
"Best places to visit around the world", "Adventurous places to visit", "Best luxuries of the world", "Top webseries to watch" ] searchFor = random.choice(topics) webbrowser.open('https://www.google.co.in/search?q=' + "+".join(searchFor.split())) talk("If you did not find this helpful try watching youtube.") elif 'news for today' in Input: try: news_url = "https://news.google.com/news/rss" Client = urlopen(news_url) xml_page = Client.read() Client.close() soup_page = soup(xml_page, "xml") news_list = soup_page.findAll("item") talk("Okk sir tell me how may top news you wanna know ?") n = int(input("Enter the number -> ")) talk("Here is what you need to know.") for news in news_list[:n]: newsList = news.title.text.split("-") talk("According to " + newsList[1]) talk(newsList[0]) print("\n") talk("That's about updates...") except Exception as e: print(e) else:
def scrape(baseURL, county): global cancelButtonFlag startPage = 1 url = baseURL + "1" print("URL IS: " + url) try: request = Request(url, headers = {'User-Agent' :\ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"}) uClient = urlopen(request) page_html = uClient.read() uClient.close() pageSoup = soup(page_html, "html.parser") pageSoup = pageSoup.body # find out how many pages of results there are and obtain that number pagination = pageSoup.find("nav", {"class": "pagination"}) pageList = str(pagination) try: pageList = pageList.split("\n", 7)[-2] except: messagebox.showerror( "Form Error", "Make sure you spelled everything correctly in the forms and try agian." ) result = re.search("/page-(.*)<", str(pageList)) almostThere = result.group(1) pageTotal = "" for char in almostThere: if char.isdigit(): pageTotal += char continue else: break pageTotal = int(pageTotal) + 1 workingPage = 1 for page in range(1, pageTotal + 1): if page == 0: continue else: url = baseURL + str(workingPage) print("DEBUG: I'm opening result url " + url) print("DEBUG") request = Request(url, headers = {'User-Agent' :\ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"}) print("1") uClient = urlopen(request) page_html = uClient.read() uClient.close() print("2") # find list of results on result page pageTag = soup(page_html, "html.parser") pageTag = pageTag.body.tbody # for each result in the result page, go to that result and pull data for i in pageTag: print("in pagetag for loop 3") if cancelButtonFlag: print( "in cancelButtonFlag condition: should only be here if cancelButtonFlag == True 4" ) scrapeCanceled() sys.exit() print( "after cancelButtonFlag condition 5" ) i = i.a i = str(i) i = re.search("href=\"(.*)\">", i) i = i.group(1) url = "https://okcountyrecords.com" + i print("DEBUG: I'm opening page url" + url) print("DEBUG") # Open next result from result page request = Request(url, headers = {'User-Agent' :\ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"}) uClient = urlopen(request) page_html = uClient.read() uClient.close() # Program has reached the destination page for desired data finalPage = soup(page_html, "html.parser") print("DEBUG: I'm looking in tables for data") print("DEBUG") # find all data fields in the table that contains the desired data tables = finalPage.find_all('table') for tbl in tables: if tbl == tables[0]: tds = tbl.findChildren('td') else: tds += tbl.findChildren('td') # TODO: Add better handling here. could result in shifted CSV rows if any of these data are missing. book = re.search(">(.*)</td>", str(tds[0])) book = book.group(1) page = re.search(">(.*)</td>", str(tds[1])) page = page.group(1) instrument = re.search("heavy\">(.*)</td>", str(tds[2])) instrument = instrument.group(1) documentStamps = re.search("<td>(.*)</td>", str(tds[6])) documentStamps = documentStamps.group(1) recordedOn = re.search("<td>(.*)</td>", str(tds[7])) recordedOn = recordedOn.group(1) if len(tds) > 8: instrumentDate = re.search("<td>(.*)</td>", str(tds[8])) instrumentDate = instrumentDate.group(1) else: instrumentDate = "" # write the data to CSV writeCSV(county, book, page, instrument, documentStamps, recordedOn, instrumentDate, url) # delay so we don't overwhelm the web servers and get blocked or something sleep(5) # increment page number to go to next page workingPage += 1 except HTTPError: messagebox.showerror( "URL/HTTP Error", "Could not access " + url + " Check your internet connection and try again") except URLError: messagebox.showerror( "URL/HTTP Error", "Could not access " + url + " Check your internet connection and try again")
from bs4 import BeautifulSoup as soup #parsing from urllib.request import urlopen as uReq #requesting url my_url = 'https://store.steampowered.com/app/1091500/Cyberpunk_2077/' #Paste any link u want #looking for url and scanning suiteble html of the page with the set above url uClient = uReq(my_url) #Turning html into text-variable page_html = uClient.read() uClient.close() #html parser page_soup = soup( page_html, "html.parser") #parsing html into html(not xml,json and so on) page_soup__directItem = page_soup.findAll("div", { "class": "game_purchase_price price" }) #looking for every div with class on the page "game_description_snippet" directItem__content = page_soup__directItem[0].text.strip( ) #Getting exact value of the container print("Actual product price:", directItem__content)
from urllib.request import Request, urlopen from bs4 import BeautifulSoup as soup url = 'https://www.fool.ca/recent-headlines/' req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() page_soup = soup(webpage, "html.parser") title = page_soup.find("title") print(title) containers = page_soup.findAll("p", "promo") for container in containers: print(container)
def scrape(urlOriginal, data_list): i = 0 #7 for value in range(1, 7): url = "" url = urlOriginal + format(value) print(url) try: uClient = uReq(url) except: pass page_html = uClient.read() uClient.close() #Parsing page_soup = soup(page_html, "html.parser") #article = page_soup.findAll('ul',class_='search-main-content__events-list') article_1 = page_soup.findAll('div', class_='search-event-card-wrapper') # fetching each details for container in article_1: title = container.findAll( 'div', class_='eds-event-card__formatted-name--is-clamped' )[0].text try: Date_time = container.findAll( 'div', class_= 'eds-text-color--primary-brand eds-l-pad-bot-1 eds-text-weight--heavy eds-text-bs' )[0].text except: Date_time = 'None' # try: # Location = container.findAll('div',class_='card-text--truncated__one')[0].text # except: # Location='None' try: Price = container.findAll( 'div', class_= 'eds-media-card-content__sub eds-text-bm eds-text-color--grey-600 eds-l-mar-top-1 eds-media-card-content__sub--cropped' )[1].text except: Price = 'None' a_tags = container.findAll('a') try: image = a_tags[0].img['src'] except: image = 'None' read_more = a_tags[0]['href'] print(read_more) category = 'EDUCATION, BUSINESS & TECHNOLOGY' if category == 'EDUCATION, BUSINESS & TECHNOLOGY' and image == 'None': image = 'https://uindia.net/assets/img/MediaTechnology.jpg' # description descurl = read_more #Opening connection , grabbing the page try: uClient = uReq(descurl) except: pass desc_html = uClient.read() uClient.close() #Parsing desc_soup = soup(desc_html, "html.parser") #print(desc_soup) desc = desc_soup.findAll( 'div', class_='js-xd-read-more-contents l-mar-top-3' ) or desc_soup.findAll( 'div', class_= 'structured-content-rich-text structured-content__module l-align-left l-mar-vert-6 l-sm-mar-vert-4 text-body-medium' ) if len(desc) > 0: try: p_tags = desc[0].findAll('p') except: continue descrip = [] for i in range(len(p_tags)): descript = p_tags[i].text descrip.append(descript) description = ''.join(str(e) for e in descrip) else: description = 'None' # date fetching and formatting time = desc_soup.findAll('time', class_='clrfix') if len(time) > 0: time_tags = time[0].findAll('p') date_check = time_tags[0].text if date_check == 'Multiple Dates' or date_check == 'Multiple Dates GMT': Final_Date = date_check else: Date_time = date_check.split(',') #print(Date_time) #print(len(Date_time)) if (len(Date_time)) == 2: Final_Date = Date_time[1].strip(' ') else: Mon_Date = Date_time[1].split(' ') if len(Mon_Date) == 3: Date = Mon_Date[2] month = Mon_Date[1] if len(month) <= 3: Month = datetime.datetime.strptime( month, '%b').strftime('%B') else: Month = month year = Date_time[2] Final_Date = Date + (' ') + Month + year elif len(Mon_Date) == 4: Date = Mon_Date[1] month = Mon_Date[2] Month = datetime.datetime.strptime( month, '%b').strftime('%B') year = Mon_Date[3] Final_Date = Date + (' ') + Month + ( ' ') + year else: Final_Date = 'None' #location fetching location_div = desc_soup.findAll('div', class_='event-details__data') if len(location_div) > 0: location_tags = location_div[1].findAll('p') locat = location_tags[0].text location = locat + (' ') + "Dublin" else: location = 'Dublin' print(location) try: if location == 'Dublin': ordinates[2] = "The Spire,North City,Dublin" ordinates[0] = 53.3498091 ordinates[1] = -6.2602548 else: ordinates = getOrdinates(location) except: continue try: d1 = datetime.datetime(int(year), int(month_string_to_number(Month)), int(Date)) except: continue d2 = datetime.datetime.now() if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = '' data.location = location data.summary = description data.img = image data.category = category data.startdate = Final_Date data.read_more = read_more data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = Price data_list.append(data) i = i + 1 # print(len(data)) print(len(data_list)) return data_list
def getpics(url): request=requests.get(url) page=request.text doc=soup(page,'html.parser') imglink=[element.get('src') for element in doc.find_all('img')] return imglink
params = { '_from': 'R40', '_sacat': 0, 'LH_Complete': 1, 'LH_Sold': 1, 'LH_ItemCondition': 1000, '_nkw': shoe_search, '_dcat': 15709, "US%20Shoe%20Size%20%28Men%27s%29": size, 'rt': 'nc', } r = requests.get(my_url, params=params) # html parsing page_soup = soup(r.text, "html.parser") #class nllclt is only there when there are 0 results if bool(page_soup.find("span", {"class": "nllclt"})) == True: continue #find the first of this only because Ebay sometimes adds suggested results that don't match right away matches = page_soup.find("ul", {"class": "gv-ic"}) # grabs each sale containers = matches.findAll("li", {"class": "sresult"}) # Create table, comment out after making it the first time create_table() for container in containers:
bar.update(i) i += 1 f.close() return # <a href="http://webmusic.cc/hindi_music.php?id=5012">Hate Story IV</a> # expression for movie names expr = re.compile( r'(http://webmusic.cc/hindi_music.php\?id=\d+)">([\w\s.]+)</a>') #home page for the latest bollywood entries url = 'http://webmusic.cc/music/mobile/hindi/latest.php' response = urlopen(url) bs_obj = soup(response.read(), "html.parser") data = str(bs_obj) response.close #get the current directory and then create a new folder names 'latest-bollywood' new_path = os.path.dirname(os.path.realpath(__file__)) + '\latest-bollywood' if not os.path.exists(new_path): os.makedirs(new_path) subprocess.Popen('explorer ".\latest-bollywood"') result = re.findall(expr, data) #print(len(result)," results found :---->") print("TOTAL", len(result), " files to download")
from urllib.request import Request, urlopen from bs4 import BeautifulSoup as soup import pandas as pd url = 'https://etherscan.io/token/generic-tokentxns2?m=normal&contractAddress=0xd6a55c63865affd67e2fb9f284f87b7a9e5ff3bd&a=0xd071f6e384cf271282fc37eb40456332307bb8af' req = Request( url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36' } ) # I got this line from another post since "uClient = uReq(URL)" and "page_html = uClient.read()" would not work (I beleive that etherscan is attemption to block webscraping or something?) response = urlopen(req, timeout=20).read() response_close = urlopen(req, timeout=20).close() page_soup = soup(response, "html.parser") Transfers_info_table_1 = page_soup.find( "table", {"class": "table table-md-text-normal table-hover mb-4"}) df = pd.read_html(str(Transfers_info_table_1))[0] df.to_csv("TransferTable.csv", index=False)
def get_links(url): result = requests.get(url) page = result.text doc = soup(page, 'html.parser') links = [element.get('href') for element in doc.find_all('a')] return links
link = link2 elif l == 3: link = link3 elif l == 4: link = link4 elif l == 5: link = link5 else: print("error in links \n\n\n") # scrape data into array here while iterating through # weekly links client = request(link) page_html = client.read() client.close() page_content = soup(page_html, "html.parser") # get all rows player_row = page_content.findAll('table')[0].find_all('tr') for i in range(len(player_row)): # grab relevant data from each row row_data = player_row[i].find_all('td') if (len(row_data) != 0): # make sure we are getting stats for QBs only if (row_data[1].text == 'QB'): # fill matrix row w data cmp_percentage = row_data[14].text
import pandas as pd # Set the executable path and initialize the chrome browser in splinter executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path) # Visit the mars nasa news site - assign the url and instruct the browser to visit it url = 'https://mars.nasa.gov/news/' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Set up the HTML parser html = browser.html news_soup = soup(html, 'html.parser') slide_elem = news_soup.select_one('ul.item_list li.slide') # Find the first article title slide_elem.find("div", class_='content_title') # Use the parent element to find the first `a` tag and save it as `news_title` news_title = slide_elem.find("div", class_='content_title').get_text() news_title # Use the parent element to find the paragraph text news_p = slide_elem.find('div', class_="article_teaser_body").get_text() news_p # Get the featured image from nasa.gov - takes us to URL # Visit URL
import html5lib import requests from bs4 import BeautifulSoup as soup import io import logging import datetime import copy import argparse from arcgis.gis import GIS from arcgis.features import FeatureLayer import pprint import re src = r'https://www.dhhs.vic.gov.au/coronavirus-update-victoria-2-july-2020' r = requests.get(src) page_soup = soup(r.content, 'html5lib') items = page_soup.findAll( 'div', { 'class': 'field field--name-field-dhhs-rich-text-text field--type-text-long field--label-hidden field--item' }) for item in items: txt = str(item.text) x = re.search( "Victoria is [0-9]+ with [0-9]+ new cases reported yesterday", txt) print(x.group()) d = re.findall('[0-9]+', x.group()) print(d) """paras = item.findAll('p') for para in paras: txt = para.text
from requests import get from bs4 import BeautifulSoup as soup url = "http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1" response = get(url) html_soup = soup(response.text, "html.parser") movie_container = html_soup.find_all("div", {"class": "lister-item mode-advanced"}) headers = "Movie_Name,Movie_Year,Movie_Rating,Movie_MetaScore,Movie_Votes\n" whandle = open("Movie_List.csv", "w") whandle.write(headers) for movie in movie_container: movie_metascore = "Not Available" movie_name_container = movie.find("h3", {"class": "lister-item-header"}) movie_name = movie_name_container.a.text.replace(",", " |") movie_year_container = movie.find( "span", {"class": "lister-item-year text-muted unbold"}) movie_year = movie_year_container.text movie_rating_container = movie.find( "div", {"class": "inline-block ratings-imdb-rating"}) movie_rating = (movie_rating_container["data-value"]) movie_metascore_container = movie.find( "div", {"class": "inline-block ratings-metascore"}) if movie_metascore_container != None: movie_metascore = (movie_metascore_container.span.text.strip())
async def get_img_urls(link: str) -> AsyncIterator[str]: plain = await get_plain_text(link) s = soup(plain, "html.parser") for link in s.find_all("img", class_="imagecontent"): yield link.get("src")
# Scrape with BeautifulSoup - basic parsing example # ----------------------------------------------------------------------------- # If you already have the HTML data from a website and just want to extract # from it, BeautifulSoup is a good choice. HTML parsing is a headache because # much of the HTML on public web pages is technically invalid: unclosed tags, # incorrect nesting, and other complications. # $ pip3 install beautifulsoup4 import requests from bs4 import BeautifulSoup as soup r = requests.get('http://pythonhow.com/example.html') c = r.content # the entire source code s = soup(c, 'html.parser') # feed the content to the bs4 parser a = s.find('div', {'class': 'cities'}) # returns the first div a = s.find_all('div', {'class': 'cities'}) # returns a list of all divs # print(s.prettify) print(type(r)) # <class 'requests.models.Response'> print(type(c)) # <class 'bytes'> print(type(s)) # <class 'bs4.BeautifulSoup'> print(type(a)) # <class 'bs4.element.ResultSet'> print(type(a[2])) # <class 'bs4.element.Tag'> print(a) # prints all the divs and their contents print(a[2]) # prints the 3rd div and its contents print(a[0].find_all('h2')) # list of all the h2 tags from the 1st div print(a[0].find_all('h2')[0]) # just the 1st h2 print(a[0].find_all('h2')[0].text) # just the text inside the h2 tag