def get_response_2(): raw_html = urllib2.urlopen( 'https://mutualfund.wishfin.com/reliance-mutual-fund').read() html = BeautifulSoup(raw_html, 'html.parser') response = [] for a in html.select('li'): data = dict() link = a.select("a") if len(link) > 0: link = link[0] try: if 'reliance' in link['href']: data['url'] = link['href'] data['fund_name'] = link.text except Exception: pass percent = a.select("li") if len(percent) > 1: percent = percent[1] span = percent.select("span") if len(span) > 1: data["1Y"] = float(span[1].text.split(" ")[0]) data["3Y"] = 0 else: print "NA" if len(data) > 0: response.append(data) response.pop(-1) return response
def searchGenders( browser : mc.Browser, gender : str , page = 1) -> None: url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(page) res = browser.open(url) html = res.read() html = bs4( html, "html.parser" ) pageCount = html.select("div[max_num_pages]") pageCount = pageCount[0].select(":not(:last-child)") maxPage = int(pageCount[ len(pageCount) -1 ].get_text()) linkdata = open("links.html", "a") for i in range(maxPage + 1): if( i >= 1 ): print("===> getting page {}\n".format(i)) url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(i) res = browser.open(url) html = res.read() html = bs4( html, "html.parser" ) booklinks = html.find_all('a', {'class' : 'bookTitle'}) for link in booklinks: linkdata.write( "<a href='"+ str(url) + str( link['href'] ) +"' ></a>\n") print("Ready!")
def get_sectors(): # set up browser chrome_options = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) browser = webdriver.Chrome(options=chrome_options) # browser = webdriver.Chrome() # open web page browser.implicitly_wait(10) # wait for web page to load url = 'https://www.thestar.com.my/business/marketwatch/' browser.get(url) r = browser.page_source html = BeautifulSoup(r, 'html.parser') # print(html) browser.close() # sector elements htmlPart = html.find(class_=re.compile("stocks")) linkPart = [ x.get_attribute_list('id') for x in htmlPart.find_all('a', {"id": True}) ] for i in range(len(linkPart)): sector_elements.extend(linkPart[i]) # print(linkPart) # print(sector_elements) # print(len(sector_elements)) # sector_list sector = html.find_all('strong') for i in sector: sector_list.append(i.text.strip(':')) # print(i.text) # print(sector_list) # sector_name_list sector_n = html.select('div.text a') for i in sector_n: sector_name_list.append(i.text) # print(sector_name_list) return
def parseByTag(self, html, tag): '''parse HTML by tag Keyword arguments: html -- BeautifulSoup object tag -- string ''' total_words=int() words = [] for i, t in enumerate(html.select(tag)): for j, word in enumerate(t.text.split(' ')): for k, w in enumerate(word.split('\n')): w = w.rstrip(punctuation) w = w.rstrip(' ') if len(w)>1 and w is not None and len(w)<255: total_words+=1 words.append(w) return total_words, words
def response(flow): content_type = flow.response.headers.get('Content-Type', '') if content_type.startswith('text/html'): charset = 'utf8' if '=' in content_type: charset = content_type.split('=')[1] print("delete ads:" + content_type + "->" + charset + ": " + flow.request.url) t1 = time.time() html = BeautifulSoup(flow.response.content, "html.parser", from_encoding=charset) print(" BeautifulSoup:" + str(round(time.time() - t1, 1)) + "sec") if html.head: delcount = 0 t1 = time.time() for bkey in blist: for col in html.select(bkey): col.extract() delcount += 1 flow.response.content = str(html).encode(charset) print(" " + str(round(time.time() - t1, 1)) + "sec: delcount=" + str(delcount))
import requests import lxml.html from bs4 import BeautifulSoup as BS import xml.etree.ElementTree as ET from xml.dom import minidom r = requests.get('https://www.citrus.ua/smartfony/') html = BS(r.content, 'html.parser') k = -1 tree = lxml.html.fromstring(r.text) content = tree.xpath('//meta[@itemprop="sku"]/@content') for el in html.select('.product-card__overview'): k += 1 def main(): ti = el.select('.product-card__name > a') pr = el.select('.prices__price') hr = ti[0]['href'] new = ET.Element('all_data') phone = ET.SubElement(new, 'phone') phone.set('id', content[k]) name = ET.SubElement(phone, 'name') name.text = ti[0]['title'] price = ET.SubElement(phone, 'price') price.text = pr[0].text characteristics = ET.SubElement(phone, 'characteristics') r2 = requests.get('https://www.citrus.ua' + hr)
import lxml.html from bs4 import BeautifulSoup as BS import xml.etree.ElementTree as ET from xml.dom import minidom r = requests.get( 'https://www.ebay.com/b/Football-Clothing-Shoes-Accessories/159113/bn_1941036' ) html = BS(r.content, 'html.parser') z = -1 tree = lxml.html.fromstring(r.text) shipping = tree.xpath( '//span[@class="s-item__shipping s-item__logisticsCost"]/@span') for vd in html.select('.s-item__wrapper.clearfix'): z += 1 def main(): n = vd.select('.s-item__title > h3') p = vd.select('.s-item__price') b = vd.select('.s-item__dynamic s-item__dynamicAttributes1') hr = n[0]['href'] elem = ET.Element('all_data') product = ET.SubElement(new, 'product') product.set('id', shipping[k]) name = ET.SubElement(product, 'name') name.text = n[0]['h3'] price = ET.SubElement(product, 'price') price.text = p[0].text
def scapper_tsv(n_links, path_html, path_tsv): # loading bar with tqdm(total=n_links) as pbar: for article in range(n_links): # range of files try: html = BeautifulSoup( open(path_html + "/article_{}.html".format(article)), 'html.parser') except Exception as e: # if article doesn't exists print(article, e) continue title = html.select("h1")[0].text # initialize tmp as intro tmp = 'intro' sections = {'intro': '', 'plot': ''} # take all paragraphs section by section and save only intro and plot for section in html.select('div.mw-parser-output > *' ): # take only notes in the first level if (section.name == 'p' and tmp == 'intro'): sections['intro'] += section.text.strip() # chage tmp on section names if (section.name in ['h2', 'h3']): tmp = section.span['id'] # take only sections we are interrested in if (section.name == 'p' and tmp in [ 'Plot', 'Plot_summary', 'Premise' ]): # check different names for plot sections sections['plot'] += section.text.strip() # we doesn't take in consideration pages without Plot if (sections['plot'] == ''): print(article, 'No Plot') continue # dictionary for infobox d = { 'film_name': title, 'Directed by': 'NA', 'Produced by': 'NA', 'Written by': 'NA', 'Starring': 'NA', 'Music by': 'NA', 'Release date': 'NA', 'Running time': 'NA', 'Country': 'NA', 'Language': 'NA', 'Budget': 'NA' } # take elem from infobox info_box = html.findAll(['th', 'td']) for elem in info_box: info = elem.text.strip('\n') # take text from the table if info in d: d[info] = info_box[info_box.index(elem) + 1].text.strip('\n') # select elem in oroder as a list to save in .tsv ld = list(d.values()) columns = [ 'title', 'intro', 'plot', 'film_name', 'Directed by', 'Produced by', 'Written by', 'Starring', 'Music by', 'Release date', 'Running time', 'Country', 'Language', 'Budget' ] data = [title, sections['intro'], sections['plot']] + ld[0:] # create and save a tsv with open(path_tsv + '/article_{}.tsv'.format(article), 'w', newline='', encoding='utf-8') as f_output: tsv_output = csv.writer(f_output, delimiter='\t') tsv_output.writerow(columns) tsv_output.writerow(data) pbar.update(1)