def check_url(url): html = urlopen(url).read().decode('utf-8') content = bts(html, features='lxml') result = True if "Item Withheld" in content.title.get_text(): result = False return result
def getComment(aid): cid = getCid(aid) if cid < 0: return -1 url = "https://comment.bilibili.com/" + str(cid) + ".xml" page = getData(url) soup = bts(page, "html.parser") comment = {} for c in soup.find_all('d'): time = float(c.attrs['p'].split(',')[0]) comment[time] = c.string comment = sorted(comment.items(), key=lambda x: x[0]) ret = [ define.Comment(_time=x[0], _vtime=calcTime(x[0]), _content=x[1]) for x in comment ] return ret
def crawler(x): import os import sys import urllib.request from bs4 import BeautifulSoup as bts from text_preprocessing import clean_text from lst_pick import lst_pick client_id = "IFvtovuLLdeQi6K6jywv" client_secret = "_51j6auaOC" encText = urllib.parse.quote(x) start = 1 str_big = [] while start < 1000: url = "https://openapi.naver.com/v1/search/news.xml?query=" + encText + \ "&display=30" + "&sort=date" + "&start=" + str(start) # xml 결과 request = urllib.request.Request(url) request.add_header("X-Naver-Client-Id", client_id) request.add_header("X-Naver-Client-Secret", client_secret) response = urllib.request.urlopen(request) rescode = response.getcode() if (rescode == 200): response_body = response.read() a = response_body.decode('utf-8') else: print("Error Code:" + rescode) html = bts(a, "html.parser") news_titles = html.find_all("title") for title in news_titles: title_str = str(title.string) str_big.append(title_str.strip()) start += 30 try: pre_str_big = [clean_text(i) for i in str_big] data = {} for row in pre_str_big: for text in row.split(): data[text] = data.get(text, 0) + 1 except Exception as e: print("예외 발생", e) return data
# Test beautiful soup from bs4 import BeautifulSoup as bts import requests # parse the html with lxml and beautifulsoup # get the desire data from the html with open('simple.html') as htmlFile: soup= bts(htmlFile, 'lxml') # get the title match = soup.title.text # get the prefified html prettifiedSoup = soup.prettify() with open('simple.html') as htmlFile: soup= bts(htmlFile, 'lxml') # Find the first return class named article article = soup.find('div', class_='article') #print the headline headline= article.h2.a.text print(headline) #print the summary summary= article.p.text print(summary) # print the headline and summary of all article class for i in soup.findAll("div", class_= 'article'): headline= article.h2.a.text
from urllib.request import urlopen from urllib.error import HTTPError import re from bs4 import BeautifulSoup as bts try: html = urlopen("https://baike.baidu.com/item/gooogle") except HTTPError as e: print(e) else: if html is None: print('None') else: bsobj = bts(html.read(), "html.parser") title = bsobj.h1.get_text() dtlist = bsobj.findAll("dt", {"class": "basicInfo-item name"}) ddlist = bsobj.findAll("dd", {"class": "basicInfo-item value"}) print(title + ':') for name, value in zip(dtlist, ddlist): pattern = re.compile(r'[^\u4e00-\u9fa5]') name = re.sub(pattern, '', str(name)) print('\t' + name + ' : ' + value.get_text().strip())
def __init__(self, SoupPage): self.serp = bts(SoupPage, 'lxml')
from bs4 import BeautifulSoup as bts import re import csv from urllib.request import urlopen import xlrd count = 0 all_href = [] all_href_excel = [] #PLOS ONE #url = "http://hub.hku.hk/simple-search?query=&location=publication&filter_field_1=journal&filter_type_1=equals&filter_value_1=plos+one&filter_field_2=dateIssued&filter_type_2=equals&filter_value_2=%5B2016+TO+2018%5D&sort_by=score&order=desc&rpp=25&etal=0&start=0"; #Scientific Reports url = "http://hub.hku.hk/simple-search?query=&location=publication&rpp=25&sort_by=score&order=desc&filter_field_1=journal&filter_type_1=equals&filter_value_1=scientific+reports&filter_field_2=dateIssued&filter_type_2=equals&filter_value_2=%5B2016+TO+2018%5D" while url is not None: html = urlopen(url).read().decode('utf-8') content = bts(html, features='lxml') pages = content.find('ul', 'pagination pull-right') #print (pages) next_url = None flag = False pages_refs = pages.find_all('li') for page_ref in pages_refs: if page_ref.has_attr('class'): flag = True continue if flag == True: next_url = page_ref.find('a')['href'] break #print(next_url) if next_url is None:
import requests from bs4 import BeautifulSoup as bts url = requests.get("https://m.stock.naver.com/marketindex/index.nhn").text soup = bts(url, 'html.parser') for key in soup.select('.exchg_on'): name = key.select_one('.stock_dn').text # doll = key.select_one('.stock_price').text # gap = key.select_one('.gap_wrp').text print(f'{name}')
from bs4 import BeautifulSoup as bts from selenium import webdriver driver = webdriver.Firefox() url = "https://www.lazada.vn/dien-thoai-di-dong/?page=1" driver.get(url) root = driver.find_element_by_id(id_='root') # print(root.get_attribute('innerHTML')) lzd = root.get_attribute('innerHTML') soup = bts(lzd, 'lxml') # print(soup.title.string) products = soup.find_all(class_='c2prKC') # print(products) # print(lzd.text) count = 0 for product in products: # if product.has_attr('class'): count += 1 print(count) driver.close()
from urllib.request import urlopen from bs4 import BeautifulSoup as bts import re import pandas as pd import datetime # wiki 网址 url = 'https://zh.wikipedia.org/wiki/%E6%B2%AA%E6%B7%B1300' target_html = urlopen(url).read().decode('utf-8') soup = bts(target_html,'lmxl') target_table = soup.find('table') CSI300_trs = target_table.find_all('tr')[1:] target_data = [] for i in CSI300_trs: CSI300_tds = i.find_all('td') share_code =CSI300_tds[0].get_text() share_name = CSI300_tds[1].get_text() exchange = CSI300_tds[3].get_text() target_data.append([share_code, share_name, exchange]) CSI300_data =pd.DataFrame(target_data,colunms=['share_code', 'share_name', 'Exchange']) date = datetime.date.today().strftime('%Y%m%d') CSI300_data.to_csv(f'CSI300_{date}',encoding='utf_8_sig')