def telekom_internet(scraper_url: str): soup = get_soup(scraper_url) oferte = soup.select('.oferte-container div.oferta') packages = [] for oferta in oferte: price = oferta.select_one('.bottomContent h3.magenta').text name = oferta.select_one('h2').text detail_url = oferta.select_one('.bottomContent a')['href'] id_hint = url_last_path(detail_url, scraper_url) characteristics = {} for feature in oferta.select('ul li'): feature_text = feature.text.lower() channels = parse_metric(feature_text, ['canale']) if channels and 'online' not in feature_text: characteristics['tv_nchan'] = format_units(channels) elif 'inclus' in feature_text: characteristics['tv_rcvr'] = [True] package = { 'name': name.strip(), 'price': format_units(price), 'scraper_id_hint': id_hint.strip(), 'characteristics': characteristics } packages.append(package) return json.dumps({"packages": packages})
def telekom_internet(scraper_url: str): soup = get_soup(scraper_url) oferte = soup.select('.oferte-container div.oferta') packages = [] for oferta in oferte: price = oferta.select_one('.bottomContent h3.magenta').text name = oferta.select_one('h2').text detail_url = oferta.select_one('.bottomContent a')['href'] id_hint = url_last_path(detail_url, scraper_url) characteristics = {} for feature in oferta.select('ul li'): feature_text = feature.text.lower() speed = parse_metric(feature_text, ['Mbps', 'Gbps']) if speed: characteristics['inet_down'] = characteristics[ 'inet_up'] = format_units(speed) elif 'inclus' in feature_text: characteristics['inet_router'] = True package = { 'name': name.strip(), 'price': format_units(price), 'scraper_id_hint': id_hint.strip(), 'characteristics': characteristics } packages.append(package) return json.dumps({"packages": packages})
def get_dy2018(): url = 'https://item.mi.com/product/6334.html' print('开始抓取AI音箱...') soup = get_soup(get_html(url, 'utf-8')) # print(soup.prettify()) a = soup.select('#J_buyBtnBox a') print(a)
def get_initial_urls(): initial_url_lists = ['https://www.tapology.com/rankings/current-top-ten-lightweight-mma-fighters-155-pounds', 'https://www.tapology.com/rankings/current-top-ten-featherweight-mma-fighters-145-pounds', 'https://www.tapology.com/rankings/current-top-ten-bantamweight-mma-fighters-135-pounds', 'https://www.tapology.com/rankings/35-top-flyweight-mma-fighters', 'https://www.tapology.com/rankings/1261-top-women-bantamweight-fighters', 'https://www.tapology.com/rankings/1262-top-women-flyweight-fighters', 'https://www.tapology.com/rankings/1263-top-women-strawweight-fighters', 'https://www.tapology.com/rankings/1264-top-women-atomweight-fighters', 'https://www.tapology.com/rankings/1265-top-women-featherweight-fighters', 'https://www.tapology.com/rankings/current-top-ten-best-pound-for-pound-mma-and-ufc-fighters', 'https://www.tapology.com/rankings/top-ten-fan-favorite-mma-and-ufc-fighters', 'https://www.tapology.com/rankings/33-current-best-pound-for-pound-female-mma-fighters', 'https://www.tapology.com/rankings/current-top-ten-heavyweight-mma-fighters-265-pounds', 'https://www.tapology.com/rankings/current-top-ten-light-heavyweight-mma-fighters-205-pounds', 'https://www.tapology.com/rankings/current-top-ten-middleweight-mma-fighters-185-pounds', 'https://www.tapology.com/rankings/current-top-ten-welterweight-mma-fighters-170-pounds' ] output_urls = set() for url_ranking_page_url in initial_url_lists: soup = get_soup(url_ranking_page_url) name_soups = soup.find_all('div', {'class':'rankingItemsItemRow name'}) next_batch_of_urls = set([parse.urljoin(base_url, i.find('a')) for i in name_soups if i.find('a')]) output_urls.update(next_batch_of_urls) return output_urls
def kathmandu_post_extractor(): url = 'https://kathmandupost.ekantipur.com' soup = get_soup(url) more_news_section = soup.find('div', class_='block--morenews') news_list = more_news_section.find_all('article', class_="article-image") main_list = [] for news in news_list: post_link = news.a['href'] default_link = "https://kathmandupost.ekantipur.com" full_link = default_link + post_link title = news.contents[1].h3.text image_div = news.find('div', class_='image') try: image_link = image_div.figure.a.img['data-src'] except: image_link = "img not available" date_url = news.a['href'] date = get_date(date_url) summary = news.p.text news_dict = { "image_link": image_link, "title": title, "nep_date": date, "source": "ekantipur", "news_link": full_link, "summary": summary, } main_list.append(news_dict) last_list = featured_news(soup) + main_list return last_list
def orange_abon(scraper_url: str): soup = get_soup(scraper_url) feature_kw = OrderedDict([ ('mobil_min_nat mobil_sms_nat', 'minute sms naiona'), ('mobil_min_nat', 'minute naiona'), ('mobil_sms_nat', 'sms naiona'), ('mobil_min_internat', 'minute interna'), ('mobil_date', 'trafic internet'), ]) abonamente = soup.select('.carusel-abo .item') packages = [] for abon in abonamente: name = abon.select_one('.minbenbox.romico h3').text price = abon.select_one('.secbenbox .lefttext').text features = abon.select('.minbenbox.romico .minben') features = [(f.select_one('.descben').text.strip().lower(), str(f.contents[0]).strip()) for f in features] characteristics = extract_features(features, feature_kw, name) packages.append({ 'name': name.strip(), 'price': format_units(price), 'characteristics': characteristics }) return json.dumps({"packages": packages})
def kantipur_daily_extractor(): news_list = [] url = 'https://www.kantipurdaily.com/news' soup = get_soup(url) for article in soup.find_all('article', class_='normal'): title = article.h2.a.text #author = article.find('div', class_='author').text summary = article.find('p').text image = article.find('div', class_="image").figure.a.img["data-src"] img = image.replace("-lowquality", "") small_img = img.replace("lowquality", "") big_img = small_img.replace("300x0", "1000x0") date_ore = article.h2.a['href'] contaminated_list = date_ore.split('/') pure_date_list = [ contaminated_list[2], contaminated_list[3], contaminated_list[4] ] date = "/".join(pure_date_list) link = "https://kantipurdaily.com" + date_ore news_dict = { 'title': title, 'nep_date': date, 'source': 'ekantipur', 'summary': summary, 'news_link': link, 'image_link': big_img, } news_list.append(news_dict) return news_list
def scrape_url(url): output = [] soup = get_soup(url) fighter_info_soup = soup.find('div', {'class':'details details_two_columns'}) fighter_info_soup = fighter_info_soup.find('ul', recursive = False) fighter_info_soups = fighter_info_soup.find_all('li', recursive = False) fighter_info_dict = dict() for i in fighter_info_soups: label = i.find('strong').getText() value = i.find('span').getText() fighter_info_dict[label] = value
def extractor(category=None, section='gen'): if not category: category = [27] if section == 'intl' else [21, 22, 24, 25, 26, 31] elif category: _resolve_category(category) all_news = [] for category_no in category: url = f'http://nagariknews.nagariknetwork.com/category/{category_no}' print("Resolved url", url) soup = get_soup(url) news_list = _soup_extractor(soup) all_news += news_list print(f"Length of all {len(all_news)}") return all_news
def telekom_mobil_abonamente(scraper_url: str): response_body = get_json(scraper_url) products_html = response_body['productsHtml'] soup = bs4.BeautifulSoup(products_html, 'html5lib') feature_kw = OrderedDict([('mobil_min_internat', 'min internationale'), ('mobil_min_nat', 'min nationale'), ('mobil_date', 'trafic date'), ('mobil_sms_nat', 'sms nationale')]) packages = [] abonamente = soup.select('div.abonamenteTabWrapper') for abon in abonamente: detail_url = abon.select_one('div.abonamenteButtons a')['href'] if not urlparse(detail_url).netloc: detail_url = urljoin(scraper_url, detail_url) abon_name = abon.select_one('div.abonamenteTabTitle strong').text abon_price = abon.select_one('.abonamentePrice strong').text abon_id = urlparse(detail_url).path abon_id = os.path.basename(os.path.normpath(abon_id)) abon_details = get_soup(detail_url) features = abon_details.select('#tab-1 .tarrifs-table tbody tr') characteristics = {} for feature in features: feature_name, feature_value = map(_extract_p_data, feature.select('td')) feature_words = re.sub('[^a-z]', '', feature_name.lower()) for alias, kws in feature_kw.items(): if all(kw in feature_words for kw in kws.split(' ')): characteristics[alias] = format_units(feature_value) break if not all(alias in characteristics for alias in feature_kw.keys()): missing = set(feature_kw.keys()) - set(characteristics.keys()) # raise ScraperError(f"{abon_name} missing values for [{', '.join(missing)}]") print(f"{abon_name} missing values for [{', '.join(missing)}]") package = { 'name': abon_name.strip(), 'price': format_units(abon_price), 'scraper_id_hint': abon_id.strip(), 'characteristics': characteristics } packages.append(package) return json.dumps({"packages": packages})
def get_dy2018(): url = 'https://www.dy2018.com/' print('开始抓取dy2018...') soup = get_soup(get_html(url, 'gb2312')) # print(soup.prettify()) list = soup.select('.co_content222 li') print("共搜索到" + str(len(list)) + "条记录") for i in list: one = i a = one.select('a')[0] a_href = a['href'] a_title = a['title'] print(base_url + a_href) print(a_title) print("")
from datetime import datetime from bs4 import BeautifulSoup as BS import requests from common import get_soup url = 'https://kathmandupost.ekantipur.com' soup = get_soup(url) def kathmandu_post_extractor(): url = 'https://kathmandupost.ekantipur.com' soup = get_soup(url) more_news_section = soup.find('div', class_='block--morenews') news_list = more_news_section.find_all('article', class_="article-image") main_list = [] for news in news_list: post_link = news.a['href'] default_link = "https://kathmandupost.ekantipur.com" full_link = default_link + post_link title = news.contents[1].h3.text image_div = news.find('div', class_='image') try: image_link = image_div.figure.a.img['data-src'] except: image_link = "img not available" date_url = news.a['href'] date = get_date(date_url) summary = news.p.text news_dict = { "image_link": image_link,