def scraper2(): c = 0 try: r2 = requests.get("https://www.elwatan.com/edition/actualite") unicode_str2 = r2.content.decode('utf8') encoded_str2 = unicode_str2.encode("utf-8") except: print("request failed try again") return if r2.status_code == 200: array = [] soup = bf(encoded_str2, "html.parser") title = soup.find_all('h3', {'class': 'title-14'}) img = soup.find_all('article', {'class': 'post post-tp-24'}) for i in img: im = bf(str(i), "html.parser") try: im = im.find_all('img') im = im[0]["src"] array.append(im) except: array.append("https://i.ibb.co/8cfP6ZD/elwatan.png") for i in title: link = bf(str(i), "html.parser") link = link.find_all('a') link = link[0]["href"] articles(title=i.get_text().encode("utf-8"), link=link, img=array[c], category=category(str(i.get_text().encode("utf-8")), 0)).save() c = c + 1
def scraper4(): # try to send a get request to the websites try: url = "https://tsa-algerie.com" working_proxy = check_proxies() if working_proxy != 0: scraper = cfscrape.create_scraper() proxies = {"http": working_proxy, "https": working_proxy} r4 = scraper.get(url, proxies=proxies, allow_redirects=True, timeout=(10, 20)) unicode_str4 = r4.content.decode('utf8') encoded_str4 = unicode_str4.encode("utf-8") articles = [] if r4.status_code == 200: soup = bf(encoded_str4, "html.parser") title = soup.find_all('h2', {'class': 'ntdga__title transition'}) for i in range(0, 10): link = bf(str(title[i].encode("utf-8")), "html.parser") link = link.find_all('a') link = link[0]["href"] articles.append(str(title[i].get_text().encode("utf-8") + ": " + str(link)).replace(",", "")) list_to_csv(articles, "tsa") readcsv("tsa.csv") dataframe_to_csv("result", tsport, thealth, tscience) time.sleep(tsa_freq) else: print("no working proxy found") # if request failed except: print("request failed try again") scraper4()
def scraper3(): try: r3 = requests.get("http://www.aps.dz") unicode_str3 = r3.content.decode('utf8') encoded_str3 = unicode_str3.encode("utf-8") except: print("request failed try again") return if r3.status_code == 200: soup = bf(encoded_str3, "html.parser") title = soup.find_all("h3", {"class": "allmode-title"}) img = soup.find_all('div', {'class': 'allmode-img-top'}) img = img + soup.find_all('div', {'class': 'allmode-img'}) array = [] c = 0 for i in img: im = bf(str(i), "html.parser") try: im = im.find_all('img') im = im[0]["src"] array.append(im) except: array.append("https://i.ibb.co/1z2d99g/aps.jpg") for i in title: link = bf(str(i), "html.parser") link = link.find_all('a') link = link[0]['href'] articles(title=i.get_text().encode("utf-8"), link="http://www.aps.dz" + link, img="http://www.aps.dz" + array[c], category=category(str(i.get_text().encode("utf-8")), 0)).save() c = c + 1
def scraper3(): try: r3 = requests.get("http://www.aps.dz") unicode_str3 = r3.content.decode('utf8') encoded_str3 = unicode_str3.encode("utf-8") except: print("request failed try again") return articles = [] if r3.status_code == 200: soup = bf(encoded_str3, "html.parser") title = soup.find_all("h3", {"class": "allmode-title"}) for i in title: link = bf(str(i), "html.parser") link = link.find_all('a') link = link[0]['href'] articles.append(str(i.get_text().encode("utf-8") + ": " + "http://www.aps.dz" + str(link)).replace(",", "")) list_to_csv(articles, "aps") readcsv("aps.csv") dataframe_to_csv("result", tsport, thealth, tscience) time.sleep(aps_freq) scraper3()
def scraper4(): # try to send a get request to the websites try: url = "https://tsa-algerie.com" working_proxy = check_proxies() if working_proxy != 0: scraper = cfscrape.create_scraper() proxies = {"http": working_proxy, "https": working_proxy} r4 = scraper.get(url, proxies=proxies, allow_redirects=True, timeout=(10, 30)) unicode_str4 = r4.content.decode('utf8') encoded_str4 = unicode_str4.encode("utf-8") if r4.status_code == 200: soup = bf(encoded_str4, "html.parser") title = soup.find_all('h2', {'class': 'ntdga__title transition'}) for i in range(0, 10): link = bf(str(title[i].encode("utf-8")), "html.parser") link = link.find_all('a') link = link[0]["href"] articles1.append(str(+ ": " + str(link)).replace(",", "")) articles(title=title[i].get_text().encode("utf-8"), link=link, img="https://i.ibb.co/QMZ7VBg/tsa.jpg", category=category( str(i.get_text().encode("utf-8")), 0)).save() else: print("no working proxy found") # if request failed except: print("request failed try again")
def checkLink(): global p url = """https://github.com/search?l=JavaScript&o=desc&p=%(page)s&q=%(location)s&ref=advsearch&type=Users""" url2 = """https://github.com/""" data = {"page": p, "location": "location%3AIndia"} #location can be set here insted of India, any desired location could be entered. site = (url % data) print site print "\n\n" try: data = urllib2.urlopen(site, data=None, timeout=60) tag = bf(data.read()) user = tag.findAll('div',{'class': 'user-list-info'}) mydb = MySQLdb.connect(host='localhost', user='******', passwd='password', db='gitcrawler')# change the database information cursor = mydb.cursor() for r in user: username = r.find('a').string locate = r.find('li') location = locate.text date = r.findAll('li')[2:3] fullname = r.contents[2:3] name = str(fullname)[9:-8] site2 = (url2+username) userpage = urllib2.urlopen(site2,data=None, timeout=60) userinfo = bf(userpage.read()) information = userinfo.findAll('a', {'class': 'url'}, text=True) link = None for x in information: link = ''.join(x.findAll(text=True)) print link joining_date = "Not Available" for n in date: joindate = n.findAll('span')[1:] for s in joindate: joining_date = ''.join(s.findAll(text=True)) email = r.findAll('a')[1:2] mail = None for q in email: mail = q.get('data-email') mail = urllib.unquote(mail).decode('utf8') print "---------------------------------------------------------------------------" print '|%s | %s | %s | %s | %s | %s |' % (location, username, joining_date, mail, name, link) try: len(mail)# change the name of table in insert query as per language. cursor.execute('''INSERT INTO javascript(username, language, currentlocation, email, joining_date, name, link) VALUES(%s, %s, %s, %s, %s, %s, %s)''', (username, lang, location, mail, joining_date, name,link)) except TypeError: cursor.execute('''INSERT INTO javascript(username, language, currentlocation, email, joining_date, name, link) VALUES(%s, %s, %s, %s, %s, %s, %s)''', (username, lang, location, mail, joining_date, name, link)) mydb.commit() cursor.close() p += 1 except urllib2.HTTPError, e: print "Http error to many request , sleeping for 10 seconds..." print e sleep(10)
def main(url): #娛樂 domain = 'https://www.setn.com' three = requests.get(url).text three=bf(three, 'html.parser') title=three.find('h3',{'class','view-li-title'}).find('a',{'class':'gt'}).text text=three.find('h3',{'class','view-li-title'}).find('a',{'class':'gt'})['href'] text_1=requests.get(domain+text) text_1=bf(text_1.text,'html.parser') text_1=text_1.find('div',{'class':'Content2'}).text text_1=re.sub('(圖/\w*)','',text_1) title_content=(title,text_1) return list(title_content)
def get_comment(self, html): bs4_comment = bf(html, 'lxml') comment = bs4_comment.find(name='div', class_='mouth-main') comment = str(comment) res1 = re.compile('<style.*?>.*?</style>', re.S) content = res1.sub('', comment) res2 = re.compile('<script.*?>.*?</script>', re.S) content = res2.sub('', content) res3 = re.compile('\n', re.S) content = res3.sub('', content) # comment = bs4_comment.select('div .mouth-main') bs4_comment = bf(content, 'lxml') print(bs4_comment.get_text())
def getSmallCounty(countyURL): get_r = requests.get(countyURL) get_soup = bf(get_r.text, "html.parser") smallCounty = get_soup.select("area") if len(smallCounty) != 0: for i in smallCounty: smallCountyLink.append("http://cmdweb.pcc.gov.tw/pccms/owa/" + i["href"].rstrip()) #進入鄉鎮meta data抓鄉鎮名 get_r2 = requests.get("http://cmdweb.pcc.gov.tw/pccms/owa/" + i["href"].rstrip()) get_soup2 = bf(get_r2.text, "html.parser") smallCountyNameList.append( get_soup2.find("font", color="red").text)
def scraper1(): # send request and encode results try: r1 = requests.get("https://www.liberte-algerie.com/actualite") unicode_str1 = r1.content.decode('utf8') encoded_str1 = unicode_str1.encode("utf-8") except: print("no working proxy found") return if r1.status_code == 200: # create beautiful soup object soup = bf(encoded_str1, "html.parser") # get article title title = soup.find_all('a', {'class': 'title'}) mg = soup.find_all('div', {'class': 'span-8'}) img = mg[0].find_all('li') array = [] co = 0 for i in img: im = bf(str(i), "html.parser") try: im = im.find_all('img') im = im[0]["src"] array.append(im) except: array.append("https://i.ibb.co/fDDLYQc/libre.jpg") c = 0 # create tables to save articles1 for i in title: # get link and article as text link = title[c]['href'] articles(title=i.get_text().encode("utf-8").strip(), link="https://www.liberte-algerie.com" + link, img=array[c], category=category(str(i.get_text().encode("utf-8")), 0)).save() c = c + 1
def smallCounty(URL): r = requests.get(URL) soup = bf(r.text, "html.parser") mainLink = soup.select("frameset frame") mainLink = "http://cmdweb.pcc.gov.tw" + mainLink[0]["src"] #找iframe網址 r2 = requests.get(mainLink) soup2 = bf(r2.text, "html.parser") county = soup2.find("map").select("area") #縣市Link for i in county: countyLink.append("http://cmdweb.pcc.gov.tw/pccms/owa/" + i["href"].rstrip()) #跨區Link和縣市名稱 for i in countyLink: r3 = requests.get(i) soup3 = bf(r3.text, "html.parser") acrossNameList.append(soup3.select("font")[1].text[:-6]) countyNameList.append(soup3.select("font")[1].text[1:3]) acrossLink.append("http://cmdweb.pcc.gov.tw/pccms/owa/" + soup3.find("a")["href"].rstrip()) #鄉鎮Link和鄉鎮名稱 def getSmallCounty(countyURL): get_r = requests.get(countyURL) get_soup = bf(get_r.text, "html.parser") smallCounty = get_soup.select("area") if len(smallCounty) != 0: for i in smallCounty: smallCountyLink.append("http://cmdweb.pcc.gov.tw/pccms/owa/" + i["href"].rstrip()) #進入鄉鎮meta data抓鄉鎮名 get_r2 = requests.get("http://cmdweb.pcc.gov.tw/pccms/owa/" + i["href"].rstrip()) get_soup2 = bf(get_r2.text, "html.parser") smallCountyNameList.append( get_soup2.find("font", color="red").text) for i in countyLink: getSmallCounty(i) #鄉鎮Link產生df smallCountyNameList.extend(countyNameList) smallCountyLink.extend(acrossLink) dic_smallCounty = {"鄉鎮": smallCountyNameList, "table連結": smallCountyLink} df_smallCounty = pd.DataFrame(dic_smallCounty) return df_smallCounty #df_smallCounty.to_csv("smallCounty.csv",encoding="utf_8_sig")
def job_seek(): target_url = 'https://www.104.com.tw/jobbank/custjob/index.php?r=cust&j=503a4224565c3e2430683b1d1d1d1d5f2443a363189j48&jobsource=joblist_b_relevance#info06' print('Start parsing appleNews....') rs = requests.session() res = rs.get(target_url) res.encoding = 'utf-8' soup = bf(res.text, 'html.parser') content = "" temp = [] reback = [] for date in soup.select('.joblist_cont .date'): if date.text == '': temp.append('緊急!!重點職務') else: temp.append(date.text) for v, data in enumerate(soup.select('.joblist_cont .jobname a'), 0): link = data['href'] title = data['title'] content += '發布時間->{}\n工作名稱->{}\n連結網址->{}\n'.format( temp[v], title, 'https://www.104.com.tw' + link) if v % 5 == 0: if v == 0: continue reback.append(TextSendMessage(text=content)) content = '' return reback
def get_all_url(): rsp = requests.get(main_url, timeout=10) rsp_html = rsp.text.encode('ISO-8859-1').decode('gbk').encode( 'utf-8').decode('utf-8') # print(rsp_html) soup = bf(rsp_html, 'html.parser') # print(soup) # lists = soup.select_one('#list') # print(lists) lists = soup.select('div > dl > dd > a') print(lists) # pat = re.compile(r'第二章:千梅(1') # print(pat.findall(rsp_html)) print(len(lists)) url_dict_list = [] for url in lists: url_dict = {} url_dict[url.text] = url.get('href') url_dict_list.append(url_dict) with open('url.json', 'w') as f: json.dump(url_dict_list, f) print(url_dict_list) # get_all_url()
async def main(url_passed): list_of_links = scrap_content(url_passed) count = 0 for links in list_of_links: site_files = requests.get(f'{url}/{links}').text soup_content = bf(site_files, 'lxml') page_title = soup_content.find('h1') content = soup_content.find('article', class_='content') file_name = links.replace('.shtml', '/').split('/') file_name = file_name[len(file_name) - 2].replace('-', '_') count += 1 with open(os.path.join(path, f'{file_name}.html'), 'w', encoding="utf-8") as fs: fs.write(str(page_title).replace("\n", "").replace(" ", "")) fs.write(str(content)) print(count) print("All the pages has been crawled successfully!")
def symbol_statics_parser(ticker): print("start " + ticker) staticmap = {} url = "https://finance.yahoo.com/quote/%s/key-statistics?p=%s" % (ticker, ticker) response = requests.get(url, verify=False) sp = bf(response.text, features="lxml") tables = sp.find_all("table") for tb in tables: rows = tb.findChildren(['th', 'tr']) for row in rows: cells = row.findChildren('td') for cell in cells: for s in cell.strings: if s is not None and "Forward Annual Dividend Yield" in s: staticmap["Forward Annual Dividend Yield"] = cells[ len(cells) - 1].string if s is not None and "Payout Ratio" in s: staticmap["Payout Ratio"] = cells[len(cells) - 1].string if s is not None and "52 Week High" in s: staticmap["52 Week High"] = cells[len(cells) - 1].string if s is not None and "52 Week Low" in s: staticmap["52 Week Low"] = cells[len(cells) - 1].string if s is not None and "Diluted EPS (ttm)" in s: staticmap["Diluted EPS (ttm)"] = cells[len(cells) - 1].string if s is not None and "Ex-Dividend Date" in s: staticmap["Ex-Dividend Date"] = cells[len(cells) - 1].string return staticmap
def WeiBo_page(index=2, *args): browser = Browser_Driver() # 通过args区分搜索关键字 if not args: for i in range(index): try: time.sleep(3) el = browser.find_element_by_xpath( '//*[contains(text(),"正在加载中,请稍候...")]') time.sleep(2) browser.execute_script("arguments[0].scrollIntoView();", el) time.sleep(2) except Exception as e: time.sleep(3) el = browser.find_element_by_xpath( '//*[contains(text(),"正在加载中,请稍候...")]') time.sleep(2) browser.execute_script("arguments[0].scrollIntoView();", el) time.sleep(2) else: key = args[0] browser.find_element_by_xpath( '//*[@id="weibo_top_public"]/div/div/div[2]/input').send_keys(key) browser.find_element_by_xpath( '//*[@id="weibo_top_public"]/div/div/div[2]/a').click() weibo_pages = bf(browser.page_source, 'lxml') browser.quit() return weibo_pages
def Get_Article_PO(PO_Article_Url): PO_Aricle = urlopen(PO_Article_Url) PO_Aricle_Obj = bf(PO_Aricle.read().decode('GB2312', 'ignore'), 'html.parser') print('<h1 class=\"print\">', PO_Aricle_Obj.find_all('h1')[1].string, '</h1>', file=fp) print('<hr>', file=fp) print("<div class=\"author no-print\">", PO_Aricle_Obj.find('div', class_='author cf').string, "</div>", "\n\n", file=fp) print("<a class=\"no-print\" href=\"", PO_Article_Url, "\">", "原文链接", "</a>", "\n\n", file=fp) for i in PO_Aricle_Obj.find_all('p'): if (i.string): print(i, file=fp) print('<br>', file=fp)
def __init__(self, url): self.url = url self.reponse = requests.get(url) self.soup_page = bf(self.reponse.text, 'lxml') self.collection_equipe = CollectionEquipe() self.lst_compet = ['France : Ligue 1', 'France : Ligue 2','Angleterre : Premier League', 'Angleterre : League Championship','Espagne : Liga BBVA', 'Italie : Serie A', 'Allemagne : Bundesliga', 'Portugal : Liga Sagres','Belgique : Pro League', 'Pays-Bas : Eredivisie'] self.lst_index = []
def crawler(self): for i in range(82900, 83300): r = requests.get(self.input_url + str(i) + ".htm") soup = bf(r.text, "lxml") info = soup.find("div", id="shop_info").findAll("div") if len(info) == 1: continue data = {} print(i) try: data["店名"] = soup.find("div", id="shop_name").find("h3").text.replace( '\r', ';').replace('\n', ';').replace('|', ';') data["電話"] = soup.find("div", id="shop_tel").text.replace( '\r', ';').replace('\n', ';').replace('|', ';') data["地址"] = soup.find("div", id="shop_add").text.replace( '\r', ';').replace('\n', ';').replace('|', ';') data["營業時間"] = soup.find( "div", id="opening").find("span").text.replace( '\r', ';').replace('\n', ';').replace('|', ';') except: print("go") self.output_pois.append(data)
def crwal(): req = request.Request(url, headers=headers) handler = request.BaseHandler() opener = request.build_opener(handler) response = opener.open(req) # logger.info(response.read().decode('utf-8')) page = response.read().decode('utf-8') soup = bf(page, 'lxml') itemlist = soup.select('#J_goodsList > ul > li > div') totalprice = 0.0 pricelist = [] for item in itemlist: priceStr = item.find(class_='p-price').strong.get_text().replace( '¥', '') price = float(priceStr) pricelist.append(price) # print(price) # print(type(price)) totalprice = totalprice + price pricelist.sort(reverse=True) averageprice = totalprice / len(itemlist) print(averageprice) # priceString = unicode(nString) r = RedisUtil.getredis() param = { 'date': time.strftime('%Y-%m-%d', time.localtime()), 'price': averageprice, 'itemlist': pricelist } r.lpush('JDPrice', param)
def add_hw(title, due, time, the_class): session = requests.session() url = 'https://myhomeworkapp.com/login' #Add user and pass for myHomework app data = {'username': '******', 'password': '******'} hw_login = session.get(url) tree = html.fromstring(hw_login.text) data['csrfmiddlewaretoken'] = list( set(tree.xpath("//input[@name = 'csrfmiddlewaretoken']/@value")))[0] hw_login_response = session.post(url, data=data, headers=dict(referer=url)) hw_add = session.get('https://myhomeworkapp.com/homework/add') soup = bf(hw_add.text, 'html.parser') req_class = '' for i in soup.find_all('option'): if i.text == the_class: req_class = i['value'] hw_data = { 'title': title, 'cls': req_class, 'type': 1, 'due_date': due, 'due_time': time, 'repeats': 0, 'save': 'Save' } tree2 = html.fromstring(hw_add.text) hw_data['csrfmiddlewaretoken'] = list( set(tree2.xpath("//input[@name = 'csrfmiddlewaretoken']/@value")))[0] submit_url = 'https://myhomeworkapp.com/homework/add' hw_submit = session.post(submit_url, data=hw_data, headers=dict(referer=submit_url)) print(hw_submit) return 0
def info_scraper(imdb_id): title_url = 'https://www.imdb.com/title/' res = requests.get(url=title_url + imdb_id).text soup = bf(res, 'html5lib') rating = float( soup.find(name='span', attrs={ "itemprop": "ratingValue" }).text) x = soup.find(name='h1', attrs={'class': ''}).text title = x.split("\xa0")[0] + ' ' + x.split("\xa0")[1].strip() info = soup.find(name='div', class_='subtext').text duration = info.split("|\n")[0].strip() genres = info.split("|\n")[1].split("\n")[0] + info.split("|\n")[1].split( "\n")[1] yor = info.split("\n")[-2] img_url = soup.find(name='div', class_='poster').find(name='img').attrs['src'] hd = 'UX650_CR1,0,680,1000_AL__QL50.jpg' img_url = img_url[:-27] + hd ## for heroku '27', for local '32' try: text = soup.find(name='div', attrs={ 'class': 'inline canwrap' }).find('p').text storyline = '' for i in text.strip().split("\n"): storyline = storyline + " " + i except: storyline = 'Not found any storyline for this move.' return [rating, title, duration, genres, yor, img_url, storyline]
def review_scrapper(imdb_id): reviews_df = pd.DataFrame( columns=['rating', 'title', 'username', 'review_date', 'review_text']) title_url = 'https://www.imdb.com/title/' res = requests.get(url=title_url + imdb_id + '/reviews').text soup = bf(res, 'html5lib') review_boxs = soup.find_all(name='div', attrs={'class': 'review-container'}) for i in review_boxs: try: rate = int(i.find_all(name='span')[1].text) except: rate = 'nan' title = i.find(name='a', class_='title').text.strip() username = i.find(name='div', class_='display-name-date').find('a').text review_date = i.find( name='div', class_='display-name-date').find(class_='review-date').text review_text = i.find(name='div', class_='text show-more__control').text temp = { 'rating': rate, 'title': title, 'username': username, 'review_date': review_date, 'review_text': review_text } reviews_df = reviews_df.append(temp, ignore_index=True) return reviews_df
def get_target(web): """get the target tags which contain the covid information of each country and region then return tags Note: param <web> has to be in text format or byte format, not textwraper""" soup = bf(web, 'lxml') tbody = soup.find('tbody') trs = tbody.find_all(name='tr', class_=False) return trs
def get_html_from_avito(params): http = urllib3.PoolManager() answer = '' avito_url = get_avito_url(params) r = http.request('GET', avito_url) soup = bf(r.data, 'html.parser') return soup
def get_html_from_avito(): http = urllib3.PoolManager() answer = '' r = http.request( 'GET', 'https://www.avito.ru/rossiya/kvartiry/prodam?pmax=5000000&pmin=0&s_trg=4&f=549_5696-5697-5698-5699.59_13990b.497_5185b ' ) soup = bf(r.data, 'html.parser') return soup
def get_page_source(driver, url): try: driver.get(url) except Exception as e: print e return get_page_source(driver, url) else: pagesource = driver.page_source soup = bf(pagesource, 'html.parser') return soup
def get_twitter(): #读取html文件 page = bf(driver.page_source, 'html5lib') #搜索目标信息 mu_mes = page.find( 'div', {'style': re.compile('position: relative; min-height: ')}) mes_links = mu_mes.find_all('div', {'lang': "en"}) #输出 for mes in mes_links: word.append(mes.span.get_text())
def multi_process(self, result): WIDTH = 320 HEIGHT = 640 PIXEL_RATIO = 3.0 UA = random.choice(user_agent_phone) mobileEmulation = { "deviceMetrics": { "width": WIDTH, "height": HEIGHT, "pixelRatio": PIXEL_RATIO }, "userAgent": UA } options = webdriver.ChromeOptions() options.add_experimental_option('mobileEmulation', mobileEmulation) # 下面代码 不需要打开 chrome浏览器 运行 2行代码 options.add_argument('--headless') options.add_argument('--disable-gpu') driver = webdriver.Chrome(chrome_options=options) # driver.get('https://mitangzhicheng.m.tmall.com/shop/shop_auction_search.htm?suid=2095767659&sort=default') # 下面可以弄成字典的形式 一个店铺名 一个连接 # shop_link_list=['http://shop114249705.m.taobao.com','http://shop103991552.m.taobao.com','https://esey.m.tmall.com/','https://mitangzhicheng.m.tmall.com/','https://msuya.m.tmall.com/?shop_id=66459281'] shopWangwang, shopId, wx_shop_url, shop_url = result if not self.check_repeat(shopId, self.get_date): myFormat('店铺【%s】在【%s】 已经保存过了,跳过' % (shopWangwang, get_date)) return pd.DataFrame({}) driver.get(wx_shop_url) myFormat('正在处理店铺【%s】' % shopWangwang, symbol='.', fillMode='right') # print(driver.page_source) try: soup = bf(driver.page_source) fans_counter = soup.select('.collect-counter')[0].text.strip() temp = round( float(re.search(r'(\d+\.*\d*)', fans_counter).group(1)), 3) fans_counter_num = int(temp * 10000) if '万' in fans_counter else int(temp) except: # print(driver.page_source) # fans_counter_num=soup.select('div[data-role="shop_head"] span') # print(fans_counter_num) fans_counter = -1 fans_counter_num = -1 print(shopWangwang, fans_counter) # 33.3万 data_list = [[ self.get_date, shopWangwang, fans_counter, fans_counter_num, shop_url, shopId, OPERATOR_NAME ]] data_dict = dict(zip(range(len(data_list)), data_list)) df = pd.DataFrame(data_dict) driver.close() return df.T
def obj_url(url): obj = requests.get( url=url, # proxies={'http',''}, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36' }) obj.encoding = 'gb2312' soup = bf(obj.text, 'lxml') return soup
def scrapeBook(url): url = f"http://books.toscrape.com/catalogue/{url}" html = requests.get(url) control = bf(html.content, 'html.parser') title = control.select(".product_main h1")[0].get_text() price = control.select(".product_main p.price_color")[0].get_text() stock = control.select(".product_main .instock")[0].get_text().strip( ).split(' ')[2].split('(')[1] dic = {'title': title, 'price': price, 'stock': stock} return dic
def index(): if request.method == "POST": if 'username' in session: url = request.form["url"] links = db.links if not validators.url(url): url = "http://" + url if not validators.url(url): return render('form.html', error='URL is incorrect') else: existing_url = links.find_one({'url': url}) if not existing_url: current_time = str(datetime.now()) print current_time print url cur_user = db.users.find_one({'name': session['username']}) html = None try: html = urllib2.urlopen(url) html = html.read() soup = bf(html) title = url try: title = soup.find('title').text except Exception: pass db.links.insert({ 'url': url, 'title': title, 'author': cur_user['name'], 'author_id': cur_user['_id'], 'current_time': current_time, 'votes': 1 }) return render('form.html', error="New item is added") except Exception: return render('form.html', error="URL is incorrect") else: return render('form.html', error="URL already exists") else: flash('Please log in') redirect(url_for('login')) return render('form.html')
start_time = time.time() if(len(sys.argv) == 1): sys.exit(0) else: if(sys.argv[1]=='crawl'): url = "https://www.ptt.cc/bbs/beauty/index.html" base = "https://www.ptt.cc" data=[] key = [False,False,False] while(1): #use url to fetch new page r = requests.get(url) #time.sleep(0.01) soup = bf(r.text.encode('utf-8'),"html.parser") #follow up is the data that need to be save #first we get the next page url info = soup.find_all(class_="btn-group btn-group-paging")[0].find_all(class_="btn wide") url = base + info[1]['href'] #print(url) #end loop if in the first page if( len(info) == 3 and '下頁' in info[1].text): break #second we parse the data word = soup.find_all(class_="r-ent") word.reverse()