def get_article_url(page_list, url_list, title_list, pop_list, index_list): url_all = [] title_all = [] pop_all = [] rs = requests.session() res = rs.post(ask, verify = False, data = payload) res = rs.get(url, verify = False) s = bsp(res.text, "html.parser") page_handle(s, url_all, title_all, pop_all) url_all.reverse() url_list.extend(url_all) title_all.reverse() title_list.extend(title_all) pop_all.reverse() pop_list.extend(pop_all) link = get_prev(s) page_list.append(link) for i in page_list: url_all = [] title_all = [] pop_all = [] rs = requests.session() res = rs.post(ask, verify = False, data = payload) res = rs.get(i, verify = False) s = bsp(res.text, "html.parser") page_handle(s, url_all, title_all, pop_all) url_all.reverse() url_list.extend(url_all) title_all.reverse() title_list.extend(title_all) pop_all.reverse() pop_list.extend(pop_all) print('=============',i,'==============') for j in s.select('.btn-group-paging'): page_link_result = j.findAll('a', class_='btn wide') page_link = page_link_result[1] page_link = page_link['href'] link = 'https://www.ptt.cc' + page_link time.sleep(0.3) print('Fetching ... ') if(check_today(s) == 1): page_list.append(link) index_url(url_list, index_list) select_article(url_list, title_list, pop_list, index_list) #print_list(pop_list, title_list, url_list, index_list) """
def ReadAsin(): # AsinList = csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv"))) driver = webdriver.Chrome(executable_path=r'g:/chromedriver.exe') my_url="https://www.amazon.com/s?k=pants&ref=nb_sb_noss_2" driver.get(my_url) html=driver.page_source page=bsp(html,"html.parser") AsinList = [] containers=page.findAll(class_="sg-col-4-of-24 sg-col-4-of-12 sg-col-4-of-36 s-result-item sg-col-4-of-28 sg-col-4-of-16 sg-col sg-col-4-of-20 sg-col-4-of-32") for i in containers: if i["data-asin"]: AsinList.append(i["data-asin"]) driver.quit() extracted_data = [] extracted_data1=[] for i in AsinList: url = "http://www.amazon.com/dp/"+i print ("Processing: "+url) extracted_data.append(AmzonParser(url,i)) sleep(1) f=open('data2.json','w') json.dump(extracted_data,f,indent=4) for i in AsinList: url = "http://www.amazon.com/dp/"+i print ("Processing: "+url) extracted_data1.append(AmzonParser1(url,i,extracted_data1)) sleep(1) f=open('data.json','w') json.dump(extracted_data1,f,indent=4)
def Quora(self,r_url): html_=urllib2.urlopen(r_url) soup = bsp(html_) question['title'] = soup.title.string question['url'] = r_url details = soup.find_all('div',class_='question_details_text') for detail in details: question['details'] = detail.text topics = soup.find_all('div',class_='topic_list_item') for topic in topics: question['topics'] = [topic.text] ans_count = soup.find('div',class_='answer_header_text').text.split() count = int(ans_count[0]) question['answer_count'] = count answers = soup.find_all('div',class_='pagedlist_item') if count < 6: count = len(answers)-1 else: count = 6 for i in range(count): if answers[i].find('div',class_='answer_content'): self.response.write(answers[i].find('div',class_='answer_content').text) self.response.write('-----------------------------------------------------------------')
async def extract_abstract(ix): url = 'https://www.cw.com.tw/article/articleLogin.action?id=%s' % ix async with aiohttp.ClientSession(connector=aiohttp.TCPConnector( ssl=False)) as session: async with session.get(url, headers=headers) as res: try: text = await res.text() except: print('%s failed' % ix) with open('failed.txt', 'a') as f: f.write('%s\n' % ix) return soup = bsp(text, "lxml") try: img_url = soup.select( '.main article .st_email_large')[0]['st_image'] if len(img_url) == 0: return img_url = re.sub('\:\d+', '', img_url) print(img_url) print('--------') async with session.get(img_url, headers=headers) as img_res: img = await img_res.read() with open('%s/%s.jpg' % (output_dir, ix), 'wb') as out_file: #shutil.copyfileobj(img, out_file) out_file.write(img) except IndexError: return
def fetch_images(query): # addr = "https://www.google.com/search?tbm=isch&q=" + query addr = "https://www.google.co.in/search?q={}&source=lnms&tbm=isch".format( query) s = rq.session() r = s.get(addr) soup = bsp(r.text, 'html.parser') # print(soup) imgs = [] for im in soup.find_all('a', href=True): if im.find('img'): imgs.append(im) # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(imgs) try: os.mkdir('images') except OSError: shutil.rmtree('images') os.mkdir('images') count = 0 prs = Presentation() blank_slide_layout = prs.slide_layouts[6] left = top = Inches(1) for im in imgs: for child in im.children: filename = wget.download(child['src'], out='images/image_{}'.format(count)) count += 1 for i in range(count): slide = prs.slides.add_slide(blank_slide_layout) pic = slide.shapes.add_picture('images/image_{}'.format(i), left, top) title = '{}_image.pptx'.format(query) prs.save(title) return title
def gujianew(code, riqi): url = 'http://www.aigaogao.com/tools/history.html?s=' + code detail_url = requests.get(url) soup = bsp(detail_url.content, 'lxml') detail_content = soup.find_all('table') #print(detail_content) table = detail_content[-1] #print(table) trs = table.find_all('tr') # print(trs) trs_list = [] for i in trs: # print(i) tds = i.find_all('td') # print(tds) tr_list = [] for j in tds: tr_list.append(j.text) trs_list.append(tr_list) #print(trs_list[0]) result = DataFrame(trs_list[1:-1], columns=trs_list[0]) #print(result) result['日期'] = pd.to_datetime(result['日期']) #print(riqi) #print(result['日期']) finally_result = result[result['日期']>riqi] #finally_result = [[time_to_timestr(j) for j in i] for i in finally_result.values] print(finally_result, type(finally_result)) a = finally_result print(s) fw = open('./002401.txt', 'a', encoding='utf-8') for l in s[0]: fw.write(str(l)+'\t')
def addCounties(self, cities): '添加县级城市' starttime = time.time() insert = 0 mycursor = self.__mydb.cursor() try: for city in cities: citynumber, cityname = tuple(city.split(':')) rqtApi = self.rrhighApi + citynumber + '.html' try: htmlhandle = urllib.request.urlopen(rqtApi) except Exception as e: self.log.write(time.asctime() + u'请求文档错误:' + str(e) + '\n') else: print('---下载%s数据成功---' % (cityname)) htmldoc = htmlhandle.read().decode('utf-8') htmlhandle.close() btsp = bsp(htmldoc, 'html.parser') counties = btsp.find_all('a', href="#highschool_anchor") mycursor.execute(self.queryCityIdSql, (cityname,)) cityid = mycursor.fetchone()['id'] for county in counties: mycursor.execute(self.queryCountySql, (county.string, cityid)) if mycursor.fetchone()['num'] == 0: insert += 1 print('插入%s-->%s' % (cityname, county.string)) mycursor.execute(self.insertCountySql, (county.string, cityid)) self.__mydb.commit() except Exception as e: self.log.write(time.asctime() + str(e) + '\n') mycursor.execute(self.countCountySql) countnum = mycursor.fetchone()['countnum'] endtime = time.time() self.printExeResult(insert, endtime - starttime, countnum, '县区')
def tencent(self): result = requests.get(self.url).content result = bsp(result, 'html5lib') self.title = result.head.title.text.split("-")[0] result = result.find(attrs={"class": "mod_episode"}) text = result.text.replace('\t', '').splitlines() while '' in text: text.remove('') contents = result.contents for i in contents: if type(i) != 'bs4.element.Tag': contents.remove(i) num = len(text) # 原版采用列表逆序从头查找,现在采用text[-1]和__reversed__方法减少逆序时间 # text.reverse() # contents.reverse() if text[-1].find('展开更多') >= 0: self.tencent2() return else: for i in range(num).__reversed__(): if text[i].isdigit(): self.ep = text[i] self.link = contents[i].find('a').get('href') return self.tencent2()
def getRankData(partition,rankTime): #url = 'https://www.bilibili.com/ranking?spm_id_from=333.334.banner_link.1' url = urlConstructor('全站',partition,rankTime) result = bsp(session.get(url,headers = HostReferer).content,'lxml') #将结果转化为bsp结构 items = result.find_all('div',class_ = 'info') rankData = '' count = 1 for item in items: detail = item.find_all('span') pts = item.find_all('div',class_ = 'pts') if detail: #获取该视频链接 videoHref = item.a['href'] #通过链接获取播放量和弹幕数 data = getVideoView('http:'+videoHref) rankData += str(count)+u'\t标题: '+item.a.text + '\n' rankData += u'\t播放量: '+str(data['view'])+u'\t弹幕数: '+str(data['danmaku']) + '\t'+u'UP主: '+detail[1].text+'\t' + u'综合得分: '+pts[0].div.text+'\n\n' count += 1 #break#调试先设置只执行一次 return rankData
def get_zj(url): res = rqs.get(url) ques=[] #test input str_ques="" ans=[] #test output str_ans="" tmp=[] html = bsp(res.text, "html.parser") data = html.findAll("div", {"class" : "problembox"}) data2 = html.findAll("pre") for i in data2: if i.text!="\r" and i.text!="\r\n": tmp.append(i.text+"\n") for i in range(0,len(tmp),2): ques.append(tmp[i]) ans.append(tmp[i+1]) for i in ques: str_ques+=i for i in ans: str_ans+=i str_ques=str_ques.replace("\n\n","\n") str_ans=str_ans.replace("\n\n","\n") str_ques=str_ques.replace("\r","") str_ans=str_ans.replace("\r","") return str_ques,str_ans
def info(self): self.idget = requests.get(d.url + d.loc[5] + d.loc2[0], cookies=self.kk).text # 注释部分用bs4脱裤子放屁 # soup = bsp(idget, "html5lib") # self.id = soup.legend.contents[0].split(d.seg[0])[1].strip() # print(legend) # info = soup('script') # info = info[len(info) - 1].string # self.id = idget.split(d.seg[0])[1].split(d.seg[1])[0].strip() info = json.loads(self.idget.split(d.seg[2])[1].split(d.seg[3])[0]) # print(info) self.n = info['nick'] self.s = info['school'] self.c = info['class'] # gender 0:♂ 1:♀ self.g = info['gender'] self.e = info['email'] self.b = info['birthday'] userinfoPage = requests.get(d.url + d.loc[8], params={ 'user': self.id }).text userinfoPage = bsp(userinfoPage, "html5lib") tag = userinfoPage.find('div', id="yijiejue") # it=tag.text # print(it) # print(type(it)) # print(re.findall(re.compile(r'\d\d\d\d'), it)) self.headled = re.findall(re.compile(r'\d\d\d\d'), tag.text)
def get_message(): url="http://www.freebuf.com" header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0"} res=requests.get(url,headers=header) soup=bsp(res.text,"html.parser") a=soup.find_all('div',class_="news-img") for i in a: url1.append(i.a['href']) title.append(i.img['title']) for i in range(len(title)): dic={title[i]:url1[i]} dict.update(dic) #word=re.compile(u'\u62db\u8058|\u62db|\u62db\u4eba|\u8df3\u69fd|\u5b9e\u4e60|\u6316\u4eba|\u8bda\u62db') word=re.compile(u'招聘|诚聘|招人|招|聘|实习生|挖人') for key in dict: match=word.search(key) if match: pass else: dic={key:dict[key]} result.update(dic) for key in result: file.write('<a href=') file.write('"') file.write(result[key]) file.write('">') file.write(key) file.write('</a>') file.write('<br>') file.close()
def get_page(artist_name): artist_name = artist_name.replace(' ', '%20') base_url = 'http://www.lyrics.com' full_address = base_url + '/artist/' + artist_name page = rq.get(full_address) soup = bsp(page.content, 'html.parser') return soup
def Quora(self, r_url): html_ = urllib2.urlopen(r_url) soup = bsp(html_) question["title"] = soup.title.string question["url"] = r_url details = soup.find_all("div", class_="question_details_text") for detail in details: question["details"] = detail.text topics = soup.find_all("div", class_="topic_list_item") for topic in topics: question["topics"] = [topic.text] ans_count = soup.find("div", class_="answer_header_text").text.split() count = int(ans_count[0]) question["answer_count"] = count answers = soup.find_all("div", class_="pagedlist_item") if count < 6: count = len(answers) - 1 else: count = 6 for i in range(count): if answers[i].find("div", class_="answer_content"): self.response.write(answers[i].find("div", class_="answer_content").text) self.response.write("--" * 30)
def Quora(self, r_url): html_ = urllib2.urlopen(r_url) soup = bsp(html_) question['title'] = soup.title.string question['url'] = r_url details = soup.find_all('div', class_='question_details_text') for detail in details: question['details'] = detail.text topics = soup.find_all('div', class_='topic_list_item') for topic in topics: question['topics'] = [topic.text] ans_count = soup.find('div', class_='answer_header_text').text.split() count = int(ans_count[0]) question['answer_count'] = count answers = soup.find_all('div', class_='pagedlist_item') if count < 6: count = len(answers) - 1 else: count = 6 for i in range(count): if answers[i].find('div', class_='answer_content'): self.response.write(answers[i].find('div', class_='answer_content').text) self.response.write( '-----------------------------------------------------------------' )
def scrapTodayYest(): import requests from bs4 import BeautifulSoup as bsp a2m = lambda ahi: { 'game': ahi.parent.h2.text, 'time': ahi.previousSibling.previousSibling.text, 'url': ahi['href'], 'tvalue': ahi.parent.parent.h3.text } rh = requests.get('https://satta-king-fast.com/') sp = bsp(rh.text, 'html.parser') ahrefs = filter(lambda a: a.text == "Record Chart", sp.findAll('a')) temps = [[d['url'], d['game'], d['time'], d['tvalue']] for d in map(a2m, ahrefs)] rt = dict() for url, game, time, tv in temps: rt[game] = rt.get(game, {'url': urlpath(url), 'dt-val-ts': []}) rt[game]['dt-val-ts'].append([tv, time]) dates = [ t.findAll('h1')[0].text.split()[-2].replace(',', '') for t in sp.findAll('table')[:2] ] for game in rt: rt[game]['dt-val-ts'] = list( map(lambda xy: [xy[0]] + xy[1], zip(dates, rt[game]['dt-val-ts']))) return rt
def parse_html(html): soup = bsp(html) position_soup = soup.find('ul', class_='item_con_list') for position_li in position_soup.find_all('li'): position_attr = position_li.attrs print position_attr.get('data-salary') print position_attr.get('data-company')
def organization_details(): response = requests.get(myconstants.orgs_url) html = response.content soup = bsp(html, "html.parser") get_organizations = soup.findAll("li", {'class': 'organization-card__container'}) final_result = fetch_details(get_organizations) print(final_result) return json.dumps(final_result)
def get_article(cls, page_url): print 'self.get_article()' response = requests.get(page_url, headers=cls.headers) soup = bsp(response.text, 'html.parser') info = soup.findAll('a', class_='question_link') article_names = [i.text.strip() for i in info] article_links = [cls.base_url + i['href'] for i in info] return zip(article_names, article_links)
def scraper(url): try: head = {'User-Agent': 'Mozilla/5.0'} req = Request(url, headers=head) html = urlopen(req) bsObj = bsp(html, "html.parser") #scrape raw tags first imgs = bsObj.findAll('img', {'class':'th'}) headlines = bsObj.findAll('h3', {'class':'r'}) links = bsObj.findAll('span', {'class':'f'}) dates = bsObj.findAll('div', {'class':'slp'}) stories = bsObj.findAll('div', {'class':'st'}) # print(len(imgs)) # print(len(headlines)) # print(len(links)) # print(len(dates)) # print(len(stories)) #extract from scraped data #each data array may not be the same length; #try to match domains to align arrays thumbs = [t['src'] for t in imgs] # if s is a hashed news headline link: s.split("=")[1].split("&")[0] hlinks = [h.a['href'].split("=")[1].split("&")[0] for h in headlines] datesources = [re.sub('\u200e', '', str(d.contents[0].contents[0])) for d in dates] hlines = [tagCleaner(h.a.contents) for h in headlines] for i in range(len(thumbs)): print(thumbs[i]) print(hlinks[i]) print(hlines[i]) print(datesources[i]) # for h in headlines: # print(h.a['href']) # print(tagCleaner(h.a.contents)) # print(type(h.a.contents[1])==bs4.NavigableString) # print(str(h.a.contents[1])) # f = open(fname, 'w') # for s in stories: # f.write(str(s) + "\n") # f.close() except Exception as e: print(e) finally: print("*"*25)
def findUrls(href, driver): driver.get(href) time.sleep(1) soup = bsp(driver.page_source, features='lxml') list_urls = soup.find_all('a', {"class": "bookTitle"}, itemprop='url') urls = [] for url in list_urls: urls.append(url.get("href")) return urls
def get_sling_networks(): data = "https://www.cnet.com/news/sling-tv-everything-you-need-to-know/" response = requests.get(data, timeout=5) content = bsp(response.content, "html.parser") data_container = content.find("div", class_="chartWrapper") target = data_container.findAll("th") for i in target: print i.getText().strip()
def scrapeHomePage(): rh = requests.get('https://sattakingdarbar.com/') sp = bsp(rh.text, 'html.parser') ahrefs = filter(lambda a: a.text == "Record Chart", sp.findAll('a')) temps = { d['url']: [d['game'], d['time'], d['tvalue']] for d in map(a2m, ahrefs) } return temps
def cleanText(txt): text ='' txt = unicode(str(txt),errors='ignore') try: soup = bsp(txt,"html.parser") for e in soup.findAll(['script', 'style','form','meta','head']): e.extract() soup = bsp(str(soup),"html.parser") text = soup.get_text() except: print sys.exc_info()[0] text = re.sub(' +',' ',text) text = re.sub('\t\t+','\t',text) text = re.sub('\n\n+','\n',text) return text
def get_answer_url(userid): base="https://www.zhihu.com/people/" url=base+userid try: data=get_homepage_url_content(url) except: return [] beautiful=bsp(data,"html.parser") url1=beautiful.find_all("a",{"class":"question_link","target":"_blank"}) url2=beautiful.find_all("div",{"class":"zm-profile-section-item zm-item clearfix","data-type-detail":"member_voteup_answer"}) try: last_datatime=url2[-1].get("data-time") except: return [] # print url1 url3=beautiful.find_all("div",{"class":"zh-profile-account-status"}) if url3==[]: isbanned = False else: isbanned = True max_len=100 ans_url=[] while(url2!=[] and len(ans_url)<=max_len): for content in url1: if content.get("href").find("answer")>0: ans_url=ans_url+[content.get("href")] else: continue try: data=get_all_profile(userid,last_datatime) except: return ans_url beautiful=bsp(data,"html.parser") url1=beautiful.find_all("a",{"class":"question_link","target":"_blank"}) url2=beautiful.find_all("div",{"class":"zm-profile-section-item zm-item clearfix","data-type-detail":"member_voteup_answer"}) try: last_datatime=url2[-1].get("data-time") except: return ans_url return ans_url
def get_votersprofile_id(url): try: url1=get_homepage_url_content(url) url2=bsp(url1,"html.parser").find_all("a",{"class":"zg-anchor-hidden"}) except: url2=[] if url2==[]: return 10 voter_id=url2[0].get("name")[7:] return voter_id
def scraper(url): out = {} try: head = {'User-Agent': 'Mozilla/5.0'} req = Request(url, headers=head) html = urlopen(req) bsObj = bsp(html, "html.parser") #scrape raw tags first imgs = bsObj.findAll('img', {'class': 'th'}) headlines = bsObj.findAll('h3', {'class': 'r'}) links = bsObj.findAll('span', {'class': 'f'}) dates = bsObj.findAll('div', {'class': 'slp'}) stories = bsObj.findAll('div', {'class': 'st'}) # print(len(imgs)) # print(len(headlines)) # print(len(links)) # print(len(dates)) # print(len(stories)) #extract from scraped data #each data array may not be the same length; #try to match domains to align arrays thumbs = [t['src'] for t in imgs] # if s is a hashed news headline link: s.split("=")[1].split("&")[0] hlinks = [h.a['href'].split("=")[1].split("&")[0] for h in headlines] datesources = [ re.sub('\u200e', '', str(d.contents[0].contents[0])) for d in dates ] hlines = [tagCleaner(h.a.contents) for h in headlines] # for i in range(len(thumbs)): # print(thumbs[i]) # print(hlinks[i]) # print(hlines[i]) # print(datesources[i]) out = { 'thumbs': thumbs, "hlinks": hlinks, "hlines": hlines, "dates": datesources } except Exception as e: print(e) out = {"Exception": e} finally: print("*" * 25) return out
def get_lyrics(url_list): list_lyrics = [] for url in url_list: page = rq.get(url) soup = bsp(page.content, 'html.parser') container = soup.find('div', {"class": "lyric clearfix"}) lyric_tags = container.findAll('pre', attrs={'id': 'lyric-body-text'}) for tag in lyric_tags: lyric = tag.text list_lyrics.append(lyric) return list_lyrics
def save_all_img_src(url): #传入参数为首页 try: html = requests.get(url=url) html.encoding = 'utf-8' sp = bsp(html.text, 'html.parser') title = sp.find('h1', {'class': 'title2'}).text with open(cur_path + 'list/' + title + '.txt', 'w') as src_file: while (True): save_onepage_img_src(sp, src_file) #如果有下一页,则更新url if next_page_url(sp): url = next_page_url(sp) html = requests.get(url=url) html.encoding = 'utf-8' sp = bsp(html.text, 'html.parser') else: break except BaseException: log_file.write(url + '\t' + '下一页获取失败' + '\n')
def main(): root_url = 'http://www.asahi.com' url = 'http://www.asahi.com/politics/list/' res = requests.get(url) soup = bsp(res.text, 'lxml') urls = get_articles_urls(soup, ( ('ul', 'List'), ('li', ''), ), link_class="SW") article_datas = [] for u in urls: time.sleep(2) # 過アクセス防止 if not (u.startswith('http') and u.startswith('https')): u = root_url + u res = requests.get(u) soup = bsp(res.text, 'lxml') article_data = ArticleData() article_data.title = get_title(soup, ( ('div', 'ArticleTitle'), ('div', 'Title'), ('h1', ''), )) article_data.description = get_description(soup, ( ('div', 'ArticleText'), ('p', ''), )) article_datas.append(article_data) print('fetched from {}'.format(u)) for a in article_datas: print(a.title) print(a.description) print('\n-+-+-+-+-+-+-+-\n')
def get_max_page(cls): # print 'self.get_max_page()' try: response = requests.get(cls.oa_url, headers=cls.headers) soup = bsp(response.text, 'html.parser') max_page = int(soup.find_all('div', class_='w4_5')[-1].span.find_all('a')[-1].text) # 转为int cls.max_page = max_page # print type(max_page) return max_page except: return -1
def get_html(url): tag = ("section", "comments") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' } r = requests.get(url, headers=headers) page_html = bsp(r.text, 'html.parser') html = page_html.find(tag[0], id=tag[1]) return (html, url)
def scpeImage(): browser_image = init_browser() url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser_image.visit(url_image) html_image = browser_image.html soup_image = bsp(html_image, 'html.parser') image = (soup_image.find_all( 'div', class_='carousel_items')[0].a.get('data-fancybox-href')) images = 'https://www.jpl.nasa.gov' + image data_web['featImage'] = images browser_image.quit() return data_web
def get_urls(self): # 从主页面获取所有子页面的URL html = get_conn(self.base_url) soup = bsp(html, 'lxml') slider = soup.find('div', {'class': 'slider'}) new_home_box = soup.find_all('div', {'class': 'new-home-box'}) urls_obj = [] for item in [slider] + new_home_box: urls_obj.extend(item.find_all('a')) permerfor_urls = map(lambda item: item.get('href'), urls_obj) return permerfor_urls
def get_xc_item2(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/55.0.2883.95 Safari/537.36' } r = requests.get(url, headers=headers) r.encoding = 'gb2312' # 题库列表 html content = bsp(r.text, 'html.parser') return content
def get_soup(url=BASE_URL): # raw content done = False while done == False: try: content = requests.get(url).content except: sleep(1) abc = 123 finally: done = True # soup return bsp(content, "lxml")
def get_text(url): html = requests.get(url).text print(html) soup = bsp(html, 'lxml') title = soup.find('h1', id='articleTitle').get_text() autor = soup.find('div', class_="content-th-info").find('a').get_text() article_content = soup.find('div', class_="document").find_all('p') all_p = [ i.get_text() for i in article_content if not i.find('img') and not i.find('a') ] # 去除标签 article = '\n'.join(all_p) yield {"title": title, "autor": autor, "article": article}
def test(): proxy = urllib2.ProxyHandler({'http':'177.43.212.44'}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) response = urllib2.urlopen('https://en.wikipedia.org/wiki/apple') if response is not None: html = response.read() print html else: print 'None' # soup = bsp(open("wiki_files/123.html")) print soup.title print soup.find_all('h2')
def getTitle(self, title, content): soup = bsp(content) hs = soup.findAll("h1") hs += soup.findAll("h2") lines = [] for h in hs: lines.append(h.string) if len(lines) != 0: ret = self.getTitleFromLinesInHTags(title, lines) if ret and ret != "": return ret content, number = re.subn("<[\s\S]*?>", "", content) lines = content.split("\n") return self.getTitleFromLinesInAllContext(title, lines)
def get_totalpage(typeparam) : bspurl = 'http://number.sungoin.cn/number/template/agents.do?param=findXuanHao'+typeparam; s = requests.Session(); s.mount("http://number.sungoin.cn",HTTPAdapter( max_retries=Retry(total=10, status_forcelist=[500, 503]) )); response=s.get(bspurl); soup = bsp(response.content,"html.parser"); s = soup.find_all('script'); patt = 'val\("[0-9]+"\)'; re_pat=re.compile(patt); ll = s[2].text.replace('\n',""); search_ret=re_pat.search(ll); if search_ret : return int(search_ret.group().replace('val("','').replace('")',''));
def parse_voter_A(html): try: list = "" s = bsp(html, 'html.parser') list += s.a['href'][8:] list += ','+s.a['title'] list += ','+s.span.text all = s.find_all('li') list += ',' + all[0].span.text.split(' ')[0] list += ',' + all[1].span.text.split(' ')[0] list += ',' + all[2].a.text.split(' ')[0] list += ',' + all[3].a.text.split(' ')[0] except: # print(html.encode('utf-8')) pass return list.split(",")
def addTechnicalSchools(self, cities): '插入技校数据' starttime = time.time() insert = 0 mycursor = self.__mydb.cursor() try: for city in cities: citynumber, cityname = tuple(city.split(':')) rqtApi = self.rrtechnicalApi + citynumber + '.html' try: htmlhandle = urllib.request.urlopen(rqtApi) except Exception as e: self.log.write(time.asctime() + u'请求技校文档错误:' + str(e) + '\n') else: print('---下载%s数据成功---' % (cityname)) htmldoc = htmlhandle.read().decode('utf-8') htmlhandle.close() btsp = bsp(htmldoc, 'html.parser') countieshtml = btsp.find_all('a', href="#highschool_anchor") counties = [] for countyhtml in countieshtml: counties.append([countyhtml.string.strip(), re.search(r'[0-9]{4,}', countyhtml['onclick']).group()]) mycursor.execute(self.queryCityIdSql, (cityname,)) cityid = mycursor.fetchone()['id'] for county in counties: mycursor.execute(self.queryCountyIdSql, (county[0], cityid)) try: countyid = mycursor.fetchone()['id'] except Exception as e: self.log.write('没有找到%s-->%s的id\n' % (cityname, county[0])) else: techshtml = btsp.select('ul[id$=' + county[1] +']') techshtml = techshtml[0].find_all('a') if isinstance(techshtml, list) and len(techshtml) else [] for tech in techshtml: if isinstance(tech.string, str) and tech.string.strip(): mycursor.execute(self.queryTechnicalSql, (tech.string.strip())) if mycursor.fetchone()['num'] == 0: insert += 1 print('插入技校%s--%s--%s' % (cityname, county[0], tech.string)) mycursor.execute(self.insertTechnicalSql, (tech.string.strip(), countyid)) self.__mydb.commit() except Exception as e: self.log.write(time.asctime() + str(e) + '\n') mycursor.execute(self.countTechnicalSql) countnum = mycursor.fetchone()['countnum'] endtime = time.time() self.printExeResult(insert, endtime - starttime, countnum, '技校')
def get_answer_url(userid): base="https://www.zhihu.com/people/" url=base+userid try: data=get_homepage_url_content(url) except: return [] beautiful=bsp(data,"html.parser") url1=beautiful.find_all("a",{"class":"question_link","target":"_blank"}) ans_url=[] for content in url1: if content.get("href").find("answer")>0: ans_url=ans_url+[content.get("href")] else: continue return ans_url
def hanle_request(typeparam,pageNo) : arr = []; bspurl = 'http://number.sungoin.cn/number/template/agents.do?param=findXuanHao&type='+typeparam+"&pageNo="+str(pageNo)+"&pageSize=80"; # bspurl = 'http://number.sungoin.cn/number/template/agents.do?param=findXuanHao&type=无规则&pageNo="+str(pageNo)+"&pageSize=80‘; print bspurl s = requests.Session(); # s.mount(bspurl,HTTPAdapter(max_retries=10)); s.mount("http://number.sungoin.cn",HTTPAdapter( max_retries=Retry(total=10, status_forcelist=[500, 503]) )); response=s.get(bspurl); # response = s.get(bspurl); soup = bsp(response.content,"html.parser"); tds = soup.find_all('td',attrs={"width": "110px"}); for j in range(0,len(tds)) : td = tds[j]; s = typeparam+","+td.text.strip()+","+td.find_next_sibling().text.strip(); # arr.append(s) levelparam=get_level(td.find_next_sibling().text.strip(),td.text.strip(),typeparam); (level,selfrule)=levelparam.split(":"); update_sql_lite(td.text.strip(),level,selfrule,typeparam);
def filter_user(userid): base="https://www.zhihu.com/people/" url=base+userid asks="/people/"+userid+"/asks" answers="/people/"+userid+"/answers" posts="/people/"+userid+"/posts" collections="/people/"+userid+"/collections" logs="/people/"+userid+"/logs" try: data=get_homepage_url_content(url) except: print "user:"******"need to reconnect." beautiful=bsp(data,"html.parser") url1=beautiful.find_all("a",{"class":"item","href":asks})[0].span.string url2=beautiful.find_all("a",{"class":"item","href":answers})[0].span.string url3=beautiful.find_all("a",{"class":"item","href":posts})[0].span.string url4=beautiful.find_all("a",{"class":"item","href":collections})[0].span.string url5=beautiful.find_all("a",{"class":"item","href":logs})[0].span.string return [int(url1),int(url2),int(url3),int(url4),int(url5)]
def extractInfo(Link, domain): """Gets the link of the page, and returns Page title and Description of the Link works for - khan academy EdX udacity Udemy """ LinkInfo = fs.Links() LinkInfo.link = Link try: response = urlopen(Link) html = response.read() sp = bsp(html) if domain != "mitocw": desc = sp.findAll(attrs={"name": "description"}) else: desc = sp.findAll(attrs={"name": "Description"}) LinkInfo.pagetitle = sp.title.text.decode("utf-8") LinkInfo.desc = desc[0]["content"].decode("utf-8") except Exception: LinkInfo.pagetitle = "pagetitle" LinkInfo.desc = "desc[:498]" return LinkInfo
'wiki', '.ps', '.exe', '.txt', '.pps', 'drupal', 'lab401'] url_list.append(url) for i in url_list: print(i) print('Find',len(mail_list),'results') if(len(mail_list) >= 1000): break try : req = urllib.request.urlopen(i) #print(i) valid_url.append(i) s = bsp(req,"html.parser") #find <a href tag in the page link = s.findAll('a',href=True) for tmp in link: #print(tmp['href']) buf = tmp['href'] # mail if('mailto:' in buf): if(buf not in mail_list): if( len(buf) < 40 and '\n' not in buf): mail_list.append(buf) #print(buf)
def get_soup(url=BASE_URL): # raw content content = requests.get(url).content # soup return bsp(content,"lxml")
sqlitecu.execute("update four_hunder_number set is_sync = 1") sqliteconn.commit(); mysqlconn.commit() #关闭游标连接,释放资源 mysqlcurs.close() #关闭连接 mysqlconn.close() sqlitecu.close() sqliteconn.close() arr = []; types= []; bspurl = 'http://number.sungoin.cn/number/template/agents.do?param=findXuanHao'; response = requests.get(bspurl); soup = bsp(response.content,"html.parser"); numertypes = soup.find_all('div',attrs={"class":"guize_tit"}); # print len(numertypes) for m in range(0,len(numertypes)) : numbertype=numertypes[m]; aLinks=numbertype.find_next_sibling().find_all('a'); for h in range(0,len(aLinks)) : link = aLinks[h]; typeparam = link['href'].replace("javascript:sub('","").replace("')",""); if not typeparam.startswith('end'): continue; types.append(typeparam); total_page=get_totalpage("&type="+typeparam) for p in range(1,total_page): hanle_request(typeparam,p);