def get_info(url): wb_data = requests.get(url, headers=headers) soup = BeautifulSoup(wb_data.text, 'lxml') ranks = soup.select('span.pc_temp_num') titles = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a') times = soup.select('div.pc_temp_songlist > ul > li > span.pc_temp_tips_r > span') for rank, title, time in zip(ranks, titles, times): # print(rank.get_text()) # print(title.get_text().strip()) # print(time.get_text().strip()) if title.get_text().find('-') > 0: data = { 'rank': rank.get_text().strip(), 'singer': title.get_text().split('-')[0].strip(), 'song': title.get_text().split('-')[1].strip(), 'time': time.get_text().strip(), } else: data = { 'rank': rank.get_text().strip(), 'singer': title.get_text().split('-')[0].strip(), 'song': '', 'time': time.get_text().strip(), } print(data)
def get_movie_data(soup): for child in soup.find_all('div', class_ = "showtime_box"): print child.div.a.get_text(strip=True) for length in child.find_all('div', class_ = "showtime_poster"): print length.get_text(strip=True) for time in child.find_all('li'): print time.get_text(strip=True) print
def get_items_info(item_link): wb_data = requests.get(item_link, headers=headers) if wb_data.status_code == 200: #判断页面是否存在者ip是否被封ip soup = BeautifulSoup(wb_data.text, 'lxml') titles = soup.select('.title-name') times = soup.select('.pr-5') types = soup.select('ul.det-infor > li:nth-of-type(1) > span > a') prices = soup.select('i.f22') adrs = soup.select('ul.det-infor > li:nth-of-type(3)') cates = soup.select('div.h-crumbs') qualities = soup.select( ' div.leftBox > div:nth-of-type(4) > div.det-summary > div > div ') for title, time, type, price, adr, cate, quality in zip( titles, times, types, prices, adrs, qualities, cates): items_data = { 'title': title.get_text(), 'times': time.get_text().split(), 'type': type.get_text(), 'price': price.get_text(), 'adr': list(adr.stripped_strings), 'qualities': list(quality.stripped_strings), 'cate': cate.get_text() } items_info9.insert_one(items_data) print(items_data)
def get_message(url): #print(url) wb_date = requests.get(url) id = (url.split('/')[-1])[:14]#截取商品ID得到对应的id,一共15位 views = get_totols(id) soup = BeautifulSoup(wb_date.text,'lxml') categories = soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a') #类目 titles = soup.select('#content > div.person_add_top.no_ident_top >div.per_ad_left > div.col_sub.mainTitle > h1')#标题 times = soup.select('#index_show > ul.mtit_con_left.fl > li.time')#发布时间 prices = soup.select('#content > div.person_add_top.no_ident_top >div.per_ad_left > div.col_sub.sumary > ul >' ' li:nth-of-type(1) > div.su_con > span')#价格 olds = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul >' ' li:nth-of-type(2) > div.su_con > span')#成色 areas1 = soup.select('#content > div.person_add_top.no_ident_top >' ' div.per_ad_left > div.col_sub.sumary > ul >' ' li:nth-of-type(3) > div.su_con > span >' ' a:nth-of-type(1)')#区域1 areas2 = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left >' ' div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span > a:nth-of-type(2)')#区域2 for category,title,time,price,old,area1,area2 in zip(categories,titles,times,prices,olds,areas1,areas2): data = { '类目':category.get_text(), '标题':title.get_text(), '发布时间':time.get_text(), '价格':price.get_text(), '成色':old.get_text().strip(), '区域':area1.get_text()+'-'+area2.get_text(), '浏览量':views } print(data) return None
def kg_spider(url): ''' 获取酷狗音乐top500,保存到mongdb :param url: 请求地址 :return: ''' res = requests.get(url, headers=headers) # print(res.text) soup = BeautifulSoup(res.text, 'lxml') ranks = soup.select('.pc_temp_num') titles = soup.select('.pc_temp_songlist > ul > li > a') times = soup.select('.pc_temp_time') for rank, title, time in zip(ranks, titles, times): rank = rank.get_text().strip() # print(rank) song = title.get_text().split(' - ')[-1] # print(song) singer = title.get_text().split(' - ')[0] # print(singer) song_time = time.get_text().strip() # print(song_time) print(rank, song, singer, song_time) data = { 'rank': rank, 'song': song, 'singer': singer, 'song_time': song_time } storage_mongdb(data) print("---" * 20)
def check_results(): URL = 'https://footballdatabase.com/ranking/' headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362' } page = requests.get(URL, headers=headers) soup = BeautifulSoup(page.content, 'html.parser') linksPaginacao = soup.find(attrs={"pagination pagination-sm"}) for link in linksPaginacao.findAll("a"): URL = 'https://footballdatabase.com' + link["href"] print(URL) page = requests.get(URL, headers=headers) soup = BeautifulSoup(page.content, 'html.parser') tabelaDeDados = soup.find(attrs={"table-responsive"}) for linha in tabelaDeDados.findAll("tr"): rank = linha.findAll(attrs={"rank"}) time = linha.find(attrs={"limittext"}) pais = linha.find(attrs={"sm_logo-name"}) if time is not None: print(rank[0].get_text() + " - " + time.get_text() + " (" + pais.get_text() + ")" + " ~ " + rank[1].get_text())
def kg_spider(url): rsp = requests.get(url,headers=headers) # print(rsp.text) soup = BeautifulSoup(rsp.text,'lxml') nums = soup.select('.pc_temp_num') # print(num) titles = soup.select('.pc_temp_songlist > ul > li > a') # print(titles) times = soup.select('.pc_temp_time') # print(times) for num,title,time in zip(nums,titles,times): data = { # 歌曲编号 'num':num.get_text().strip(), # print(num) # 歌曲名称 'song':title.get_text().split('-')[-1].strip(), # print(song) # 歌手 'singer':title.get_text().split('-')[0].strip(), # print(singer) # 歌曲时长 'time':time.get_text().strip() # print(num,song,singer,song_time) } songs_id = songs.insert(data) print(songs_id)
def get_zhaopin(page_num): url = "http://sou.zhaopin.com/jobs/searchresult.ashx" querystring = { "jl": "%e5%8c%97%e4%ba%ac", "kw": "python", "sm": "0", "sg": "2d8d7bd1731e4c06a4fbbb6aa50d7eb6", "p": page_num } print("第{}页".format(page_num)) response = requests.request("GET", url, headers=headers, params=querystring).content soup = BeautifulSoup(response, 'lxml') job_name = soup.select("div#newlist_list_content_table td.zwmc a") salarys = soup.select("div#newlist_list_content_table td.zwyx") locatinos = soup.select("div#newlist_list_content_table td.gzdd") times = soup.select("div#newlist_list_content_table td.gxsj span") for name, salary, location, time in zip(job_name, salarys, locatinos, times): data = { '职位名称': name.get_text(), '薪资范围': salary.get_text(), '工作地点': location.get_text(), '更新时间': time.get_text(), } print(data) f = open("Python招聘职位信息.json", "a", encoding="utf-8") f.write(json.dumps(data, ensure_ascii=False)) f.close()
def forbes_fetch_links(string): url="http://www.forbes.com" pages = [] to_return = [] pages.append(url+"/search/?q="+'"'+string.replace(' ','+')+'"') soup=crawl(pages[0]) pages.pop() for page in soup.find_all('li',attrs={'class':'page'}): page=page.findChild('a') try: pages.append(url+page.get('href')) except: print pages.reverse() while pages: soup=crawl(pages.pop()) links=soup.find_all('h2') times=soup.find_all('time',attrs={'class':'date'}) for link,time in zip(links,times): link=link.findChild('a') try: if int(time.get_text()[-4:])<=2008: to_return.append(link.get('href')) except: print return to_return
def get_attractions(url): import time wb_data = requests.get(url) time.sleep(2) soup = BeautifulSoup(wb_data.text, 'lxml') titles = soup.select("body > div.main.auto-width > ul > li > a > p.g-name") persons = soup.select( "body > div.main.auto-width > ul > li > a > p.performerName") prices = soup.select( "body > div.main.auto-width > ul > li > a > p.g-price > b") times = soup.select("body > div.main.auto-width > ul > li > a > p.g-time") places = soup.select( "body > div.main.auto-width > ul > li > a > p.g-place.a-link") images = soup.select( "body > div.main.auto-width > ul > li > a > div > img") print(images) for title, person, price, time, place, imge in zip(titles, persons, prices, times, places, images): data = { 'title': title.get_text().replace("\t", "").replace("\n", "").replace( "独家", "").replace("【秀动呈献】", ""), 'person': person.get_text().replace("\t", "").replace("\n", ""), 'price': price.get_text(), 'time': time.get_text().replace("\t", "").replace("\n", ""), 'place': place.get_text(), 'imge': imge.get('original') } print(data)
def get_info(url): wb_data = requests.get(url,headers=headers) print(wb_data) wb_data.encoding = wb_data.apparent_encoding soup = BeautifulSoup(wb_data.text,'lxml') # rankWrap > div.pc_temp_songlist > ul > li:nth-child(1) > span.pc_temp_num # rankWrap > div.pc_temp_songlist > ul > li:nth-child(1) > span.pc_temp_num #通过比较发现,选择器里信息有些标签都是一样的,所以可以只取部分即可 ranks = soup.select('span.pc_temp_num') # for each in ranks: # print(each.text.strip())#strip()去除换行符,spilt()用来去除特定的字符,返回一个列表对象 # titles = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a') titles = soup.select('div.pc_temp_songlist > ul > li > a') # for each in titles: # print(each.get('title')) times = soup.select('span.pc_temp_tips_r > span') # for each in times: # print(each.text.strip()) for rank, title, time in zip(ranks, titles, times): data = { 'rank':rank.text.strip(), 'singer':title.text.split('-')[0], 'song':title.text.split('-')[1], 'time':time.get_text().strip(), } print(data)
def kugou_spider(url): '''获取酷狗音乐TOP500, 保存至MongoDB''' rsp = requests.get(url, headers=headers) # print(rsp.text) soup = BeautifulSoup(rsp.text, 'lxml') # 排行 ranks = soup.select('.pc_temp_num') # print(ranks) titles = soup.select('.pc_temp_songlist > ul > li > a') # print(titles) times = soup.select('.pc_temp_time') # print(times) for rank, title, time in zip(ranks, titles, times): # 歌曲排名 rank = rank.get_text().strip() # 歌曲名称 song = title.get_text().split('-')[-1].strip() # 歌手 singer = title.get_text().split('-')[0].strip() # 歌曲时长 song_time = time.get_text().strip() # print(rank, song, singer, song_time) # print('~~~' * 20) data = { 'rank': rank, 'song': song, 'singer': singer, 'song_time': song_time } songs_id = songs.insert(data) print(songs_id)
def get_position_results(url): ab_urls = 'https://www.zhipin.com' headers = { 'Cookie': 'lastCity=101210100; JSESSIONID=""; __c=1533020202; __g=-; __l=l=%2Fwww.zhipin.com%2F&r=https%3A%2F%2Fwww.google.com%2F; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1532073970,1532323266,1532326685,1533020202; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1533021779; __a=15341501.1528789420.1532323266.1533020202.44.5.24.44; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101210100%2Fh_101210100%2F%3Fquery%3D%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598%26page%3D2', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'lxml') job_title = soup.select('div[class="job-title"]') job_salary = soup.select('span[class="red"]') job_info = soup.select('div.info-primary > p') company = soup.select('div.company-text > h3.name > a') company_info = soup.select('div.company-text > p') publish_time = soup.select('div.info-publis > p') details = soup.select('div.info-primary > h3.name > a') position_results = [] for title, salary, info, comp, comp_info, time, detail in zip( job_title, job_salary, job_info, company, company_info, publish_time, details): title = title.get_text() salary = salary.get_text() job_info = info.get_text() company = comp.get_text() company_info = comp_info.get_text() publish_time = time.get_text() job_url = ab_urls + detail.get('href') lst = [ title, salary, job_info, company, company_info, publish_time, job_url ] position_results.append(lst) return position_results
def get_zhaopin_1(page): url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=深圳&kw=python&p={0}&kt=3'.format( page) print("第{0}页".format(page)) wbdata = requests.get(url).content soup = BeautifulSoup(wbdata, 'lxml') job_name = soup.select("table.newlist > tr > td.zwmc > div > a") salarys = soup.select("table.newlist > tr > td.zwyx") locations = soup.select("table.newlist > tr > td.gzdd") times = soup.select("table.newlist > tr > td.gxsj > span") for name, salary, location, time in zip(job_name, salarys, locations, times): url = name['href'] wbdata = requests.get(url).content soup = BeautifulSoup(wbdata, 'lxml') gsmc = soup.select('div.inner-left > h2') if len(gsmc) > 0: company = gsmc[0].get_text() else: company = '' data = { 'name': name.get_text(), 'company': company, 'salary': salary.get_text(), 'location': location.get_text(), 'time': time.get_text(), 'url': name['href'] } print(data)
def get_info(url): db = pymysql.connect(host='127.0.0.1', user='******', password='******', port=3306, db='spiders') cursor = db.cursor() sql = 'create table if not exists kugoulist(ranks VARCHAR(255) NOT NULL, singer VARCHAR(255) NOT NULL, song VARCHAR(255) NOT NULL, time VARCHAR(255) NOT NULL)' cursor.execute(sql) web_data = requests.get(url, headers=headers) soup = BeautifulSoup(web_data.text, 'lxml') ranks = soup.select('span.pc_temp_num') titles = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a') times = soup.select('span.pc_temp_tips_r > span') for rank, title, time in zip(ranks, titles, times): data = { 'ranks': rank.get_text().strip(), 'singer': title.get_text().split('-')[0], 'song': title.get_text().split('-')[1], 'time': time.get_text().strip() } table = 'kugoulist' keys = ','.join(data.keys()) values = ','.join(['%s'] * len(data)) sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format( table=table, keys=keys, values=values) try: if cursor.execute(sql, tuple(data.values())): print('Successful') db.commit() except: print('Failed') db.rollback() db.close()
def get_page(html): soup = BeautifulSoup(html, 'lxml') for content, time in zip(soup.find_all('span', class_="ctt"), soup.find_all('span', class_="ct")): print(content.get_text(), time.get_text()) query = 'insert content(微博内容, 发布时间) value(' + "\'" + content.get_text() + "\'," + "\'" + time.get_text() + "\'" + ');' cur.execute(query) conn.commit()
def kg_spider(url): """ 获取酷狗音乐top500, 保存至mongodb :param url: 请求地址 :return: """ data = {} res = requests.get(url, headers=headers) # print(res.text) soup = BeautifulSoup(res.text, "lxml") ranks = soup.select(".pc_temp_num") # print(ranks) titles = soup.select(".pc_temp_songlist > ul > li > a") # print(titles) times = soup.select(".pc_temp_time") # print(times) for rank, title, time in zip(ranks, titles, times): rank = rank.get_text().strip() # print(rank) # 歌曲名称 song = title.get_text().split("-")[-1].strip() # 歌手 singer = title.get_text().split("-")[0].strip() href = title["href"] req = requests.get(href, headers=headers) soup = BeautifulSoup(req.text, "lxml") mp3s = soup.select(".mainPage") print(mp3s) mp3 = mp3s["src"] print(mp3) song_time = time.get_text().strip() # print(rank, song, singer, song_time) data = { "rank": rank, "singer": singer, "song": song, "time": song_time, "href": href, } # data = { # "rank": rank.get_text().strip(), # "singer": title.get_text().split("-")[0].strip(), # "song": title.get_text().split("-")[-1].strip(), # "time": time.get_text().strip(), # "href": title["href"], # } # songs_id = songs.insert(data) return data
def get_info(url): # 获取网页信息 web_data = requests.get(url, headers=headers) soup = BeautifulSoup(web_data.text, 'lxml') ranks = soup.select('span.pc_temp_num') titles = soup.select('div.pc_temp_songlist > ul > li > a') times = soup.select('span.pc_temp_tips_r > span') for rank, title, time in zip(ranks, titles, times): data = {'rank': rank.get_text().strip(), 'singer': title.get_text().split('-')[0], 'song': title.get_text().split('-')[1], 'time': time.get_text().strip()} print(data)
def get_info(url): wb_data = requests.get(url,headers=headers) soup = BeautifulSoup(wb_data.text,'lxml') ranks = soup.select('span.pc_temp_num') titiles = soup.select('div.pc_temp_songlist > ul > li > a') times = soup.select('span.pc_temp_tips_r > span') for rank,titile,time in zip(ranks,titiles,times): data = { 'rank':rank.get_text().strip(), 'single':titile.get_text().strip(), 'time':time.get_text().strip() } print(data)
def get_info(url): wb_data = requests.get(url, headers) soup = BeautifulSoup(wb_data.text, 'lxml') ranks = soup.select('span.pc_temp_num') titles = soup.select('div.pc_temp_songlist > ul > li > a') times = soup.select('span.pc_temp_tips_r > span') for rank, title, time in zip(ranks, titles, times): data = { 'rank': rank.get_text().strip(), 'singer': title.get_text().split('-')[0], 'song': title.get_text().split('-')[1], #通过split获取歌手和歌曲信息 'time': time.get_text().strip() } print(data) #获取爬虫信息并按字典打印
def get_info(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, 'html.parser') ranks = soup.select('span.pc_temp_num') titles = soup.select('div.pc_temp_songlist > ul > li > a') times = soup.select('span.pc_temp_tips_r > span') for rank, title, time in zip(ranks, titles, times): data = { 'rank': rank.get_text().strip(), 'singer': title.get_text().split('-')[0], 'song': title.get_text().split('-')[1], 'time': time.get_text().strip() } print(data)
def get_info(url): wb_data = requests.get(url, headers=comm.headers) soup = BeautifulSoup(wb_data.text, "lxml") ranks = soup.select("span.pc_temp_num") titles = soup.select("div.pc_temp_songlist > ul > li >a") times = soup.select("span.pc_temp_tips_r > span") for rank, title, time in zip(ranks, titles, times): data = { "rank": rank.get_text().strip(), "singer": get_song(title.get_text().split('-'), 0), "song": get_song(title.get_text().split('-'), 1), "time": time.get_text().strip() } print("结果:", data)
def get_info(url): wb_data = requests.get(url, headers=headers) soup = BeautifulSoup(wb_data.text, 'lxml') ranks = soup.select("span.pc_temp_num") titles = soup.select("#rankWrap > div.pc_temp_songlist > ul > li") times = soup.select("span.pc_temp_tips_r > span") for rank, title, time in zip(ranks, titles, times): data = { 'rank': rank.get_text().strip(), 'singer': title.get("title").split("-")[0], 'song': title.get("title").split("-")[1], 'time': time.get_text().strip() } print(data)
def get_info(url, counts): """获取网页指定元素信息""" try: # 编码格式不一致,需要判断 if econding_type == 'ISO-8859-1': r = wb_data.text.encode('ISO-8859-1').decode( requests.utils.get_encodings_from_content( wb_data.text)[0]) # 转码 soup = BeautifulSoup(r, 'lxml') else: soup = BeautifulSoup(wb_data.text, 'lxml') # UTF-8 # 获取指定元素 titles = soup.select( 'body > section > div > div > div > header > h1 > a') genre_films = soup.select( 'body > section > div > div > div > header > ul > li > a') times = soup.select( 'body > section > div > div > div > header > ul > li:nth-child(2)') imgs = soup.select( 'body > section > div > div > div > article > div.video_box > div.video_img > img' ) url_paths = soup.select( 'body > section > div > div > div > header > h1 > a') video_infos = soup.select( 'body > section > div > div > div > article > div.video_box > div.video_info' ) for title, genre_film, time, img, url_path, video_info in zip( titles, genre_films, times, imgs, url_paths, video_infos): data = [ # 'article-title':title.get_text().strip(), # 'genre_film':genre_film.get_text().strip(), # 'time':time.get_text(), # 'img':img.get("src"), # 'url_path':url_path.get("href"), # 'video_info':video_info.get_text().replace('\n',',') title.get_text().strip(), genre_film.get_text().strip(), time.get_text(), img.get("src"), url_path.get("href"), video_info.get_text().replace('\n', ',').replace(' / ', '/') ] content = "{}{}".format(counts, str(data)) # 转换为st以便写入文本 # return content # print(counts + "正在写入……") out_put_file(counts, content) #调用文件操作函数 except: print('Error,pass,get_info') pass
def get_info(url): Response = requests.get(url, headers=headers) soup = BeautifulSoup(Response.text, 'lxml') ranks = soup.select('span.pc_temp_num') titles = soup.select(' div.pc_temp_songlist > ul > li > a') print(titles) times = soup.select(' span.pc_temp_tips_r > span') for rank, title, time in zip(ranks, titles, times): data = { 'rank': rank.get_text().strip(), 'singer': title.get_text().split('-')[0], 'song': title.get_text().split('-')[1], 'time': time.get_text().strip() } print(data)
def get_info(links): for link in links: r = requests.get(link, headers=headers) soup = BeautifulSoup(r.text, 'lxml') titles = soup.select('div.pc_temp_songlist > ul > li >a') ranks = soup.select('span.pc_temp_num') times = soup.select('span.pc_temp_time') for rank, title, time in zip(ranks, titles, times): data = { 'rank': rank.get_text().strip(), 'singer': title.get_text().strip().split('-')[0], 'name': title.get_text().strip().split('-')[1], 'time': time.get_text().strip() } print(data)
def get_info(url): wb_data = requests.get(url, headers=headers) #获取信息 soup = BeautifulSoup(wb_data.text, 'lxml') #解析/结构化信息,使其日后方便过滤/提取 ranks = soup.select('span.pc_temp_num') #锁定歌曲排名在网页中的具体位置 titles = soup.select( 'div.pc_temp_songlist > ul > li > a') #锁定歌曲名字在网页中对应的位置 times = soup.select('span.pc_temp_tips_r > span') for rank, title, time in zip(ranks, titles, times): data = { "rank": rank.get_text().strip(), "singer": title.get_text().split('-')[0], "song": title.get_text().split('-')[1], "time": time.get_text().strip() } print(data)
def get_rank_info(self, url): res = requests.get(url, headers=self.headers) res.encoding = self.encoding # 同样读取和写入的编码格式 a = res.text soup = BeautifulSoup(res.text, 'lxml') ranks = soup.select('span.pc_temp_num') titles = soup.select('a.pc_temp_songname') times = soup.select('span.pc_temp_time') ids = [song['href'].split('/')[-1].split('.')[0] for song in soup.find_all('a', 'pc_temp_songname')] song_list = [(rank.get_text().strip(), title.get_text().strip().split('-')[0].strip(), title.get_text().strip().split('-')[1].strip(), time.get_text().strip(), song_id) for rank, title, time, song_id in zip(ranks, titles, times, ids)] return song_list
def top(url): html = requests.get(url, headers=headers) soup = BeautifulSoup(html.text, 'lxml') No = soup.select('.pc_temp_num') titles = soup.select('.pc_temp_songname') href = soup.select('.pc_temp_songname') time = soup.select('.pc_temp_time') for No, titles, time, href in zip(No, titles, time, href): data = { 'NO': No.get_text().strip(), 'titles': titles.get_text(), 'time': time.get_text().strip(), 'href': href.get('href') } print(data)
def get_info(url): wb_data = Requests.get(url, headers=headers) soup = BeautifulSoup(wb_data.text, 'lxml') ranks = soup.select('span.pc_temp_num') titles = soup.select('div.pc_temp_songlist> ul > li> a ') times = soup.select('span.pc_temp_tips_r > span ') for rank in ranks: data = { 'rank': rank.get_text().strip(), 'singer': title.get_text().split('-')[0], 'song': title.get_text().split('-')[1], 'time': time.get_text().strip() } print(data)
def get_info(url, file): res = requests.get(url, headers=headers) res.encoding = file.encoding # 同样读取和写入的编码格式 soup = BeautifulSoup(res.text, 'lxml') ranks = soup.select('span.pc_temp_num') titles = soup.select('a.pc_temp_songname') times = soup.select('span.pc_temp_time') for rank, title, time in zip(ranks, titles, times): data = { 'rank': rank.get_text().strip(), 'title': title.get_text().strip(), 'time': time.get_text().strip() } string = "{: <10}{: <30}{: <10}\n".format(data['rank'], data['title'], data['time']) # 格式化输出 file.write(string)
def get_info(url): wb_data = requests.get(url, headers=headers) soup = BeautifulSoup(wb_data.text, 'lxml') ranks = soup.select('div.pc_temp_songlist > ul > li > span.pc_temp_num') titles = soup.select('div.pc_temp_songlist > ul > li > a') times = soup.select('span.pc_temp_tips_r > span') # 此处由于这个标签和上一个 # 标签是在同一个标签下,所以可以只要后面部分路径 for rank, title, time in zip(ranks, titles, times): data = { 'rank': rank.get_text().strip(), 'singer': title.get_text().split('-')[0], 'song': title.get_text().split('-')[1], 'time': time.get_text().strip() } print(data) writer.writerow((data['rank'], data['singer'], data['song'], data['time']))
def get(str, HEADERS, csv_write, out, proxy_info): row = [] result_req = requests.get(str, headers=HEADERS, proxies=proxy_info) resultsoup = BeautifulSoup(result_req.text, features='lxml') title = resultsoup.find('h2', {"class": "citation__title"}) name_find = resultsoup.find('div', {'class', 'accordion-tabbed'}) abstract_a = resultsoup.find('div', {"class": "article-section__content en main"}) time = resultsoup.find('span', {"class": "epub-date"}) if title is None: title = 'null' row.append(title) else: row.append(title.get_text()) if name_find is None: author_list = 'null' row.append(author_list) else: span = name_find.find_all('span') author_list = '' for i in span: name = i.get_text() if re.search('E-mail', name) is None: if author_list == '': author_list = name else: author_list = author_list + ';' + name else: pass row.append(author_list) if time is None: timeresult = 'null' row.append(timeresult) else: row.append(time.get_text()) if abstract_a is None: abstract_b = 'null' row.append(abstract_b) else: abstract_b = abstract_a.find('p') row.append(abstract_b.get_text()) print(row) csv_write.writerow(row) out.flush()
def get(str, HEADERS, csv_write, out, proxy_info): row = [] result_req = requests.get(str, headers=HEADERS, proxies=proxy_info) resultsoup = BeautifulSoup(result_req.text, features='lxml') title = resultsoup.find('header', {"class": "publicationContentTitle"}) name_find = resultsoup.find_all('span', {"class": "contrib-author"}) abstract_a = resultsoup.find('div', {"class": "abstractSection abstractInFull"}) time = resultsoup.find('div', {"class": "publicationContentEpubDate dates"}) if title is None: title = 'null' row.append(title) else: title_a = title.find('h3') row.append(title_a.get_text()) if name_find is None: author_list = 'null' row.append(author_list) else: author_list = '' for i in name_find: name = i.find_all('a') for j in name: if j.get_text() is None: pass else: author_list = author_list + ' ' + j.get_text() row.append(author_list) if time is None: timeresult = 'null' row.append(timeresult) else: row.append(time.get_text()) if abstract_a is None: abstract_b = 'null' row.append(abstract_b) else: abstract_b = abstract_a.find('div') row.append(abstract_b.get_text()) print(row) csv_write.writerow(row) out.flush()
def sniffingThread(self, button, devstore, filter, count, time): modele = devstore.get_model() est_actif = devstore.get_active() dev = "" if est_actif < 0: dev = "" dev = modele[est_actif][0] self.log.info(_("Launching sniff process on dev {0} with : count={1}, timeout={2}, filter=\"{3}\"").format(dev, count.get_text(), time.get_text(), filter.get_text())) sniffer = pcapy.open_live(dev, 1024, False, int(time.get_text())) try: sniffer.setfilter(filter.get_text()) except: self.log.warn(_("The provided filter is not valid (it should respects the BPF format")) button.set_sensitive(True) return sniffer.loop(int(count.get_text()), self.packetHandler) button.set_sensitive(True)
def get_items_info(item_link): wb_data=requests.get(item_link,headers=headers) if wb_data.status_code ==200: #判断页面是否存在者ip是否被封ip soup=BeautifulSoup(wb_data.text,'lxml') titles = soup.select('.title-name') times = soup.select('.pr-5') types = soup.select('ul.det-infor > li:nth-of-type(1) > span > a') prices = soup.select('i.f22') adrs = soup.select('ul.det-infor > li:nth-of-type(3)') cates=soup.select('div.h-crumbs') qualities = soup.select(' div.leftBox > div:nth-of-type(4) > div.det-summary > div > div ') for title, time, type, price, adr, cate, quality in zip(titles, times, types, prices, adrs, qualities,cates): items_data = { 'title': title.get_text(), 'times': time.get_text().split(), 'type': type.get_text(), 'price': price.get_text(), 'adr': list(adr.stripped_strings), 'qualities': list(quality.stripped_strings), 'cate':cate.get_text() } items_info9.insert_one(items_data) print(items_data)
movies = soup.find_all("div", attrs={"class": "single-item single-film"}) for movie in movies: movieTitle = movie.find("a", attrs={"class": "filmInfoLink"}).get_text() try: movieRuntime = movie.find("span", attrs={"class": "runtime"}).get_text().strip() except AttributeError: movieRuntime = "" movieLength = movie.find("span", attrs={"class": "length"}).get_text().strip() movieGenre = movie.find("span", attrs={"class": "genre"}).get_text().strip() print "Movie: " + movieTitle print "Runtime: " + movieRuntime print "Length: " + movieLength print "Genre: " + movieGenre #Times runningTimesTable = movie.find("table", attrs={"class": "times times-single-day"}) runningTimes = runningTimesTable.find_all("tr") print "Running Times:" print "****************************" for row in runningTimes: movie2d3d = row.find("th").get_text().strip() print movie2d3d for time in row.find_all("a", attrs={"class": "btn-runningtime"}): movieTime = time.get_text().strip() print movieTime print "++++++++++++++++++++++++++++" print "============================\n"