def handle(self, *args, **options): help = 'Closes the specified poll for voting' for num in range(1, 1451): html = 'https://www.mima.co.il/fact_page.php?song_id={}'.format(num) html_content = requests.get(html).text soup = BeautifulSoup(html_content, "html.parser") infoSongArt = soup.find_all('font', {'size': ['+5', '+2']}) a = infoSongArt[1].text artist, boolA = Artist.objects.get_or_create(name=a) artist.save() song, boolS = Song.objects.get_or_create(name=infoSongArt[0].text, artist=artist) song.save() print(Artist.objects.all()) print(Song.objects.all()) facts = soup.find_all("tr", {'bgcolor': ['#CCFFCC', '#EDF3FE']}) print(facts) for fact in facts: txt = fact.text splited_txt = txt.strip().split('נכתב ע"י') factData = splited_txt[0] try: authorData = splited_txt[1] except IndexError: authorData = 'אנונימי' fact, boolF = Fact.objects.get_or_create(message=factData, author=authorData, song=song) fact.save()
def search(quarter, dept): global soup global cookie form_hidden = [ [x['name'], x['value']] for x in soup.find_all('input') if x['type'] == 'hidden' ] + [['ctl00%24pageContent%24quarterDropDown', str(quarter)], ['ctl00%24pageContent%24subjectAreaDropDown', str(dept)], ['ctl00%24pageContent%24searchButton.x', '0'], ['ctl00%24pageContent%24searchButton.y', '0']] head = {'Content-Type': 'application/x-www-form-urlencoded'} form = [ x[0] + '=' + requests.utils.quote(x[1], safe='') for x in form_hidden ] form_body = '&'.join(form) course_url = 'https://my.sa.ucsb.edu/gold/BasicFindCourses.aspx' greq = grequests.post(course_url, headers=head, data=form_body, cookies=cookie) return greq
def getLinks(source): links_list = [] content = urllib.request.urlopen(source) url = content.read() content.close() soup = BeautifulSoup(url, features="html.parser") for link in soup.find_all('a'): if "premchand-stories" in str(link): link_url = str(link.get('href')) links_list.append(link_url) return links_list
for page in range(1): value = page url = 'https://github.com/microsoft/vscode/issues?page=%s&q=is%%3Aissue+is%%3Aopen' % str( value) # 获取这个网页的源代码,存放在req中,{}中为不同浏览器的不同User-Agent属性,针对不同浏览器可以自行百度 req = requests.get( url, { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' }) # 生成一个Beautifulsoup对象,用以后边的查找工作 soup = BeautifulSoup(req.text, 'html.parser') # 找到所有p标签中的内容并存放在xml这样一个类似于数组队列的对象中 xml = soup.find_all( 'a', class_= 'link-gray-dark v-align-middle no-underline h4 js-navigation-open') # 利用循环将xml[]中存放的每一条打印出来 for i in range(len(xml)): # 表示从0到xml的len()长度 t3 = xml[i].get('href') #print(t3) url1 = 'https://github.com' + str(t3) print(url1) req1 = requests.get( url, { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' }) soup1 = BeautifulSoup(req1.text, 'html.parser')
# 이미지 크롤링 body = driver.find_element_by_tag_name('body') # 인기순/작성순 선택할 수 있는 영역 클릭 # driver.find_element_by_xpath('//paper-button[@class="dropdown-trigger style-scope yt-dropdown-menu"]').click() # 인기순 카테고리 클릭 # driver.find_element_by_xpath('//paper-listbox[@class="dropdown-content style-scope yt-dropdown-menu"]/a[1]').click() page = driver.page_source soup = BeautifulSoup(page, 'html.parser') # comments=soup.find_all('yt-formatted-string',attrs={'class':'style-scope ytd-comment-renderer'}) cmmt_box = soup.find_all(attrs={'id': 'wrap'}) # real=soup.find('video') # real=real.get('src') # print(real) # //*[@id="container"]/div/div/div[3]/div[1]/table/tbody/tr[1]/td[2]/dl/dt/a/text() # //*[@id="container"]/div/div/div[3]/div[1]/table/tbody/tr[1]/td[3] # //*[@id="container"]/div/div/div[3]/div[1]/table/tbody/tr[2]/td[2]/dl/dt/a/text() from collections import OrderedDict import json data = OrderedDict() dojangRank = [] reboot1DojangRank = [] reboot2DojangRank = []
from asyncore import read import soup as soup from bs4 import BeautifulSoup import requests from facts.models import Artist, Song, Fact from mymima import settings html = 'https://www.mima.co.il/fact_page.php?song_id=969' html_content = requests.get(html).text soup = BeautifulSoup(html_content, "html.parser") infoSongArt = soup.find_all('font', {'size': ['+5', '+2']}) #artist_name = infoSongArt[0] artist_name = Artist(name=infoSongArt[0]) artist_name.save() #song_name = infoSongArt[1] song_name = Song(name=infoSongArt[1], Artist=infoSongArt[0]) song_name.save() print(Artist.objects.all()) print(Song.objects.all()) infoFactAut = [ element.text for element in soup.find_all('tr', {'bgcolor': ['#CCFFCC', '#EDF3FE']}) ] factim = [] print(infoFactAut) for f in infoFactAut: temp = f.strip().replace('\r\n', '').split('נכתב ע"י') factim.append(temp) print(factim) #print(type(text)) print(soup.prettify())
def getImg(html, path): s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=10)) s.mount('https://', HTTPAdapter(max_retries=10)) reg = 'url(.*);' imgre = re.compile(reg) imglist = imgre.findall(html) soup = BeautifulSoup(html, 'html.parser') all_img = (soup.find_all('img')) for img in all_img: src = img['src'] imglist.append(src) # patterncss = '<link rel="stylesheet" href="(.*?)"' patterncss = '<link.*?href="(.*?)"' hrefList = re.compile(patterncss, re.S).findall(html) for href in hrefList: if (href.find('http') < 0): href = basePath + href tmpHtml = getHtml(href) reg = 'url(.*\.png|jpg);' imgre = re.compile(reg) imglistTmp = imgre.findall(tmpHtml) imglist.extend(imglistTmp) x = 0 if not os.path.isdir(path): os.makedirs(path) # paths = path+'\\' for imgurl in imglist: if (imgurl.find('.svg') > 0): continue if (x >= -1): imgurl = imgurl.replace('(', '') imgurl = imgurl.replace(')', '') nameList = imgurl.split('/') name = '' for nameTmp in nameList: name = nameTmp url = basePath + imgurl print(url) print('ks') # urllib.request.urlretrieve(url,'{}{}.jpg'.format(paths,x)) content = '' try: response = requests.get(url, timeout=3) content = response.content except Exception as e: print(e) time.sleep(3) response = requests.get(url, timeout=3) content = response.content try: image = Image.open(BytesIO(content)) savePath = path + name print(savePath) image.save(savePath) print('js') except Exception as e: print(s) x = x + 1 else: x = x + 1 return imglist
from bs4 import BeautifulSoup import csv response = requests.get("https://coursehunter.net/frontend/javascript") soup = BeautifulSoup(response.text, "html.parser") #articles = soup.find_all("article") #print(articles) # for article in articles: # print(soup.body.div) #d = (soup.find(class_="course-figure")) #d = soup.find_all("picture", class_="course-figure") #d = soup.select(".course-figure") d = soup.find_all("picture", {"class": "course-figure", "data-link": True}) for d in soup.findAll(attrs={"class": "course-figure"}): print(d["data-link"]) #d = soup.find_all(attrs={"data-link":True}) # d= soup.find_all('course-figure').get('data-link') #print(d) """m = [] m = d.copy() single = list(map(str, m)) print(single)"""