Exemplos de find_all em Python, exemplos de soup.find_all em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: Scraping.py Projeto: gal-ch/MimaSite

 def handle(self, *args, **options):
     help = 'Closes the specified poll for voting'
     for num in range(1, 1451):
         html = 'https://www.mima.co.il/fact_page.php?song_id={}'.format(num)
         html_content = requests.get(html).text
         soup = BeautifulSoup(html_content, "html.parser")
         infoSongArt = soup.find_all('font', {'size': ['+5', '+2']})
         a = infoSongArt[1].text
         artist, boolA = Artist.objects.get_or_create(name=a)
         artist.save()
         song, boolS = Song.objects.get_or_create(name=infoSongArt[0].text, artist=artist)
         song.save()
         print(Artist.objects.all())
         print(Song.objects.all())
         facts = soup.find_all("tr", {'bgcolor': ['#CCFFCC', '#EDF3FE']})
         print(facts)
         for fact in facts:
             txt = fact.text
             splited_txt = txt.strip().split('נכתב ע"י')
             factData = splited_txt[0]
             try:
                 authorData = splited_txt[1]
             except IndexError:
                 authorData = 'אנונימי'
             fact, boolF = Fact.objects.get_or_create(message=factData, author=authorData, song=song)
             fact.save()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: search.py Projeto: sguzman/PsyberMiner

def search(quarter, dept):
    global soup
    global cookie

    form_hidden = [
        [x['name'], x['value']]
        for x in soup.find_all('input') if x['type'] == 'hidden'
    ] + [['ctl00%24pageContent%24quarterDropDown',
          str(quarter)],
         ['ctl00%24pageContent%24subjectAreaDropDown',
          str(dept)], ['ctl00%24pageContent%24searchButton.x', '0'],
         ['ctl00%24pageContent%24searchButton.y', '0']]

    head = {'Content-Type': 'application/x-www-form-urlencoded'}
    form = [
        x[0] + '=' + requests.utils.quote(x[1], safe='') for x in form_hidden
    ]
    form_body = '&'.join(form)
    course_url = 'https://my.sa.ucsb.edu/gold/BasicFindCourses.aspx'
    greq = grequests.post(course_url,
                          headers=head,
                          data=form_body,
                          cookies=cookie)

    return greq

Exemplo n.º 3

0

Exibir arquivo

Arquivo: beautifulsoup.py Projeto: mirandaday16/hin_urd_translation

def getLinks(source):
    links_list = []
    content = urllib.request.urlopen(source)
    url = content.read()
    content.close()
    soup = BeautifulSoup(url, features="html.parser")
    for link in soup.find_all('a'):
        if "premchand-stories" in str(link):
            link_url = str(link.get('href'))
            links_list.append(link_url)
    return links_list

Exemplo n.º 4

0

Exibir arquivo

for page in range(1):
    value = page
    url = 'https://github.com/microsoft/vscode/issues?page=%s&q=is%%3Aissue+is%%3Aopen' % str(
        value)
    # 获取这个网页的源代码，存放在req中，{}中为不同浏览器的不同User-Agent属性，针对不同浏览器可以自行百度
    req = requests.get(
        url, {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
        })
    # 生成一个Beautifulsoup对象，用以后边的查找工作
    soup = BeautifulSoup(req.text, 'html.parser')
    # 找到所有p标签中的内容并存放在xml这样一个类似于数组队列的对象中
    xml = soup.find_all(
        'a',
        class_=
        'link-gray-dark v-align-middle no-underline h4 js-navigation-open')

    # 利用循环将xml[]中存放的每一条打印出来

    for i in range(len(xml)):  # 表示从0到xml的len()长度
        t3 = xml[i].get('href')
        #print(t3)
        url1 = 'https://github.com' + str(t3)
        print(url1)
        req1 = requests.get(
            url, {
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
            })
        soup1 = BeautifulSoup(req1.text, 'html.parser')

Exemplo n.º 5

0

Exibir arquivo

Arquivo: Dojang.py Projeto: msg-gg/msg.gg-crawling

# 이미지 크롤링

body = driver.find_element_by_tag_name('body')

# 인기순/작성순 선택할 수 있는 영역 클릭
# driver.find_element_by_xpath('//paper-button[@class="dropdown-trigger style-scope yt-dropdown-menu"]').click()
# 인기순 카테고리 클릭
# driver.find_element_by_xpath('//paper-listbox[@class="dropdown-content style-scope yt-dropdown-menu"]/a[1]').click()

page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')

# comments=soup.find_all('yt-formatted-string',attrs={'class':'style-scope ytd-comment-renderer'})

cmmt_box = soup.find_all(attrs={'id': 'wrap'})
# real=soup.find('video')
# real=real.get('src')

# print(real)
# //*[@id="container"]/div/div/div[3]/div[1]/table/tbody/tr[1]/td[2]/dl/dt/a/text()
# //*[@id="container"]/div/div/div[3]/div[1]/table/tbody/tr[1]/td[3]

# //*[@id="container"]/div/div/div[3]/div[1]/table/tbody/tr[2]/td[2]/dl/dt/a/text()
from collections import OrderedDict
import json

data = OrderedDict()
dojangRank = []
reboot1DojangRank = []
reboot2DojangRank = []

Exemplo n.º 6

0

Exibir arquivo

from asyncore import read
import soup as soup
from bs4 import BeautifulSoup
import requests
from facts.models import Artist, Song, Fact
from mymima import settings
html = 'https://www.mima.co.il/fact_page.php?song_id=969'
html_content = requests.get(html).text
soup = BeautifulSoup(html_content, "html.parser")
infoSongArt = soup.find_all('font', {'size': ['+5', '+2']})
#artist_name = infoSongArt[0]
artist_name = Artist(name=infoSongArt[0])
artist_name.save()
#song_name = infoSongArt[1]
song_name = Song(name=infoSongArt[1], Artist=infoSongArt[0])
song_name.save()
print(Artist.objects.all())
print(Song.objects.all())
infoFactAut = [
    element.text
    for element in soup.find_all('tr', {'bgcolor': ['#CCFFCC', '#EDF3FE']})
]
factim = []
print(infoFactAut)
for f in infoFactAut:
    temp = f.strip().replace('\r\n', '').split('נכתב ע"י')
    factim.append(temp)
print(factim)
#print(type(text))
print(soup.prettify())

Exemplo n.º 7

0

Exibir arquivo

Arquivo: getHtmlImage.py Projeto: hzm1313/miracle

def getImg(html, path):
    s = requests.Session()
    s.mount('http://', HTTPAdapter(max_retries=10))
    s.mount('https://', HTTPAdapter(max_retries=10))

    reg = 'url(.*);'
    imgre = re.compile(reg)
    imglist = imgre.findall(html)

    soup = BeautifulSoup(html, 'html.parser')
    all_img = (soup.find_all('img'))
    for img in all_img:
        src = img['src']
        imglist.append(src)

    # patterncss = '<link rel="stylesheet" href="(.*?)"'
    patterncss = '<link.*?href="(.*?)"'
    hrefList = re.compile(patterncss, re.S).findall(html)
    for href in hrefList:
        if (href.find('http') < 0):
            href = basePath + href
        tmpHtml = getHtml(href)
        reg = 'url(.*\.png|jpg);'
        imgre = re.compile(reg)
        imglistTmp = imgre.findall(tmpHtml)
        imglist.extend(imglistTmp)
    x = 0
    if not os.path.isdir(path):
        os.makedirs(path)
    # paths = path+'\\'

    for imgurl in imglist:
        if (imgurl.find('.svg') > 0):
            continue
        if (x >= -1):
            imgurl = imgurl.replace('(', '')
            imgurl = imgurl.replace(')', '')
            nameList = imgurl.split('/')
            name = ''
            for nameTmp in nameList:
                name = nameTmp
            url = basePath + imgurl
            print(url)
            print('ks')
            #  urllib.request.urlretrieve(url,'{}{}.jpg'.format(paths,x))
            content = ''
            try:
                response = requests.get(url, timeout=3)
                content = response.content
            except Exception as e:
                print(e)
                time.sleep(3)
                response = requests.get(url, timeout=3)
                content = response.content
            try:
                image = Image.open(BytesIO(content))
                savePath = path + name
                print(savePath)
                image.save(savePath)
                print('js')
            except Exception as e:
                print(s)
            x = x + 1
        else:
            x = x + 1
    return imglist

Exemplo n.º 8

0

Exibir arquivo

Arquivo: webscrape.py Projeto: scorpiopd/allpython

from bs4 import BeautifulSoup
import csv

response = requests.get("https://coursehunter.net/frontend/javascript")
soup = BeautifulSoup(response.text, "html.parser")
#articles = soup.find_all("article")
#print(articles)
# for article in articles:
# print(soup.body.div)

#d = (soup.find(class_="course-figure"))

#d = soup.find_all("picture", class_="course-figure")

#d = soup.select(".course-figure")

d = soup.find_all("picture", {"class": "course-figure", "data-link": True})
for d in soup.findAll(attrs={"class": "course-figure"}):

    print(d["data-link"])

#d = soup.find_all(attrs={"data-link":True})

# d= soup.find_all('course-figure').get('data-link')

#print(d)
"""m = []
m = d.copy()
single = list(map(str, m))
print(single)"""