示例#1
0
def webScraper(html):
    html_data = Beautifulsoup(html)
    req_data = html_data.find_all(['p'])
    res = ""
    for i in req_data:
        res = res + str(i)
    return res
示例#2
0
    def comics_parse(self, response):
        content = response.body
        if not content:
            self.log('parse comics body error.')
            return

        soup = Beautifulsoup(content, "html5lib")

        page_list_tag = soup.find('ul', class_='pagelist')

        current_li = page_list_tag.find('li', class_='thisclass')
        page_num = current_li.string
        self.log('current page = ' + page_num)

        # display current page img tag
        li_tag = soup.find('li', id='imgshow')
        img_tag = li_tag.find('img')

        # current img url
        img_url = img_tag['src']
        self.log('img url: ' + img_url)

        # comics title
        title = img_tag['alt']

        # save img to local
        self.save_img(page_num, title, img_url)

        # next page url, when the href attribute is # so that signed the last page
        a_tag_list = page_list_tag.find_all('a')
        next_page = a_tag_list[-1]['href']
        if next_page == '#':
            self.log('parse comics: ' + title + ' finished.')
        else:
            next_page = SITE_NAME + '/' + KEY_WORD + '/' + next_page
            yield scrapy.Request(next_page, callback=self.comics_parse)
示例#3
0
    def comics_parse(self, response):
        content = response.body
        if not content:
            self.log('parse comics body error.')
            return

        soup = Beautifulsoup(content, "html5lib")

        page_list_tag = soup.find('ul', class_='pagelist')

        current_li = page_list_tag.find('li', class_='thisclass')
        page_num = current_li.string
        self.log('current page = ' + page_num)

        # display current page img tag
        li_tag = soup.find('li', id='imgshow')
        img_tag = li_tag.find('img')

        # current img url
        img_url = img_tag['src']
        self.log('img url: ' + img_url)

        # comics title
        title = img_tag['alt']

        # save img to local
        self.save_img(page_num, title, img_url)

        # next page url, when the href attribute is # so that signed the last page
        a_tag_list = page_list_tag.find_all('a')
        next_page = a_tag_list[-1]['href']
        if next_page == '#':
            self.log('parse comics: ' + title + ' finished.')
        else:
            next_page = SITE_NAME + '/' + KEY_WORD + '/' + next_page
            yield scrapy.Request(next_page, callback=self.comics_parse)
示例#4
0
    def parse(self, response):
        content = response.body
        if not content:
            self.log('parse body error.')
            return

        # use beautifulsoup instead of lxml
        soup = Beautifulsoup(content, "html5lib")

        # get tags contain comics
        listcon_tag = soup.find('ul', class_='listcon')
        if len(listcon_tag) < 1:
            self.log('extract comics list error.')
            return

        # get <a> in each tag list
        com_a_list = listcon_tag.find_all('a', attrs={ 'href': True })
        if len(com_a_list) < 1:
            self.log('can not find <a> that contain href arrtibute.')
            return

        # append each comics url to array
        comics_url_list = []
        for tag_a in com_a_list:
            url = SITE_NAME + tag_a['href']
            comics_url_list.append(url)


        print('\n>>>>>>>>>>>>>>>>>>> current page comics list <<<<<<<<<<<<<<<<<<<<')
        print(comics_url_list)

        # handle each page comic
        for url in comics_url_list:
            print('>>>>>>>>  parse comics:' + url)
            yield scrapy.Request(url=url, callback=self.comics_parse)

        # if just crawl current one page execute return
        #return

        # get all pages and each for this
        page_tag = soup.find('ul', class_="pagelist")
        if len(page_tag) < 1:
            self.log('extract page list error.')
            return

        # get next page url
        page_a_list = page_tag.find_all('a', attrs={ 'href': True })
        if len(page_a_list) < 2:
            self.log('extract page tag a error.')
            return

        #check current page is or not last page by select control
        select_tag = soup.find('select', attrs={ 'name': 'sldd' })
        option_list = select_tag.find_all('option')

        # verify current page is last by attribute selected in option tage
        last_option = option_list[-1]
        current_option = select_tag.find('option', attrs={ 'selected': True })

        # check is or not last page
        is_last = (last_option.string == current_option.string)
        if not is_last:
            next_page = SITE_NAME + '/' + KEY_WORD + '/' + page_a_list[-2]['href']
            if next_page is not None:
                print('\n------ parse next page --------')
                print(next_page)
                yield scrapy.Request(next_page, callback=self.parse)
                pass
        else:
            print('========= Last page ==========')
示例#5
0
def get_soup(html):
    soup = bsp.soup(html, 'lxml')
    return soup
示例#6
0
from bs4 import Beautifulsoup
import requests

with open(
        "https://musbonrealestate.com/Musbon/Index.aspx?gclid=CjwKCAjw6fCCBhBNEiwAem5SO-Wemf1LQRCWoRKmi84e-ZyZGt6T9w0Hn4tBp_oCShnirLwJhKEJUhoCzP8QAvD_BwE"
) as html_file:

    soup = Beautifulsoup(html_file, 'lxml')

print(soup)
import requests
from bs4 import Beautifulsoup

result = requests.get("https://www.google.pl/")

#print(result.status_code) Sprawdzenie czy strona jest dostępna
#print(result.headers) Sprawdzenie nagłówków HTTP

src = result.content  #przypisanie zawartości strony do zmiennej
#print(src)

soup = Beautifulsoup(
    src,
    'lxml')  #Tworzymy obiekt Beautifulsoup oparty na zmiennej źródłowej src
links = soup.find_all('a')  #Szukamy akpitów <a>, rezultatem będzie lista
print(links)

for link in links:
    if "About" in link.text:
        print(link)
        print(link.attrs['href'])
示例#8
0
    def parse(self, response):
        content = response.body
        if not content:
            self.log('parse body error.')
            return

        # use beautifulsoup instead of lxml
        soup = Beautifulsoup(content, "html5lib")

        # get tags contain comics
        listcon_tag = soup.find('ul', class_='listcon')
        if len(listcon_tag) < 1:
            self.log('extract comics list error.')
            return

        # get <a> in each tag list
        com_a_list = listcon_tag.find_all('a', attrs={'href': True})
        if len(com_a_list) < 1:
            self.log('can not find <a> that contain href arrtibute.')
            return

        # append each comics url to array
        comics_url_list = []
        for tag_a in com_a_list:
            url = SITE_NAME + tag_a['href']
            comics_url_list.append(url)

        print(
            '\n>>>>>>>>>>>>>>>>>>> current page comics list <<<<<<<<<<<<<<<<<<<<'
        )
        print(comics_url_list)

        # handle each page comic
        for url in comics_url_list:
            print('>>>>>>>>  parse comics:' + url)
            yield scrapy.Request(url=url, callback=self.comics_parse)

        # if just crawl current one page execute return
        #return

        # get all pages and each for this
        page_tag = soup.find('ul', class_="pagelist")
        if len(page_tag) < 1:
            self.log('extract page list error.')
            return

        # get next page url
        page_a_list = page_tag.find_all('a', attrs={'href': True})
        if len(page_a_list) < 2:
            self.log('extract page tag a error.')
            return

        #check current page is or not last page by select control
        select_tag = soup.find('select', attrs={'name': 'sldd'})
        option_list = select_tag.find_all('option')

        # verify current page is last by attribute selected in option tage
        last_option = option_list[-1]
        current_option = select_tag.find('option', attrs={'selected': True})

        # check is or not last page
        is_last = (last_option.string == current_option.string)
        if not is_last:
            next_page = SITE_NAME + '/' + KEY_WORD + '/' + page_a_list[-2][
                'href']
            if next_page is not None:
                print('\n------ parse next page --------')
                print(next_page)
                yield scrapy.Request(next_page, callback=self.parse)
                pass
        else:
            print('========= Last page ==========')
示例#9
0
import urllib.request, urllib.parse, urllib.error
from bs4 import Beautifulsoup
import ssl

ctx = ssl.creat_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = Beautifulsoup(html, 'html.parser')

tags = soup('a')
for tag in tags:
    print(tag.get(href, None))
示例#10
0
    'user-agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
url_path = 'https://www.pexels.com/search/'
word = input('请输入你要下载的图片')
#调用斯必克的翻译api,可以对输入的中文进行翻译
url_tra ='http://howto speak.org:443/api/e2c?user_key=dfcacb6404295f9ed9e430f67b641a8e&notrans=0&text='+word
english_data =requests.get(url_tra)
#将数据解析为json数据
js_data = json.loads(englisn_data)
#从json格式当中获取数据
content = js_data['english']
#结合翻译的英文,以及对应的url格式,构造新的uel路径,给下面使用
url = url_path + content +'/'
#使用ruquests,从而获取到需要的图片的页面
wb_data = requests.get(url,headers = headers)
soup = Beautifulsoup(wb_data.txt,'lxml')
#获取这个页面的图片的url
imgs =soup.select('article > a >img')
list = []
#将获取到的url存储到列表当中
for img in imgs :
    photo =img .get('src')
    list.append(photo)
#定义存储图片的地址  
path =''
#逐个的调用url,获取图片
for item in list:
    data = requests.get(item,headers = headers )
    #使用item来打开文件
    fp = open(path +item .split('?')[0][-10:],'wb')
    fp.writte(data.content)
示例#11
0
from bs4 import Beautifulsoup
import requests
import html5lib
# import os

url = "https://www.timeanddate.com/weather/netherlands/delft"

response = requests.get(url)

result = Beautifulsoup(response.content, "html5lib")

temperature_raw_data = result.findall('td')[12]

temperature = temperature_raw_data.get_text()

print('Temperature of Delft city :', temperature)
示例#12
0
 def parser(self):
     response = requests.get(url=self.url, headers=self.headers)
     soup = Beautifulsoup(response, 'lxml')
     return soup
示例#13
0
#Writing a CSV from a HTML- scraping data

import csv
import urllib
from bs4 import Beautifulsoup

html = urlopen("https://cricclubs.com/NTCA/teamSchedule.do?teamId=1636&clubId=343")
bsobj = Beautifulsoup(html, "lxml")
table = bsobj.findall("table", {"class":"sortable table"})[0]
rows = table.findall("tr")
csvFile = open("ntca.csv", 'wt', newline='')
writer = csv.writer(csvFile)
try:
    for row in rows:
        
        csvRow = []
        for cell in row.findall(['td', 'th']):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
finally:
        csvFile.close()
示例#14
0
import requests
from bs4 import Beautifulsoup

sample_obj = []

# This writes a new file with the sample_obj content
with open('index.html', 'w') as f:
    f.write(sample_obj)

soup = Beautifulsoup(sample_obj)

# Pretty can render a cleaner code, e.g. *.html
#print(soup.prettify())

# Tag
soup.b  # This will print the first <b> tag in a html file
soup.p  # Does the same with the <p> tag
print  #will find the <b> tags(soup.find('b'))
print  #will finds all <b> tags(soup.find_all('b'))
print  #will find the name of the <b> tags(soup.b.name)

#changing tag names
#tag = soup.b
#tag.name = "blockquote"

#print(tag['id'])
#print(tag['any_attribute'])

#print(tag.attrs) # returns a dictionary of the attributes
#del tag['id']
示例#15
0
from bs4 import Beautifulsoup
import pandas

with open("hu.html") as f:
    page = Beautifulsoup(f, "lxml")

page.select(title)[0].get_text(
)  # .get_text() only takes the text, without h1 / h2 / etc
print(title)  #.upper() / .lower()

headings = page.select("h2")
len(headings)

for heading in headings:
    print(heading.get_text().strip())

flyer_title = page.select(".flyer_content_title")

flyer_title[0].get_text()

heading_title = flyer_title[0].get_text()
print(heading_title)

___

# PANDAS

df = pd.DataFrame([{"title": heading_title, "text": text}])

df
示例#16
0
文件: bs2.py 项目: joe210920/reptile
from bs4 import Beautifulsoup

html = '''
<html>
    <head><meta charset = 'UTF8'>
          <title>我是網頁標題</title>
    </head>
    <body>
        <p id = "p1">我是段落一</p>
        <p id = "p2" class="red">我是段落二</p>
    </body>

</html>
'''
sp = Beautifulsoup(html, "lxml")
print(sp.find('p'))
print(sp.find_all('p'))
print(sp.find('p', {'id': 'p2', 'class': 'red'}))
print(sp.find('p', id='p2', class_='red'))

datas = sp.select('title')
示例#17
0
import requests
import bs4
from bs4 import Beautifulsoup
import pandas as pd
import time
URL = "https://www.indeed.com/jobs?q=Python&l=Chennai&start=2"
page = requests.get(URL)
soup = Beautifulsoup(page.text, "html.parser")
print(soup.prettify())