Python BSp示例，bs4.BSp Python示例

示例#1

0

显示文件

def work(url, city=None, language=None):
    jobs.clear()
    errors.clear()
    domain = 'https://www.work.ua'
    if url:
        resp = requests.get(url, headers=headers[randint(0, 2)])
        if resp.status_code == 200:
            soup = BSp(resp.content, 'html.parser')
            main_div = soup.find('div', id='pjax-job-list')
            if main_div:
                div_lst = main_div.find_all('div', attrs={'class': 'job-link'})
                for div in div_lst:
                    title = div.find('h2')
                    href = title.a['href']
                    content = div.p.text
                    company = 'No name'
                    logo = div.find('img')
                    if logo:
                        company = logo['alt']
                    jobs.append({
                        'title': title.text,
                        'url': domain + href,
                        'description': content,
                        'company': company,
                        'city_id': city,
                        'language_id': language
                    })
            else:
                errors.append({'url': url, 'title': "Div does not exists"})
        else:
            errors.append({'url': url, 'title': "Page do not response"})

    return jobs, errors

示例#2

0

显示文件

def dou(url, city=None, language=None):
    jobs.clear()
    errors.clear()
    # domain = 'https://www.work.ua'
    if url:
        resp = requests.get(url, headers=headers[randint(0, 2)])
        if resp.status_code == 200:
            soup = BSp(resp.content, 'html.parser')
            main_div = soup.find('div', id='vacancyListId')
            if main_div:
                li_lst = main_div.find_all('li', attrs={'class': 'l-vacancy'})
                for li in li_lst:
                    # if '__hot' not in li['class']:
                    title = li.find('div', attrs={'class': 'title'})
                    href = title.a['href']
                    cont = li.find('div', attrs={'class': 'sh-info'})
                    content = cont.text
                    company = 'No name'
                    a = title.find('a', attrs={'class': 'company'})
                    if a:
                        company = a.text
                    jobs.append({
                        'title': title.text,
                        'url': href,
                        'description': content,
                        'company': company,
                        'city_id': city,
                        'language_id': language
                    })
            else:
                errors.append({'url': url, 'title': "Div does not exists"})
        else:
            errors.append({'url': url, 'title': "Page do not response"})

    return jobs, errors

示例#3

0

显示文件

文件： books_thread.py 项目： avine1003/spider

 def get_soup(self, url):
     try:
         response = requests.get(url)
         html_doc = response.text
         soup = BSp(html_doc, 'lxml')
         return soup
     except:
         pass

示例#4

0

显示文件

文件： books_thread.py 项目： avine1003/spider

def get_url_list():
    base_url = 'http://books.toscrape.com/catalogue/page-1.html'
    r = requests.get(base_url)
    soup = BSp(r.text, 'lxml')
    content = soup.select('.current')[0].text
    pages = int(content.split()[-1])
    url_list = []
    for page in range(1, pages + 1):
        url = 'http://books.toscrape.com/catalogue/page-' + str(page) + '.html'
        url_list.append(url)
    return url_list

示例#5

0

显示文件

def rabota(url, city=None, language=None):
    jobs.clear()
    errors.clear()
    domain = 'https://rabota.ua'
    if url:
        resp = requests.get(url, headers=headers[randint(0, 2)])
        if resp.status_code == 200:
            soup = BSp(resp.content, 'html.parser')
            new_jobs = soup.find('div',
                                 attrs={'class': 'f-vacancylist-newnotfound'})
            if not new_jobs:
                table = soup.find('table',
                                  id='ctl00_content_vacancyList_gridList')
                if table:
                    tr_lst = table.find_all('tr', attrs={'id': True})
                    for tr in tr_lst:
                        div = tr.find('div', attrs={'class': 'card-body'})
                        if div:
                            title = div.find('p',
                                             attrs={'class': 'card-title'})
                            href = title.a['href']
                            content = div.p.text
                            company = 'No name'
                            p = div.find('p', attrs={'class': 'company-name'})
                            if p:
                                company = p.a.text
                            jobs.append({
                                'title': title.text,
                                'url': domain + href,
                                'description': content,
                                'company': company,
                                'city_id': city,
                                'language_id': language
                            })
                else:
                    errors.append({
                        'url': url,
                        'title': "Table does not exists"
                    })
            else:
                errors.append({'url': url, 'title': "Page is empty"})
        else:
            errors.append({'url': url, 'title': "Page do not response"})

    return jobs, errors

示例#6

0

显示文件

def djinni(url, city=None, language=None):
    jobs.clear()
    errors.clear()
    domain = 'https://djinni.co'
    if url:
        resp = requests.get(url, headers=headers[randint(0, 2)])
        if resp.status_code == 200:
            soup = BSp(resp.content, 'html.parser')
            main_ul = soup.find('ul', attrs={'class': 'list-jobs'})
            if main_ul:
                li_lst = main_ul.find_all('li',
                                          attrs={'class': 'list-jobs__item'})
                for li in li_lst:
                    title = li.find('div', attrs={'class': 'list-jobs__title'})
                    href = title.a['href']
                    cont = li.find('div',
                                   attrs={'class': 'list-jobs__description'})
                    content = cont.text
                    company = 'No name'
                    comp = li.find('div',
                                   attrs={'class': 'list-jobs__details__info'})
                    if comp:
                        company = comp.text
                    jobs.append({
                        'title': title.text,
                        'url': domain + href,
                        'description': content,
                        'company': company,
                        'city_id': city,
                        'language_id': language
                    })
            else:
                errors.append({'url': url, 'title': "Div does not exists"})
        else:
            errors.append({'url': url, 'title': "Page do not response"})

    return jobs, errors