def work(url, city=None, language=None): jobs.clear() errors.clear() domain = 'https://www.work.ua' if url: resp = requests.get(url, headers=headers[randint(0, 2)]) if resp.status_code == 200: soup = BSp(resp.content, 'html.parser') main_div = soup.find('div', id='pjax-job-list') if main_div: div_lst = main_div.find_all('div', attrs={'class': 'job-link'}) for div in div_lst: title = div.find('h2') href = title.a['href'] content = div.p.text company = 'No name' logo = div.find('img') if logo: company = logo['alt'] jobs.append({ 'title': title.text, 'url': domain + href, 'description': content, 'company': company, 'city_id': city, 'language_id': language }) else: errors.append({'url': url, 'title': "Div does not exists"}) else: errors.append({'url': url, 'title': "Page do not response"}) return jobs, errors
def dou(url, city=None, language=None): jobs.clear() errors.clear() # domain = 'https://www.work.ua' if url: resp = requests.get(url, headers=headers[randint(0, 2)]) if resp.status_code == 200: soup = BSp(resp.content, 'html.parser') main_div = soup.find('div', id='vacancyListId') if main_div: li_lst = main_div.find_all('li', attrs={'class': 'l-vacancy'}) for li in li_lst: # if '__hot' not in li['class']: title = li.find('div', attrs={'class': 'title'}) href = title.a['href'] cont = li.find('div', attrs={'class': 'sh-info'}) content = cont.text company = 'No name' a = title.find('a', attrs={'class': 'company'}) if a: company = a.text jobs.append({ 'title': title.text, 'url': href, 'description': content, 'company': company, 'city_id': city, 'language_id': language }) else: errors.append({'url': url, 'title': "Div does not exists"}) else: errors.append({'url': url, 'title': "Page do not response"}) return jobs, errors
def get_soup(self, url): try: response = requests.get(url) html_doc = response.text soup = BSp(html_doc, 'lxml') return soup except: pass
def get_url_list(): base_url = 'http://books.toscrape.com/catalogue/page-1.html' r = requests.get(base_url) soup = BSp(r.text, 'lxml') content = soup.select('.current')[0].text pages = int(content.split()[-1]) url_list = [] for page in range(1, pages + 1): url = 'http://books.toscrape.com/catalogue/page-' + str(page) + '.html' url_list.append(url) return url_list
def rabota(url, city=None, language=None): jobs.clear() errors.clear() domain = 'https://rabota.ua' if url: resp = requests.get(url, headers=headers[randint(0, 2)]) if resp.status_code == 200: soup = BSp(resp.content, 'html.parser') new_jobs = soup.find('div', attrs={'class': 'f-vacancylist-newnotfound'}) if not new_jobs: table = soup.find('table', id='ctl00_content_vacancyList_gridList') if table: tr_lst = table.find_all('tr', attrs={'id': True}) for tr in tr_lst: div = tr.find('div', attrs={'class': 'card-body'}) if div: title = div.find('p', attrs={'class': 'card-title'}) href = title.a['href'] content = div.p.text company = 'No name' p = div.find('p', attrs={'class': 'company-name'}) if p: company = p.a.text jobs.append({ 'title': title.text, 'url': domain + href, 'description': content, 'company': company, 'city_id': city, 'language_id': language }) else: errors.append({ 'url': url, 'title': "Table does not exists" }) else: errors.append({'url': url, 'title': "Page is empty"}) else: errors.append({'url': url, 'title': "Page do not response"}) return jobs, errors
def djinni(url, city=None, language=None): jobs.clear() errors.clear() domain = 'https://djinni.co' if url: resp = requests.get(url, headers=headers[randint(0, 2)]) if resp.status_code == 200: soup = BSp(resp.content, 'html.parser') main_ul = soup.find('ul', attrs={'class': 'list-jobs'}) if main_ul: li_lst = main_ul.find_all('li', attrs={'class': 'list-jobs__item'}) for li in li_lst: title = li.find('div', attrs={'class': 'list-jobs__title'}) href = title.a['href'] cont = li.find('div', attrs={'class': 'list-jobs__description'}) content = cont.text company = 'No name' comp = li.find('div', attrs={'class': 'list-jobs__details__info'}) if comp: company = comp.text jobs.append({ 'title': title.text, 'url': domain + href, 'description': content, 'company': company, 'city_id': city, 'language_id': language }) else: errors.append({'url': url, 'title': "Div does not exists"}) else: errors.append({'url': url, 'title': "Page do not response"}) return jobs, errors