from scraping.models import Vacancy, City, Language, Error parsers = ( (work, 'https://www.work.ua/ru/jobs-kyiv-python/'), (rabota, 'https://rabota.ua/zapros/python/%D0%BA%D0%B8%D0%B5%D0%B2'), (dou, 'https://jobs.dou.ua/vacancies/?category=Python&search=%D0%9A%D0%B8%D0%B5%D0%B2'), (djinni, 'https://djinni.co/jobs/keyword-python/kyiv/') ) city = City.objects.filter(slug='kiev').first() language = Language.objects.filter(slug='python').first() jobs, errors = [], [] for func, url in parsers: j, e = func(url) jobs += j errors += e for job in jobs: v = Vacancy(**job, city=city, language=language) try: v.save() except DatabaseError: pass if errors: er = Error(data=errors).save() # with codecs.open('FROM_4_SITES.json', 'w', 'utf-8') as file: # json.dump(jobs, file, ensure_ascii=False, indent=4)
loop = asyncio.get_event_loop() tmp_tasks = [(func, data['url_data'][key], data['city'], data['language']) for data in url_list for func, key in parsers] # for data in url_list: # # for func, key in parsers: # url = data['url_data'][key] # j, e = func(url, city=data['city'], language=data['language']) # jobs += j # errors += e if tmp_tasks: tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks]) loop.run_until_complete(tasks) loop.close() for job in jobs: v = Vacancy(**job) try: v.save() except DatabaseError: pass if errors: er = Error(data=errors).save() #h = codecs.open('hh.txt', 'w', 'utf-8') #h.write(str(jobs)) #h.close() ten_days_ago = dt.date.today() - dt.timedelta(10) Vacancy.objects.filter(timestamp__lte=ten_days_ago).delete()
parsers = ((work, 'https://www.work.ua/jobs-kyiv-python/'), ( rabota, 'https://rabota.ua/zapros/python/%D1%83%D0%BA%D1%80%D0%B0%D0%B8%D0%BD%D0%B0' ), (dou, 'https://jobs.dou.ua/vacancies/?category=Python&search=%D0%9A%D0%B8%D0%B5%D0%B2' ), (djinni, 'https://djinni.co/jobs/keyword-python/kyiv/')) city = City.objects.filter(slug='kyiv').first() language = Language.objects.filter(slug='python').first() jobs, errors = [], [] for func, url in parsers: j, e = func(url) jobs += j errors += e for job in jobs: v = Vacancy(city=city, language=language, **job) try: v.save() except DatabaseError: pass if errors: er = Error(data=errors).save() # with open('vacancies.txt', 'w', encoding='utf-8') as f: # f.write(str(jobs))
tmp_tasks = [(func, data['url_data'][key], data['city'], data['language']) for data in url_list for func, key in parsers] # for data in url_list: # # for func, key in parsers: # url = data['url_data'][key] # j, e = func(url, city=data['city'], language=data['language']) # jobs += j # errors += e if tmp_tasks: tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks]) loop.run_until_complete(tasks) loop.close() for job in jobs: print(job) vac = Vacancy(**job) try: vac.save() except DatabaseError: pass if errors: qs = Error.objects.filter(timestamp=dt.date.today()) if qs.exists(): err = qs.first() err.data.update({'errors': errors}) err.save() else: er = Error(data=f'errors:{errors}').save()
# # for data in url_list: # # for func, key in parsers: # # print(func, url) # url = data['url_data'][key] # j, e = func(url, city=data['city'], language=data['language']) # jobs += j # errors += e loop.run_until_complete(tasks) loop.close() #print(time.time()-start)# 3 узнаем сколько времени ушло на ваполнениние запроса for job in jobs: v = Vacancy(**job) # раскрываем словарь, try: v.save() # и записвыаем в базу данных except DatabaseError: pass if errors: qs = Error.objects.filter(timestamp=dt.date.today()) if qs.exists(): # если кто ч утра записал err = qs.first() # уникальность в один день одна запись err.data.update({'errors': errors}) err.save() else: er = Error(data=f'errors:{errors}').save( ) # в словаре будет храниться весь массив ошибок которые были получены в этот день # 52 урок закоментили
j, e = func(url, city=data['city'], language=data['language']) jobs.extend(j) errors.extend(e) else: print(data['url_data'], type(data['url_data'])) # ===================================================================== Неасинхронный способ выполнения # print((time.time() - start) / 10) # print(*jobs,len(jobs), sep='\n') # print(errors) for vacancy in jobs: v = Vacancy(**vacancy) try: v.save() except DatabaseError: pass errors_exp = [{ 'url': 'https://www.amalgama-lab.com/songs/m/misfits/mars_attacks.html', 'cause': 'HZ', 'status_code': 200, }] """ for production server choose section 1 ------- for local server choose section 2 ======== """ # if errors_exp:
url_list = get_urls(settings) loop = asyncio.get_event_loop() tmp_tasks = [ (func, data['url_data'][key], data['city'], data['language']) # набор функций и данных(урлы, города и языки) для асинхронного запуска for data in url_list # функции для выполнения for func, key in parsers ] tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks]) # запуск на выполнение loop.run_until_complete(tasks) loop.close() for job in jobs: v = Vacancy(**job) # раскрытие словаря try: v.save() except DatabaseError: pass if errors: qs = Error.objects.filter(timestamp=dt.date.today()) if qs.exists(): er = qs.first() er.data.update({'errors': errors}) er.save() else: er = Error(data=f'errors:{errors}').save() ten_days_ago = dt.date.today() - dt.timedelta(10)
from scraping.parser import * from scraping.models import Vacancy, City, Language, Errors parsers = ((hh, 'https://hh.ru/search/vacancy?st=searchVacancy&L_profession_id=29.8&area=2760&no_magic=true&text=%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%81%D1%82+Python'), (rabota, 'https://rabota.ua/zapros/python/%d0%ba%d0%b8%d0%b5%d0%b2')) jobs, errors = [], [] for func, url in parsers: j, e = func(url) jobs += j errors += e city = City.objects.filter(slug='ny').first() language = Language.objects.filter(slug='python').first() for job in jobs: obj = Vacancy(**job, city=city, Language=language) try: obj.save() except DatabaseError: pass if errors: er = Errors(data=errors).save()
django.setup() from django.db import DatabaseError from scraping.parsers import * from scraping.models import Vacancy, City, Programming_language parsers = (( tut_by, 'https://jobs.tut.by/search/vacancy?area=1002&fromSearchLine=true&st=searchVacancy&text=Python' ), (trudbox, 'http://trudbox.by/minsk?whatQuery=Python+')) city = City.objects.filter(slug='minsk').first() programming_language = Programming_language.objects.filter( slug='python').first() jobs, errors = [], [] for func, url in parsers: j, e = func(url) jobs += j errors += e for job in jobs: v = Vacancy(**job, city=city, programming_language=programming_language) try: v.save() except DatabaseError: pass h = codecs.open('work.txt', 'w', 'utf-8') h.write(str(jobs)) h.close()
def start(): print('Running') start = time.time() User = get_user_model() parsers = ((tut_pars, 'tut_pars'), (bel_pars, 'bel_pars')) jobs, errors = [], [] def get_settings(): qs = User.objects.filter(send_email=True).values() print(qs) settings_lst = set((q['city_id'], q['language_id']) for q in qs) return settings_lst def get_urls(_settings): qs = Url.objects.all().values() url_dct = {(q['city_id'], q['language_id']): q['url_data'] for q in qs} urls = [] for pair in _settings: if pair in url_dct: tmp = {} tmp['city'] = pair[0] tmp['language'] = pair[1] tmp['url_data'] = url_dct[pair] urls.append(tmp) return urls async def main(value): func, url, city, language = value job, err = await loop.run_in_executor(None, func, url, city, language) errors.extend(err) jobs.extend(job) settings = get_settings() url_list = get_urls(settings) #no async function # for data in url_list: # for func,key in parsers: # url = data['url_data'][key] # j,e = func(url, city=data['city'], language=data['language']) # jobs += j # errors += e # h = codecs.open('pars.txt', 'w', 'utf-8') # h.write(str(jobs)) # h.close() loop = asyncio.get_event_loop() #loop forever? tmp_tasks = [(func, data['url_data'][key], data['city'], data['language']) for data in url_list for func, key in parsers] if tmp_tasks: tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks]) loop.run_until_complete(tasks) # loop.close() #no async # print(time.time()-start) #33sek # print(time.time()-start) #20 sek async for job in jobs: v = Vacancy(**job) try: v.save() except DatabaseError: pass if errors: qs = Error.objects.filter(timestamp=dt.date.today()) if qs.exists(): err = qs.first() data = err.data err.data.update({'errors': errors}) err.save() else: er = Error(data=f'errors:{errors}').save()
async def execute(task): """ Asynchronous execution of parser functions """ function, location, specialty, url = task vacancy_log, error_log = await loop.run_in_executor( None, function, location, specialty, url) data_for_recording_vacancies.extend(vacancy_log) data_for_recording_errors.extend(error_log) loop = asyncio.get_event_loop() instruction = [(function, data['url_json'][parser], data['location_id'], data['specialty_id']) for data in url_list for function, parser in parsers] tasks = asyncio.wait([loop.create_task(execute(task)) for task in instruction]) loop.run_until_complete(tasks) loop.close() for record in data_for_recording_vacancies: to_record = Vacancy(**record) try: to_record.save() except DatabaseError: pass if data_for_recording_errors: error_dump = Error(error_json=data_for_recording_errors) error_dump.save()
from scraping.models import Vacancy, City, Language, Error from scraping.parsers import headhunter parsers = (( headhunter, 'https://hh.ru/search/vacancy?area=1&st=searchVacancy&fromSearch=true&text=Python&from=suggest_post' ), ) city = City.objects.filter(slug='moskva').first() language = Language.objects.filter(slug='python').first() works, errors = [], [] for func, url in parsers: w, e = func(url) works += w errors += e for work in works: v = Vacancy(**work, city=city, language=language) try: v.save() except DatabaseError: pass if errors: er = Error(data=errors).save() # h = codecs.open('headhunter.txt', 'w', 'utf-8') # h.write(str(works)) # h.close()
django.setup() from scraping.parsers import * from scraping.models import Vacancy, City, LanguageProgramming parsers = (( habr_vacancy, 'https://career.habr.com/vacancies?q=Python&type=suitable' ), (head_hunter, 'https://ufa.hh.ru/search/vacancy?area=1&clusters=true&enable_snippets=true&text=python&experience=noExperience&from=cluster_experience&showClusters=false' )) city = City.objects.filter(slug='moskva').first() language = LanguageProgramming.objects.filter(slug='python').first() jobs, errors = [], [] for func, url in parsers: j, e = func(url) jobs += j errors += e a = 1 for item in jobs: v = Vacancy(**item, city=city, language=language) try: v.save() except DatabaseError: pass # h = codecs.open('../work.html', 'w', 'utf-8') # h.write(str(jobs)) # h.close()