예제 #1
0
tmp_tasks = [(func, data['url_data'][key], data['city'], data['language'])
             for data in url_list
             for func, key in parsers]

# for data in url_list:
#
#     for func, key in parsers:
#         url = data['url_data'][key]
#         j, e = func(url, city=data['city'], language=data['language'])
#         jobs += j
#         errors += e
if tmp_tasks:
    tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks])
    loop.run_until_complete(tasks)
    loop.close()

for job in jobs:
    print(job)
    vac = Vacancy(**job)
    try:
        vac.save()
    except DatabaseError:
        pass
if errors:
    qs = Error.objects.filter(timestamp=dt.date.today())
    if qs.exists():
        err = qs.first()
        err.data.update({'errors': errors})
        err.save()
    else:
        er = Error(data=f'errors:{errors}').save()
예제 #2
0
loop = asyncio.get_event_loop()
tmp_tasks = [(func, data['url_data'][key], data['city'], data['language'])
             for data in url_list for func, key in parsers]

# for data in url_list:
#
#     for func, key in parsers:
#         url = data['url_data'][key]
#         j, e = func(url, city=data['city'], language=data['language'])
#         jobs += j
#         errors += e
if tmp_tasks:
    tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks])
    loop.run_until_complete(tasks)
    loop.close()

for job in jobs:
    v = Vacancy(**job)
    try:
        v.save()
    except DatabaseError:
        pass
if errors:
    er = Error(data=errors).save()

#h = codecs.open('hh.txt', 'w', 'utf-8')
#h.write(str(jobs))
#h.close()
ten_days_ago = dt.date.today() - dt.timedelta(10)
Vacancy.objects.filter(timestamp__lte=ten_days_ago).delete()
예제 #3
0
    for pair in _settings:
        tmp = {}
        tmp['city'] = pair[0]
        tmp['language'] = pair[1]
        tmp['url_data'] = url_dict[pair]
        urls.append(tmp)
    return urls

city = City.objects.filter(slug='kyiv').first()
language = Language.objects.filter(slug='python').first()

jobs, errors = [], []
for func, url in parsers:
    job, error = func(url)
    jobs += job
    errors += error

for job in jobs:
    vacancy = Vacancy(**job, city=city, language=language)
    try:
        vacancy.save()
    except DatabaseError:
        pass

if errors:
    error = Error(data=errors).save()

# h = codecs.open('work.txt', 'w', 'utf-8')
# h.write(str(jobs))
# h.close
예제 #4
0
#
#     for func, key in parsers:
#         # print(func, url)
#         url = data['url_data'][key]
#         j, e = func(url, city=data['city'], language=data['language'])
#         jobs += j
#         errors += e

loop.run_until_complete(tasks)
loop.close()
#print(time.time()-start)# 3 узнаем сколько времени ушло на ваполнениние запроса

for job in jobs:
    v = Vacancy(**job)  # раскрываем словарь,
    try:
        v.save()  # и записвыаем в базу данных
    except DatabaseError:
        pass

if errors:
    qs = Error.objects.filter(timestamp=dt.date.today())
    if qs.exists():  # если кто ч утра записал
        err = qs.first()  # уникальность в один день одна запись
        err.data.update({'errors': errors})
        err.save()
    else:
        er = Error(data=f'errors:{errors}').save(
        )  # в словаре будет храниться весь массив ошибок которые были получены в этот день
# 52 урок закоментили
# h = codecs.open('work.txt', 'w', 'utf-8')# открываем в режиме записи и задаем кодировку 'utf-8'
# h.write(str(jobs))# записываем весь контент словарем
예제 #5
0

from scraping.parser import *
from scraping.models import Vacancy, City, Language, Errors

parsers = ((hh, 'https://hh.ru/search/vacancy?st=searchVacancy&L_profession_id=29.8&area=2760&no_magic=true&text=%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%81%D1%82+Python'),
	(rabota, 'https://rabota.ua/zapros/python/%d0%ba%d0%b8%d0%b5%d0%b2'))

jobs, errors = [], []

for func, url in parsers:
	j, e = func(url)
	jobs += j
	errors += e

city = City.objects.filter(slug='ny').first()
language = Language.objects.filter(slug='python').first()


for job in jobs:
	
	obj = Vacancy(**job, city=city, Language=language)
	try:
		obj.save()
	except DatabaseError:
		pass

if errors:
	er = Errors(data=errors).save()

예제 #6
0
def start():
    print('Running')
    start = time.time()

    User = get_user_model()

    parsers = ((tut_pars, 'tut_pars'), (bel_pars, 'bel_pars'))
    jobs, errors = [], []

    def get_settings():
        qs = User.objects.filter(send_email=True).values()
        print(qs)
        settings_lst = set((q['city_id'], q['language_id']) for q in qs)
        return settings_lst

    def get_urls(_settings):
        qs = Url.objects.all().values()

        url_dct = {(q['city_id'], q['language_id']): q['url_data'] for q in qs}

        urls = []
        for pair in _settings:
            if pair in url_dct:

                tmp = {}
                tmp['city'] = pair[0]

                tmp['language'] = pair[1]
                tmp['url_data'] = url_dct[pair]
                urls.append(tmp)
        return urls

    async def main(value):
        func, url, city, language = value
        job, err = await loop.run_in_executor(None, func, url, city, language)
        errors.extend(err)
        jobs.extend(job)

    settings = get_settings()
    url_list = get_urls(settings)

    #no async function

    # for data in url_list:
    #     for func,key in parsers:
    #         url = data['url_data'][key]

    #         j,e = func(url, city=data['city'], language=data['language'])
    #         jobs += j
    #         errors += e
    # h = codecs.open('pars.txt', 'w', 'utf-8')
    # h.write(str(jobs))
    # h.close()

    loop = asyncio.get_event_loop()  #loop forever?
    tmp_tasks = [(func, data['url_data'][key], data['city'], data['language'])
                 for data in url_list for func, key in parsers]
    if tmp_tasks:
        tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks])
        loop.run_until_complete(tasks)
        # loop.close()

    #no async
    # print(time.time()-start) #33sek

    # print(time.time()-start) #20 sek async
    for job in jobs:
        v = Vacancy(**job)
        try:
            v.save()
        except DatabaseError:
            pass

    if errors:
        qs = Error.objects.filter(timestamp=dt.date.today())
        if qs.exists():
            err = qs.first()
            data = err.data
            err.data.update({'errors': errors})
            err.save()
        else:
            er = Error(data=f'errors:{errors}').save()
예제 #7
0
async def execute(task):
    """
    Asynchronous execution of parser functions
    """
    function, location, specialty, url = task
    vacancy_log, error_log = await loop.run_in_executor(
        None, function, location, specialty, url)
    data_for_recording_vacancies.extend(vacancy_log)
    data_for_recording_errors.extend(error_log)


loop = asyncio.get_event_loop()

instruction = [(function, data['url_json'][parser], data['location_id'],
                data['specialty_id']) for data in url_list
               for function, parser in parsers]
tasks = asyncio.wait([loop.create_task(execute(task)) for task in instruction])
loop.run_until_complete(tasks)
loop.close()

for record in data_for_recording_vacancies:
    to_record = Vacancy(**record)
    try:
        to_record.save()
    except DatabaseError:
        pass

if data_for_recording_errors:
    error_dump = Error(error_json=data_for_recording_errors)
    error_dump.save()