예제 #1
0

from scraping.models import Vacancy, City, Language, Error

parsers = (
    (work, 'https://www.work.ua/ru/jobs-kyiv-python/'),
    (rabota, 'https://rabota.ua/zapros/python/%D0%BA%D0%B8%D0%B5%D0%B2'),
    (dou, 'https://jobs.dou.ua/vacancies/?category=Python&search=%D0%9A%D0%B8%D0%B5%D0%B2'),
    (djinni, 'https://djinni.co/jobs/keyword-python/kyiv/')
)

city = City.objects.filter(slug='kiev').first()
language = Language.objects.filter(slug='python').first()

jobs, errors = [], []
for func, url in parsers:
    j, e = func(url)
    jobs += j
    errors += e
    
for job in jobs:
    v = Vacancy(**job, city=city, language=language)
    try:
        v.save()
    except DatabaseError:
        pass
if errors:
    er = Error(data=errors).save()
    
# with codecs.open('FROM_4_SITES.json', 'w', 'utf-8') as file:
    # json.dump(jobs, file, ensure_ascii=False, indent=4)
예제 #2
0
loop = asyncio.get_event_loop()
tmp_tasks = [(func, data['url_data'][key], data['city'], data['language'])
             for data in url_list for func, key in parsers]

# for data in url_list:
#
#     for func, key in parsers:
#         url = data['url_data'][key]
#         j, e = func(url, city=data['city'], language=data['language'])
#         jobs += j
#         errors += e
if tmp_tasks:
    tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks])
    loop.run_until_complete(tasks)
    loop.close()

for job in jobs:
    v = Vacancy(**job)
    try:
        v.save()
    except DatabaseError:
        pass
if errors:
    er = Error(data=errors).save()

#h = codecs.open('hh.txt', 'w', 'utf-8')
#h.write(str(jobs))
#h.close()
ten_days_ago = dt.date.today() - dt.timedelta(10)
Vacancy.objects.filter(timestamp__lte=ten_days_ago).delete()
예제 #3
0
parsers = ((work, 'https://www.work.ua/jobs-kyiv-python/'), (
    rabota,
    'https://rabota.ua/zapros/python/%D1%83%D0%BA%D1%80%D0%B0%D0%B8%D0%BD%D0%B0'
), (dou,
    'https://jobs.dou.ua/vacancies/?category=Python&search=%D0%9A%D0%B8%D0%B5%D0%B2'
    ), (djinni, 'https://djinni.co/jobs/keyword-python/kyiv/'))

city = City.objects.filter(slug='kyiv').first()
language = Language.objects.filter(slug='python').first()

jobs, errors = [], []

for func, url in parsers:
    j, e = func(url)
    jobs += j
    errors += e

for job in jobs:
    v = Vacancy(city=city, language=language, **job)
    try:
        v.save()
    except DatabaseError:
        pass

if errors:
    er = Error(data=errors).save()

# with open('vacancies.txt', 'w', encoding='utf-8') as f:
#     f.write(str(jobs))
예제 #4
0
tmp_tasks = [(func, data['url_data'][key], data['city'], data['language'])
             for data in url_list
             for func, key in parsers]

# for data in url_list:
#
#     for func, key in parsers:
#         url = data['url_data'][key]
#         j, e = func(url, city=data['city'], language=data['language'])
#         jobs += j
#         errors += e
if tmp_tasks:
    tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks])
    loop.run_until_complete(tasks)
    loop.close()

for job in jobs:
    print(job)
    vac = Vacancy(**job)
    try:
        vac.save()
    except DatabaseError:
        pass
if errors:
    qs = Error.objects.filter(timestamp=dt.date.today())
    if qs.exists():
        err = qs.first()
        err.data.update({'errors': errors})
        err.save()
    else:
        er = Error(data=f'errors:{errors}').save()
예제 #5
0
#
# for data in url_list:
#
#     for func, key in parsers:
#         # print(func, url)
#         url = data['url_data'][key]
#         j, e = func(url, city=data['city'], language=data['language'])
#         jobs += j
#         errors += e

loop.run_until_complete(tasks)
loop.close()
#print(time.time()-start)# 3 узнаем сколько времени ушло на ваполнениние запроса

for job in jobs:
    v = Vacancy(**job)  # раскрываем словарь,
    try:
        v.save()  # и записвыаем в базу данных
    except DatabaseError:
        pass

if errors:
    qs = Error.objects.filter(timestamp=dt.date.today())
    if qs.exists():  # если кто ч утра записал
        err = qs.first()  # уникальность в один день одна запись
        err.data.update({'errors': errors})
        err.save()
    else:
        er = Error(data=f'errors:{errors}').save(
        )  # в словаре будет храниться весь массив ошибок которые были получены в этот день
# 52 урок закоментили
                    j, e = func(url,
                                city=data['city'],
                                language=data['language'])
                    jobs.extend(j)
                    errors.extend(e)
            else:
                print(data['url_data'], type(data['url_data']))
# =====================================================================    Неасинхронный способ выполнения

# print((time.time() - start) / 10)
# print(*jobs,len(jobs), sep='\n')
# print(errors)

for vacancy in jobs:

    v = Vacancy(**vacancy)
    try:
        v.save()
    except DatabaseError:
        pass

errors_exp = [{
    'url': 'https://www.amalgama-lab.com/songs/m/misfits/mars_attacks.html',
    'cause': 'HZ',
    'status_code': 200,
}]
"""
for production server choose section 1 ------- 
for local server choose section 2 ========
"""
# if errors_exp:
예제 #7
0
url_list = get_urls(settings)

loop = asyncio.get_event_loop()
tmp_tasks = [
    (func, data['url_data'][key], data['city'], data['language'])
    # набор функций и данных(урлы, города и языки) для асинхронного запуска
    for data in url_list  # функции для выполнения
    for func, key in parsers
]
tasks = asyncio.wait([loop.create_task(main(f))
                      for f in tmp_tasks])  # запуск на выполнение
loop.run_until_complete(tasks)
loop.close()

for job in jobs:
    v = Vacancy(**job)  # раскрытие словаря
    try:
        v.save()
    except DatabaseError:
        pass

if errors:
    qs = Error.objects.filter(timestamp=dt.date.today())
    if qs.exists():
        er = qs.first()
        er.data.update({'errors': errors})
        er.save()
    else:
        er = Error(data=f'errors:{errors}').save()

ten_days_ago = dt.date.today() - dt.timedelta(10)
예제 #8
0

from scraping.parser import *
from scraping.models import Vacancy, City, Language, Errors

parsers = ((hh, 'https://hh.ru/search/vacancy?st=searchVacancy&L_profession_id=29.8&area=2760&no_magic=true&text=%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%81%D1%82+Python'),
	(rabota, 'https://rabota.ua/zapros/python/%d0%ba%d0%b8%d0%b5%d0%b2'))

jobs, errors = [], []

for func, url in parsers:
	j, e = func(url)
	jobs += j
	errors += e

city = City.objects.filter(slug='ny').first()
language = Language.objects.filter(slug='python').first()


for job in jobs:
	
	obj = Vacancy(**job, city=city, Language=language)
	try:
		obj.save()
	except DatabaseError:
		pass

if errors:
	er = Errors(data=errors).save()

예제 #9
0
django.setup()

from django.db import DatabaseError
from scraping.parsers import *
from scraping.models import Vacancy, City, Programming_language

parsers = ((
    tut_by,
    'https://jobs.tut.by/search/vacancy?area=1002&fromSearchLine=true&st=searchVacancy&text=Python'
), (trudbox, 'http://trudbox.by/minsk?whatQuery=Python+'))

city = City.objects.filter(slug='minsk').first()
programming_language = Programming_language.objects.filter(
    slug='python').first()
jobs, errors = [], []
for func, url in parsers:
    j, e = func(url)
    jobs += j
    errors += e

for job in jobs:
    v = Vacancy(**job, city=city, programming_language=programming_language)
    try:
        v.save()
    except DatabaseError:
        pass

h = codecs.open('work.txt', 'w', 'utf-8')
h.write(str(jobs))
h.close()
예제 #10
0
def start():
    print('Running')
    start = time.time()

    User = get_user_model()

    parsers = ((tut_pars, 'tut_pars'), (bel_pars, 'bel_pars'))
    jobs, errors = [], []

    def get_settings():
        qs = User.objects.filter(send_email=True).values()
        print(qs)
        settings_lst = set((q['city_id'], q['language_id']) for q in qs)
        return settings_lst

    def get_urls(_settings):
        qs = Url.objects.all().values()

        url_dct = {(q['city_id'], q['language_id']): q['url_data'] for q in qs}

        urls = []
        for pair in _settings:
            if pair in url_dct:

                tmp = {}
                tmp['city'] = pair[0]

                tmp['language'] = pair[1]
                tmp['url_data'] = url_dct[pair]
                urls.append(tmp)
        return urls

    async def main(value):
        func, url, city, language = value
        job, err = await loop.run_in_executor(None, func, url, city, language)
        errors.extend(err)
        jobs.extend(job)

    settings = get_settings()
    url_list = get_urls(settings)

    #no async function

    # for data in url_list:
    #     for func,key in parsers:
    #         url = data['url_data'][key]

    #         j,e = func(url, city=data['city'], language=data['language'])
    #         jobs += j
    #         errors += e
    # h = codecs.open('pars.txt', 'w', 'utf-8')
    # h.write(str(jobs))
    # h.close()

    loop = asyncio.get_event_loop()  #loop forever?
    tmp_tasks = [(func, data['url_data'][key], data['city'], data['language'])
                 for data in url_list for func, key in parsers]
    if tmp_tasks:
        tasks = asyncio.wait([loop.create_task(main(f)) for f in tmp_tasks])
        loop.run_until_complete(tasks)
        # loop.close()

    #no async
    # print(time.time()-start) #33sek

    # print(time.time()-start) #20 sek async
    for job in jobs:
        v = Vacancy(**job)
        try:
            v.save()
        except DatabaseError:
            pass

    if errors:
        qs = Error.objects.filter(timestamp=dt.date.today())
        if qs.exists():
            err = qs.first()
            data = err.data
            err.data.update({'errors': errors})
            err.save()
        else:
            er = Error(data=f'errors:{errors}').save()
예제 #11
0
async def execute(task):
    """
    Asynchronous execution of parser functions
    """
    function, location, specialty, url = task
    vacancy_log, error_log = await loop.run_in_executor(
        None, function, location, specialty, url)
    data_for_recording_vacancies.extend(vacancy_log)
    data_for_recording_errors.extend(error_log)


loop = asyncio.get_event_loop()

instruction = [(function, data['url_json'][parser], data['location_id'],
                data['specialty_id']) for data in url_list
               for function, parser in parsers]
tasks = asyncio.wait([loop.create_task(execute(task)) for task in instruction])
loop.run_until_complete(tasks)
loop.close()

for record in data_for_recording_vacancies:
    to_record = Vacancy(**record)
    try:
        to_record.save()
    except DatabaseError:
        pass

if data_for_recording_errors:
    error_dump = Error(error_json=data_for_recording_errors)
    error_dump.save()
예제 #12
0
from scraping.models import Vacancy, City, Language, Error
from scraping.parsers import headhunter

parsers = ((
    headhunter,
    'https://hh.ru/search/vacancy?area=1&st=searchVacancy&fromSearch=true&text=Python&from=suggest_post'
), )

city = City.objects.filter(slug='moskva').first()
language = Language.objects.filter(slug='python').first()

works, errors = [], []
for func, url in parsers:
    w, e = func(url)
    works += w
    errors += e

for work in works:
    v = Vacancy(**work, city=city, language=language)
    try:
        v.save()
    except DatabaseError:
        pass
if errors:
    er = Error(data=errors).save()

# h = codecs.open('headhunter.txt', 'w', 'utf-8')
# h.write(str(works))
# h.close()
예제 #13
0
django.setup()

from scraping.parsers import *

from scraping.models import Vacancy, City, LanguageProgramming

parsers = ((
    habr_vacancy, 'https://career.habr.com/vacancies?q=Python&type=suitable'
), (head_hunter,
    'https://ufa.hh.ru/search/vacancy?area=1&clusters=true&enable_snippets=true&text=python&experience=noExperience&from=cluster_experience&showClusters=false'
    ))
city = City.objects.filter(slug='moskva').first()
language = LanguageProgramming.objects.filter(slug='python').first()

jobs, errors = [], []
for func, url in parsers:
    j, e = func(url)
    jobs += j
    errors += e
    a = 1

for item in jobs:
    v = Vacancy(**item, city=city, language=language)
    try:
        v.save()
    except DatabaseError:
        pass

# h = codecs.open('../work.html', 'w', 'utf-8')
# h.write(str(jobs))
# h.close()