Exemplo n.º 1
0
def crawl_lagou_company_data_task(city_id, finance_stage_id, industry_id):
    """爬取拉勾公司数据任务"""
    companies_pagination = crawlers.get_companies_pagination_from_lagou(city_id=city_id,
                                                                        finance_stage_id=finance_stage_id,
                                                                        industry_id=industry_id)
    for page_no in companies_pagination.iter_pages:
        company_dicts = crawlers.get_companies_from_lagou(city_id=city_id,
                                                          finance_stage_id=finance_stage_id,
                                                          industry_id=industry_id,
                                                          page_no=page_no)
        if not company_dicts:
            break
        for company_dict in company_dicts:
            crawlers.clean_lagou_company_data(company_dict)
            utils.convert.convert_dict_field_to_constants(company_dict)

            industries = company_dict.pop('industries')
            city_name = company_dict.pop('city_name')

            city_ctl.insert_city_if_not_exist(city_name)
            company_dict['city_id'] = city_ctl.get_city_id_by_name(city_name)

            company = CompanyModel.get_one(filter_by={'lagou_company_id': company_dict.lagou_company_id})
            if company:
                CompanyModel.update_by_pk(pk=company.id, values=company_dict)
            else:
                company_id = CompanyModel.add(**company_dict)

                for industry in industries:
                    industry_ctl.insert_industry_if_not_exist(name=industry)
                    industry_id = industry_ctl.get_industry_id_by_name(name=industry)
                    CompanyIndustryModel.add(industry_id=industry_id, company_id=company_id)

            crawl_lagou_job_data_task.delay(company_dict.lagou_company_id)
Exemplo n.º 2
0
def crawl_lagou_job_data_task(lagou_company_id):
    """爬取拉勾职位数据任务"""
    # 过滤本轮已经爬取过职位的公司
    if not redis_instance.setnx(constants.CRAWLED_COMPANY_JOBS_REDIS_KEY.format(lagou_company_id=lagou_company_id), 1):
        return
    jobs_pagination = crawlers.get_jobs_pagination_from_lagou(lagou_company_id=lagou_company_id,
                                                              job_type=constants.LagouJobType.technology)
    for page_no in jobs_pagination.iter_pages:
        job_dicts = crawlers.get_jobs_from_lagou(lagou_company_id=lagou_company_id,
                                                 job_type=constants.LagouJobType.technology,
                                                 page_no=page_no)
        if not job_dicts:
            break
        for job_dict in job_dicts:
            crawlers.clean_lagou_job_data(job_dict)
            utils.convert.convert_dict_field_to_constants(job_dict)

            keywords = job_dict.pop('keywords')
            city_name = job_dict.pop('city_name')

            city_ctl.insert_city_if_not_exist(city_name)
            job_dict['city_id'] = city_ctl.get_city_id_by_name(city_name)
            company = CompanyModel.get_one(filter_by={'lagou_company_id': lagou_company_id})
            job_dict['company_id'] = company.id

            job = JobModel.get_one(filter_by={'lagou_job_id': job_dict.lagou_job_id})
            if job:
                JobModel.update_by_pk(pk=job.id, values=job_dict)
            else:
                job_id = JobModel.add(**job_dict)

                for keyword in keywords:
                    keyword_ctl.insert_keyword_if_not_exist(name=keyword)
                    keyword_id = keyword_ctl.get_keyword_id_by_name(name=keyword)
                    JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
Exemplo n.º 3
0
def crawl_lagou_job_data_suites(lagou_company_id):
    jobs_pagination = lagou_jobs_scripts.crawl_lagou_jobs_pagination(
        lagou_company_id=lagou_company_id,
        job_type=constants.LagouJobType.technology)
    for page_no in jobs_pagination.iter_pages:
        job_dicts = lagou_jobs_scripts.crawl_lagou_jobs(
            lagou_company_id=lagou_company_id,
            job_type=constants.LagouJobType.technology,
            page_no=page_no)
        if not job_dicts:
            break
        for job_dict in job_dicts:
            if not job_dict.is_exist:
                lagou_jobs_scripts.clean_lagou_job_data(job_dict)
                lagou_jobs_scripts.convert_lagou_job_data(job_dict)

                company = CompanyModel.get_one(
                    filter_by={'lagou_company_id': lagou_company_id})
                job_dict['company_id'] = company.id
                keywords = job_dict.pop('keywords')
                advantage = job_dict.pop('advantage')
                description = job_dict.pop('description')
                job_dict.pop('city')

                job_id = JobModel.add(**job_dict)
                JobExtraModel.add(advantage=advantage,
                                  description=description,
                                  job_id=job_id)

                for keyword in keywords:
                    keyword_ctl.insert_keyword_if_not_exist(name=keyword)
                    keyword_id = keyword_ctl.get_keyword_id_by_name(
                        name=keyword)
                    JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
Exemplo n.º 4
0
def get_company(company_id=None, lagou_company_id=None):
    if not any([company_id, lagou_company_id]):
        raise ValueError('必须指定过滤条件')
    filter_by = {}
    if company_id:
        filter_by['company_id'] = company_id

    if lagou_company_id:
        filter_by['lagou_company_id'] = lagou_company_id

    return CompanyModel.get_one(filter_by=filter_by)
Exemplo n.º 5
0
def get_finance_stage_statistic(jobs):
    """
    获取 jobs 的公司的统治情况统计

    :param jobs: webspider.models.JobModel instances list
    :return: collections.Counter
    """
    company_ids = [job.company_id for job in jobs]
    companies = CompanyModel.list(filter=CompanyModel.id.in_(company_ids))

    finance_stage_statistic = utils.common.get_field_statistics(
        values=[company.finance_stage for company in companies],
        constants_dict=constants.FINANCE_STAGE_DICT)
    return finance_stage_statistic
Exemplo n.º 6
0
def crawl_lagou_company_data_suites(city_id, finance_stage_id, industry_id):
    companies_pagination = lagou_companies_scripts.crawl_lagou_companies_pagination(
        city_id=city_id,
        finance_stage_id=finance_stage_id,
        industry_id=industry_id)
    for page_no in companies_pagination.iter_pages:
        company_dicts = lagou_companies_scripts.crawl_lagou_companies(
            city_id=city_id,
            finance_stage_id=finance_stage_id,
            industry_id=industry_id,
            page_no=page_no)
        if not company_dicts:
            break
        for company_dict in company_dicts:
            if not company_dict.is_exist:
                lagou_companies_scripts.clean_lagou_company_data(company_dict)
                lagou_companies_scripts.convert_lagou_company_data(
                    company_dict)

                industries = company_dict.pop('industries')
                advantage = company_dict.pop('advantage')
                introduce = company_dict.pop('introduce')
                company_dict.pop('city')

                company_id = CompanyModel.add(**company_dict)
                CompanyExtraModel.add(introduce=introduce,
                                      company_id=company_id,
                                      advantage=advantage)

                for industry in industries:
                    industry_ctl.insert_industry_if_not_exist(name=industry)
                    industry_id = industry_ctl.get_industry_id_by_name(
                        name=industry)
                    CompanyIndustryModel.add(industry_id=industry_id,
                                             company_id=company_id)
            crawl_lagou_job_data_suites(company_dict.lagou_company_id)
Exemplo n.º 7
0
def add_company(values):
    return CompanyModel.add(**values)
Exemplo n.º 8
0
def update_company(company_id, values):
    return CompanyModel.update(filter_by={'company_id': company_id}, values=values)