def crawl_lagou_company_data_task(city_id, finance_stage_id, industry_id): """爬取拉勾公司数据任务""" companies_pagination = crawlers.get_companies_pagination_from_lagou(city_id=city_id, finance_stage_id=finance_stage_id, industry_id=industry_id) for page_no in companies_pagination.iter_pages: company_dicts = crawlers.get_companies_from_lagou(city_id=city_id, finance_stage_id=finance_stage_id, industry_id=industry_id, page_no=page_no) if not company_dicts: break for company_dict in company_dicts: crawlers.clean_lagou_company_data(company_dict) utils.convert.convert_dict_field_to_constants(company_dict) industries = company_dict.pop('industries') city_name = company_dict.pop('city_name') city_ctl.insert_city_if_not_exist(city_name) company_dict['city_id'] = city_ctl.get_city_id_by_name(city_name) company = CompanyModel.get_one(filter_by={'lagou_company_id': company_dict.lagou_company_id}) if company: CompanyModel.update_by_pk(pk=company.id, values=company_dict) else: company_id = CompanyModel.add(**company_dict) for industry in industries: industry_ctl.insert_industry_if_not_exist(name=industry) industry_id = industry_ctl.get_industry_id_by_name(name=industry) CompanyIndustryModel.add(industry_id=industry_id, company_id=company_id) crawl_lagou_job_data_task.delay(company_dict.lagou_company_id)
def crawl_lagou_job_data_task(lagou_company_id): """爬取拉勾职位数据任务""" # 过滤本轮已经爬取过职位的公司 if not redis_instance.setnx(constants.CRAWLED_COMPANY_JOBS_REDIS_KEY.format(lagou_company_id=lagou_company_id), 1): return jobs_pagination = crawlers.get_jobs_pagination_from_lagou(lagou_company_id=lagou_company_id, job_type=constants.LagouJobType.technology) for page_no in jobs_pagination.iter_pages: job_dicts = crawlers.get_jobs_from_lagou(lagou_company_id=lagou_company_id, job_type=constants.LagouJobType.technology, page_no=page_no) if not job_dicts: break for job_dict in job_dicts: crawlers.clean_lagou_job_data(job_dict) utils.convert.convert_dict_field_to_constants(job_dict) keywords = job_dict.pop('keywords') city_name = job_dict.pop('city_name') city_ctl.insert_city_if_not_exist(city_name) job_dict['city_id'] = city_ctl.get_city_id_by_name(city_name) company = CompanyModel.get_one(filter_by={'lagou_company_id': lagou_company_id}) job_dict['company_id'] = company.id job = JobModel.get_one(filter_by={'lagou_job_id': job_dict.lagou_job_id}) if job: JobModel.update_by_pk(pk=job.id, values=job_dict) else: job_id = JobModel.add(**job_dict) for keyword in keywords: keyword_ctl.insert_keyword_if_not_exist(name=keyword) keyword_id = keyword_ctl.get_keyword_id_by_name(name=keyword) JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
def crawl_lagou_job_data_suites(lagou_company_id): jobs_pagination = lagou_jobs_scripts.crawl_lagou_jobs_pagination( lagou_company_id=lagou_company_id, job_type=constants.LagouJobType.technology) for page_no in jobs_pagination.iter_pages: job_dicts = lagou_jobs_scripts.crawl_lagou_jobs( lagou_company_id=lagou_company_id, job_type=constants.LagouJobType.technology, page_no=page_no) if not job_dicts: break for job_dict in job_dicts: if not job_dict.is_exist: lagou_jobs_scripts.clean_lagou_job_data(job_dict) lagou_jobs_scripts.convert_lagou_job_data(job_dict) company = CompanyModel.get_one( filter_by={'lagou_company_id': lagou_company_id}) job_dict['company_id'] = company.id keywords = job_dict.pop('keywords') advantage = job_dict.pop('advantage') description = job_dict.pop('description') job_dict.pop('city') job_id = JobModel.add(**job_dict) JobExtraModel.add(advantage=advantage, description=description, job_id=job_id) for keyword in keywords: keyword_ctl.insert_keyword_if_not_exist(name=keyword) keyword_id = keyword_ctl.get_keyword_id_by_name( name=keyword) JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
def get_company(company_id=None, lagou_company_id=None): if not any([company_id, lagou_company_id]): raise ValueError('必须指定过滤条件') filter_by = {} if company_id: filter_by['company_id'] = company_id if lagou_company_id: filter_by['lagou_company_id'] = lagou_company_id return CompanyModel.get_one(filter_by=filter_by)
def get_finance_stage_statistic(jobs): """ 获取 jobs 的公司的统治情况统计 :param jobs: webspider.models.JobModel instances list :return: collections.Counter """ company_ids = [job.company_id for job in jobs] companies = CompanyModel.list(filter=CompanyModel.id.in_(company_ids)) finance_stage_statistic = utils.common.get_field_statistics( values=[company.finance_stage for company in companies], constants_dict=constants.FINANCE_STAGE_DICT) return finance_stage_statistic
def crawl_lagou_company_data_suites(city_id, finance_stage_id, industry_id): companies_pagination = lagou_companies_scripts.crawl_lagou_companies_pagination( city_id=city_id, finance_stage_id=finance_stage_id, industry_id=industry_id) for page_no in companies_pagination.iter_pages: company_dicts = lagou_companies_scripts.crawl_lagou_companies( city_id=city_id, finance_stage_id=finance_stage_id, industry_id=industry_id, page_no=page_no) if not company_dicts: break for company_dict in company_dicts: if not company_dict.is_exist: lagou_companies_scripts.clean_lagou_company_data(company_dict) lagou_companies_scripts.convert_lagou_company_data( company_dict) industries = company_dict.pop('industries') advantage = company_dict.pop('advantage') introduce = company_dict.pop('introduce') company_dict.pop('city') company_id = CompanyModel.add(**company_dict) CompanyExtraModel.add(introduce=introduce, company_id=company_id, advantage=advantage) for industry in industries: industry_ctl.insert_industry_if_not_exist(name=industry) industry_id = industry_ctl.get_industry_id_by_name( name=industry) CompanyIndustryModel.add(industry_id=industry_id, company_id=company_id) crawl_lagou_job_data_suites(company_dict.lagou_company_id)
def add_company(values): return CompanyModel.add(**values)
def update_company(company_id, values): return CompanyModel.update(filter_by={'company_id': company_id}, values=values)