Пример #1
0
def file2job(file, zhinengleibie, province):
    job = Job()
    job.zhinengleibie = zhinengleibie

    if province == '深圳':
        job.province = '广东'
    else:
        job.province = province

    job.job_id = path.split(file)[-1].replace(".html", "")

    content = ""
    try:
        with open(file, mode='r', encoding='gbk') as f:
            content = f.read()
            f.close()
    except UnicodeDecodeError:
        print("UnicodeDecodeError")
        return None

    soup = BeautifulSoup(content, "html.parser")
    title_tag = soup.select_one('title')
    if not title_tag:
        return None
    job.page_title = title_tag.text
    if '异地招聘' in job.page_title:
        job.province = '异地招聘'
        job.city = '异地招聘'
    #职业 start
    #首先判断职业,如果职业不是程序员,直接pass

    job.career = zhinengleibie

    #职业 end

    #20-40万/年
    #'1-1.5万/月'
    #10万以上/月
    #100万以上/年
    #3-4.5千/月
    #17元/小时
    salary_tag = soup.select_one('.cn strong')
    if not salary_tag:
        return None
    salary_string = salary_tag.text
    #零时工,不统计
    job.get_salary(salary_string)
    if job.monthly_salary == -1:
        return None

    job.title = soup.find("h1").text.strip()

    if any(key in job.title for key in title_key_blacklist):
        return None

    #'深圳-福田区|5-7年经验|本科|招1人|04-01发布'
    job.job_summary = soup.select_one(".msg").text.replace('\xa0', '').replace(
        ' ', '').strip()
    #print(basic_info)
    infos = job.job_summary.split('|')
    #first location
    job.city = infos[0].split('-')[0]
    #remove the first one - location
    infos = infos[1:]
    for info in infos:
        if '经验' in info and not job.check_working_experience():
            job.get_working_experience(info)

        #学历
        if not job.check_edu():
            job.get_edu(info)

        #人数
        if '招' in info and '人' in info:
            headcount_string = info.replace('招', '').replace('人', '')
            if headcount_string == '若干':
                job.headcount = 5
            else:
                job.headcount = int(headcount_string)

        if info.endswith('发布'):

            #date
            date_string = "2020-" + info.replace("发布", '')
            job.publish_date = datetime.strptime(date_string, '%Y-%m-%d')
            weekday = job.publish_date.weekday()
            job.published_on_weekend = weekday > 4

        #language
        if '英语' in info or '英文' in info:
            job.english = True
        if '日语' in info or '日文' in info:
            job.japanese = True

    #tags
    tags = [tag.text for tag in soup.select('.sp4')]
    job.job_tags = ','.join(tags)
    job.get_tags(tags)

    h2_span = soup.select_one('h2 span')
    job.job_description = h2_span.parent.find_next('div').text.strip()
    job_description_lower = job.job_description.lower()
    job_description_lower = job.title + " " + job_description_lower

    job.get_programming_languages(job_description_lower) \
        .get_databases(job_description_lower) \
        .get_big_data_stats(job_description_lower)

    #english and japanese
    if '英语' in job_description_lower or '英文' in job_description_lower:
        job.english = True
    #如果招聘信息本身都是英语写的,那么肯定要求英语
    if is_article_english(job_description_lower):
        job.english = True
    if '日语' in job_description_lower or '日文' in job_description_lower:
        job.japanese = True

    #手机程序员并不单独归类,而是用smart_phone属性表示
    #手机应用开发工程师
    if 'iso' in job_description_lower or 'iphone' in job_description_lower:
        job.phone_iso = True
        job.phone_app = True
    if 'android' in job_description_lower:
        job.phone_android = True
        job.phone_app = True

    company_title_tag = soup.select_one('.com_name')
    if not company_title_tag:
        company_title_tag = soup.select_one('.catn')
    company_title = company_title_tag.text.strip()
    #black named companies
    if company_title in company_blacklist:
        return None
    job.company_link = company_title_tag.attrs['href']
    job.company_id = re.match('.*(co\d*).html', job.company_link).group(1)

    #996
    #朝九晚五,周末双休 双休 不加班
    if '朝九晚五' in job.job_description \
        or '朝九晚六' in job.job_description \
        or '双休' in job.job_description \
        or '不加班' in job.job_description:
        job._996_no = True
    if '朝九晚九' in job.job_description:
        job._996_yes = True
    if job.tag_rest_two_days:
        job._996_no = True
    if job.published_on_weekend:
        job._996_yes = True

    return job
Пример #2
0
def file2job(file, zhinengleibie, province):
    job=Job()
    job.zhinengleibie=zhinengleibie

    if province=='深圳':
        job.province='广东'
    else:
        job.province=province

    job.job_id=path.split(file)[-1].replace(".html","")
    
    if job.job_id in ['110455749','77612262','107681687']:
        return None
    
    #page title


    #print(file)
    content=""
    try:
        with open(file, mode='r',encoding='gbk') as f:
            content=f.read()
            f.close()
    except UnicodeDecodeError:
        print("UnicodeDecodeError")
        return None

    
    soup=BeautifulSoup(content, "html.parser")
    title_tag=soup.select_one('title')
    if not title_tag:
        return None
    job.page_title=title_tag.text
    if '异地招聘' in job.page_title:
        return None
    #职业 start
    #首先判断职业,如果职业不是程序员,直接pass
    zhineng_tag=soup.find('span',{'class':'label'},text='职能类别:')
    if not zhineng_tag:
        return None


    #职业 end
    
    #20-40万/年
    #'1-1.5万/月'
    #10万以上/月
    #100万以上/年
    #3-4.5千/月
    #17元/小时
    salary_tag=soup.select_one('.cn strong')
    if not salary_tag:
        return None
    salary_string=salary_tag.text
    #零时工,不统计
    job.get_salary(salary_string)
    if job.monthly_salary==-1:
        return None
        
    
    job.title=soup.find("h1").text.strip()

    if any(key in job.title for key in title_key_blacklist):
        return None
    

    #'深圳-福田区|5-7年经验|本科|招1人|04-01发布'
    job.job_summary=soup.select_one(".msg").text.replace('\xa0','').replace(' ','').strip()
    #print(basic_info)
    infos=job.job_summary.split('|')
    #first location
    job.city=infos[0].split('-')[0]
    #remove the first one - location
    infos=infos[1:]
    for info in infos:
        if '经验' in info and not job.check_working_experience():
            job.get_working_experience(info)
    
        #学历
        if not job.check_edu():
            job.get_edu(info)
        
        #人数
        if '招' in info and '人' in info:
            headcount_string=info.replace('招','').replace('人','')            
            if headcount_string=='若干':
                headcount_string='5'
            job.headcount=int(headcount_string)
    
        if info.endswith('发布'):
    
            #date
            date_string="2020-"+info.replace("发布",'')
            job.publish_date=datetime.strptime(date_string, '%Y-%m-%d')
            weekday=job.publish_date.weekday()
            job.published_on_weekend=weekday>4

        #language
        if '英语' in info or '英文' in info:
            job.english=True
        if '日语' in info or '日文' in info:
            job.japanese=True
        
    #tags
    tags=[tag.text for tag in soup.select('.sp4')]
    job.job_tags=','.join(tags)
    job.get_tags(tags)
    
    h2_span=soup.select_one('h2 span')
    job.job_description=h2_span.parent.find_next('div').text.strip()
    job_description_lower=job.job_description.lower()
    job_description_lower=job.title+" "+job_description_lower
    

        
    job.get_programming_languages(job_description_lower) \
        .get_databases(job_description_lower) \
        .get_big_data_stats(job_description_lower) \
        .get_machine_learning_stats(job_description_lower) 
    

    #如果招聘信息本身都是英语写的,那么肯定要求英语
    if is_article_english(job_description_lower):
        job.english=True

    
    #<span class="bname">公司信息</span>
    company_info_tag=soup.find('span',text='公司信息')
    if company_info_tag:
        job.company_description=company_info_tag.parent.find_next('div').text.replace('\xa0',' ').strip()
    company_title_tag=soup.select_one('.com_name')
    if not company_title_tag:
        company_title_tag=soup.select_one('.catn')
    job.company_title=company_title_tag.text.strip()
    #['民营公司', '150-500人', '服装/纺织/皮革']
    company_tags=[p.text.strip() for p in soup.select('.com_tag .at')]
    
    if len(company_tags)>0:
        job.get_company_type(company_tags[0])
    if job.company_type=='':
        company_link=company_title_tag.attrs['href']
        company_tags=get_company_tags(company_link)
        for tag in company_tags:
            if job.get_company_type(tag).check_company_type():
                break 

    if job.company_type=='':
        return None
    
    job.get_company_size(company_tags[1])
    if not job.check_company_size():
        company_link=company_title_tag.attrs['href']
        company_tags=get_company_tags(company_link)
        for tag in company_tags:
            if job.get_company_size(tag).check_company_size():
                break
            
    #计算机/互联网/通信/电子
    industry_tags=[p.text.strip() for p in soup.select('.com_tag .at a') if not p.text=='']
    
    
    if len(industry_tags)==0:
        company_link=soup.select_one('.com_name').attrs['href']
        industry_tags=get_company_tags(company_link)
    
    for industry_tag in industry_tags:
        job.get_industry(industry_tag)
    
    if job.company_title in ['系统集成有限责任公司','博彦科技股份有限公司']:
        job.industry='computer'
    if job.company_title=='软件与服务中心':
        job.industry='trade'
    if job.company_title== '中核集团技术经济总院':
        job.industry='energy'
    
    #black named companies
    if job.company_title in company_blacklist:
        return None
#    if not job.check_industry():
#        raise Exception("no industry")
        
    
    return job
def file2job(file, city):
    job = Job()
    setattr(job, "city_" + city, True)
    job.job_id = path.split(file)[-1].replace(".html", "")
    #print(file)
    content = ""
    with open(file, mode='r', encoding='gbk') as f:
        content = f.read()
        f.close()
    os.rename(file, file.replace("51jobs", "51jobs_back"))
    soup = BeautifulSoup(content, "html.parser")

    #职业 start
    #首先判断职业,如果职业不是程序员,直接pass
    #career=soup.find('span',{'class':'label'},text='职能类别:').find_next('a').text.strip()
    careers = [
        a_tag.text.strip()
        for a_tag in soup.find('span', {
            'class': 'label'
        }, text='职能类别:').parent.find_all('a')
    ]
    for career in careers:
        if (career in [
                '软件工程师', '高级软件工程师', 'ERP技术开发', '互联网软件开发工程师', '多媒体/游戏开发工程师',
                '手机应用开发工程师', 'WEB前端工程师', '脚本开发工程师', '语音/视频/图形开发工程师'
        ]):
            job.career_software_engineer = True
        if career == '算法工程师':
            job.career_algorithm = True
        if career in ['系统架构设计师', '网站架构设计师']:
            job.career_architect = True

    is_developer = job.career_software_engineer or job.career_algorithm or job.career_architect
    if not is_developer:
        return None

    #职业 end

    #20-40万/年
    #'1-1.5万/月'
    #10万以上/月
    #100万以上/年
    #3-4.5千/月
    #17元/小时
    salary_tag = soup.select_one('.cn strong')
    if not salary_tag:
        return None
    salary_string = salary_tag.text
    #零时工,不统计
    job.get_salary(salary_string)
    if job.monthly_salary == -1:
        return None

    job.title = soup.find("h1").text.strip()
    #'深圳-福田区|5-7年经验|本科|招1人|04-01发布'
    basic_info = soup.select_one(".msg").text.replace('\xa0',
                                                      '').replace(' ',
                                                                  '').strip()
    #print(basic_info)
    infos = basic_info.split('|')

    #remove the first one - location
    infos = infos[1:]
    for info in infos:
        if '经验' in info and not job.check_working_experience():
            job.get_working_experience(info)

        #学历
        if not job.check_edu():
            job.get_edu(info)

        if info.endswith('发布'):

            #date
            date_string = "2019-" + info.replace("发布", '')
            job.publish_date = datetime.strptime(date_string, '%Y-%m-%d')
            weekday = job.publish_date.weekday()
            job.published_on_weekend = weekday > 4

        #language
        if '英语' in info or '英文' in info:
            job.english = True
        if '日语' in info or '日文' in info:
            job.japanese = True

    #tags
    tags = [tag.text for tag in soup.select('.sp4')]
    job.get_tags(tags)

    h2_span = soup.select_one('h2 span')
    job.job_description = h2_span.parent.find_next('div').text.strip()
    job_description_lower = job.job_description.lower()
    job_description_lower = job.title + " " + job_description_lower

    #年龄歧视
    job.ageism = '岁' in job.job_description

    #继续判断是不是架构师
    if '架构师' in job_description_lower:
        job.career_algorithm = True
    #继续判断是不是算法工程师
    if 'tensorflow' in job_description_lower \
        or 'keras' in job_description_lower \
        or 'caffe' in job_description_lower \
        or 'pytorch' in job_description_lower \
        or '机器学习' in job_description_lower \
        or 'nlp' in job_description_lower \
        or '自然语言处理' in job_description_lower \
        or '算法工程师' in job_description_lower \
        or 'sklearn' in job_description_lower \
        or '深度学习' in job_description_lower \
        or '图像识别' in job_description_lower:
        job.career_algorithm = True

    if job.career_algorithm or job.career_architect:
        job.career_software_engineer = True

    job.get_programming_languages(job_description_lower).get_databases(
        job_description_lower)

    #english and japanese
    if '英语' in job_description_lower or '英文' in job_description_lower:
        job.english = True
    #如果招聘信息本身都是英语写的,那么肯定要求英语
    if is_article_english(job_description_lower):
        job.english = True
    if '日语' in job_description_lower or '日文' in job_description_lower:
        job.japanese = True

    #手机程序员并不单独归类,而是用smart_phone属性表示
    #手机应用开发工程师
    if '手机应用开发工程师' in careers:
        job.phone_app = True

    if 'iso' in job_description_lower or 'iphone' in job_description_lower:
        job.phone_iso = True
        job.phone_app = True
    if 'android' in job_description_lower:
        job.phone_android = True
        job.phone_app = True

    #<span class="bname">公司信息</span>
    job.company_description = soup.find(
        'span', text='公司信息').parent.find_next('div').text.replace('\xa0',
                                                                  ' ').strip()
    job.company_title = soup.select_one('.com_name').text.strip()
    #['民营公司', '150-500人', '服装/纺织/皮革']
    company_tags = [p.text.strip() for p in soup.select('.com_tag .at')]

    job.get_company_type(company_tags[0])
    if not job.check_company_type():
        company_link = soup.select_one('.com_name').attrs['href']
        company_tags = get_company_tags(company_link)
        for tag in company_tags:
            if job.get_company_type(tag).check_company_type():
                break

    if not job.check_company_type():
        return None

    job.get_company_size(company_tags[1])
    if not job.check_company_size():
        company_link = soup.select_one('.com_name').attrs['href']
        company_tags = get_company_tags(company_link)
        for tag in company_tags:
            if job.get_company_size(tag).check_company_size():
                break

    #计算机/互联网/通信/电子
    industry_tags = [
        p.text.strip() for p in soup.select('.com_tag .at a')
        if not p.text == ''
    ]

    if len(industry_tags) == 0:
        company_link = soup.select_one('.com_name').attrs['href']
        industry_tags = get_company_tags(company_link)

    for industry_tag in industry_tags:
        job.get_industry(industry_tag)

    if job.company_title in ['系统集成有限责任公司', '博彦科技股份有限公司']:
        job.industry_computer = True
    if job.company_title == '软件与服务中心':
        job.industry_trade = True
    if job.company_title == '中核集团技术经济总院':
        job.industry_energy = True


#    if not job.check_industry():
#        raise Exception("no industry")

#996
#朝九晚五,周末双休 双休 不加班
    if '朝九晚五' in job.job_description \
        or '朝九晚六' in job.job_description \
        or '双休' in job.job_description \
        or '不加班' in job.job_description:
        job.non_996 = True
    if '朝九晚九' in job.job_description:
        job.icu_996 = True
    if job.tag_rest_two_days:
        job.non_996 = True
    if in_996_list(job.company_title):
        job.icu_996 = True
    if in_non_996_list(job.company_title):
        job.non_996 = True
    if job.published_on_weekend:
        job.icu_996 = True

    return job
Пример #4
0
def file2job(file, zhinengleibie, province):
    job = Job()
    job.zhinengleibie = zhinengleibie
    job.province = province
    job.job_id = path.split(file)[-1].replace(".html", "")

    if job.job_id in ['110455749', '77612262', '107681687']:
        return None

    #page title

    #print(file)
    content = ""
    try:
        with open(file, mode='r', encoding='gbk') as f:
            content = f.read()
            f.close()
    except UnicodeDecodeError:
        print("UnicodeDecodeError")
        return None

    soup = BeautifulSoup(content, "html.parser")
    job.page_title = soup.select_one('title').text
    if '异地招聘' in job.page_title:
        return None
    #职业 start
    #首先判断职业,如果职业不是程序员,直接pass
    #career=soup.find('span',{'class':'label'},text='职能类别:').find_next('a').text.strip()
    zhineng_tag = soup.find('span', {'class': 'label'}, text='职能类别:')
    if not zhineng_tag:
        return None
    careers = [
        a_tag.text.strip() for a_tag in zhineng_tag.parent.find_all('a')
    ]
    for career in careers:
        if (career in [
                '软件工程师', '高级软件工程师', 'ERP技术开发', '互联网软件开发工程师', '多媒体/游戏开发工程师',
                '手机应用开发工程师', 'WEB前端工程师', '脚本开发工程师', '语音/视频/图形开发工程师'
        ]):
            job.career = '一般程序员'
        if career == '算法工程师':
            job.career = '算法工程师'
        if career in ['系统架构设计师', '网站架构设计师']:
            job.career = '系统架构师'

    if '爬虫' in job.title:
        job.career = '爬虫工程师'

    if '生物信息' in job.title:
        job.career = '生物信息工程师'

    if job.career == '':
        return None

    #职业 end

    #20-40万/年
    #'1-1.5万/月'
    #10万以上/月
    #100万以上/年
    #3-4.5千/月
    #17元/小时
    salary_tag = soup.select_one('.cn strong')
    if not salary_tag:
        return None
    salary_string = salary_tag.text
    #零时工,不统计
    job.get_salary(salary_string)
    if job.monthly_salary == -1:
        return None

    job.title = soup.find("h1").text.strip()

    if any(key in job.title for key in [
            '安全工程师', 'seo', '测试', '前端', '信息工程师', '运维', '经理', '嵌入式', '讲师', '老师',
            '负责人', '合伙人', '计算机技术员', '主任', '总监', 'cto', '需求工程师', '需求分析',
            '系统集成工程师', '系统工程师', '系统分析师', '计算机辅助设计', 'DBA', '实施', '售前', '售后',
            '数据库'
    ]):
        return None

    job_title_lower = job.title.lower()
    if '专家' in job_title_lower:
        job.expert_expert = False
    if 'blockchain' in job_title_lower or '区块链' in job_title_lower:
        job.expert_blockchain = False
    if 'adas' in job_title_lower:
        job.expert_adas = False
    if '嵌入式' in job_title_lower:
        job.expert_embed = False
    if 'gis' in job_title_lower:
        job.expert_gis = False

    if '架构师' in job_title_lower:
        job.career = '系统架构师'
    if '算法工程师' in job_title_lower:
        job.career = '算法工程师'

    #'深圳-福田区|5-7年经验|本科|招1人|04-01发布'
    job.job_summary = soup.select_one(".msg").text.replace('\xa0', '').replace(
        ' ', '').strip()
    #print(basic_info)
    infos = job.job_summary.split('|')
    #first location
    job.city = infos[0].split('-')[0]
    #remove the first one - location
    infos = infos[1:]
    for info in infos:
        if '经验' in info and not job.check_working_experience():
            job.get_working_experience(info)

        #学历
        if not job.check_edu():
            job.get_edu(info)

        #人数
        if '招' in info and '人' in info:
            headcount_string = info.replace('招', '').replace('人', '')
            if headcount_string == '若干':
                headcount_string = '5'
            job.headcount = int(headcount_string)

        if info.endswith('发布'):

            #date
            date_string = "2019-" + info.replace("发布", '')
            job.publish_date = datetime.strptime(date_string, '%Y-%m-%d')
            weekday = job.publish_date.weekday()
            job.published_on_weekend = weekday > 4

        #language
        if '英语' in info or '英文' in info:
            job.english = True
        if '日语' in info or '日文' in info:
            job.japanese = True

    #tags
    tags = [tag.text for tag in soup.select('.sp4')]
    job.job_tags = ','.join(tags)
    job.get_tags(tags)

    h2_span = soup.select_one('h2 span')
    job.job_description = h2_span.parent.find_next('div').text.strip()
    job_description_lower = job.job_description.lower()
    job_description_lower = job.title + " " + job_description_lower

    #年龄歧视
    job.ageism = '岁' in job.job_description

    #继续判断是不是架构师
    #if '架构师' in job_description_lower:
    #    job.career='系统架构师'
    #继续判断是不是算法工程师
    if 'tensorflow' in job_description_lower \
        or 'keras' in job_description_lower \
        or 'caffe' in job_description_lower \
        or 'pytorch' in job_description_lower \
        or '机器学习' in job_description_lower \
        or 'nlp' in job_description_lower \
        or '自然语言处理' in job_description_lower \
        or '算法工程师' in job_description_lower \
        or 'sklearn' in job_description_lower \
        or '深度学习' in job_description_lower \
        or '图像识别' in job_description_lower:
        job.career = '算法工程师'

    job.get_programming_languages(job_description_lower).get_databases(
        job_description_lower)

    #english and japanese
    if '英语' in job_description_lower or '英文' in job_description_lower:
        job.english = True
    #如果招聘信息本身都是英语写的,那么肯定要求英语
    if is_article_english(job_description_lower):
        job.english = True
    if '日语' in job_description_lower or '日文' in job_description_lower:
        job.japanese = True

    #手机程序员并不单独归类,而是用smart_phone属性表示
    #手机应用开发工程师
    if '手机应用开发工程师' in careers:
        job.phone_app = True

    if 'iso' in job_description_lower or 'iphone' in job_description_lower:
        job.phone_iso = True
        job.phone_app = True
    if 'android' in job_description_lower:
        job.phone_android = True
        job.phone_app = True

    #<span class="bname">公司信息</span>
    job.company_description = soup.find(
        'span', text='公司信息').parent.find_next('div').text.replace('\xa0',
                                                                  ' ').strip()
    job.company_title = soup.select_one('.com_name').text.strip()
    #['民营公司', '150-500人', '服装/纺织/皮革']
    company_tags = [p.text.strip() for p in soup.select('.com_tag .at')]

    job.get_company_type(company_tags[0])
    if job.company_type == '':
        company_link = soup.select_one('.com_name').attrs['href']
        company_tags = get_company_tags(company_link)
        for tag in company_tags:
            if job.get_company_type(tag).check_company_type():
                break

    if job.company_type == '':
        return None

    job.get_company_size(company_tags[1])
    if not job.check_company_size():
        company_link = soup.select_one('.com_name').attrs['href']
        company_tags = get_company_tags(company_link)
        for tag in company_tags:
            if job.get_company_size(tag).check_company_size():
                break

    #计算机/互联网/通信/电子
    industry_tags = [
        p.text.strip() for p in soup.select('.com_tag .at a')
        if not p.text == ''
    ]

    if len(industry_tags) == 0:
        company_link = soup.select_one('.com_name').attrs['href']
        industry_tags = get_company_tags(company_link)

    for industry_tag in industry_tags:
        job.get_industry(industry_tag)

    if job.company_title in ['系统集成有限责任公司', '博彦科技股份有限公司']:
        job.industry = 'computer'
    if job.company_title == '软件与服务中心':
        job.industry = 'trade'
    if job.company_title == '中核集团技术经济总院':
        job.industry = 'energy'


#    if not job.check_industry():
#        raise Exception("no industry")

#996
#朝九晚五,周末双休 双休 不加班
    if '朝九晚五' in job.job_description \
        or '朝九晚六' in job.job_description \
        or '双休' in job.job_description \
        or '不加班' in job.job_description:
        job._996_no = True
    if '朝九晚九' in job.job_description:
        job._996_yes = True
    if job.tag_rest_two_days:
        job._996_no = True
    if in_996_list(job.company_title):
        job._996_yes = True
    if in__996_no_list(job.company_title):
        job._996_no = True
    if job.published_on_weekend:
        job._996_yes = True

    if job.company_title in ['青岛云指针软件有限公司']:
        job._996_no = False
        job._996_yes = False

    return job
def file2job(file, zhinengleibie, province):
    job=Job()
    job.zhinengleibie=zhinengleibie

    if province=='深圳':
        job.province='广东'
    else:
        job.province=province

    job.job_id=path.split(file)[-1].replace(".html","")
    
    if job.job_id in ['110455749','77612262','107681687']:
        return None
    
    #page title


    #print(file)
    content=""
    try:
        with open(file, mode='r',encoding='gbk') as f:
            content=f.read()
            f.close()
    except UnicodeDecodeError:
        print("UnicodeDecodeError")
        return None

    
    soup=BeautifulSoup(content, "html.parser")
    title_tag=soup.select_one('title')
    if not title_tag:
        return None
    job.page_title=title_tag.text
    if '异地招聘' in job.page_title:
        return None
    #职业 start
    #首先判断职业,如果职业不是程序员,直接pass
    #career=soup.find('span',{'class':'label'},text='职能类别:').find_next('a').text.strip()
    zhineng_tag=soup.find('span',{'class':'label'},text='职能类别:')
    if not zhineng_tag:
        return None
    careers=[a_tag.text.strip() for a_tag in zhineng_tag.parent.find_all('a')]
    for career in careers:
        if (career in ['软件工程师','高级软件工程师','ERP技术开发','互联网软件开发工程师','多媒体/游戏开发工程师','手机应用开发工程师','WEB前端工程师','脚本开发工程师','语音/视频/图形开发工程师']):
            job.career='一般程序员'
        if career in ['算法工程师','机器学习工程师','深度学习工程师','图像算法工程师','图像处理工程师','语音识别工程师','图像识别工程师','机器视觉工程师','自然语言处理(NLP)']:
            job.career='算法工程师'
        if career in ['系统架构设计师','网站架构设计师']:
            job.career='系统架构师'

    #职业 end
    
    #20-40万/年
    #'1-1.5万/月'
    #10万以上/月
    #100万以上/年
    #3-4.5千/月
    #17元/小时
    salary_tag=soup.select_one('.cn strong')
    if not salary_tag:
        return None
    salary_string=salary_tag.text
    #零时工,不统计
    job.get_salary(salary_string)
    if job.monthly_salary==-1:
        return None
        
    
    job.title=soup.find("h1").text.strip()
    
    if '爬虫' in job.title:
        job.career='爬虫工程师'

    if '生物信息' in job.title:
        job.career='生物信息工程师'

    if job.career=='':
        return None

    if any(key in job.title for key in title_key_blacklist):
        return None
    
    job_title_lower=job.title.lower()
    if '专家' in job_title_lower:   
        job.expert_expert=True
    if 'blockchain' in job_title_lower or '区块链' in job_title_lower:
        job.expert_blockchain=True
    if 'adas' in job_title_lower:
        job.expert_adas=True
    if '嵌入式' in job_title_lower:
        job.expert_embed=True
    if 'gis' in job_title_lower:
        job.expert_gis=True

    if '架构师' in job_title_lower:
        job.career='系统架构师'
        
    if '算法工程师' in job_title_lower:
        job.career='算法工程师'

    if '数据分析' in job_title_lower:
        job.career='数据分析'

    if '图像算法' in job_title_lower:
        job.career='图像算法'

    #'深圳-福田区|5-7年经验|本科|招1人|04-01发布'
    job.job_summary=soup.select_one(".msg").text.replace('\xa0','').replace(' ','').strip()
    #print(basic_info)
    infos=job.job_summary.split('|')
    #first location
    job.city=infos[0].split('-')[0]
    #remove the first one - location
    infos=infos[1:]
    for info in infos:
        if '经验' in info and not job.check_working_experience():
            job.get_working_experience(info)
    
        #学历
        if not job.check_edu():
            job.get_edu(info)
        
        #人数
        if '招' in info and '人' in info:
            headcount_string=info.replace('招','').replace('人','')            
            if headcount_string=='若干':
                headcount_string='5'
            job.headcount=int(headcount_string)
    
        if info.endswith('发布'):
    
            #date
            date_string="2020-"+info.replace("发布",'')
            job.publish_date=datetime.strptime(date_string, '%Y-%m-%d')
            weekday=job.publish_date.weekday()
            job.published_on_weekend=weekday>4

        #language
        if '英语' in info or '英文' in info:
            job.english=True
        if '日语' in info or '日文' in info:
            job.japanese=True
        
    #tags
    tags=[tag.text for tag in soup.select('.sp4')]
    job.job_tags=','.join(tags)
    job.get_tags(tags)
    
    h2_span=soup.select_one('h2 span')
    job.job_description=h2_span.parent.find_next('div').text.strip()
    job_description_lower=job.job_description.lower()
    job_description_lower=job.title+" "+job_description_lower
    
    #年龄歧视
    job.ageism='岁' in job.job_description
             
            
    if '机器学习' in job.title or '深度学习' in job.title or '推荐系统' in job.title \
         or '推荐算法' in job.title or '图像识别' in job.title or '人工智能' in job.title \
         or 'nlp' in job.title.lower() or '自然语言' in job.title or 'aml' in job.title \
         or 'AI' in job.title or '数据科学家' in job.title or 'data scientist' in job.title.lower() \
         or '知识图谱' in job.title \
         or job.zhinengleibie in ('机器学习工程师','深度学习工程师'):
             job.career='机器学习' 
             
               
           
        
    if 'dsp' in job_description_lower:
        job.career='DSP' 
        
    if 'slam' in job_description_lower:
        job.career='SLAM' 
        
    if 'CT重建' in job.title:
        job.career='CT重建' 

    if '大数据' in job.title:
        job.career='大数据'   
        
    if 'FPGA' in job.title:
        job.career='FPGA'  

    if '信号处理' in job.title:
        job.career='信号处理' 
        
    if '架构师' in job.title:
        job.career='系统架构师' 

    if '视觉' in job.title:
        job.career='视觉软件工程师'
        
    if '三维重建' in job.title:
        job.career='视觉软件工程师'
        
    if '规划算法' in job.title:
        job.career='规划算法工程师'

    if '遥感' in job.title:
        job.career='遥感'

    if '光学算法' in job.title:
        job.career='光学算法工程师'        
        
    if '机器人' in job.title or 'ROS' in job.title:
        job.career='机器人'

    if '爬虫' in job.title:
        job.career='爬虫工程师'

    if 'ADAS' in job.title:
        job.career='adas'

    if 'GIS' in job.title:
        job.career='gis'

    if 'CAE' in job.title:
        job.career='cae'

    if 'ETL' in job.title:
        job.career='etl'
        
    if 'unity3d' in job.title.lower() or 'u3d' in job.title.lower():
        job.career='Unity3d'
        
    job.get_programming_languages(job_description_lower) \
        .get_databases(job_description_lower) \
        .get_big_data_stats(job_description_lower) \
        .get_machine_learning_stats(job_description_lower) 
    
    #english and japanese
    if '英语' in job_description_lower or '英文' in job_description_lower:
        job.english=True
    #如果招聘信息本身都是英语写的,那么肯定要求英语
    if is_article_english(job_description_lower):
        job.english=True
    if '日语' in job_description_lower or '日文' in job_description_lower:
        job.japanese=True
    
    #手机程序员并不单独归类,而是用smart_phone属性表示
    #手机应用开发工程师
    if '手机应用开发工程师' in careers:
        job.phone_app=True
    
    if 'iso' in job_description_lower or 'iphone' in job_description_lower:
        job.phone_iso=True
        job.phone_app=True
    if 'android' in job_description_lower:
        job.phone_android=True
        job.phone_app=True

    
    #<span class="bname">公司信息</span>
    company_info_tag=soup.find('span',text='公司信息')
    if company_info_tag:
        job.company_description=company_info_tag.parent.find_next('div').text.replace('\xa0',' ').strip()
    company_title_tag=soup.select_one('.com_name')
    if not company_title_tag:
        company_title_tag=soup.select_one('.catn')
    job.company_title=company_title_tag.text.strip()
    #['民营公司', '150-500人', '服装/纺织/皮革']
    company_tags=[p.text.strip() for p in soup.select('.com_tag .at')]
    
    if len(company_tags)>0:
        job.get_company_type(company_tags[0])
    if job.company_type=='':
        company_link=company_title_tag.attrs['href']
        company_tags=get_company_tags(company_link)
        for tag in company_tags:
            if job.get_company_type(tag).check_company_type():
                break 

    if job.company_type=='':
        return None
    
    job.get_company_size(company_tags[1])
    if not job.check_company_size():
        company_link=company_title_tag.attrs['href']
        company_tags=get_company_tags(company_link)
        for tag in company_tags:
            if job.get_company_size(tag).check_company_size():
                break
            
    #计算机/互联网/通信/电子
    industry_tags=[p.text.strip() for p in soup.select('.com_tag .at a') if not p.text=='']
    
    
    if len(industry_tags)==0:
        company_link=soup.select_one('.com_name').attrs['href']
        industry_tags=get_company_tags(company_link)
    
    for industry_tag in industry_tags:
        job.get_industry(industry_tag)
    
    if job.company_title in ['系统集成有限责任公司','博彦科技股份有限公司']:
        job.industry='computer'
    if job.company_title=='软件与服务中心':
        job.industry='trade'
    if job.company_title== '中核集团技术经济总院':
        job.industry='energy'
    
    #black named companies
    if job.company_title in company_blacklist:
        return None
#    if not job.check_industry():
#        raise Exception("no industry")
        
    #996
    #朝九晚五,周末双休 双休 不加班
    if '朝九晚五' in job.job_description \
        or '朝九晚六' in job.job_description \
        or '双休' in job.job_description \
        or '不加班' in job.job_description:
        job._996_no=True
    if '朝九晚九' in job.job_description:
        job._996_yes=True
    if job.tag_rest_two_days:
        job._996_no=True
    if job.published_on_weekend:
        job._996_yes=True
    
    if job.company_title in ['青岛云指针软件有限公司']:
        job._996_no=False
        job._996_yes=False
    
    return job