def lagouSalaryDetail(): sql = database.getLagouPositionInfo() # 获取数据库连接 conn = database.getDatabaseConn() df = pd.read_sql(sql, conn) tmp = df['salary'].apply(salarySplit).rename(columns={0:'salary_low', 1:'salary_high', 2:'salary_mean'}) df = df.combine_first(tmp) return df
def get51IndustryNum(): # 获取行业数据 sql = database.get51IndustrySql() # 获取数据库连接 conn = database.getDatabaseConn() data = pd.read_sql(sql, conn) data = data[(True ^ data.industry.isin(['1000-5000人', '5000-10000人', '500-1000人', '少于50人', '150-500人', '50-150人']))] data_sort = data.groupby('industry').size().sort_values(ascending=False)[:60] return data_sort.to_json(orient='index', force_ascii=False)
def programingPositionRank(sql, query_key): # 获取数据库连接 conn = database.getDatabaseConn() # 获取常用语言 programing_language_list = helper.getProgramingLanguage() # 获取数据 lg_rd_df = pd.read_sql(sql, conn) conn.close() # 查询数据 lg_df_programing_rank = getNumByKeyWords(lg_rd_df, query_key, programing_language_list) return lg_df_programing_rank
def job51CompanyPositionNum(): # 获取51job的sql sql_position = database.get51JobPositionSql() sql_company = database.get51JobCompanySql() # 获取数据库连接 conn = database.getDatabaseConn() j5_df = pd.read_sql(sql_position, conn) j5c_df = pd.read_sql(sql_company, conn) res_j5 = pd.merge(j5_df, j5c_df, on='company_md5', how='left') res_j5_g = res_j5.groupby('full_name').size().sort_values(ascending=False) return res_j5_g[0:60].to_json(orient='index', force_ascii=False)
def zhilianCompanyPositionNum(): # 获取51job的sql sql_position = database.getZhilianPositionSql() sql_company = database.getZhilianCompanySql() # 获取数据库连接 conn = database.getDatabaseConn() zp_df = pd.read_sql(sql_position, conn) zc_df = pd.read_sql(sql_company, conn) res_zp = pd.merge(zp_df, zc_df, on='company_md5', how='left') res_zp_g = res_zp.groupby('full_name').size().sort_values(ascending=False) return res_zp_g[0:60].to_json(orient='index', force_ascii=False)
def lagouPositionRank(): # 获取拉钩的sql sql = database.getLagouSecondtype() # 获取数据库连接 conn = database.getDatabaseConn() position_res_df = pd.read_sql(sql, conn) res_choose = getPositionRank(position_res_df, 'second_type') return res_choose.sort_values(ascending=False)[0:60].to_json(orient='index', force_ascii=False)
def zhilianPositionRank(): # 获取51job的sql sql = database.getZhilianPositionType() sql_Other = database.getZhilianPositionName() # 获取数据库连接 conn = database.getDatabaseConn() position_res_df = pd.concat([pd.read_sql(sql, conn), pd.read_sql(sql_Other, conn)]) res_choose = getPositionRank(position_res_df, 'position_type') return res_choose.sort_values(ascending=False)[0:60].to_json(orient='index', force_ascii=False)
def getZLSalaryByPosition(type = 1, sort_type = 1): sql = database.getZLSalaryByPositionSql() conn = database.getDatabaseConn() df = pd.read_sql(sql, conn) if type == 1: condition = ['position_type'] elif type == 2: condition = ['position_type', 'work_year', 'city'] sort_status = True if sort_type == 1 else False df_res = df.groupby(condition).describe().salary_high.sort_values('mean', ascending=sort_status)[:100] res = pd.concat([df_res['std'].apply(lambda x: int(x)), df_res['mean'].apply(lambda x: int(x)), df_res['50%'].apply(lambda x: int(x))], axis=1).rename(columns={'std':'标准差','mean':'平均值','50%':'中位数'}) res.to_excel('output/PositionSalaryByWE.xlsx') return res.to_json(orient='index', force_ascii=False)
def job51SalaryPosition(type=1): import os # 获取数据库连接 if type == 1 & os.path.exists('./salaryWE.h5'): df_res = pd.read_hdf('./salaryWE.h5') else: conn = database.getDatabaseConn() sql = database.getJ5ZCSql() df_51job_salary = pd.read_sql(sql, conn) df_51job_salary = df_51job_salary[df_51job_salary.salary != 'NULL'] df_51job_salary_deal = df_51job_salary[True ^ df_51job_salary['salary'].str.contains('天|小时|\+')] df_tmp = df_51job_salary_deal.salary.apply(job51SalaryDeal) df_res = df_51job_salary_deal.combine_first(df_tmp.rename(columns={0: 'low', 1: 'high', 2: 'mean'})) df_res.to_hdf('./salaryWE.h5', 'salary_all') return df_res
def lagouCompanyHrRankDf(): # 获取数据库连接和sql conn = database.getDatabaseConn() sql_lagou_hr = database.getLagouHrInfo() sql_lagou_recruit_day = database.getLagourHrComId() sql_lagou_company = database.getLagouCompanyInfo() # 读取数据 lagou_hr_df = pd.read_sql(sql_lagou_hr, conn) lagou_recruit_day = pd.read_sql(sql_lagou_recruit_day, conn) lagou_company = pd.read_sql(sql_lagou_company, conn) # 连接三张表,首先获取公司hr数量 res_lrd_df = pd.merge(lagou_recruit_day, lagou_hr_df, on='publisher_id', how='left') res_com_df = pd.merge(lagou_company, res_lrd_df, on='company_id', how='left') res_com_df = res_com_df.set_index('company_id') res_g = res_com_df.groupby(['company_id']) res_hr_num_df = res_g.size().to_frame().rename(columns={0: 'hr_num'}) # 然后将数量信息和公司表连接 company_hr_num_df = pd.concat([lagou_company.set_index('company_id'), res_hr_num_df], axis=1) res = company_hr_num_df.sort_values('hr_num', ascending=False) return res
def job51CompanyHighNumType(type=1): # 获取51job的sql sql_position = database.get51JobPositionSql() sql_company = database.get51JobCompanySql() # 获取数据库连接 conn = database.getDatabaseConn() j5_df = pd.read_sql(sql_position, conn) j5c_df = pd.read_sql(sql_company, conn) res_j5 = pd.merge(j5_df, j5c_df, on='company_md5', how='left') res_j5_g = res_j5.groupby('full_name').size() res_j5_g = pd.DataFrame(list(zip(res_j5_g.index, res_j5_g.values))).rename( columns={0: 'full_name', 1: "total_recruit_num"}) res_total = pd.merge(j5c_df, res_j5_g, on='full_name', how='left').sort_values('total_recruit_num', ascending=False)[:100] if type == 1: query_column = 'size' elif type == 2: query_column = 'company_nature' elif type == 3: query_column = 'industry' res = res_total.groupby(query_column).size().sort_values(ascending=False) return res.to_json(orient='index', force_ascii=False)
def getLagouPositionWordCloud(query_name, type=1, db_type=1): if db_type == 1: if type == 1: sql = getLGPositionContent() elif type == 2: sql = getZLPositionContent() elif type == 3: sql = getJ5PositionContent() res_df = pd.read_sql(sql, getDatabaseConn()) res_df.to_hdf('/Users/monstar/Desktop/lg_wc.h5', 'wordCloud') else: res_df = pd.read_hdf('/Users/monstar/Desktop/lg_wc.h5') res_df = res_df[res_df['position_name'].str.contains( re.escape(query_name))] words = ' '.join(res_df.content) getWordCloud(content=words, type=1, status=1, pic_file_path=os.path.join('/Users/monstar/Desktop', query_name + '.jpg'))
def get51jobRecruitNumByEducation(): # 获取数据库连接 conn = database.getDatabaseConn() sql = database.getJ5ZCSql() df_51job_salary = pd.read_sql(sql, conn) return df_51job_salary.groupby('education').size().sort_values(ascending=False).to_json(orient='index', force_ascii=False)
def workYearNum(sql): # 获取数据库连接 conn = database.getDatabaseConn() df = pd.read_sql(sql, conn) df_g = df.groupby('work_year') return df_g.size().sort_values(ascending=False)[:7]
def getJ5PositionWordCloud(): res_df = pd.read_sql(getJ5Advantage(), getDatabaseConn()) words = ' '.join(res_df.advantage) getWordCloud(content=words, type=1, status=2)