def get_total_hirings_per_office(): engine = import_util.create_sqlalchemy_engine() query = "select\ siret,\ raisonsociale,\ enseigne,\ email,\ tel,\ website,\ codenaf,\ codepostal,\ codecommune,\ trancheeffectif,\ greatest(0, floor(score_regr)) as total_hirings\ from \ etablissements_backoffice" # To make it work in local dev, we have to take all the offices that have score > 0 if get_current_env() != ENV_DEVELOPMENT: query += " where greatest(0, floor(score_regr)) > 0;" df_total_hirings = pd.read_sql_query(query, engine) engine.close() logger.info("Datas selected from etablissements_backoffice") print(df_total_hirings) return df_total_hirings
def get_nb_entreprise_par_cycle_et_naf_ou_dep_isLBX(colonne, nom, is_lbb): filter_lbx = "is_a_bonne_boite" if is_lbb else "is_a_bonne_alternance" prefix_columns = "lbb" if is_lbb else "lba" engine = import_util.create_sqlalchemy_engine() if colonne == "global": query = f'SELECT importer_cycle_infos_id as cycle,sum({prefix_columns}_nb_effective_hirings) as nbTotalLBXHirings, count(*) as nbTotalLBX \ FROM perf_prediction_and_effective_hirings ppaeh \ INNER JOIN perf_importer_cycle_infos pici on pici.id = ppaeh.importer_cycle_infos_id \ WHERE pici.computed is true \ AND pici.on_google_sheets is false \ AND {filter_lbx} is true \ GROUP BY importer_cycle_infos_id;' else: #colonne == Naf OR departement query = f'SELECT importer_cycle_infos_id as cycle , {colonne} as {nom} ,sum({prefix_columns}_nb_effective_hirings) as nbTotalLBXHirings, count(*) as nbTotalLBX \ FROM perf_prediction_and_effective_hirings ppaeh \ INNER JOIN perf_importer_cycle_infos pici on pici.id = ppaeh.importer_cycle_infos_id \ WHERE pici.computed is true \ AND pici.on_google_sheets is false \ AND {filter_lbx} is true \ GROUP BY importer_cycle_infos_id , {colonne};' df_nb_entreprise_isLBX = pd.read_sql_query(query, engine) engine.close() return df_nb_entreprise_isLBX
def needs_parse_json_activity_log(self, json_file_name): '''Function which takes one json file name and check if it needs to be parsed and saved in database ''' # json_file_name is format : activity-lbb-2019.09.13.json date = json_file_name.replace('activity-lbb-', '').replace('.json', '').replace('.', '-') date_in_db_query = 'select dateheure\ from logs_activity\ where date(dateheure) = "{}"\ ORDER BY dateheure desc\ LIMIT 1'.format(date) engine = import_util.create_sqlalchemy_engine() row = engine.execute(date_in_db_query).fetchone() engine.close() file_needs_to_be_parsed = False if row is None: file_needs_to_be_parsed = True else: # we check the last hour of the activity logs in database hour_recorded_activity = row[0].hour # if the most recent hour in the database for the logs is before 3am, the file needs to be parsed if hour_recorded_activity <= 3: file_needs_to_be_parsed = True return file_needs_to_be_parsed
def get_logs_activities_by_sirets(self, sirets): if len(sirets) > 1: sub_query = "in %s" % str(tuple(sirets)) else: sub_query = "= %s" % sirets[0] engine = import_util.create_sqlalchemy_engine() query = "select * from logs_activity where siret %s order by dateheure asc;" % sub_query df_activity = pd.read_sql_query(query, engine) engine.close() return df_activity.astype(str)
def insert_in_database(self, dataframe, table_name): engine = import_util.create_sqlalchemy_engine() dataframe.to_sql(con=engine, name=table_name, if_exists='append', index=False, chunksize=10000) engine.close()
def get_cycle_infos(): engine = import_util.create_sqlalchemy_engine() query = f'SELECT id as cycle, execution_date as dateExecution \ FROM perf_importer_cycle_infos \ WHERE on_google_sheets = 0 \ and computed is true;' df_predict = pd.read_sql_query(query, engine) engine.close() return df_predict
def get_df_evol_idpe_connect(self): # Get the evolution of number of IDPEC which log into LBB engine = import_util.create_sqlalchemy_engine() query = 'SELECT count(DISTINCT idutilisateur_peconnect) as count_distinct_idpe, concat(MONTH(dateheure),"-",YEAR(dateheure)) as date_month\ FROM logs_idpe_connect\ GROUP BY MONTH(dateheure), YEAR(dateheure)\ ORDER BY YEAR(dateheure), MONTH(dateheure);' df_evol_idpe_connect = pd.read_sql_query(query, engine) engine.close() return df_evol_idpe_connect
def get_df_evol_dpae(self): # GET the evolution of the number of dpae with LBB activity engine = import_util.create_sqlalchemy_engine() query = 'SELECT count(idutilisateur_peconnect) as count_dpae_lbb, concat(MONTH(date_embauche),"-",YEAR(date_embauche)) as date_month \ FROM logs_activity_dpae_clean\ GROUP BY MONTH(date_embauche), YEAR(date_embauche)\ ORDER BY YEAR(date_embauche), MONTH(date_embauche);' df_evol_dpae = pd.read_sql_query(query, engine) engine.close() return df_evol_dpae
def save_activity_logs(self): engine = import_util.create_sqlalchemy_engine() self.df_dpae_act.reset_index(drop=True, inplace=True) self.df_dpae_act.to_sql(con=engine, name=TABLE_NAME, if_exists='replace', index=True, index_label='id', chunksize=10000) engine.close()
def check_existing_data_in_table(self): engine = import_util.create_sqlalchemy_engine() query = f"SELECT COUNT(*) FROM {TABLE_NAME}" existing_sql_table = False if engine.execute(query).fetchone()[0] > 0: # Table existe existing_sql_table = True engine.close() return existing_sql_table
def insert_into_etablissements_predicted_and_effective_hirings( importer_cycle_infos_id, file_name): logger.info( f"\n Start : Insert data into perf_prediction_and_effective_hirings from file {file_name}" ) query = f'SELECT siret, \ raisonsociale, \ enseigne, \ codenaf, \ codepostal, \ codecommune, \ departement, \ score, \ score_alternance \ FROM etablissements_new;' engine = import_util.create_sqlalchemy_engine() df = pd.read_sql_query(query, engine) engine.close() df['importer_cycle_infos_id'] = importer_cycle_infos_id df = df.rename( columns={ "score": "lbb_nb_predicted_hirings_score", "score_alternance": "lba_nb_predicted_hirings_score" }) df.reset_index(drop=True, inplace=True) engine = import_util.create_sqlalchemy_engine() df.to_sql(con=engine, name="perf_prediction_and_effective_hirings", if_exists='append', index=False, chunksize=1000) engine.close() logger.info("Insertion into perf_prediction_and_effective_hirings OK") logger.info(f"Insertion of {len(df.index)} rows ")
def get_last_recorded_hiring_date(self): # We want to check if there are already data in the final table table_name_act_dpae = LogsActivityDPAEClean.__tablename__ query = f"SELECT COUNT(*) FROM {table_name_act_dpae}" engine = import_util.create_sqlalchemy_engine() # If data in table if engine.execute(query).fetchone()[0] > 0: query = f"select date_embauche from {table_name_act_dpae} order by date_embauche DESC LIMIT 1" row = engine.execute(query).fetchone() date_last_recorded_hiring = row[0].split()[0] # Else no data else: # We set the date to the first activity log that has ever been created on LBB date_last_recorded_hiring = "2018-08-31" engine.close() logger.info(f"the most recent date found is {date_last_recorded_hiring}") return date_last_recorded_hiring
def get_activity_logs(self): engine = import_util.create_sqlalchemy_engine() query = "select * from logs_activity" if DEBUG: query += " ORDER BY RAND() LIMIT 10000" df_activity = pd.read_sql_query(query, engine) engine.close() # TODO : Define how long we should use logs # https://valodata.slack.com/archives/C0QR8RYL8/p1562319224015200 # TODO : Uncomment these lines after the first initialization of the project # one_year_date = datetime.date.today() - datetime.timedelta(365) # df_activity['dateheure'] = pd.to_datetime(df_activity['dateheure']) # df_activity = df_activity[df_activity.dateheure > one_year_date] logger.info('Activities logs are loaded') return df_activity.astype(str)
def get_predict_par_cycle_et_naf_ou_dep(cycle, colonne, value=None, is_lbb=True): prefix_columns = "lbb" if is_lbb else "lba" engine = import_util.create_sqlalchemy_engine() if colonne == "global": query = f'SELECT {prefix_columns}_nb_predicted_hirings as predict, {prefix_columns}_nb_effective_hirings as effective, codenaf, departement \ FROM perf_prediction_and_effective_hirings \ WHERE importer_cycle_infos_id = {cycle}\ ORDER BY {prefix_columns}_nb_predicted_hirings desc ' else: query = f'SELECT {prefix_columns}_nb_predicted_hirings as predict, {prefix_columns}_nb_effective_hirings as effective, codenaf, departement \ FROM perf_prediction_and_effective_hirings \ WHERE importer_cycle_infos_id = {cycle} and {colonne} = "{value}" \ ORDER BY {prefix_columns}_nb_predicted_hirings desc' df_predict = pd.read_sql_query(query, engine) engine.close() return df_predict
def get_sum_predict_par_cycle_et_naf_ou_dep(cycle, colonne, nom=None, value=None, is_lbb=True): prefix_columns = "lbb" if is_lbb else "lba" engine = import_util.create_sqlalchemy_engine() if colonne == "global": query = f'SELECT importer_cycle_infos_id as cycle, sum({prefix_columns}_nb_predicted_hirings) as sommeTotal \ FROM perf_prediction_and_effective_hirings ppaeh \ WHERE importer_cycle_infos_id = {cycle} \ GROUP BY importer_cycle_infos_id;' else: #colonne == Naf OR departement query = f'SELECT importer_cycle_infos_id as cycle , {colonne} as {nom},sum({prefix_columns}_nb_predicted_hirings) as sommeTotal \ FROM perf_prediction_and_effective_hirings ppaeh \ WHERE importer_cycle_infos_id = {cycle} and {colonne} = "{value}" \ GROUP BY importer_cycle_infos_id , {colonne};' df_sum_predict = pd.read_sql_query(query, engine) engine.close() return df_sum_predict
def get_old_activities_logs_saved(self): engine = import_util.create_sqlalchemy_engine() query = f"select idutilisateur_peconnect,\ siret, date_activite,\ date_embauche,\ type_contrat,\ duree_activite_cdd_mois,\ duree_activite_cdd_jours,\ diff_activite_embauche_jrs,\ dc_lblprioritede,\ tranche_age,\ dc_prive_public,\ duree_prise_en_charge,\ dn_tailleetablissement,\ code_postal from {TABLE_NAME}" df_dpae_act_existing = pd.read_sql_query(query, engine) df_dpae_act_existing['siret'] = df_dpae_act_existing.siret.astype(str) engine.close() return df_dpae_act_existing
def get_df_evol_idpe_connect_sign(self, did_specific_activity=None): # Get the evolution of the number of significative activities about a company engine = import_util.create_sqlalchemy_engine() if did_specific_activity is not None: specific_column_name = did_specific_activity.replace('-', '_') column_name = f'count_distinct_activity_{specific_column_name}' else: column_name = 'count_distinct_activity' query = f'SELECT count(DISTINCT idutilisateur_peconnect) as {column_name},\ concat(MONTH(dateheure),"-",YEAR(dateheure)) as date_month\ FROM logs_activity ' if did_specific_activity is not None: query += f'WHERE nom="{did_specific_activity}"' query += ' GROUP BY MONTH(dateheure), YEAR(dateheure)\ ORDER BY YEAR(dateheure), MONTH(dateheure);' df_evol_idpe_connect_sign = pd.read_sql_query(query, engine) engine.close() return df_evol_idpe_connect_sign
def compute_effective_and_predicted_hirings(): logger.info(f"\n Start : Computing effective hirings") importer_cycles_infos = PerfImporterCycleInfos.query.filter( PerfImporterCycleInfos.computed == False).all() importer_cycles_infos_to_compute = [] for ici in importer_cycles_infos: if os.environ["LBB_ENV"] in ["development", "test"]: importer_cycles_infos_to_compute.append(ici) continue if ici.prediction_end_date < datetime.now(): importer_cycles_infos_to_compute.append(ici) logger.info( f"Importer cycles infos which have not been computed yet : {[i.file_name for i in importer_cycles_infos_to_compute]}" ) for ici in importer_cycles_infos_to_compute: perf_division_per_rome_dict = load_perf_division_per_rome_dict() naf_not_founds = set() nb_companies_with_naf_not_found = 0 logger.info( f"Start computing for importer cycle infos : {ici._id} - {ici.file_name}" ) engine = import_util.create_sqlalchemy_engine() ppaeh = PerfPredictionAndEffectiveHirings.query.filter( PerfPredictionAndEffectiveHirings.importer_cycle_infos_id == ici._id) columns_companies = [ "_id", "siret", "naf", "lbb_nb_predicted_hirings_score", "lba_nb_predicted_hirings_score" ] dict_df_companies = {} dict_ppaeh = {} for col in columns_companies: dict_df_companies[col] = [] for perf in ppaeh: dict_ppaeh[perf._id] = perf for col in columns_companies: dict_df_companies[col].append(getattr(perf, col)) del ppaeh df_companies_list = pd.DataFrame(data=dict_df_companies) logger.info(f"Nb offices to compute : {len(df_companies_list)}") query_hirings_lbb = f"SELECT siret, count(*) as lbb_nb_effective_hirings \ FROM hirings\ WHERE hiring_date >= '{ici.prediction_start_date}'\ and hiring_date <= '{ici.prediction_end_date}'\ and (contract_type={Hiring.CONTRACT_TYPE_CDD} or contract_type={Hiring.CONTRACT_TYPE_CDI})\ GROUP BY siret;" df_hirings_lbb = pd.read_sql_query(query_hirings_lbb, engine) logger.info( f"Nb offices found in hirings for lbb : {len(df_hirings_lbb)}") query_hirings_lba = f"SELECT siret, count(*) as lba_nb_effective_hirings \ FROM hirings\ WHERE hiring_date >= '{ici.prediction_start_date}'\ and hiring_date <= '{ici.prediction_end_date}'\ and (contract_type={Hiring.CONTRACT_TYPE_APR} or contract_type={Hiring.CONTRACT_TYPE_CP})\ GROUP BY siret;" df_hirings_lba = pd.read_sql_query(query_hirings_lba, engine) logger.info( f"Nb offices found in hirings for lba: {len(df_hirings_lba)}") engine.close() df_merge_hirings_tmp = pd.merge(df_companies_list, df_hirings_lbb, how='left', on="siret") df_merged = pd.merge(df_merge_hirings_tmp, df_hirings_lba, how='left', on="siret") # Compute the predicted hirings from the score df_merged["lbb_nb_predicted_hirings"] = df_merged[ "lbb_nb_predicted_hirings_score"].apply( lambda x: scoring_util.get_hirings_from_score(x)) df_merged["lba_nb_predicted_hirings"] = df_merged[ "lba_nb_predicted_hirings_score"].apply( lambda x: scoring_util.get_hirings_from_score(x)) df_merged = df_merged.fillna(0) cols_we_want_to_keep = [ "_id", "siret", "naf", "lbb_nb_effective_hirings", "lba_nb_effective_hirings", "lbb_nb_predicted_hirings", "lba_nb_predicted_hirings", "lbb_nb_predicted_hirings_score", "lba_nb_predicted_hirings_score", ] df_merged = df_merged[cols_we_want_to_keep] values_to_update = df_merged.values.tolist() count = 0 updated_ppaeh = [] for row in values_to_update: row_id = row[0] siret = row[1] naf = row[2] params = dict( zip([ "lbb_nb_effective_hirings", "lba_nb_effective_hirings", "lbb_nb_predicted_hirings", "lba_nb_predicted_hirings" ], row[3:7])) lbb_nb_predicted_hirings_score = row[7] lba_nb_predicted_hirings_score = row[8] # foo pred_effective_hirings = dict_ppaeh[row_id] updated_values = {"_id": row_id} for key, val in params.items(): updated_values[key] = val is_a_bonne_boite = False is_a_bonne_alternance = False naf_present_in_mapping_rome_naf = naf in perf_division_per_rome_dict if naf_present_in_mapping_rome_naf: for rome_code, values in perf_division_per_rome_dict[ naf].items(): score_lbb = scoring_util.get_score_adjusted_to_rome_code_and_naf_code( score=lbb_nb_predicted_hirings_score, rome_code=rome_code, naf_code=naf) if score_lbb >= values["threshold_lbb"]: perf_division_per_rome_dict[naf][rome_code][ "nb_bonne_boites_lbb"] += 1 is_a_bonne_boite = True score_lba = scoring_util.get_score_adjusted_to_rome_code_and_naf_code( score=lba_nb_predicted_hirings_score, rome_code=rome_code, naf_code=naf) if score_lba >= values["threshold_lba"]: perf_division_per_rome_dict[naf][rome_code][ "nb_bonne_boites_lba"] += 1 is_a_bonne_alternance = True else: naf_not_founds.add(naf) nb_companies_with_naf_not_found += 1 pred_effective_hirings.is_a_bonne_boite = is_a_bonne_boite pred_effective_hirings.is_a_bonne_alternance = is_a_bonne_alternance updated_values["is_a_bonne_boite"] = is_a_bonne_boite updated_values["is_a_bonne_alternance"] = is_a_bonne_alternance updated_ppaeh.append(updated_values) count += 1 # Commit all the 10 000 transactions if len(updated_ppaeh) % 100000 == 0: logger.info(f"{count} companies have been treated") db_session.bulk_update_mappings( PerfPredictionAndEffectiveHirings, updated_ppaeh) db_session.commit() updated_ppaeh = [] # Commit for the remaining rows db_session.bulk_update_mappings(PerfPredictionAndEffectiveHirings, updated_ppaeh) db_session.commit() updated_ppaeh = [] logger.info( f"Number of naf not found in the mapping rome naf for this importer cycle : {len(naf_not_founds)}" ) logger.info( f"List of naf not found in the mapping rome naf for this importer cycle : {naf_not_founds}" ) logger.info( f"Number of companies with naf not found in the mapping rome naf for this importer cycle : {nb_companies_with_naf_not_found}" ) logger.info(f"Number of total companies : {count}") for naf_code, romes_list in perf_division_per_rome_dict.items(): for rome_code, values in romes_list.items(): division_per_rome = PerfDivisionPerRome( importer_cycle_infos_id=ici._id, naf=naf_code, rome=rome_code, threshold_lbb=values["threshold_lbb"], threshold_lba=values["threshold_lba"], nb_bonne_boites_lbb=values["nb_bonne_boites_lbb"], nb_bonne_boites_lba=values["nb_bonne_boites_lba"], ) db_session.add(division_per_rome) db_session.commit() ici.computed = True db_session.add(ici) db_session.commit()