def insert_into_importer_cycle_infos(file_name, months_time): logger.info( f"\n Start : Insert data into importer_cycle_infos from file {file_name}" ) # Insert into importer cycle infos # TODO : Check that the prediction start date and end date match these ones execution_date = get_date_from_file_name(file_name) prediction_start_date = execution_date + relativedelta( months=+1) + relativedelta(day=1) # First day of next month prediction_end_date = prediction_start_date + relativedelta( months=+months_time) importer_cycle_infos = PerfImporterCycleInfos( execution_date=execution_date, prediction_start_date=prediction_start_date, prediction_end_date=prediction_end_date, file_name=file_name, computed=False, on_google_sheets=False) db_session.add(importer_cycle_infos) db_session.commit() importer_cycle_infos = PerfImporterCycleInfos.query.filter( PerfImporterCycleInfos.file_name == file_name).first() logger.info(f"id = {importer_cycle_infos._id}") logger.info(f"execution_date = {execution_date}") logger.info(f"prediction_start_date = {prediction_start_date}") logger.info(f"prediction_end_date = {prediction_end_date}") logger.info(f"file_name = {file_name}") logger.info("insertion into importer_cycle_infos OK") return importer_cycle_infos
def set_importer_cycle_infos_google_sheets_boolean(importer_cycle_infos_id): for ici_id in importer_cycle_infos_id: ici = PerfImporterCycleInfos.query.filter( PerfImporterCycleInfos._id == ici_id).first() ici.on_google_sheets = True db_session.add(ici) db_session.commit()
def setUp(self): """ Populate the DB with data required for these tests to work. """ super(UserAccountTest, self).setUp() self.user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe') db_session.add(self.user) db_session.flush() self.office1 = Office( departement='57', siret='00000000000001', company_name='1', headcount='5', city_code='57070', zipcode='57070', naf='4646Z', score=90, x=6.166667, y=49.133333, ) self.office2 = Office( departement='57', siret='00000000000002', company_name='1', headcount='5', city_code='57070', zipcode='57070', naf='4646Z', score=90, x=6.166667, y=49.133333, ) db_session.add_all([self.office1, self.office2]) db_session.flush() self.user_social_auth = UserSocialAuth( provider=PEAMOpenIdConnect.name, extra_data={'id_token': 'fake'}, user_id=self.user.id, ) self.fav1 = UserFavoriteOffice(user_id=self.user.id, office_siret=self.office1.siret) self.fav2 = UserFavoriteOffice(user_id=self.user.id, office_siret=self.office2.siret) db_session.add_all([self.user_social_auth, self.fav1, self.fav2]) db_session.flush() db_session.commit() self.assertEqual(db_session.query(User).count(), 1) self.assertEqual(db_session.query(Office).count(), 2) self.assertEqual(db_session.query(UserFavoriteOffice).count(), 2) self.assertEqual(db_session.query(UserSocialAuth).count(), 1)
def test_office_admin_add(self): form = { "siret": "78548035101646", "company_name": "SUPERMARCHES MATCH", "office_name": "SUPERMARCHES MATCH", "naf": "4711D", "street_number": "45", "street_name": "AVENUE ANDRE MALRAUX", "city_code": "57463", "zipcode": "57000", "email": "*****@*****.**", "tel": "0387787878", "website": "http://www.supermarchesmatch.fr", "flag_alternance": 0, "flag_junior": 0, "flag_senior": 0, "flag_handicap": 0, "departement": "57", "headcount": "12", "score": 90, "score_alternance": 75, "x": 6.17952, "y": 49.1044, "reason": "Demande de mise en avant", } with self.test_request_context(): # Create an user admin self.user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe', active=True, is_admin=True) db_session.add(self.user) db_session.flush() user_social_auth = UserSocialAuth( provider=PEAMOpenIdConnect.name, extra_data={'id_token': 'fake'}, user_id=self.user.id, ) db_session.add(user_social_auth) db_session.commit() # Login as user admin self.user = db_session.query(User).filter_by(id=self.user.id).first() self.assertEqual(db_session.query(User).count(), 1) self.login(self.user) # Create OfficeAdminRemove self.assertEqual(0, OfficeAdminAdd.query.filter_by(id=1).count()) self.app.post(url_for('officeadminadd.create_view'), data=form) self.assertEqual(1, OfficeAdminAdd.query.filter_by(id=1).count()) # Delete OfficeAdminAdd self.app.post(url_for('officeadminadd.delete_view'), data={'id': 1}) self.assertEqual(0, OfficeAdminRemove.query.filter_by(id=1).count())
def cbs_delete_records(): try: print('> Deleting CBS records...') sql = text(""" delete from labonneboite.etablissements_third_party_update where reason='%s'; """ % (REASON_KEY)) db_session.execute(sql) db_session.commit() print('> Done') except Exception as err: print('> error executing request', err)
def add_favorite(cls, user, office): """ Add a favorite to a user. Avoid as much as possible replication errors by ignoring duplicates. """ statement = cls.__table__.insert().prefix_with("IGNORE").values( user_id=user.id, office_siret=office.siret, ) db_session.execute(statement) db_session.commit()
def cbs_insert_records(): try: print('> Inserting CBS records...', file) sql = text(""" LOAD DATA LOCAL INFILE '%s' into table etablissements_third_party_update FIELDS ENCLOSED BY '\"' TERMINATED BY ',' LINES TERMINATED BY '\\n' IGNORE 1 ROWS (@score,@siret) SET score_alternance=@score, sirets=@siret, reason='%s', date_created=NOW(); """ % (file, REASON_KEY)) db_session.execute(sql) db_session.commit() print('> Done') except Exception as err: print('> error executing request', err, '\n> Did you forget to set the env var `ENABLE_DB_INFILE=1`?')
def load_csv_perf_division_per_rome(filename, delimiter=';'): #date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S.%f') for row in load_data.load_csv_file(filename, delimiter): perf_div_per_rome = PerfDivisionPerRome(_id=row[0], importer_cycle_infos_id=row[1], naf=row[3], rome=row[2], threshold_lbb=row[4], nb_bonne_boites_lbb=row[5], threshold_lba=row[6], nb_bonne_boites_lba=row[7]) db_session.add(perf_div_per_rome) db_session.commit()
def get_geocode_from_csv(self, csv_api_path): logger.info("Parsing CSV sent back by API : {}".format(csv_api_path)) df_geocodes = pd.read_csv(csv_api_path, dtype={'siret': str}) for index, row in df_geocodes.iterrows(): if not numpy.isnan(row.latitude): coordinates = [row.longitude, row.latitude] geolocation = Geolocation.get(row.full_address) # There should not be an already existing geolocation # but working on this job, makes you know that sometimes, # the coordinates related to a siret do not update, but the geolocation is still added # in the database if geolocation: logger.info("Geolocation already found") GEOCODING_STATS['updatable_coordinates'] = GEOCODING_STATS.get( 'updatable_coordinates', 0) + 1 coordinates_updates.append( [row.siret, coordinates]) else: logger.info("Geolocation not found") geolocation = Geolocation( full_address=row.full_address, x=coordinates[0], y=coordinates[1] ) db_session.add(geolocation) # as this method is run in parallel jobs, # let's commit often so that each job see each other's changes # and rollback in case of rare simultaneous changes on same geolocation try: db_session.commit() # usually flush() is called as part of commit() # however it is not the case in our project # because autoflush=False db_session.flush() GEOCODING_STATS['flushes'] = GEOCODING_STATS.get( 'flushes', 0) + 1 except IntegrityError: # happens when a job tries to insert an already existing full_address # rollback needed otherwise db_session is left # in a state unusable by the other parallel jobs db_session.rollback() GEOCODING_STATS['rollbacks'] = GEOCODING_STATS.get( 'rollbacks', 0) + 1 if coordinates: GEOCODING_STATS['updatable_coordinates'] = GEOCODING_STATS.get( 'updatable_coordinates', 0) + 1 coordinates_updates.append( [row.siret, coordinates]) else: GEOCODING_STATS['coordinates_not_found'] = GEOCODING_STATS.get( 'coordinates_not_found', 0) + 1
def remove_scam_emails(): scam_emails = get_latest_scam_emails() for scam_emails_chunk in chunks(scam_emails, 100): query = Office.query.filter(Office.email.in_(scam_emails_chunk)) office_count = query.count() if office_count: query.update({Office.email: ''}, synchronize_session="fetch") db_session.commit() logger.info( "Removed a chunk of %d scam emails from %d offices.", len(scam_emails_chunk), office_count, )
def get_or_create(cls, defaults=None, **kwargs): try: return db_session.query(cls).filter_by(**kwargs).one(), False except NoResultFound: if defaults: kwargs.update(defaults) instance = cls(**kwargs) try: db_session.add(instance) db_session.commit() return instance, True except IntegrityError: db_session.rollback() return db_session.query(cls).filter_by(**kwargs).one(), True
def test_office_admin_remove(self): # Create officeAdminRemove form = { 'siret': '01234567891234', 'name': 'Test company', 'reason': 'N/A', 'initiative': 'office', } with self.test_request_context(): # Create an user admin self.user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe', active=True, is_admin=True) db_session.add(self.user) db_session.flush() user_social_auth = UserSocialAuth( provider=PEAMOpenIdConnect.name, extra_data={'id_token': 'fake'}, user_id=self.user.id, ) db_session.add(user_social_auth) db_session.commit() # Login as user admin self.user = db_session.query(User).filter_by( id=self.user.id).first() self.assertEqual(db_session.query(User).count(), 1) self.login(self.user) # Create OfficeAdminRemove self.assertEqual( 0, OfficeAdminRemove.query.filter_by( siret='01234567891234').count()) self.app.post(url_for('officeadminremove.create_view'), data=form) self.assertEqual( 1, OfficeAdminRemove.query.filter_by( siret='01234567891234').count()) # Delete OfficeAdminRemove self.app.post(url_for('officeadminremove.delete_view'), data={'id': 1}) self.assertEqual(0, OfficeAdminRemove.query.filter_by(id=1).count())
def load_csv_perf_importer_cycle_infos(filename, delimiter=';'): #date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S.%f') for row in load_data.load_csv_file(filename, delimiter): perf_importer_cycle_info = PerfImporterCycleInfos( _id=row[0], execution_date=datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S.%f'), prediction_start_date=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'), prediction_end_date=datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S.%f'), file_name=row[4], computed=(row[5] == 'True'), on_google_sheets=(row[6] == 'True')) db_session.add(perf_importer_cycle_info) db_session.commit()
def logout(user_social_auth=None): """ Log a user out. Param `user_social_auth`: a `UserSocialAuth` instance. `None` most of the time, except when a user is coming from the `user.account_delete` view. This param is intended to be passed when the view is called directly as a Python function, i.e. not with a `redirect()`. """ if not current_user.is_authenticated: return redirect(url_for('root.home')) logged_with_peam = session.get( 'social_auth_last_login_backend') == PEAMOpenIdConnect.name if logged_with_peam: if not user_social_auth: user_social_auth = get_user_social_auth(current_user.id) if user_social_auth: id_token = user_social_auth.extra_data['id_token'] # Force delete PEAMU token. db_session.query(UserSocialAuth).filter_by(user_id=current_user.id).delete() db_session.commit() # Log the user out and destroy the LBB session. activity.log('deconnexion') logout_user() # Clean the session: drop Python Social Auth info because it isn't done by `logout_user`. if 'social_auth_last_login_backend' in session: # Some backends have a `backend-name_state` stored in session as required by e.g. Oauth2. social_auth_state_key = '%s_state' % session['social_auth_last_login_backend'] if social_auth_state_key in session: session.pop(social_auth_state_key) session.pop('social_auth_last_login_backend') # Log the user out from PEAM and destroy the PEAM session. if logged_with_peam and user_social_auth: params = { 'id_token_hint': id_token, 'redirect_uri': url_for('auth.logout_from_peam_callback', _external=True), } peam_logout_url = '%s/compte/deconnexion?%s' % ( settings.PEAM_AUTH_BASE_URL, urlencode(params)) # After this redirect, the user will be redirected to the LBB website `logout_from_peam_callback` route. return redirect(peam_logout_url) return redirect(url_for('root.home'))
def remove_scam_emails(): scam_emails = get_latest_scam_emails() for scam_emails_chunk in chunks(scam_emails, 100): query = Office.query.filter(Office.email.in_(scam_emails_chunk)) office_count = query.count() if office_count: history = [] for office in query.all(): history.append( HistoryBlacklist(email=office.email, datetime_removal=datetime.datetime.now())) db_session.add_all(history) query.update({Office.email: ''}, synchronize_session="fetch") db_session.commit() logger.info( "Removed a chunk of %d scam emails from %d offices.", len(scam_emails_chunk), office_count, )
def test_get_user_social_auth(self): """ Test the `get_user_social_auth()` function. """ user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe') db_session.add(user) db_session.flush() expected_user_social_auth = UserSocialAuth(provider=PEAMOpenIdConnect.name, extra_data=None, user_id=user.id) db_session.add(expected_user_social_auth) db_session.flush() db_session.commit() self.assertEqual(db_session.query(User).count(), 1) self.assertEqual(db_session.query(UserSocialAuth).count(), 1) user_social_auth = get_user_social_auth(user.id) self.assertEqual(user_social_auth.id, expected_user_social_auth.id)
def test_logout(self): """ Test that the session is cleaned after a logout. """ user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe') db_session.add(user) db_session.flush() # This `UserSocialAuth` entry will be required later by the logout function. user_social_auth = UserSocialAuth( provider=PEAMOpenIdConnect.name, extra_data={'id_token': 'fake'}, user_id=user.id, ) db_session.add(user_social_auth) db_session.commit() with self.test_request_context: with self.app.session_transaction() as sess: sess[ 'this_should_not_be_deleted'] = 'foo' # This should not be deleted by a login or logout. self.login(user) with self.app.session_transaction() as sess: self.assertIn('this_should_not_be_deleted', sess) self.assertIn('user_id', sess) self.assertIn('social_auth_last_login_backend', sess) self.assertIn('peam-openidconnect_state', sess) self.logout() with self.app.session_transaction() as sess: self.assertIn('this_should_not_be_deleted', sess) self.assertNotIn('user_id', sess) self.assertNotIn('social_auth_last_login_backend', sess) self.assertNotIn('peam-openidconnect_state', sess)
def setUp(self, *args, **kwargs): super(AdminTest, self).setUp(*args, **kwargs) self.user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe') db_session.add(self.user) db_session.flush() # Required for `self.logout` to work which looks for the `extra_data` attribute. user_social_auth = UserSocialAuth( provider=PEAMOpenIdConnect.name, extra_data={'id_token': 'fake'}, user_id=self.user.id, ) db_session.add(user_social_auth) db_session.commit() self.assertEqual(db_session.query(User).count(), 1)
def test_clean(self): """ Test `OfficeAdminExtraGeoLocation.clean()`. """ extra_geolocation = OfficeAdminExtraGeoLocation( siret="38524664000176", codes="75110\n\n\n\n\n\n\n57616", reason="Paris 10 + Metz Saint Julien", ) db_session.add(extra_geolocation) db_session.commit() # The `clean()` method should have been called automatically. extra_geolocation = db_session.query( OfficeAdminExtraGeoLocation).first() # Multiple newlines should have been removed. self.assertEqual(extra_geolocation.codes, '57616\n75110') # Corresponding Lat/Lon coords should have been found and stored. self.assertEqual( extra_geolocation.geolocations, '[[49.135208952059884, 6.207906756168173], [48.8815994262695, 2.36229991912841]]' )
def add_offices(): """ Add offices (complete the data provided by the importer). """ for office_to_add in db_session.query(OfficeAdminAdd).all(): office = Office.query.filter_by(siret=office_to_add.siret).first() # Only create a new office if it does not already exist. # This guarantees that the importer data will always have precedence. if not office: # The `headcount` field of an `OfficeAdminAdd` instance has a `code` attribute. if hasattr(office_to_add.headcount, 'code'): headcount = office_to_add.headcount.code else: headcount = office_to_add.headcount # Create the new office in DB. new_office = Office() # Use `inspect` because `Office` columns are named distinctly from attributes. for field_name in list(inspect(Office).columns.keys()): try: value = getattr(office_to_add, field_name) except AttributeError: # Some fields are not shared between `Office` and `OfficeAdminAdd`. continue if field_name == 'headcount': value = headcount setattr(new_office, field_name, value) db_session.add(new_office) db_session.commit() # Create the new office in ES. doc = get_office_as_es_doc(office_to_add) es.Elasticsearch().create(index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=office_to_add.siret, body=doc)
def load_csv_perf_prediction_and_effective_h(filename, delimiter=';'): #date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S.%f') for row in load_data.load_csv_file(filename, delimiter): perf_importer_cycle_info = PerfPredictionAndEffectiveHirings( _id=row[0], importer_cycle_infos_id=row[1], siret=row[2], naf=row[3], city_code=row[4], zipcode=row[5], departement=row[6], company_name=row[7], office_name=row[8], lbb_nb_predicted_hirings_score=row[9], lba_nb_predicted_hirings_score=row[10], lbb_nb_predicted_hirings=row[11], lba_nb_predicted_hirings=row[12], lbb_nb_effective_hirings=row[13], lba_nb_effective_hirings=row[14], is_a_bonne_boite=(row[15] == "True"), is_a_bonne_alternance=(row[16] == "True")) db_session.add(perf_importer_cycle_info) db_session.commit()
def account_delete(): """ Ask for a confirmation, then delete the current user account and all of its information. """ form = UserAccountDeleteForm(request.form) if request.method == 'POST' and form.validate(): # Store the current `UserSocialAuth` instance in memory because it will be deleted # but it will also be needed later to properly logout the user from PEAM. user_social_auth = get_user_social_auth(current_user.id) # Now we can safely delete the current `UserSocialAuth` instance. # We have to delete it because it has a foreign key to the User table. # We don't need to deal with the other tables of Social Auth, see: # https://python-social-auth.readthedocs.io/en/latest/storage.html db_session.query(UserSocialAuth).filter_by( user_id=current_user.id).delete() # Delete the current user. # The user's favorites will be deleted at the same time because of the `ondelete='CASCADE'` # on the `user_id` field of the `UserFavoriteOffice` model. db_session.query(User).filter_by(id=current_user.id).delete() db_session.commit() message = "La suppression de votre compte a bien été effectuée." flash(message, 'warning') # Return the `logout` view directly. It allows us to pass the full # `user_social_auth` object as a parameter. return logout(user_social_auth=user_social_auth) context = { 'form': form, } return render_template('user/account_delete.html', **context)
def fonction_with_history(): # Get the job_name argument and remove the .py extension in the job name job = script_name.split('.') if job[1] == 'py': job_name = job[0] else: raise BadDecoratorUse # Check that the previous job is done to start this one # If the previous job is not done, it will raise an exception info_previous_job = get_previous_job_info(job_name) if info_previous_job['is_completed'] is False: print( f"The previous job '{info_previous_job['name']}' is not done " ) raise PreviousJobNotDone else: print( f"The previous job '{info_previous_job['name']}' is done. We can run this one ! " ) #Save in database the start of this job start_date = datetime.now() history = HistoryImporterJobs(start_date=start_date, end_date=None, job_name=job_name, status=StatusJobExecution['start'], exception=None, trace_log=None) db_session.add(history) db_session.commit() #If the job is done, we save it with done status try: result = function_to_execute() history.end_date = datetime.now() history.status = StatusJobExecution['done'] db_session.commit() #Else if an error occured, we raise the exception, and save it in the DB with a failed status except Exception as e: history.end_date = datetime.now() history.exception = type(e).__name__ history.trace_log = traceback.format_exc() history.status = StatusJobExecution['error'] db_session.commit() raise return result
def save(self, commit=True): db_session.add(self) if commit: db_session.commit() return self
def delete(self, commit=True): db_session.delete(self) return commit and db_session.commit()
def find_coordinates_for_address(self): """ finding coordinates for an address based on the BAN (base d'adresses nationale), an online governmental service. """ coordinates = None # FIXME refer to settings.API_ADRESS_BASE_URL and make sure we don't # make real requests in unit tests BASE = "http://api-adresse.data.gouv.fr/search/?q=" geocoding_request = "%s%s" % (BASE, self.full_address) geolocation = Geolocation.get(self.full_address) if geolocation: # coordinates were already queried and cached before coordinates = [geolocation.x, geolocation.y] GEOCODING_STATS['cache_hits'] = GEOCODING_STATS.get( 'cache_hits', 0) + 1 else: # coordinates need to be queried and cached response = session.get(geocoding_request) response.close() GEOCODING_STATS['cache_misses'] = GEOCODING_STATS.get( 'cache_misses', 0) + 1 if response.status_code == 200: try: results = response.json()['features'] if len(results) >= 1: coordinates = results[0]['geometry']['coordinates'] # let's cache the result for later computations geolocation = Geolocation( full_address=self.full_address, x=coordinates[0], y=coordinates[1]) db_session.add(geolocation) # as this method is run in parallel jobs, # let's commit often so that each job see each other's changes # and rollback in case of rare simultaneous changes on same geolocation try: db_session.commit() # usually flush() is called as part of commit() # however it is not the case in our project # because autoflush=False db_session.flush() GEOCODING_STATS['flushes'] = GEOCODING_STATS.get( 'flushes', 0) + 1 except IntegrityError: # happens when a job tries to insert an already existing full_address # rollback needed otherwise db_session is left # in a state unusable by the other parallel jobs db_session.rollback() GEOCODING_STATS['rollbacks'] = GEOCODING_STATS.get( 'rollbacks', 0) + 1 except ValueError: logger.warning('ValueError in json-ing features result %s', response.text) if coordinates: if coordinates == self.initial_coordinates: GEOCODING_STATS['unchanged_coordinates'] = GEOCODING_STATS.get( 'unchanged_coordinates', 0) + 1 else: GEOCODING_STATS['updatable_coordinates'] = GEOCODING_STATS.get( 'updatable_coordinates', 0) + 1 self.updates.append([self.siret, coordinates]) else: GEOCODING_STATS['coordinates_not_found'] = GEOCODING_STATS.get( 'coordinates_not_found', 0) + 1
def compute_effective_and_predicted_hirings(): logger.info(f"\n Start : Computing effective hirings") importer_cycles_infos = PerfImporterCycleInfos.query.filter( PerfImporterCycleInfos.computed == False).all() importer_cycles_infos_to_compute = [] for ici in importer_cycles_infos: if os.environ["LBB_ENV"] in ["development", "test"]: importer_cycles_infos_to_compute.append(ici) continue if ici.prediction_end_date < datetime.now(): importer_cycles_infos_to_compute.append(ici) logger.info( f"Importer cycles infos which have not been computed yet : {[i.file_name for i in importer_cycles_infos_to_compute]}" ) for ici in importer_cycles_infos_to_compute: perf_division_per_rome_dict = load_perf_division_per_rome_dict() naf_not_founds = set() nb_companies_with_naf_not_found = 0 logger.info( f"Start computing for importer cycle infos : {ici._id} - {ici.file_name}" ) engine = import_util.create_sqlalchemy_engine() ppaeh = PerfPredictionAndEffectiveHirings.query.filter( PerfPredictionAndEffectiveHirings.importer_cycle_infos_id == ici._id) columns_companies = [ "_id", "siret", "naf", "lbb_nb_predicted_hirings_score", "lba_nb_predicted_hirings_score" ] dict_df_companies = {} dict_ppaeh = {} for col in columns_companies: dict_df_companies[col] = [] for perf in ppaeh: dict_ppaeh[perf._id] = perf for col in columns_companies: dict_df_companies[col].append(getattr(perf, col)) del ppaeh df_companies_list = pd.DataFrame(data=dict_df_companies) logger.info(f"Nb offices to compute : {len(df_companies_list)}") query_hirings_lbb = f"SELECT siret, count(*) as lbb_nb_effective_hirings \ FROM hirings\ WHERE hiring_date >= '{ici.prediction_start_date}'\ and hiring_date <= '{ici.prediction_end_date}'\ and (contract_type={Hiring.CONTRACT_TYPE_CDD} or contract_type={Hiring.CONTRACT_TYPE_CDI})\ GROUP BY siret;" df_hirings_lbb = pd.read_sql_query(query_hirings_lbb, engine) logger.info( f"Nb offices found in hirings for lbb : {len(df_hirings_lbb)}") query_hirings_lba = f"SELECT siret, count(*) as lba_nb_effective_hirings \ FROM hirings\ WHERE hiring_date >= '{ici.prediction_start_date}'\ and hiring_date <= '{ici.prediction_end_date}'\ and (contract_type={Hiring.CONTRACT_TYPE_APR} or contract_type={Hiring.CONTRACT_TYPE_CP})\ GROUP BY siret;" df_hirings_lba = pd.read_sql_query(query_hirings_lba, engine) logger.info( f"Nb offices found in hirings for lba: {len(df_hirings_lba)}") engine.close() df_merge_hirings_tmp = pd.merge(df_companies_list, df_hirings_lbb, how='left', on="siret") df_merged = pd.merge(df_merge_hirings_tmp, df_hirings_lba, how='left', on="siret") # Compute the predicted hirings from the score df_merged["lbb_nb_predicted_hirings"] = df_merged[ "lbb_nb_predicted_hirings_score"].apply( lambda x: scoring_util.get_hirings_from_score(x)) df_merged["lba_nb_predicted_hirings"] = df_merged[ "lba_nb_predicted_hirings_score"].apply( lambda x: scoring_util.get_hirings_from_score(x)) df_merged = df_merged.fillna(0) cols_we_want_to_keep = [ "_id", "siret", "naf", "lbb_nb_effective_hirings", "lba_nb_effective_hirings", "lbb_nb_predicted_hirings", "lba_nb_predicted_hirings", "lbb_nb_predicted_hirings_score", "lba_nb_predicted_hirings_score", ] df_merged = df_merged[cols_we_want_to_keep] values_to_update = df_merged.values.tolist() count = 0 updated_ppaeh = [] for row in values_to_update: row_id = row[0] siret = row[1] naf = row[2] params = dict( zip([ "lbb_nb_effective_hirings", "lba_nb_effective_hirings", "lbb_nb_predicted_hirings", "lba_nb_predicted_hirings" ], row[3:7])) lbb_nb_predicted_hirings_score = row[7] lba_nb_predicted_hirings_score = row[8] # foo pred_effective_hirings = dict_ppaeh[row_id] updated_values = {"_id": row_id} for key, val in params.items(): updated_values[key] = val is_a_bonne_boite = False is_a_bonne_alternance = False naf_present_in_mapping_rome_naf = naf in perf_division_per_rome_dict if naf_present_in_mapping_rome_naf: for rome_code, values in perf_division_per_rome_dict[ naf].items(): score_lbb = scoring_util.get_score_adjusted_to_rome_code_and_naf_code( score=lbb_nb_predicted_hirings_score, rome_code=rome_code, naf_code=naf) if score_lbb >= values["threshold_lbb"]: perf_division_per_rome_dict[naf][rome_code][ "nb_bonne_boites_lbb"] += 1 is_a_bonne_boite = True score_lba = scoring_util.get_score_adjusted_to_rome_code_and_naf_code( score=lba_nb_predicted_hirings_score, rome_code=rome_code, naf_code=naf) if score_lba >= values["threshold_lba"]: perf_division_per_rome_dict[naf][rome_code][ "nb_bonne_boites_lba"] += 1 is_a_bonne_alternance = True else: naf_not_founds.add(naf) nb_companies_with_naf_not_found += 1 pred_effective_hirings.is_a_bonne_boite = is_a_bonne_boite pred_effective_hirings.is_a_bonne_alternance = is_a_bonne_alternance updated_values["is_a_bonne_boite"] = is_a_bonne_boite updated_values["is_a_bonne_alternance"] = is_a_bonne_alternance updated_ppaeh.append(updated_values) count += 1 # Commit all the 10 000 transactions if len(updated_ppaeh) % 100000 == 0: logger.info(f"{count} companies have been treated") db_session.bulk_update_mappings( PerfPredictionAndEffectiveHirings, updated_ppaeh) db_session.commit() updated_ppaeh = [] # Commit for the remaining rows db_session.bulk_update_mappings(PerfPredictionAndEffectiveHirings, updated_ppaeh) db_session.commit() updated_ppaeh = [] logger.info( f"Number of naf not found in the mapping rome naf for this importer cycle : {len(naf_not_founds)}" ) logger.info( f"List of naf not found in the mapping rome naf for this importer cycle : {naf_not_founds}" ) logger.info( f"Number of companies with naf not found in the mapping rome naf for this importer cycle : {nb_companies_with_naf_not_found}" ) logger.info(f"Number of total companies : {count}") for naf_code, romes_list in perf_division_per_rome_dict.items(): for rome_code, values in romes_list.items(): division_per_rome = PerfDivisionPerRome( importer_cycle_infos_id=ici._id, naf=naf_code, rome=rome_code, threshold_lbb=values["threshold_lbb"], threshold_lba=values["threshold_lba"], nb_bonne_boites_lbb=values["nb_bonne_boites_lbb"], nb_bonne_boites_lba=values["nb_bonne_boites_lba"], ) db_session.add(division_per_rome) db_session.commit() ici.computed = True db_session.add(ici) db_session.commit()
def test_admin_access(self): """ Test admin access permissions. """ admin_urls = [ self.url_for('admin.index'), self.url_for('users.index_view'), self.url_for('officeadminadd.index_view'), self.url_for('officeadminremove.index_view'), self.url_for('officeadminupdate.index_view'), self.url_for('officeadminextrageolocation.index_view'), ] with self.test_request_context(): for url in admin_urls: # Access should be denied when a user is not logged in. db_session.query(User).update({ User.active: True, User.is_admin: False }) db_session.commit() self.user = db_session.query(User).filter_by( id=self.user.id).first() self.assertTrue(self.user.active) self.assertFalse(self.user.is_admin) rv = self.app.get(url) self.assertEqual(rv.status_code, 404) self.login(self.user) # Access should be denied when a user is logged in but is not an admin. rv = self.app.get(url) self.assertEqual(rv.status_code, 404) # Access should be granted when a user is logged in and is admin. db_session.query(User).update({ User.active: True, User.is_admin: True }) db_session.commit() self.user = db_session.query(User).filter_by( id=self.user.id).first() self.assertTrue(self.user.active) self.assertTrue(self.user.is_admin) rv = self.app.get(url) self.assertEqual(rv.status_code, 200) # Access should be denied when a user is not active. db_session.query(User).update({ User.active: False, User.is_admin: True }) db_session.commit() self.user = db_session.query(User).filter_by( id=self.user.id).first() self.assertFalse(self.user.active) self.assertTrue(self.user.is_admin) rv = self.app.get(url) self.assertEqual(rv.status_code, 404) self.logout()
def run_task(self): date_insertion = datetime.now() logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. 'lbb_xdpdpae_delta_201611102200.bz2' # will match 2018-09-12 date_pattern = r'.*_(\d\d\d\d\d\d\d\d)\d\d\d\d' #We keep only the date in the file name, ex: 20190910 = 10th september 2019 date_match = re.match(date_pattern, self.input_filename) if date_match: date_part = date_match.groups()[0] self.last_historical_data_date_in_file = datetime.strptime( date_part, "%Y%m%d") logger.debug("identified last_historical_data_date_in_file=%s", self.last_historical_data_date_in_file) else: raise Exception( "couldn't find a date pattern in filename. filename should be \ like lbb_xdpdpae_delta_YYYYMMDDHHMM.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_dpae = 0 imported_dpae_distribution = {} not_imported_dpae = 0 last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date)) \ .filter(Hiring.contract_type.in_((Hiring.CONTRACT_TYPE_CDI, Hiring.CONTRACT_TYPE_CDD, Hiring.CONTRACT_TYPE_CTT))).first()[0] if last_historical_data_date_in_db is None: last_historical_data_date_in_db = DEFAULT_DATETIME_DPAE logger.info( "will now extract all dpae with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip( ) # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 100000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error( "error in executing statement into dpae table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, _, contract_type, departement, contract_duration, \ iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line) except ValueError: self.zipcode_errors += 1 continue except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue dpae_should_be_imported = ( hiring_date > last_historical_data_date_in_db and hiring_date <= self.last_historical_data_date_in_file # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days) # and we ignore CTT. and (contract_type == Hiring.CONTRACT_TYPE_CDI or (contract_type == Hiring.CONTRACT_TYPE_CDD and contract_duration is not None and contract_duration > 31))) if dpae_should_be_imported: statement = (siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion) statements.append(statement) imported_dpae += 1 if hiring_date.year not in imported_dpae_distribution: imported_dpae_distribution[hiring_date.year] = {} if hiring_date.month not in imported_dpae_distribution[ hiring_date.year]: imported_dpae_distribution[hiring_date.year][ hiring_date.month] = {} if hiring_date.day not in imported_dpae_distribution[ hiring_date.year][hiring_date.month]: imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] = 0 imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] += 1 else: not_imported_dpae += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into dpae table: %s", sys.exc_info()[1]) raise logger.info("processed %i dpae...", count) logger.info("imported dpae: %i", imported_dpae) logger.info("not imported dpae: %i", not_imported_dpae) logger.info("zipcode errors: %i", self.zipcode_errors) logger.info("invalid_row errors: %i", self.invalid_row_errors) if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: raise IOError('too many zipcode errors') if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: raise IOError('too many invalid_row errors') logger.info("verifying good number of dpae imported.") query = "select count(*) from hirings h where hiring_date > %s and hiring_date <= %s and h.contract_type in (1,2,3)" cur.execute(query, [ last_historical_data_date_in_db, self.last_historical_data_date_in_file ]) res = cur.fetchone() if res[0] != imported_dpae: raise DoublonException( f"Too many DPAE ({res[0]}) in DB compared to DPAE file ({imported_dpae})." ) logger.info("verifying number of DPAE: OK.") con.commit() cur.close() con.close() try: statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, file_type=self.file_type) db_session.add(statistics) db_session.commit() logger.info("First way to insert DPAE statistics in DB : OK") except OperationalError: # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server # So we insert it directly via an SQL query # This job has been broken for more than a year, only way to fix it : db_session.rollback() last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') most_recent_date = self.last_historical_data_date_in_file.strftime( '%Y-%m-%d %H:%M:%S') query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')" con, cur = import_util.create_cursor() cur.execute(query) con.commit() cur.close() con.close() logger.info("Second way to insert DPAE statistics in DB : OK") logger.info("finished importing dpae...") return something_new
def run_task(self): date_insertion = datetime.now() logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. '20200803ExtractApp' # will match 20200803 date_string = self.input_filename.split('/')[-1][0:8] try: self.last_historical_data_date_in_file = datetime.strptime(date_string, "%Y%m%d") except ValueError: raise Exception("couldn't find a date pattern in filename. filename should be \ like 20200803ExtractApp.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_alternance_contracts = 0 imported_alternance_contracts_distribution = {} not_imported_alternance_contracts = 0 last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date))\ .filter(Hiring.contract_type == self.contract_type).first()[0] logger.info("will now extract all alternance contracts with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip() # FIXME detect column positions from header if b"SIRET" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 10000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, departement = parse_alternance_line(line) except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue except InvalidSiretException: error_message = traceback.format_exc() logger.info("invalid siret met at row: %i", count) logger.info(error_message) self.invalid_siret_errors += 1 continue except InvalidZipCodeException: logger.info("invalid zip code met at row: %i", count) self.invalid_zipcode_errors += 1 continue # This part of code is useless : # The data used has a lot of late contracts inputs # So we have to insert ALL the contracts from different dates # alternance_contract_should_be_imported = ( # hiring_date > last_historical_data_date_in_db # and hiring_date <= self.last_historical_data_date_in_file #) if hiring_date <= self.last_historical_data_date_in_file: statement = ( siret, hiring_date, self.contract_type, departement, None, #contract_duration None, #iiann None, #tranche_age None, #handicap_label None, #duree_pec date_insertion ) statements.append(statement) imported_alternance_contracts += 1 if hiring_date.year not in imported_alternance_contracts_distribution: imported_alternance_contracts_distribution[hiring_date.year] = {} if hiring_date.month not in imported_alternance_contracts_distribution[hiring_date.year]: imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month] = {} if hiring_date.day not in imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month]: imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] = 0 imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1]) raise logger.info(f"Types de contrats à importer : {self.contract_name}") logger.info(f"processed {count} lba_contracts...") logger.info(f"imported lba_contracts: {imported_alternance_contracts}") logger.info(f"not imported lba_contracts: {not_imported_alternance_contracts}") logger.info(f"zipcode errors: {self.invalid_zipcode_errors}") logger.info(f"invalid_row errors: {self.invalid_row_errors}") logger.info(f"invalid siret errors: {self.invalid_siret_errors}") # if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: # raise IOError('too many zipcode errors') # if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: # raise IOError('too many invalid_row errors') con.commit() cur.close() con.close() try: statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, file_type=self.file_type ) db_session.add(statistics) db_session.commit() logger.info("First way to insert DPAE statistics in DB : OK") except OperationalError: # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server # So we insert it directly via an SQL query # This job has been broken for more than a year, only way to fix it : db_session.rollback() last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') most_recent_date = self.last_historical_data_date_in_file.strftime('%Y-%m-%d %H:%M:%S') query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')" con, cur = import_util.create_cursor() cur.execute(query) con.commit() cur.close() con.close() logger.info("Second way to insert DPAE statistics in DB : OK") logger.info("finished importing dpae...") return something_new