def run_missing_geocoding_jobs(self, csv_max_rows=5000, disable_multithreading=False): # The CSV file to send to API must not be > 8mb (https://adresse.data.gouv.fr/api) # This line :"03880702000011,2 RUE DE LA TETE NOIRE 14700 FALAISE,14258" # was copied 100000 times in a file, and the size was 5.77 MB, # it seems ok to set it to ~80000 / 100000 # --> After multiple tests, if we want to multithread this, we need to set it to 5000, not more csv_path_prefix = '/tmp/csv_geocoding' csv_files = [] csv_api_back[:] = [] for start in range(0, len(adresses_not_geolocated), csv_max_rows): end = start + csv_max_rows csv_path = "{}-{}-{}.csv".format(csv_path_prefix, start, end) with open(csv_path, 'w') as resultFile: wr = csv.writer(resultFile, dialect='excel') wr.writerow(("siret", "full_address", "city_code")) wr.writerows(adresses_not_geolocated[start:end]) csv_files.append(csv_path) GEOCODING_STATS['number created CSV'] = GEOCODING_STATS.get( 'number created CSV', 0) + 1 logger.debug("wrote CSV file to %s", csv_path) logger.info("GEOCODING_STATS = %s", GEOCODING_STATS) for csv_path in csv_files: self.get_csv_from_api(csv_path) logger.info("GEOCODING_STATS = %s", GEOCODING_STATS) if disable_multithreading: for csv_path in csv_api_back: self.get_geocode_from_csv(csv_path) else: pool = Pool(processes=pool_size) for csv_path in csv_api_back: pool.apply_async(self.get_geocode_from_csv, (csv_path, )) pool.close() pool.join() logger.info("GEOCODING_STATS = %s", GEOCODING_STATS) return coordinates_updates
def run_task(self): date_insertion = datetime.now() logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. 'lbb_xdpdpae_delta_201611102200.bz2' # will match 2018-09-12 date_pattern = r'.*_(\d\d\d\d\d\d\d\d)\d\d\d\d' #We keep only the date in the file name, ex: 20190910 = 10th september 2019 date_match = re.match(date_pattern, self.input_filename) if date_match: date_part = date_match.groups()[0] self.last_historical_data_date_in_file = datetime.strptime( date_part, "%Y%m%d") logger.debug("identified last_historical_data_date_in_file=%s", self.last_historical_data_date_in_file) else: raise Exception( "couldn't find a date pattern in filename. filename should be \ like lbb_xdpdpae_delta_YYYYMMDDHHMM.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_dpae = 0 imported_dpae_distribution = {} not_imported_dpae = 0 last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date)) \ .filter(Hiring.contract_type.in_((Hiring.CONTRACT_TYPE_CDI, Hiring.CONTRACT_TYPE_CDD, Hiring.CONTRACT_TYPE_CTT))).first()[0] if last_historical_data_date_in_db is None: last_historical_data_date_in_db = DEFAULT_DATETIME_DPAE logger.info( "will now extract all dpae with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip( ) # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 100000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error( "error in executing statement into dpae table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, _, contract_type, departement, contract_duration, \ iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line) except ValueError: self.zipcode_errors += 1 continue except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue dpae_should_be_imported = ( hiring_date > last_historical_data_date_in_db and hiring_date <= self.last_historical_data_date_in_file # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days) # and we ignore CTT. and (contract_type == Hiring.CONTRACT_TYPE_CDI or (contract_type == Hiring.CONTRACT_TYPE_CDD and contract_duration is not None and contract_duration > 31))) if dpae_should_be_imported: statement = (siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion) statements.append(statement) imported_dpae += 1 if hiring_date.year not in imported_dpae_distribution: imported_dpae_distribution[hiring_date.year] = {} if hiring_date.month not in imported_dpae_distribution[ hiring_date.year]: imported_dpae_distribution[hiring_date.year][ hiring_date.month] = {} if hiring_date.day not in imported_dpae_distribution[ hiring_date.year][hiring_date.month]: imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] = 0 imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] += 1 else: not_imported_dpae += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into dpae table: %s", sys.exc_info()[1]) raise logger.info("processed %i dpae...", count) logger.info("imported dpae: %i", imported_dpae) logger.info("not imported dpae: %i", not_imported_dpae) logger.info("zipcode errors: %i", self.zipcode_errors) logger.info("invalid_row errors: %i", self.invalid_row_errors) if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: raise IOError('too many zipcode errors') if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: raise IOError('too many invalid_row errors') logger.info("verifying good number of dpae imported.") query = "select count(*) from hirings h where hiring_date > %s and hiring_date <= %s and h.contract_type in (1,2,3)" cur.execute(query, [ last_historical_data_date_in_db, self.last_historical_data_date_in_file ]) res = cur.fetchone() if res[0] != imported_dpae: raise DoublonException( f"Too many DPAE ({res[0]}) in DB compared to DPAE file ({imported_dpae})." ) logger.info("verifying number of DPAE: OK.") con.commit() cur.close() con.close() try: statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, file_type=self.file_type) db_session.add(statistics) db_session.commit() logger.info("First way to insert DPAE statistics in DB : OK") except OperationalError: # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server # So we insert it directly via an SQL query # This job has been broken for more than a year, only way to fix it : db_session.rollback() last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') most_recent_date = self.last_historical_data_date_in_file.strftime( '%Y-%m-%d %H:%M:%S') query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')" con, cur = import_util.create_cursor() cur.execute(query) con.commit() cur.close() con.close() logger.info("Second way to insert DPAE statistics in DB : OK") logger.info("finished importing dpae...") return something_new
def get_offices_from_file(self): # FIXME elegantly parallelize this stuff # see # https://stackoverflow.com/questions/8717179/chunking-data-from-a-large-file-for-multiprocessing # https://docs.python.org/2/library/itertools.html#itertools.islice logger.info("extracting %s...", self.input_filename) departements = dpt.DEPARTEMENTS count = 0 no_zipcode_count = 0 unprocessable_departement_errors = 0 format_errors = 0 # KPI expected after the add of the RGPD email column emails_here_before_rgpd = 0 # Number of offices who did not have email before, and now have one emails_not_here_before_rgpd = 0 # Number of offices who had an existing email, which has been replaced by the new rgpd mail clean departement_counter_dic = {} offices = {} with import_util.get_reader(self.input_filename) as myfile: header_line = myfile.readline().strip() # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise ValueError("wrong header line") for line in myfile: line = line.decode() count += 1 try: fields = import_util.get_fields_from_csv_line(line) if len(fields) != 22: logger.exception("wrong number of fields in line %s", line) raise ValueError siret, raisonsociale, enseigne, codenaf, numerorue, \ libellerue, codecommune, codepostal, email, \ tel, trancheeffectif_etablissement, effectif_reel, _, _, \ website1, website2, better_tel, \ website3, _, contrat_afpr, contrat_poe, contrat_pmsmp = fields if not siret_util.is_siret(siret): logger.exception("wrong siret : %s", siret) raise ValueError if siret in WRONG_SIRETS: logger.exception("wrong siret : %s, should not be here - need other extract from datalake", siret) raise WrongSiretException except ValueError: logger.exception("exception in line %s", line) format_errors += 1 continue # We cant rely on the field trancheeffectif_etablissement which is in etablissement file # We have to rely on the field effectif_reel # We take the number of employees and we use a dataframe which will help us to determine which category the number of employees is related to # If there is no effectif reel in the dataset OR it is 0, we use the old field tranche_effectif if effectif_reel != '': if int(effectif_reel) > 0: trancheeffectif_etablissement = DF_EFFECTIF_TO_LABEL[ (DF_EFFECTIF_TO_LABEL.start_effectif <= int(effectif_reel)) & (DF_EFFECTIF_TO_LABEL.end_effectif >= int(effectif_reel)) ]['code'].values[0] website = merge_and_normalize_websites([website1, website2, website3]) if has_text_content(better_tel): tel = better_tel flag_pmsmp = 0 if contrat_pmsmp == "O": flag_pmsmp = 1 flag_poe_afpr = 0 if contrat_poe == "O" or contrat_afpr == "O": flag_poe_afpr = 1 if codecommune.strip(): departement = import_util.get_departement_from_zipcode(codepostal) process_this_departement = departement in departements if process_this_departement: # Trello Pz5UlnFh : supprimer-les-emails-pe-des-entreprises-qui-ne-sont-pas-des-agences-pe if "@pole-emploi." in email and raisonsociale != "POLE EMPLOI": email = "" if len(codepostal) == 4: codepostal = "0%s" % codepostal etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website, flag_poe_afpr, flag_pmsmp etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website, flag_poe_afpr, flag_pmsmp, siret if codepostal.startswith(departement): departement_counter_dic.setdefault(departement, 0) departement_counter_dic[departement] += 1 offices[siret] = { "create_fields": etab_create_fields, "update_fields": etab_update_fields, } else: logger.info( "zipcode %s and departement %s don't match commune_id %s", codepostal, departement, codecommune, ) else: unprocessable_departement_errors += 1 else: no_zipcode_count += 1 if not count % 100000: logger.debug("processed %s lines", count) yield offices offices = {} logger.info("%i offices total", count) logger.info("%i offices without email before and have now thanks to RGPD mails", emails_not_here_before_rgpd) logger.info("%i offices with emails before and have been replaced by RGPD mails", emails_here_before_rgpd) logger.info("%i offices with unprocessable departement", unprocessable_departement_errors) logger.info("%i offices with no zipcodes", no_zipcode_count) logger.info("%i offices not read because of format error", format_errors) logger.info("%i distinct departements from file", len(departement_counter_dic)) departement_count = sorted(departement_counter_dic.items()) logger.info("per departement read %s", departement_count) logger.info("finished reading offices...") if get_current_env() != ENV_TEST: if unprocessable_departement_errors > 2500: raise ValueError("too many unprocessable_departement_errors") if no_zipcode_count > 75000: raise ValueError("too many no_zipcode_count") if format_errors > 5: raise ValueError("too many format_errors") if len(departement_counter_dic) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES: msg = "incorrect total number of departements : %s instead of expected %s" % ( len(departement_counter_dic), settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES ) raise ValueError(msg) for departement, count in departement_count: if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT: logger.exception("only %s offices in departement %s", count, departement) raise ValueError("not enough offices in at least one departement") yield offices
def run_task(self): logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. 'LBB_XDPDPAE_2018-09-12_2017-08-01.bz2' # will match 2018-09-12 date_pattern = r'.*_(\d\d\d\d-\d\d-\d\d)_' date_match = re.match(date_pattern, self.input_filename) if date_match: date_part = date_match.groups()[0] self.last_historical_data_date_in_file = datetime.strptime( date_part, "%Y-%m-%d") logger.debug("identified last_historical_data_date_in_file=%s", self.last_historical_data_date_in_file) else: raise Exception( "couldn't find a date pattern in filename. filename should be \ like LBB_XDPDPAE_YYYY-MM-DD_YYYY-MM-DD.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_dpae = 0 imported_dpae_distribution = {} not_imported_dpae = 0 last_historical_data_date_in_db = DpaeStatistics.get_last_historical_data_date( ) logger.info( "will now extract all dpae with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip( ) # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 100000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error( "error in executing statement into dpae table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, _, contract_type, departement, contract_duration, \ iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line) except ValueError: self.zipcode_errors += 1 continue except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue dpae_should_be_imported = ( hiring_date > last_historical_data_date_in_db and hiring_date <= self.last_historical_data_date_in_file # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days) # and we ignore CTT. and (contract_type == Hiring.CONTRACT_TYPE_CDI or (contract_type == Hiring.CONTRACT_TYPE_CDD and contract_duration is not None and contract_duration > 31))) if dpae_should_be_imported: statement = ( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, ) statements.append(statement) imported_dpae += 1 if hiring_date.year not in imported_dpae_distribution: imported_dpae_distribution[hiring_date.year] = {} if hiring_date.month not in imported_dpae_distribution[ hiring_date.year]: imported_dpae_distribution[hiring_date.year][ hiring_date.month] = {} if hiring_date.day not in imported_dpae_distribution[ hiring_date.year][hiring_date.month]: imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] = 0 imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] += 1 else: not_imported_dpae += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into dpae table: %s", sys.exc_info()[1]) raise logger.info("processed %i dpae...", count) logger.info("imported dpae: %i", imported_dpae) logger.info("not imported dpae: %i", not_imported_dpae) logger.info("zipcode errors: %i", self.zipcode_errors) logger.info("invalid_row errors: %i", self.invalid_row_errors) if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: raise IOError('too many zipcode errors') if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: raise IOError('too many invalid_row errors') con.commit() cur.close() con.close() statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, ) statistics.save() logger.info("finished importing dpae...") return something_new
def run_task(self): date_insertion = datetime.now() logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. '20200803ExtractApp' # will match 20200803 date_string = self.input_filename.split('/')[-1][0:8] try: self.last_historical_data_date_in_file = datetime.strptime(date_string, "%Y%m%d") except ValueError: raise Exception("couldn't find a date pattern in filename. filename should be \ like 20200803ExtractApp.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_alternance_contracts = 0 imported_alternance_contracts_distribution = {} not_imported_alternance_contracts = 0 last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date))\ .filter(Hiring.contract_type == self.contract_type).first()[0] logger.info("will now extract all alternance contracts with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip() # FIXME detect column positions from header if b"SIRET" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 10000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, departement = parse_alternance_line(line) except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue except InvalidSiretException: error_message = traceback.format_exc() logger.info("invalid siret met at row: %i", count) logger.info(error_message) self.invalid_siret_errors += 1 continue except InvalidZipCodeException: logger.info("invalid zip code met at row: %i", count) self.invalid_zipcode_errors += 1 continue # This part of code is useless : # The data used has a lot of late contracts inputs # So we have to insert ALL the contracts from different dates # alternance_contract_should_be_imported = ( # hiring_date > last_historical_data_date_in_db # and hiring_date <= self.last_historical_data_date_in_file #) if hiring_date <= self.last_historical_data_date_in_file: statement = ( siret, hiring_date, self.contract_type, departement, None, #contract_duration None, #iiann None, #tranche_age None, #handicap_label None, #duree_pec date_insertion ) statements.append(statement) imported_alternance_contracts += 1 if hiring_date.year not in imported_alternance_contracts_distribution: imported_alternance_contracts_distribution[hiring_date.year] = {} if hiring_date.month not in imported_alternance_contracts_distribution[hiring_date.year]: imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month] = {} if hiring_date.day not in imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month]: imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] = 0 imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1]) raise logger.info(f"Types de contrats à importer : {self.contract_name}") logger.info(f"processed {count} lba_contracts...") logger.info(f"imported lba_contracts: {imported_alternance_contracts}") logger.info(f"not imported lba_contracts: {not_imported_alternance_contracts}") logger.info(f"zipcode errors: {self.invalid_zipcode_errors}") logger.info(f"invalid_row errors: {self.invalid_row_errors}") logger.info(f"invalid siret errors: {self.invalid_siret_errors}") # if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: # raise IOError('too many zipcode errors') # if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: # raise IOError('too many invalid_row errors') con.commit() cur.close() con.close() try: statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, file_type=self.file_type ) db_session.add(statistics) db_session.commit() logger.info("First way to insert DPAE statistics in DB : OK") except OperationalError: # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server # So we insert it directly via an SQL query # This job has been broken for more than a year, only way to fix it : db_session.rollback() last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') most_recent_date = self.last_historical_data_date_in_file.strftime('%Y-%m-%d %H:%M:%S') query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')" con, cur = import_util.create_cursor() cur.execute(query) con.commit() cur.close() con.close() logger.info("Second way to insert DPAE statistics in DB : OK") logger.info("finished importing dpae...") return something_new
def get_offices_from_file(self): # FIXME elegantly parallelize this stuff # see # https://stackoverflow.com/questions/8717179/chunking-data-from-a-large-file-for-multiprocessing # https://docs.python.org/2/library/itertools.html#itertools.islice logger.info("extracting %s...", self.input_filename) departements = dpt.DEPARTEMENTS count = 0 no_zipcode_count = 0 unprocessable_departement_errors = 0 format_errors = 0 departement_counter_dic = {} offices = {} with import_util.get_reader(self.input_filename) as myfile: header_line = myfile.readline().strip() # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise ValueError("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 100000: logger.debug("processed %s lines", count) try: fields = import_util.get_fields_from_csv_line(line) if len(fields) != 22: logger.exception("wrong number of fields in line %s", line) raise ValueError siret, raisonsociale, enseigne, codenaf, numerorue, \ libellerue, codecommune, codepostal, email, tel, \ trancheeffectif_etablissement, _, _, _, \ website1, website2, better_tel, \ website3, _, contrat_afpr, contrat_poe, contrat_pmsmp = fields if not siret_util.is_siret(siret): logger.exception("wrong siret : %s", siret) raise ValueError except ValueError: logger.exception("exception in line %s", line) format_errors += 1 continue website = merge_and_normalize_websites([website1, website2, website3]) if has_text_content(better_tel): tel = better_tel flag_pmsmp = 0 if contrat_pmsmp == "O" : flag_pmsmp = 1 flag_poe_afpr = 0 if contrat_poe == "O" or contrat_afpr == "O" : flag_poe_afpr = 1 email = encoding_util.strip_french_accents(email) if codecommune.strip(): departement = import_util.get_departement_from_zipcode(codepostal) process_this_departement = departement in departements if process_this_departement: # Trello Pz5UlnFh : supprimer-les-emails-pe-des-entreprises-qui-ne-sont-pas-des-agences-pe if "@pole-emploi." in email and raisonsociale != "POLE EMPLOI": email = "" if len(codepostal) == 4: codepostal = "0%s" % codepostal etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website, flag_poe_afpr, flag_pmsmp etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \ codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \ website, flag_poe_afpr, flag_pmsmp, siret if codepostal.startswith(departement): departement_counter_dic.setdefault(departement, 0) departement_counter_dic[departement] += 1 offices[siret] = { "create_fields": etab_create_fields, "update_fields": etab_update_fields, } else: logger.info( "zipcode %s and departement %s don't match commune_id %s", codepostal, departement, codecommune, ) else: unprocessable_departement_errors += 1 else: no_zipcode_count += 1 logger.info("%i offices total", count) logger.info("%i offices with unprocessable departement", unprocessable_departement_errors) logger.info("%i offices with no zipcodes", no_zipcode_count) logger.info("%i offices not read because of format error", format_errors) logger.info("%i distinct departements from file", len(departement_counter_dic)) departement_count = sorted(departement_counter_dic.items()) logger.info("per departement read %s", departement_count) logger.info("finished reading offices...") if unprocessable_departement_errors > 2500: raise ValueError("too many unprocessable_departement_errors") if no_zipcode_count > 75000: raise ValueError("too many no_zipcode_count") if format_errors > 5: raise ValueError("too many format_errors") if len(departement_counter_dic) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES: msg = "incorrect total number of departements : %s instead of expected %s" % ( len(departement_counter_dic), settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES ) raise ValueError(msg) for departement, count in departement_count: if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT: logger.exception("only %s offices in departement %s", count, departement) raise ValueError("not enough offices in at least one departement") return offices