def siret_validator(form, field):
    Validate siret number.
    if not is_siret(field.data):
        raise ValidationError(
            "Le numéro SIRET doit être composé de 14 chiffres")
 def get_sirets_from_database(self):
     query = "select siret from %s" % settings.RAW_OFFICE_TABLE
     logger.info("get offices from database")
     con, cur = import_util.create_cursor()
     rows = cur.fetchall()
     return [row[0] for row in rows if siret_util.is_siret(row[0])]
    def validate_form(self, form):
        # Add http:// is missing
        form['new_website'].data = format_url(form['new_website'].data)
        form['social_network'].data = format_url(form['social_network'].data)
        form['website_alternance'].data = format_url(

        is_valid = super(OfficeAdminUpdateModelView, self).validate_form(form)

        # All sirets must be well formed
        sirets = models.OfficeAdminUpdate.as_list(form.data['sirets'])
        only_one_siret = len(sirets) == 1

        if is_valid and not sirets:
            message = "Le champs 'Sirets' est obligatoire. Veuillez le renseigner."
            flash(message, 'error')
            return False

        if is_valid:
            for siret in sirets:
                if not is_siret(siret):
                    message = "Ce siret suivant n'est pas composé de 14 chiffres : {}".format(
                    flash(message, 'error')
                    return False

        # If only one siret, we valdate it
        # If more than one siret : no siret validation
        if is_valid and only_one_siret:
            siret = sirets[0]
            office = models.Office.query.filter_by(siret=siret).first()
            if not office:
                message = "Le siret suivant n'est pas présent sur LBB: {}".format(
                flash(message, 'error')
                return False

        if is_valid:

            # Show old value in description
            for siret in sirets:
                if 'id' in request.args:
                    # Avoid conflict with itself if update by adding id != request.args['id']
                    office_update_conflict = models.OfficeAdminUpdate.query.filter(
                        models.OfficeAdminUpdate.id != request.args['id'])
                    office_update_conflict = models.OfficeAdminUpdate.query.filter(

                if office_update_conflict.count() > 0:
                    message = """"
                        Le siret {} est déjà présent dans la fiche n°{}
                    """.format(siret, office_update_conflict[0].id)
                    flash(message, 'error')
                    return False

        # Get company
        first_office = models.Office.query.filter_by(
            siret=sirets[0]).first() if sirets else None

        # Codes ROMES to boost or to add
        if is_valid and form.data.get('romes_to_boost'):
                self.validate_romes_to_boost(form, 'romes_to_boost', 'boost')
            except (RomeToBoostException, InvalidRome) as e:
                flash(e.args[0], 'error')
                return False

        # Codes ROMES to boost or to add (for alternance)
        if is_valid and form.data.get('romes_alternance_to_boost'):
                self.validate_romes_to_boost(form, 'romes_to_boost',
            except (RomeToBoostException, InvalidRome) as e:
                flash(e.args[0], 'error')
                return False

        # Codes ROMES to remove for LBB
        if is_valid and form.data.get('romes_to_remove'):
                office_naf = first_office.naf if only_one_siret else None
                self.validate_romes_to_remove(form, 'romes_to_remove',
            except (RomeToRemoveException, InvalidRome) as e:
                flash(e.args[0], 'error')
                return False

        # Codes ROMES to remove (for alternance)
        if is_valid and form.data.get('romes_alternance_to_remove'):
                office_naf = first_office.naf if only_one_siret else None
            except (RomeToRemoveException, InvalidRome) as e:
                flash(e.args[0], 'error')
                return False

        # Codes NAF to add
        if is_valid and form.data.get('nafs_to_add'):
            nafs_to_add = form.data.get('nafs_to_add')

            for naf in models.OfficeAdminUpdate.as_list(nafs_to_add):
                if naf not in settings.NAF_CODES:
                    msg = "`%s` n'est pas un code NAF valide." % naf
                    flash(msg, 'error')
                    return False
                if naf == first_office.naf and only_one_siret:
                    msg = "Le NAF `%s` est déjà associé à cette entreprise." % naf
                    flash(msg, 'error')
                    return False

        # Identifiant recruteur
        if is_valid and form.data.get(
                'certified_recruiter') and not form.data.get('recruiter_uid'):
            msg = "La case 'Recruteur certifié' est cochée mais aucun identifiant n'est indiqué."
            flash(msg, 'error')
            return False

        return is_valid
    def get_offices_from_file(self):
        # FIXME elegantly parallelize this stuff
        # see
        # https://stackoverflow.com/questions/8717179/chunking-data-from-a-large-file-for-multiprocessing
        # https://docs.python.org/2/library/itertools.html#itertools.islice
        logger.info("extracting %s...", self.input_filename)
        departements = dpt.DEPARTEMENTS
        count = 0
        no_zipcode_count = 0
        unprocessable_departement_errors = 0
        format_errors = 0
        # KPI expected after the add of the RGPD email column
        emails_here_before_rgpd = 0 # Number of offices who did not have email before, and now have one
        emails_not_here_before_rgpd = 0 # Number of offices who had an existing email, which has been replaced by the new rgpd mail clean
        departement_counter_dic = {}
        offices = {}

        with import_util.get_reader(self.input_filename) as myfile:
            header_line = myfile.readline().strip()  # FIXME detect column positions from header
            if b"siret" not in header_line:
                raise ValueError("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1

                    fields = import_util.get_fields_from_csv_line(line)

                    if len(fields) != 22:
                        logger.exception("wrong number of fields in line %s", line)
                        raise ValueError

                    siret, raisonsociale, enseigne, codenaf, numerorue, \
                        libellerue, codecommune, codepostal, email, \
                        tel, trancheeffectif_etablissement, effectif_reel, _, _, \
                        website1, website2, better_tel, \
                        website3, _, contrat_afpr, contrat_poe, contrat_pmsmp = fields

                    if not siret_util.is_siret(siret):
                        logger.exception("wrong siret : %s", siret)
                        raise ValueError

                    if siret in WRONG_SIRETS:
                        logger.exception("wrong siret : %s, should not be here - need other extract from datalake", siret)
                        raise WrongSiretException

                except ValueError:
                    logger.exception("exception in line %s", line)
                    format_errors += 1

                # We cant rely on the field trancheeffectif_etablissement which is in etablissement file
                # We have to rely on the field effectif_reel
                # We take the number of employees and we use a dataframe which will help us to determine which category the number of employees is related to 
                # If there is no effectif reel in the dataset OR it is 0, we use the old field tranche_effectif
                if effectif_reel != '':
                    if int(effectif_reel) > 0:
                        trancheeffectif_etablissement = DF_EFFECTIF_TO_LABEL[ 
                            (DF_EFFECTIF_TO_LABEL.start_effectif <= int(effectif_reel)) & 
                            (DF_EFFECTIF_TO_LABEL.end_effectif >= int(effectif_reel))

                website = merge_and_normalize_websites([website1, website2, website3])

                if has_text_content(better_tel):
                    tel = better_tel
                flag_pmsmp = 0
                if contrat_pmsmp == "O":
                    flag_pmsmp = 1

                flag_poe_afpr = 0
                if contrat_poe == "O" or contrat_afpr == "O":
                    flag_poe_afpr = 1

                if codecommune.strip():
                    departement = import_util.get_departement_from_zipcode(codepostal)
                    process_this_departement = departement in departements
                    if process_this_departement:
                        # Trello Pz5UlnFh : supprimer-les-emails-pe-des-entreprises-qui-ne-sont-pas-des-agences-pe
                        if  "@pole-emploi." in email and raisonsociale != "POLE EMPLOI":
                            email = ""
                        if len(codepostal) == 4:
                            codepostal = "0%s" % codepostal
                        etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                            codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                            website, flag_poe_afpr, flag_pmsmp
                        etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                            codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                            website, flag_poe_afpr, flag_pmsmp, siret
                        if codepostal.startswith(departement):
                            departement_counter_dic.setdefault(departement, 0)
                            departement_counter_dic[departement] += 1
                            offices[siret] = {
                                "create_fields": etab_create_fields,
                                "update_fields": etab_update_fields,
                                "zipcode %s and departement %s don't match commune_id %s",
                        unprocessable_departement_errors += 1
                    no_zipcode_count += 1
                if not count % 100000:
                    logger.debug("processed %s lines", count)
                    yield offices
                    offices = {}

        logger.info("%i offices total", count)
        logger.info("%i offices without email before and have now thanks to RGPD mails", emails_not_here_before_rgpd)
        logger.info("%i offices with emails before and have been replaced by RGPD mails", emails_here_before_rgpd)
        logger.info("%i offices with unprocessable departement", unprocessable_departement_errors)
        logger.info("%i offices with no zipcodes", no_zipcode_count)
        logger.info("%i offices not read because of format error", format_errors)
        logger.info("%i distinct departements from file", len(departement_counter_dic))
        departement_count = sorted(departement_counter_dic.items())
        logger.info("per departement read %s", departement_count)
        logger.info("finished reading offices...")

        if get_current_env() != ENV_TEST:
            if unprocessable_departement_errors > 2500:
                raise ValueError("too many unprocessable_departement_errors")
            if no_zipcode_count > 75000:
                raise ValueError("too many no_zipcode_count")
            if format_errors > 5:
                raise ValueError("too many format_errors")
            if len(departement_counter_dic) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES:
                msg = "incorrect total number of departements : %s instead of expected %s" % (
                raise ValueError(msg)
            for departement, count in departement_count:
                if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT:
                    logger.exception("only %s offices in departement %s", count, departement)
                    raise ValueError("not enough offices in at least one departement")

        yield offices
    def get_offices_from_file(self):
        # FIXME elegantly parallelize this stuff
        # see
        # https://stackoverflow.com/questions/8717179/chunking-data-from-a-large-file-for-multiprocessing
        # https://docs.python.org/2/library/itertools.html#itertools.islice
        logger.info("extracting %s...", self.input_filename)
        departements = dpt.DEPARTEMENTS
        count = 0
        no_zipcode_count = 0
        unprocessable_departement_errors = 0
        format_errors = 0
        departement_counter_dic = {}
        offices = {}

        with import_util.get_reader(self.input_filename) as myfile:
            header_line = myfile.readline().strip()  # FIXME detect column positions from header
            if b"siret" not in header_line:
                raise ValueError("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("processed %s lines", count)

                    fields = import_util.get_fields_from_csv_line(line)
                    if len(fields) != 22:
                        logger.exception("wrong number of fields in line %s", line)
                        raise ValueError

                    siret, raisonsociale, enseigne, codenaf, numerorue, \
                        libellerue, codecommune, codepostal, email, tel, \
                        trancheeffectif_etablissement, _, _, _, \
                        website1, website2, better_tel, \
                        website3, _, contrat_afpr, contrat_poe, contrat_pmsmp = fields

                    if not siret_util.is_siret(siret):
                        logger.exception("wrong siret : %s", siret)
                        raise ValueError

                except ValueError:
                    logger.exception("exception in line %s", line)
                    format_errors += 1

                website = merge_and_normalize_websites([website1, website2, website3])

                if has_text_content(better_tel):
                    tel = better_tel
                flag_pmsmp = 0
                if contrat_pmsmp == "O" :
                    flag_pmsmp = 1

                flag_poe_afpr = 0
                if contrat_poe == "O" or contrat_afpr == "O" :
                    flag_poe_afpr = 1
                email = encoding_util.strip_french_accents(email)

                if codecommune.strip():
                    departement = import_util.get_departement_from_zipcode(codepostal)
                    process_this_departement = departement in departements
                    if process_this_departement:
                        # Trello Pz5UlnFh : supprimer-les-emails-pe-des-entreprises-qui-ne-sont-pas-des-agences-pe
                        if  "@pole-emploi." in email and raisonsociale != "POLE EMPLOI":
                            email = ""
                        if len(codepostal) == 4:
                            codepostal = "0%s" % codepostal
                        etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                            codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                            website, flag_poe_afpr, flag_pmsmp
                        etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                            codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                            website, flag_poe_afpr, flag_pmsmp, siret
                        if codepostal.startswith(departement):
                            departement_counter_dic.setdefault(departement, 0)
                            departement_counter_dic[departement] += 1
                            offices[siret] = {
                                "create_fields": etab_create_fields,
                                "update_fields": etab_update_fields,
                                "zipcode %s and departement %s don't match commune_id %s",
                        unprocessable_departement_errors += 1
                    no_zipcode_count += 1

        logger.info("%i offices total", count)
        logger.info("%i offices with unprocessable departement", unprocessable_departement_errors)
        logger.info("%i offices with no zipcodes", no_zipcode_count)
        logger.info("%i offices not read because of format error", format_errors)
        logger.info("%i distinct departements from file", len(departement_counter_dic))
        departement_count = sorted(departement_counter_dic.items())
        logger.info("per departement read %s", departement_count)
        logger.info("finished reading offices...")

        if unprocessable_departement_errors > 2500:
            raise ValueError("too many unprocessable_departement_errors")
        if no_zipcode_count > 75000:
            raise ValueError("too many no_zipcode_count")
        if format_errors > 5:
            raise ValueError("too many format_errors")
        if len(departement_counter_dic) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES:
            msg = "incorrect total number of departements : %s instead of expected %s" % (
            raise ValueError(msg)
        for departement, count in departement_count:
            if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT:
                logger.exception("only %s offices in departement %s", count, departement)
                raise ValueError("not enough offices in at least one departement")

        return offices