示例#1
0
def get_n_lines(path, n=5, ignore_header=True):
    results = []
    count = 0
    with import_util.get_reader(path) as myfile:
        if ignore_header:
            myfile.readline().strip()
        for line in myfile:
            results.append(line)
            count += 1
            if (count >= n): break
    return results
示例#2
0
def get_latest_scam_emails():
    list_of_files = glob.glob(
        os.path.join(settings.SCAM_EMAILS_FOLDER, 'lbb_blacklist_full_*.bz2'))
    if not list_of_files:
        raise ValueError(
            "No blacklist file found. Path is most likely incorrect.")
    latest_file = max(list_of_files, key=os.path.getctime)
    with importer_util.get_reader(latest_file) as myfile:
        logger.info("Processing scam emails file %s ...", latest_file)
        myfile.readline()  # ignore header
        emails = [email.decode().strip().replace('"', '') for email in myfile]
        emails = [email for email in emails if email != '']
    logger.info("The emails file contains {} emails".format(len(emails)))
    return emails
示例#3
0
    def get_offices_from_file(self):
        # FIXME elegantly parallelize this stuff
        # see
        # https://stackoverflow.com/questions/8717179/chunking-data-from-a-large-file-for-multiprocessing
        # https://docs.python.org/2/library/itertools.html#itertools.islice
        logger.info("extracting %s...", self.input_filename)
        departements = dpt.DEPARTEMENTS
        count = 0
        no_zipcode_count = 0
        unprocessable_departement_errors = 0
        format_errors = 0
        # KPI expected after the add of the RGPD email column
        emails_here_before_rgpd = 0 # Number of offices who did not have email before, and now have one
        emails_not_here_before_rgpd = 0 # Number of offices who had an existing email, which has been replaced by the new rgpd mail clean
        departement_counter_dic = {}
        offices = {}


        with import_util.get_reader(self.input_filename) as myfile:
            header_line = myfile.readline().strip()  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise ValueError("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1


                try:
                    fields = import_util.get_fields_from_csv_line(line)

                    if len(fields) != 22:
                        logger.exception("wrong number of fields in line %s", line)
                        raise ValueError

                    siret, raisonsociale, enseigne, codenaf, numerorue, \
                        libellerue, codecommune, codepostal, email, \
                        tel, trancheeffectif_etablissement, effectif_reel, _, _, \
                        website1, website2, better_tel, \
                        website3, _, contrat_afpr, contrat_poe, contrat_pmsmp = fields

                    if not siret_util.is_siret(siret):
                        logger.exception("wrong siret : %s", siret)
                        raise ValueError

                    if siret in WRONG_SIRETS:
                        logger.exception("wrong siret : %s, should not be here - need other extract from datalake", siret)
                        raise WrongSiretException

                except ValueError:
                    logger.exception("exception in line %s", line)
                    format_errors += 1
                    continue

                # We cant rely on the field trancheeffectif_etablissement which is in etablissement file
                # We have to rely on the field effectif_reel
                # We take the number of employees and we use a dataframe which will help us to determine which category the number of employees is related to 
                # If there is no effectif reel in the dataset OR it is 0, we use the old field tranche_effectif
                if effectif_reel != '':
                    if int(effectif_reel) > 0:
                        trancheeffectif_etablissement = DF_EFFECTIF_TO_LABEL[ 
                            (DF_EFFECTIF_TO_LABEL.start_effectif <= int(effectif_reel)) & 
                            (DF_EFFECTIF_TO_LABEL.end_effectif >= int(effectif_reel))
                        ]['code'].values[0]

                website = merge_and_normalize_websites([website1, website2, website3])

                if has_text_content(better_tel):
                    tel = better_tel
                flag_pmsmp = 0
                if contrat_pmsmp == "O":
                    flag_pmsmp = 1

                flag_poe_afpr = 0
                if contrat_poe == "O" or contrat_afpr == "O":
                    flag_poe_afpr = 1

                if codecommune.strip():
                    departement = import_util.get_departement_from_zipcode(codepostal)
                    process_this_departement = departement in departements
                    if process_this_departement:
                        # Trello Pz5UlnFh : supprimer-les-emails-pe-des-entreprises-qui-ne-sont-pas-des-agences-pe
                        if  "@pole-emploi." in email and raisonsociale != "POLE EMPLOI":
                            email = ""
                        if len(codepostal) == 4:
                            codepostal = "0%s" % codepostal
                        etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                            codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                            website, flag_poe_afpr, flag_pmsmp
                        etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                            codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                            website, flag_poe_afpr, flag_pmsmp, siret
                        if codepostal.startswith(departement):
                            departement_counter_dic.setdefault(departement, 0)
                            departement_counter_dic[departement] += 1
                            offices[siret] = {
                                "create_fields": etab_create_fields,
                                "update_fields": etab_update_fields,
                            }
                        else:
                            logger.info(
                                "zipcode %s and departement %s don't match commune_id %s",
                                codepostal,
                                departement,
                                codecommune,
                            )
                    else:
                        unprocessable_departement_errors += 1
                else:
                    no_zipcode_count += 1
                if not count % 100000:
                    logger.debug("processed %s lines", count)
                    yield offices
                    offices = {}

        logger.info("%i offices total", count)
        logger.info("%i offices without email before and have now thanks to RGPD mails", emails_not_here_before_rgpd)
        logger.info("%i offices with emails before and have been replaced by RGPD mails", emails_here_before_rgpd)
        logger.info("%i offices with unprocessable departement", unprocessable_departement_errors)
        logger.info("%i offices with no zipcodes", no_zipcode_count)
        logger.info("%i offices not read because of format error", format_errors)
        logger.info("%i distinct departements from file", len(departement_counter_dic))
        departement_count = sorted(departement_counter_dic.items())
        logger.info("per departement read %s", departement_count)
        logger.info("finished reading offices...")

        if get_current_env() != ENV_TEST:
            if unprocessable_departement_errors > 2500:
                raise ValueError("too many unprocessable_departement_errors")
            if no_zipcode_count > 75000:
                raise ValueError("too many no_zipcode_count")
            if format_errors > 5:
                raise ValueError("too many format_errors")
            if len(departement_counter_dic) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES:
                msg = "incorrect total number of departements : %s instead of expected %s" % (
                    len(departement_counter_dic),
                    settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES
                )
                raise ValueError(msg)
            for departement, count in departement_count:
                if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT:
                    logger.exception("only %s offices in departement %s", count, departement)
                    raise ValueError("not enough offices in at least one departement")

        yield offices
    def run_task(self):
        date_insertion = datetime.now()
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. 'lbb_xdpdpae_delta_201611102200.bz2'
        # will match 2018-09-12
        date_pattern = r'.*_(\d\d\d\d\d\d\d\d)\d\d\d\d'  #We keep only the date in the file name, ex: 20190910 = 10th september 2019
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[0]
            self.last_historical_data_date_in_file = datetime.strptime(
                date_part, "%Y%m%d")
            logger.debug("identified last_historical_data_date_in_file=%s",
                         self.last_historical_data_date_in_file)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be \
                like lbb_xdpdpae_delta_YYYYMMDDHHMM.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec,
                date_insertion
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date)) \
                                        .filter(Hiring.contract_type.in_((Hiring.CONTRACT_TYPE_CDI,
                                                                          Hiring.CONTRACT_TYPE_CDD,
                                                                          Hiring.CONTRACT_TYPE_CTT))).first()[0]
        if last_historical_data_date_in_db is None:
            last_historical_data_date_in_db = DEFAULT_DATETIME_DPAE
        logger.info(
            "will now extract all dpae with hiring_date between %s and %s",
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip(
            )  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, _, contract_type, departement, contract_duration, \
                    iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                dpae_should_be_imported = (
                    hiring_date > last_historical_data_date_in_db
                    and hiring_date <= self.last_historical_data_date_in_file
                    # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days)
                    # and we ignore CTT.
                    and (contract_type == Hiring.CONTRACT_TYPE_CDI or
                         (contract_type == Hiring.CONTRACT_TYPE_CDD
                          and contract_duration is not None
                          and contract_duration > 31)))

                if dpae_should_be_imported:
                    statement = (siret, hiring_date, contract_type,
                                 departement, contract_duration, iiann,
                                 tranche_age, handicap_label, duree_pec,
                                 date_insertion)
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
            raise IOError('too many zipcode errors')
        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
            raise IOError('too many invalid_row errors')
        logger.info("verifying good number of dpae imported.")
        query = "select count(*) from hirings h where hiring_date > %s and hiring_date <= %s and h.contract_type in (1,2,3)"
        cur.execute(query, [
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file
        ])
        res = cur.fetchone()
        if res[0] != imported_dpae:
            raise DoublonException(
                f"Too many DPAE ({res[0]}) in DB compared to DPAE file ({imported_dpae})."
            )
        logger.info("verifying number of DPAE: OK.")
        con.commit()
        cur.close()
        con.close()

        try:
            statistics = DpaeStatistics(
                last_import=datetime.now(),
                most_recent_data_date=self.last_historical_data_date_in_file,
                file_type=self.file_type)
            db_session.add(statistics)
            db_session.commit()
            logger.info("First way to insert DPAE statistics in DB : OK")
        except OperationalError:
            # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server
            # So we insert it directly via an SQL query
            # This job has been broken for more than a year, only way to fix it :
            db_session.rollback()
            last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            most_recent_date = self.last_historical_data_date_in_file.strftime(
                '%Y-%m-%d %H:%M:%S')
            query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')"
            con, cur = import_util.create_cursor()
            cur.execute(query)
            con.commit()
            cur.close()
            con.close()
            logger.info("Second way to insert DPAE statistics in DB : OK")

        logger.info("finished importing dpae...")
        return something_new
示例#5
0
    def run_task(self):
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. 'LBB_XDPDPAE_2018-09-12_2017-08-01.bz2'
        # will match 2018-09-12
        date_pattern = r'.*_(\d\d\d\d-\d\d-\d\d)_'
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[0]
            self.last_historical_data_date_in_file = datetime.strptime(
                date_part, "%Y-%m-%d")
            logger.debug("identified last_historical_data_date_in_file=%s",
                         self.last_historical_data_date_in_file)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be \
                like LBB_XDPDPAE_YYYY-MM-DD_YYYY-MM-DD.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        last_historical_data_date_in_db = DpaeStatistics.get_last_historical_data_date(
        )

        logger.info(
            "will now extract all dpae with hiring_date between %s and %s",
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip(
            )  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, _, contract_type, departement, contract_duration, \
                    iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                dpae_should_be_imported = (
                    hiring_date > last_historical_data_date_in_db
                    and hiring_date <= self.last_historical_data_date_in_file
                    # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days)
                    # and we ignore CTT.
                    and (contract_type == Hiring.CONTRACT_TYPE_CDI or
                         (contract_type == Hiring.CONTRACT_TYPE_CDD
                          and contract_duration is not None
                          and contract_duration > 31)))

                if dpae_should_be_imported:
                    statement = (
                        siret,
                        hiring_date,
                        contract_type,
                        departement,
                        contract_duration,
                        iiann,
                        tranche_age,
                        handicap_label,
                        duree_pec,
                    )
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
            raise IOError('too many zipcode errors')
        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
            raise IOError('too many invalid_row errors')
        con.commit()
        cur.close()
        con.close()
        statistics = DpaeStatistics(
            last_import=datetime.now(),
            most_recent_data_date=self.last_historical_data_date_in_file,
        )
        statistics.save()
        logger.info("finished importing dpae...")
        return something_new
    def run_task(self):
        date_insertion = datetime.now()
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. '20200803ExtractApp'
        # will match 20200803
        date_string = self.input_filename.split('/')[-1][0:8] 
        try:
            self.last_historical_data_date_in_file = datetime.strptime(date_string, "%Y%m%d")
        except ValueError:
            raise Exception("couldn't find a date pattern in filename. filename should be \
                like 20200803ExtractApp.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec,
                date_insertion
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_alternance_contracts = 0
        imported_alternance_contracts_distribution = {}
        not_imported_alternance_contracts = 0

        last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date))\
                                                            .filter(Hiring.contract_type == self.contract_type).first()[0]

        logger.info("will now extract all alternance contracts with hiring_date between %s and %s",
                    last_historical_data_date_in_db, self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip()   # FIXME detect column positions from header
            
            if b"SIRET" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")

            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 10000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, departement = parse_alternance_line(line)
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue
                except InvalidSiretException:
                    error_message = traceback.format_exc()
                    logger.info("invalid siret met at row: %i", count)
                    logger.info(error_message)
                    self.invalid_siret_errors += 1
                    continue
                except InvalidZipCodeException:
                    logger.info("invalid zip code met at row: %i", count)
                    self.invalid_zipcode_errors += 1
                    continue
                
                # This part of code is useless : 
                #   The data used has a lot of late contracts inputs
                #   So we have to insert ALL the contracts from different dates

                #  alternance_contract_should_be_imported = (
                #      hiring_date > last_historical_data_date_in_db 
                #      and hiring_date <= self.last_historical_data_date_in_file
                #)

                if hiring_date <= self.last_historical_data_date_in_file:
                    statement = (
                        siret,
                        hiring_date,
                        self.contract_type,
                        departement,
                        None, #contract_duration
                        None, #iiann
                        None, #tranche_age
                        None, #handicap_label
                        None,  #duree_pec
                        date_insertion

                    )
                    statements.append(statement)
                    imported_alternance_contracts += 1

                    if hiring_date.year not in imported_alternance_contracts_distribution:
                        imported_alternance_contracts_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_alternance_contracts_distribution[hiring_date.year]:
                        imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month] = {}
                    if hiring_date.day not in imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month]:
                        imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] = 0
                    imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1])
            raise

        logger.info(f"Types de contrats à importer : {self.contract_name}")
        logger.info(f"processed {count} lba_contracts...")
        logger.info(f"imported lba_contracts: {imported_alternance_contracts}")
        logger.info(f"not imported lba_contracts: {not_imported_alternance_contracts}")
        logger.info(f"zipcode errors: {self.invalid_zipcode_errors}")
        logger.info(f"invalid_row errors: {self.invalid_row_errors}")
        logger.info(f"invalid siret errors: {self.invalid_siret_errors}")
#        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
#            raise IOError('too many zipcode errors')
#        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
#            raise IOError('too many invalid_row errors')

        con.commit()
        cur.close()
        con.close()

        try:
            statistics = DpaeStatistics(
                last_import=datetime.now(),
                most_recent_data_date=self.last_historical_data_date_in_file,
                file_type=self.file_type
            )
            db_session.add(statistics)
            db_session.commit()
            logger.info("First way to insert DPAE statistics in DB : OK")
        except OperationalError:
            # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server
            # So we insert it directly via an SQL query
            # This job has been broken for more than a year, only way to fix it : 
            db_session.rollback()
            last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            most_recent_date = self.last_historical_data_date_in_file.strftime('%Y-%m-%d %H:%M:%S')
            query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')"
            con, cur = import_util.create_cursor()
            cur.execute(query)
            con.commit()
            cur.close()
            con.close()
            logger.info("Second way to insert DPAE statistics in DB : OK")


        logger.info("finished importing dpae...")
        return something_new
示例#7
0
    def get_offices_from_file(self):
        # FIXME elegantly parallelize this stuff
        # see
        # https://stackoverflow.com/questions/8717179/chunking-data-from-a-large-file-for-multiprocessing
        # https://docs.python.org/2/library/itertools.html#itertools.islice
        logger.info("extracting %s...", self.input_filename)
        departements = dpt.DEPARTEMENTS
        count = 0
        no_zipcode_count = 0
        unprocessable_departement_errors = 0
        format_errors = 0
        departement_counter_dic = {}
        offices = {}

        with import_util.get_reader(self.input_filename) as myfile:
            header_line = myfile.readline().strip()  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise ValueError("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("processed %s lines", count)

                try:
                    fields = import_util.get_fields_from_csv_line(line)
                    if len(fields) != 22:
                        logger.exception("wrong number of fields in line %s", line)
                        raise ValueError

                    siret, raisonsociale, enseigne, codenaf, numerorue, \
                        libellerue, codecommune, codepostal, email, tel, \
                        trancheeffectif_etablissement, _, _, _, \
                        website1, website2, better_tel, \
                        website3, _, contrat_afpr, contrat_poe, contrat_pmsmp = fields

                    if not siret_util.is_siret(siret):
                        logger.exception("wrong siret : %s", siret)
                        raise ValueError

                except ValueError:
                    logger.exception("exception in line %s", line)
                    format_errors += 1
                    continue

                website = merge_and_normalize_websites([website1, website2, website3])

                if has_text_content(better_tel):
                    tel = better_tel
                flag_pmsmp = 0
                if contrat_pmsmp == "O" :
                    flag_pmsmp = 1

                flag_poe_afpr = 0
                if contrat_poe == "O" or contrat_afpr == "O" :
                    flag_poe_afpr = 1
                
                email = encoding_util.strip_french_accents(email)

                if codecommune.strip():
                    departement = import_util.get_departement_from_zipcode(codepostal)
                    process_this_departement = departement in departements
                    if process_this_departement:
                        # Trello Pz5UlnFh : supprimer-les-emails-pe-des-entreprises-qui-ne-sont-pas-des-agences-pe
                        if  "@pole-emploi." in email and raisonsociale != "POLE EMPLOI":
                            email = ""
                        if len(codepostal) == 4:
                            codepostal = "0%s" % codepostal
                        etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                            codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                            website, flag_poe_afpr, flag_pmsmp
                        etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                            codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                            website, flag_poe_afpr, flag_pmsmp, siret
                        if codepostal.startswith(departement):
                            departement_counter_dic.setdefault(departement, 0)
                            departement_counter_dic[departement] += 1
                            offices[siret] = {
                                "create_fields": etab_create_fields,
                                "update_fields": etab_update_fields,
                            }
                        else:
                            logger.info(
                                "zipcode %s and departement %s don't match commune_id %s",
                                codepostal,
                                departement,
                                codecommune,
                            )
                    else:
                        unprocessable_departement_errors += 1
                else:
                    no_zipcode_count += 1

        logger.info("%i offices total", count)
        logger.info("%i offices with unprocessable departement", unprocessable_departement_errors)
        logger.info("%i offices with no zipcodes", no_zipcode_count)
        logger.info("%i offices not read because of format error", format_errors)
        logger.info("%i distinct departements from file", len(departement_counter_dic))
        departement_count = sorted(departement_counter_dic.items())
        logger.info("per departement read %s", departement_count)
        logger.info("finished reading offices...")

        if unprocessable_departement_errors > 2500:
            raise ValueError("too many unprocessable_departement_errors")
        if no_zipcode_count > 75000:
            raise ValueError("too many no_zipcode_count")
        if format_errors > 5:
            raise ValueError("too many format_errors")
        if len(departement_counter_dic) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES:
            msg = "incorrect total number of departements : %s instead of expected %s" % (
                len(departement_counter_dic),
                settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES
            )
            raise ValueError(msg)
        for departement, count in departement_count:
            if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT:
                logger.exception("only %s offices in departement %s", count, departement)
                raise ValueError("not enough offices in at least one departement")

        return offices
    def get_offices_from_file(self):
        logger.info("extracting %s...", self.input_filename)
        departements = settings.DEPARTEMENTS
        count = 0
        no_zipcode_count = 0
        departement_errors = 0
        unprocessable_departement_errors = 0
        format_errors = 0
        con, cur = import_util.create_cursor()
        departement_counter_dic = {}
        etablissements = {}

        with import_util.get_reader(self.input_filename) as myfile:
            header_line = myfile.readline().strip()
            if "siret" not in header_line:
                logger.debug(header_line)
                raise "wrong header line"
            for line in myfile:
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)

                try:
                    fields = import_util.get_fields_from_csv_line(line)
                    if len(fields) != 16:
                        logger.exception("wrong number of fields in line %s" %
                                         line)
                        raise ValueError

                    siret, raisonsociale, enseigne, codenaf, numerorue, \
                        libellerue, codecommune, codepostal, email, tel, \
                        trancheeffectif_etablissement, effectif_etablissement, \
                        trancheeffectif_entreprise, date_creation_entreprise, \
                        website1, website2 = fields
                except ValueError:
                    logger.exception("exception in line %s" % line)
                    format_errors += 1
                    continue

                website1 = encoding_util.strip_french_accents(website1)
                website2 = encoding_util.strip_french_accents(website2)
                email = encoding_util.strip_french_accents(email)

                if codecommune.strip():
                    try:
                        departement = extract_departement_from_zipcode(
                            codepostal, siret)
                        process_this_departement = departement in departements
                        if process_this_departement:

                            if len(codepostal) == 4:
                                codepostal = "0%s" % codepostal
                            etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                                codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                                website1, website2
                            etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                                codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                                website1, website2, siret
                            if codepostal.startswith(departement):
                                departement_counter_dic.setdefault(
                                    departement, 0)
                                departement_counter_dic[departement] += 1
                                etablissements[siret] = {
                                    "create_fields": etab_create_fields,
                                    "update_fields": etab_update_fields,
                                }
                            else:
                                logger.info(
                                    "zipcode and departement dont match code commune: %s, code postal: %s, departement: %s",
                                    codecommune, codepostal, departement)
                        else:
                            unprocessable_departement_errors += 1
                    except DepartementException:
                        logger.exception("departement exception")
                        departement_errors += 1
                else:
                    no_zipcode_count += 1

        logger.info("%i etablissements total" % count)
        logger.info("%i etablissements with incorrect departement" %
                    departement_errors)
        logger.info("%i etablissements with unprocessable departement" %
                    unprocessable_departement_errors)
        logger.info("%i etablissements with no zipcodes", no_zipcode_count)
        logger.info("%i etablissements not read because of format error",
                    format_errors)
        logger.info("%i number of departements from file" %
                    len(departement_counter_dic))
        departement_count = sorted(departement_counter_dic.items())
        logger.info("per departement read %s", departement_count)
        logger.info("finished reading etablissements...")

        if departement_errors > 500:
            raise "too many departement_errors"
        if unprocessable_departement_errors > 2000:
            raise "too many unprocessable_departement_errors"
        if no_zipcode_count > 40000:
            raise "too many no_zipcode_count"
        if format_errors > 5:
            raise "too many format_errors"
        if len(departement_counter_dic) not in [
                96, 15
        ]:  # 96 in production, 15 in test
            logger.exception("incorrect total number of departements : %s" %
                             len(departement_counter_dic))
            raise "incorrect total number of departements"
        if len(departement_counter_dic) == 96:
            for departement, count in departement_count:
                if count < 10000:
                    logger.exception(
                        "only %s etablissements in departement %s" %
                        (count, departement))
                    raise "not enough etablissements in at least one departement"

        return etablissements
示例#9
0
    def run_task(self):
        logger.info("extracting %s ", self.input_filename)
        date_pattern = ".*_(\d\d\d\d\d\d\d\d)"
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[-1]
            self.most_recent_data_date = datetime.strptime(date_part, "%Y%m%d")
            logger.debug("identified most_recent_data_date=%s" %
                         self.most_recent_data_date)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be dpae_XYZ_20xxxxxx.tar.gz"
            )

        count = 0
        statements = []
        something_new = False
        query = "INSERT into %s(siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)" % settings.DPAE_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        initial_most_recent_data_date = DpaeStatistics.get_most_recent_data_date(
        )

        logger.info(
            "will now extract all dpae with hiring_date between %s and %s" %
            (initial_most_recent_data_date, self.most_recent_data_date))

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip()
            if "siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label = parse_dpae_line(
                        line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except DepartementException:
                    self.zipcode_errors += 1
                    continue
                except TooFewFieldsException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                if hiring_date > initial_most_recent_data_date and hiring_date <= self.most_recent_data_date:
                    statement = (siret, hiring_date, zipcode, contract_type,
                                 departement, contract_duration, iiann,
                                 tranche_age, handicap_label)
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors >= 100:
            raise Exception('too many zipcode errors')
        if self.invalid_row_errors >= 100:
            raise Exception('too many invalid_row errors')
        statistics = DpaeStatistics(
            last_import=datetime.now(),
            most_recent_data_date=self.most_recent_data_date)
        statistics.save()
        con.commit()
        logger.info("finished importing dpae...")
        return something_new