Пример #1
0
 def create_geocoding_jobs(self):
     query = """select siret, numerorue, libellerue, codepostal, codecommune, coordinates_x, coordinates_y from %s""" % (
         settings.EXPORT_ETABLISSEMENT_TABLE)
     con, cur = import_util.create_cursor()
     cur.execute(query)
     rows = cur.fetchall()
     geocoding_jobs = []
     count = 0
     for row in rows:
         siret, street_number, street_name, zipcode, codecommune, coordinates_x, coordinates_y = row
         try:
             city = CITY_NAMES[codecommune]
         except KeyError:
             logger.warning("wrong codecommune: %s", codecommune)
             continue
         try:
             full_address = self.get_full_adress(street_number, street_name,
                                                 zipcode, city)
             initial_coordinates = [coordinates_x, coordinates_y]
             geocoding_jobs.append(
                 [siret, full_address, initial_coordinates])
         except IncorrectAdressDataException:
             logger.warning("incorrect address for %s %s %s %s",
                            street_number, street_name, zipcode, city)
         count += 1
         GEOCODING_STATS['jobs'] = GEOCODING_STATS.get('jobs', 0) + 1
         if not count % 10000:
             logger.info("loading geocoding jobs from db... loaded %s rows",
                         count)
     logger.info("%i geocoding jobs created...", len(geocoding_jobs))
     return geocoding_jobs
 def get_sirets_from_database(self):
     query = "select siret from %s where siret != ''" % settings.OFFICE_TABLE
     logger.info("get etablissements from database")
     con, cur = import_util.create_cursor()
     cur.execute(query)
     rows = cur.fetchall()
     return [row[0] for row in rows]
Пример #3
0
    def create_creatable_offices(self):
        """
        create new offices (that are not yet in our etablissement table)
        """
        con, cur = import_util.create_cursor()
        query = """INSERT into %s(siret, raisonsociale, enseigne, codenaf, numerorue,
            libellerue, codecommune, codepostal, email, tel, departement, trancheeffectif,
            website, flag_poe_afpr, flag_pmsmp)
        values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % settings.RAW_OFFICE_TABLE

        count = 1
        logger.info("create new offices in table %s",
                    settings.RAW_OFFICE_TABLE)
        statements = []
        MAX_COUNT_EXECUTE = 500
        for siret in self.creatable_sirets:
            statement = self.csv_offices[siret]["create_fields"]
            statements.append(statement)
            count += 1
            if not count % MAX_COUNT_EXECUTE:
                cur.executemany(query, statements)
                con.commit()
                statements = []
                if not count % 10000:
                    logger.info("created %s offices", count)
        if statements:
            cur.executemany(query, statements)
            con.commit()
        cur.close()
        con.close()
        logger.info("%i new offices created.", count)
    def update_updatable_offices(self):
        con, cur = import_util.create_cursor()
        query = """UPDATE %s SET
            raisonsociale=%%s,
            enseigne=%%s,
            codenaf=%%s,
            numerorue=%%s,
            libellerue=%%s,
            codecommune=%%s,
            codepostal=%%s,
            email=%%s,
            tel=%%s,
            departement=%%s,
            trancheeffectif=%%s,
            website1=%%s,
            website2=%%s
        where siret=%%s""" % settings.OFFICE_TABLE

        count = 1
        logger.info("update updatable etablissements in table %s" %
                    settings.OFFICE_TABLE)
        statements = []
        MAX_COUNT_EXECUTE = 500
        for siret in self.updatable_sirets:
            statement = self.csv_offices[siret]["update_fields"]
            statements.append(statement)
            count += 1
            if not count % MAX_COUNT_EXECUTE:
                cur.executemany(query, statements)
                con.commit()
                statements = []
        if statements:
            cur.executemany(query, statements)
            con.commit()
        logger.info("%i etablissements updated.", count)
Пример #5
0
    def update_coordinates(self, updates):
        con, cur = import_util.create_cursor()
        count = 0
        statements = []
        update_query = "update %s set coordinates_x=%%s, coordinates_y=%%s where siret=%%s" % \
            settings.SCORE_REDUCING_TARGET_TABLE

        logger.info("Nb of offices to update : {}".format(len(updates)))

        for siret, coordinates in updates:
            count += 1
            statements.append([coordinates[0], coordinates[1], siret])
            if len(statements) == 1000:
                logger.info("geocoding with ban... %i of %i done", count,
                            len(updates))
                cur.executemany(update_query, statements)
                con.commit()
                statements = []

        if len(statements) >= 1:
            logger.info("geocoding with ban... %i of %i done", count,
                        len(updates))
            cur.executemany(update_query, statements)
            con.commit()

        cur.close()
        con.close()
Пример #6
0
def run_sql_script(sql_script):
    con, cur = import_util.create_cursor()

    for query in sql_script.split(';'):
        query = query.strip()
        if len(query) >= 1:
            cur.execute(query)
            con.commit()
Пример #7
0
 def get_sirets_from_database(self):
     query = "select siret from %s" % settings.RAW_OFFICE_TABLE
     logger.info("get offices from database")
     con, cur = import_util.create_cursor()
     cur.execute(query)
     rows = cur.fetchall()
     cur.close()
     con.close()
     return [row[0] for row in rows if siret_util.is_siret(row[0])]
def check_departements(departements):
    for dep in departements:
        con, cur = import_util.create_cursor()
        cur.execute("select count(1) from %s where departement='%s'" %
                    (settings.OFFICE_TABLE, dep))
        con.commit()
        count = cur.fetchone()[0]
        if count < 1000:
            logger.error("only %s results for departement %s", count, dep)
Пример #9
0
 def validate_coordinates(self):
     con, cur = import_util.create_cursor()
     query = """
     select
     sum(coordinates_x > 0 and coordinates_y > 0)/count(*)
     from %s
     """ % settings.EXPORT_ETABLISSEMENT_TABLE
     cur.execute(query)
     geocoding_ratio = cur.fetchall()[0][0]
     logger.info("geocoding_ratio = %s" % geocoding_ratio)
     if geocoding_ratio < 0.75:
         raise AbnormallyLowGeocodingRatioException
 def after_check(self):
     con, cur = import_util.create_cursor()
     for departement in settings.DEPARTEMENTS:
         query = "select count(1) from %s where departement='%s'" % (
             settings.OFFICE_TABLE, departement)
         cur.execute(query)
         result = cur.fetchone()
         count = result[0]
         logger.info("number of companies in departement %s : %i",
                     departement, count)
         if count <= 1000:
             raise "too few companies"
Пример #11
0
 def test_insert_data_from_file(self):
     file = get_available_files_list(
         path_folder=os.path.join(os.path.dirname(__file__), "data"))[0]
     insert_into_sql_table_old_prediction_file(file)
     insert_data(file, months_time=4)
     con, cur = import_util.create_cursor()
     cur.execute("select count(*) from etablissements_new;")
     number_new_offices = cur.fetchone()[0]
     self.assertTrue(number_new_offices == 2)
     self.assertTrue(
         PerfImporterCycleInfos.query.filter(
             PerfImporterCycleInfos.file_name == file).count() == 1)
Пример #12
0
def populate_flag(flag):
    logger.info("populating %s ... " % flag)
    con, cur = import_util.create_cursor()
    query = """
        UPDATE
        %s e
        INNER JOIN %s f
        ON e.siret = f.siret
        SET e.%s = True;
    """ % (settings.EXPORT_ETABLISSEMENT_TABLE, flag, flag)
    cur.execute(query)
    con.commit()
    logger.info("completed populating %s ... " % flag)
def populate_flag(flag):
    logger.info("populating %s ... ", flag)
    con, cur = import_util.create_cursor()
    query = """
        UPDATE
        %s e
        INNER JOIN %s f
        ON e.siret = f.siret
        SET e.%s = True;
    """ % (settings.SCORE_REDUCING_TARGET_TABLE, flag, flag)
    cur.execute(query)
    con.commit()
    logger.info("completed populating %s ... ", flag)
    cur.close()
    con.close()
 def delete_deletable_offices(self):
     con, cur = import_util.create_cursor()
     if self.deletable_sirets:
         stringified_siret_list = ",".join(self.deletable_sirets)
         logger.info("going to delete %i offices...",
                     len(self.deletable_sirets))
         query = """DELETE FROM %s where siret IN (%s)""" % (
             settings.OFFICE_TABLE, stringified_siret_list)
         try:
             cur.execute(query)
             con.commit()
         except:
             logger.warning("deletable_sirets=%s" % self.deletable_sirets)
             raise
         logger.info("%i old offices deleted.", len(self.deletable_sirets))
Пример #15
0
 def update_coordinates(self, coordinates_updates):
     con, cur = import_util.create_cursor()
     count = 0
     statements = []
     update_query = "update %s set coordinates_x=%%s, coordinates_y=%%s where siret=%%s" % settings.EXPORT_ETABLISSEMENT_TABLE
     for siret, coordinates in coordinates_updates:
         statements.append([coordinates[0], coordinates[1], siret])
         if not count % 1000:
             logger.info(
                 "geocoding with ban... %i done (example: coordinates_x=%s, coordinates_y=%s",
                 count, statements[0][0], statements[0][1])
             cur.executemany(update_query, statements)
             con.commit()
             statements = []
         count += 1
Пример #16
0
 def delete_deletable_offices(self):
     con, cur = import_util.create_cursor()
     if self.deletable_sirets:
         for sirets in chunks(list(self.deletable_sirets), 500):
             stringified_siret_list = ",".join(sirets)
             logger.info("deleting a chunk of %i offices...", len(sirets))
             query = """DELETE FROM %s where siret IN (%s)""" % (settings.RAW_OFFICE_TABLE, stringified_siret_list)
             try:
                 cur.execute(query)
                 con.commit()
             except:
                 logger.warning("error while deleting chunk of sirets : %s", sirets)
                 raise
     cur.close()
     con.close()
     logger.info("%i no longer existing offices deleted.", len(self.deletable_sirets))
Пример #17
0
 def validate_coordinates(self):
     con, cur = import_util.create_cursor()
     query = """
     select
     sum(
         (coordinates_x > 0 or coordinates_x < 0)
         and
         (coordinates_y > 0 or coordinates_y < 0)
     )/count(*)
     from %s
     """ % settings.SCORE_REDUCING_TARGET_TABLE
     cur.execute(query)
     geocoding_ratio = cur.fetchall()[0][0]
     logger.info("geocoding_ratio = %s", geocoding_ratio)
     if geocoding_ratio < settings.MINIMUM_GEOCODING_RATIO:
         raise AbnormallyLowGeocodingRatioException
     cur.close()
     con.close()
Пример #18
0
def insert_into_sql_table_old_prediction_file(file):
    file_name = os.path.basename(file)
    logger.info(
        f"\n Start : Insert data into etablissements_new from file {file_name}"
    )
    con, cur = import_util.create_cursor()
    sql_file = gzip.open(file, 'rt', encoding='utf8')
    sql_as_string = sql_file.read()

    # Cant load the whole table at once, the file is to large ( ~ 400mb)
    # So we have to split the sql file in multiple transactions
    drop_statement = "DROP TABLE IF EXISTS `etablissements_new`;"
    cur.execute(drop_statement)

    start_create_text = "CREATE TABLE "
    end_create_text = "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;"
    start_create_statement_index = sql_as_string.find(start_create_text) + len(
        start_create_text)
    end_create_statement_index = sql_as_string.find(
        end_create_text, start_create_statement_index)
    create_statement = start_create_text + sql_as_string[
        start_create_statement_index:
        end_create_statement_index] + end_create_text
    cur.execute(create_statement)

    cur.execute("LOCK TABLES `etablissements_new` WRITE;")

    insert_statements = sql_as_string.split(
        "INSERT INTO `etablissements_new` VALUES")[1:]
    for statement in insert_statements:
        if "/*!40000 ALTER TABLE `etablissements_new` ENABLE KEYS */;" in statement:
            clean_insert_statement = "INSERT INTO `etablissements_new` VALUES" + \
                                     statement.split("/*!40000 ALTER TABLE `etablissements_new` ENABLE KEYS */;")[0]
        else:
            clean_insert_statement = "INSERT INTO `etablissements_new` VALUES" + statement
        cur.execute(clean_insert_statement)

    cur.execute("UNLOCK TABLES;")

    con.commit()  # foo test resolution du TO
    cur.close()
    con.close()
    logger.info(
        f"\n End : Insert data into etablissements_new from file {file_name}")
Пример #19
0
    def update_updatable_offices(self):
        # FIXME parallelize and/or batch for better performance
        con, cur = import_util.create_cursor()
        query = """UPDATE %s SET
            raisonsociale=%%s,
            enseigne=%%s,
            codenaf=%%s,
            numerorue=%%s,
            libellerue=%%s,
            codecommune=%%s,
            codepostal=%%s,
            email=%%s,
            tel=%%s,
            departement=%%s,
            trancheeffectif=%%s,
            website=%%s,
            flag_poe_afpr=%%s,
            flag_pmsmp=%%s
        where siret=%%s""" % settings.RAW_OFFICE_TABLE

        count = 0
        logger.info("update updatable offices in table %s",
                    settings.RAW_OFFICE_TABLE)
        statements = []
        MAX_COUNT_EXECUTE = 500
        for siret in self.updatable_sirets:
            statement = self.csv_offices[siret]["update_fields"]
            statements.append(statement)
            count += 1
            if not count % MAX_COUNT_EXECUTE:
                cur.executemany(query, statements)
                con.commit()
                statements = []
                if not count % 100000:
                    logger.info("updated %s offices", count)
        if statements:
            cur.executemany(query, statements)
            con.commit()
        cur.close()
        con.close()
        logger.info("%i offices updated.", count)
Пример #20
0
def clear_useless_data(importer_cycle_infos_id):
    con, cur = import_util.create_cursor()
    for ici_id in importer_cycle_infos_id:
        cur.execute(
            "DELETE FROM perf_prediction_and_effective_hirings WHERE importer_cycle_infos_id = %s",
            [ici_id])
Пример #21
0
    def run_task(self):
        logger.info("extracting %s ", self.input_filename)
        date_pattern = ".*_(\d\d\d\d\d\d\d\d)"
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[-1]
            self.most_recent_data_date = datetime.strptime(date_part, "%Y%m%d")
            logger.debug("identified most_recent_data_date=%s" %
                         self.most_recent_data_date)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be dpae_XYZ_20xxxxxx.tar.gz"
            )

        count = 0
        statements = []
        something_new = False
        query = "INSERT into %s(siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)" % settings.DPAE_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        initial_most_recent_data_date = DpaeStatistics.get_most_recent_data_date(
        )

        logger.info(
            "will now extract all dpae with hiring_date between %s and %s" %
            (initial_most_recent_data_date, self.most_recent_data_date))

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip()
            if "siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label = parse_dpae_line(
                        line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except DepartementException:
                    self.zipcode_errors += 1
                    continue
                except TooFewFieldsException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                if hiring_date > initial_most_recent_data_date and hiring_date <= self.most_recent_data_date:
                    statement = (siret, hiring_date, zipcode, contract_type,
                                 departement, contract_duration, iiann,
                                 tranche_age, handicap_label)
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors >= 100:
            raise Exception('too many zipcode errors')
        if self.invalid_row_errors >= 100:
            raise Exception('too many invalid_row errors')
        statistics = DpaeStatistics(
            last_import=datetime.now(),
            most_recent_data_date=self.most_recent_data_date)
        statistics.save()
        con.commit()
        logger.info("finished importing dpae...")
        return something_new
Пример #22
0
import random
import urllib.request, urllib.parse, urllib.error

from locust import HttpLocust, TaskSet, task
from slugify import slugify

from labonneboite.common import geocoding
from labonneboite.conf import settings
from labonneboite.importer import util as import_util
from labonneboite.web.api import util


logger = logging.getLogger(__name__)
logger.info("loading locustfile")

con, cur = import_util.create_cursor()

# For each locust, number of seconds between its tasks. Default value: 1.
SECONDS_BETWEEN_TASKS = 1


def generate_siret_choices():
    cur.execute("select siret from %s limit 100000" % (settings.OFFICE_TABLE))
    con.commit()
    rows = cur.fetchall()
    return [row[0] for row in rows]


def generate_city_choices():
    cities_by_population = sorted(geocoding.get_cities(), key=itemgetter('population'), reverse=True)
    city_choices = []
    def run_task(self):
        date_insertion = datetime.now()
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. 'lbb_xdpdpae_delta_201611102200.bz2'
        # will match 2018-09-12
        date_pattern = r'.*_(\d\d\d\d\d\d\d\d)\d\d\d\d'  #We keep only the date in the file name, ex: 20190910 = 10th september 2019
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[0]
            self.last_historical_data_date_in_file = datetime.strptime(
                date_part, "%Y%m%d")
            logger.debug("identified last_historical_data_date_in_file=%s",
                         self.last_historical_data_date_in_file)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be \
                like lbb_xdpdpae_delta_YYYYMMDDHHMM.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec,
                date_insertion
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date)) \
                                        .filter(Hiring.contract_type.in_((Hiring.CONTRACT_TYPE_CDI,
                                                                          Hiring.CONTRACT_TYPE_CDD,
                                                                          Hiring.CONTRACT_TYPE_CTT))).first()[0]
        if last_historical_data_date_in_db is None:
            last_historical_data_date_in_db = DEFAULT_DATETIME_DPAE
        logger.info(
            "will now extract all dpae with hiring_date between %s and %s",
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip(
            )  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, _, contract_type, departement, contract_duration, \
                    iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                dpae_should_be_imported = (
                    hiring_date > last_historical_data_date_in_db
                    and hiring_date <= self.last_historical_data_date_in_file
                    # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days)
                    # and we ignore CTT.
                    and (contract_type == Hiring.CONTRACT_TYPE_CDI or
                         (contract_type == Hiring.CONTRACT_TYPE_CDD
                          and contract_duration is not None
                          and contract_duration > 31)))

                if dpae_should_be_imported:
                    statement = (siret, hiring_date, contract_type,
                                 departement, contract_duration, iiann,
                                 tranche_age, handicap_label, duree_pec,
                                 date_insertion)
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
            raise IOError('too many zipcode errors')
        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
            raise IOError('too many invalid_row errors')
        logger.info("verifying good number of dpae imported.")
        query = "select count(*) from hirings h where hiring_date > %s and hiring_date <= %s and h.contract_type in (1,2,3)"
        cur.execute(query, [
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file
        ])
        res = cur.fetchone()
        if res[0] != imported_dpae:
            raise DoublonException(
                f"Too many DPAE ({res[0]}) in DB compared to DPAE file ({imported_dpae})."
            )
        logger.info("verifying number of DPAE: OK.")
        con.commit()
        cur.close()
        con.close()

        try:
            statistics = DpaeStatistics(
                last_import=datetime.now(),
                most_recent_data_date=self.last_historical_data_date_in_file,
                file_type=self.file_type)
            db_session.add(statistics)
            db_session.commit()
            logger.info("First way to insert DPAE statistics in DB : OK")
        except OperationalError:
            # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server
            # So we insert it directly via an SQL query
            # This job has been broken for more than a year, only way to fix it :
            db_session.rollback()
            last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            most_recent_date = self.last_historical_data_date_in_file.strftime(
                '%Y-%m-%d %H:%M:%S')
            query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')"
            con, cur = import_util.create_cursor()
            cur.execute(query)
            con.commit()
            cur.close()
            con.close()
            logger.info("Second way to insert DPAE statistics in DB : OK")

        logger.info("finished importing dpae...")
        return something_new
    def get_offices_from_file(self):
        logger.info("extracting %s...", self.input_filename)
        departements = settings.DEPARTEMENTS
        count = 0
        no_zipcode_count = 0
        departement_errors = 0
        unprocessable_departement_errors = 0
        format_errors = 0
        con, cur = import_util.create_cursor()
        departement_counter_dic = {}
        etablissements = {}

        with import_util.get_reader(self.input_filename) as myfile:
            header_line = myfile.readline().strip()
            if "siret" not in header_line:
                logger.debug(header_line)
                raise "wrong header line"
            for line in myfile:
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)

                try:
                    fields = import_util.get_fields_from_csv_line(line)
                    if len(fields) != 16:
                        logger.exception("wrong number of fields in line %s" %
                                         line)
                        raise ValueError

                    siret, raisonsociale, enseigne, codenaf, numerorue, \
                        libellerue, codecommune, codepostal, email, tel, \
                        trancheeffectif_etablissement, effectif_etablissement, \
                        trancheeffectif_entreprise, date_creation_entreprise, \
                        website1, website2 = fields
                except ValueError:
                    logger.exception("exception in line %s" % line)
                    format_errors += 1
                    continue

                website1 = encoding_util.strip_french_accents(website1)
                website2 = encoding_util.strip_french_accents(website2)
                email = encoding_util.strip_french_accents(email)

                if codecommune.strip():
                    try:
                        departement = extract_departement_from_zipcode(
                            codepostal, siret)
                        process_this_departement = departement in departements
                        if process_this_departement:

                            if len(codepostal) == 4:
                                codepostal = "0%s" % codepostal
                            etab_create_fields = siret, raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                                codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                                website1, website2
                            etab_update_fields = raisonsociale, enseigne, codenaf, numerorue, libellerue, \
                                codecommune, codepostal, email, tel, departement, trancheeffectif_etablissement, \
                                website1, website2, siret
                            if codepostal.startswith(departement):
                                departement_counter_dic.setdefault(
                                    departement, 0)
                                departement_counter_dic[departement] += 1
                                etablissements[siret] = {
                                    "create_fields": etab_create_fields,
                                    "update_fields": etab_update_fields,
                                }
                            else:
                                logger.info(
                                    "zipcode and departement dont match code commune: %s, code postal: %s, departement: %s",
                                    codecommune, codepostal, departement)
                        else:
                            unprocessable_departement_errors += 1
                    except DepartementException:
                        logger.exception("departement exception")
                        departement_errors += 1
                else:
                    no_zipcode_count += 1

        logger.info("%i etablissements total" % count)
        logger.info("%i etablissements with incorrect departement" %
                    departement_errors)
        logger.info("%i etablissements with unprocessable departement" %
                    unprocessable_departement_errors)
        logger.info("%i etablissements with no zipcodes", no_zipcode_count)
        logger.info("%i etablissements not read because of format error",
                    format_errors)
        logger.info("%i number of departements from file" %
                    len(departement_counter_dic))
        departement_count = sorted(departement_counter_dic.items())
        logger.info("per departement read %s", departement_count)
        logger.info("finished reading etablissements...")

        if departement_errors > 500:
            raise "too many departement_errors"
        if unprocessable_departement_errors > 2000:
            raise "too many unprocessable_departement_errors"
        if no_zipcode_count > 40000:
            raise "too many no_zipcode_count"
        if format_errors > 5:
            raise "too many format_errors"
        if len(departement_counter_dic) not in [
                96, 15
        ]:  # 96 in production, 15 in test
            logger.exception("incorrect total number of departements : %s" %
                             len(departement_counter_dic))
            raise "incorrect total number of departements"
        if len(departement_counter_dic) == 96:
            for departement, count in departement_count:
                if count < 10000:
                    logger.exception(
                        "only %s etablissements in departement %s" %
                        (count, departement))
                    raise "not enough etablissements in at least one departement"

        return etablissements
Пример #25
0
    def run_task(self):
        date_insertion = datetime.now()
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. '20200803ExtractApp'
        # will match 20200803
        date_string = self.input_filename.split('/')[-1][0:8] 
        try:
            self.last_historical_data_date_in_file = datetime.strptime(date_string, "%Y%m%d")
        except ValueError:
            raise Exception("couldn't find a date pattern in filename. filename should be \
                like 20200803ExtractApp.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec,
                date_insertion
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_alternance_contracts = 0
        imported_alternance_contracts_distribution = {}
        not_imported_alternance_contracts = 0

        last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date))\
                                                            .filter(Hiring.contract_type == self.contract_type).first()[0]

        logger.info("will now extract all alternance contracts with hiring_date between %s and %s",
                    last_historical_data_date_in_db, self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip()   # FIXME detect column positions from header
            
            if b"SIRET" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")

            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 10000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, departement = parse_alternance_line(line)
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue
                except InvalidSiretException:
                    error_message = traceback.format_exc()
                    logger.info("invalid siret met at row: %i", count)
                    logger.info(error_message)
                    self.invalid_siret_errors += 1
                    continue
                except InvalidZipCodeException:
                    logger.info("invalid zip code met at row: %i", count)
                    self.invalid_zipcode_errors += 1
                    continue
                
                # This part of code is useless : 
                #   The data used has a lot of late contracts inputs
                #   So we have to insert ALL the contracts from different dates

                #  alternance_contract_should_be_imported = (
                #      hiring_date > last_historical_data_date_in_db 
                #      and hiring_date <= self.last_historical_data_date_in_file
                #)

                if hiring_date <= self.last_historical_data_date_in_file:
                    statement = (
                        siret,
                        hiring_date,
                        self.contract_type,
                        departement,
                        None, #contract_duration
                        None, #iiann
                        None, #tranche_age
                        None, #handicap_label
                        None,  #duree_pec
                        date_insertion

                    )
                    statements.append(statement)
                    imported_alternance_contracts += 1

                    if hiring_date.year not in imported_alternance_contracts_distribution:
                        imported_alternance_contracts_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_alternance_contracts_distribution[hiring_date.year]:
                        imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month] = {}
                    if hiring_date.day not in imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month]:
                        imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] = 0
                    imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1])
            raise

        logger.info(f"Types de contrats à importer : {self.contract_name}")
        logger.info(f"processed {count} lba_contracts...")
        logger.info(f"imported lba_contracts: {imported_alternance_contracts}")
        logger.info(f"not imported lba_contracts: {not_imported_alternance_contracts}")
        logger.info(f"zipcode errors: {self.invalid_zipcode_errors}")
        logger.info(f"invalid_row errors: {self.invalid_row_errors}")
        logger.info(f"invalid siret errors: {self.invalid_siret_errors}")
#        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
#            raise IOError('too many zipcode errors')
#        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
#            raise IOError('too many invalid_row errors')

        con.commit()
        cur.close()
        con.close()

        try:
            statistics = DpaeStatistics(
                last_import=datetime.now(),
                most_recent_data_date=self.last_historical_data_date_in_file,
                file_type=self.file_type
            )
            db_session.add(statistics)
            db_session.commit()
            logger.info("First way to insert DPAE statistics in DB : OK")
        except OperationalError:
            # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server
            # So we insert it directly via an SQL query
            # This job has been broken for more than a year, only way to fix it : 
            db_session.rollback()
            last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            most_recent_date = self.last_historical_data_date_in_file.strftime('%Y-%m-%d %H:%M:%S')
            query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')"
            con, cur = import_util.create_cursor()
            cur.execute(query)
            con.commit()
            cur.close()
            con.close()
            logger.info("Second way to insert DPAE statistics in DB : OK")


        logger.info("finished importing dpae...")
        return something_new
Пример #26
0
    def run_task(self):
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. 'LBB_XDPDPAE_2018-09-12_2017-08-01.bz2'
        # will match 2018-09-12
        date_pattern = r'.*_(\d\d\d\d-\d\d-\d\d)_'
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[0]
            self.last_historical_data_date_in_file = datetime.strptime(
                date_part, "%Y-%m-%d")
            logger.debug("identified last_historical_data_date_in_file=%s",
                         self.last_historical_data_date_in_file)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be \
                like LBB_XDPDPAE_YYYY-MM-DD_YYYY-MM-DD.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        last_historical_data_date_in_db = DpaeStatistics.get_last_historical_data_date(
        )

        logger.info(
            "will now extract all dpae with hiring_date between %s and %s",
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip(
            )  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, _, contract_type, departement, contract_duration, \
                    iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                dpae_should_be_imported = (
                    hiring_date > last_historical_data_date_in_db
                    and hiring_date <= self.last_historical_data_date_in_file
                    # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days)
                    # and we ignore CTT.
                    and (contract_type == Hiring.CONTRACT_TYPE_CDI or
                         (contract_type == Hiring.CONTRACT_TYPE_CDD
                          and contract_duration is not None
                          and contract_duration > 31)))

                if dpae_should_be_imported:
                    statement = (
                        siret,
                        hiring_date,
                        contract_type,
                        departement,
                        contract_duration,
                        iiann,
                        tranche_age,
                        handicap_label,
                        duree_pec,
                    )
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
            raise IOError('too many zipcode errors')
        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
            raise IOError('too many invalid_row errors')
        con.commit()
        cur.close()
        con.close()
        statistics = DpaeStatistics(
            last_import=datetime.now(),
            most_recent_data_date=self.last_historical_data_date_in_file,
        )
        statistics.save()
        logger.info("finished importing dpae...")
        return something_new